public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH v2 3/3] Consider doloop cmp use in ivopts
@ 2019-05-14  3:10 linkw
  2019-05-14  7:26 ` Richard Biener
  2019-06-19 11:47 ` [PATCH v3 3/3] PR80791 " Kewen.Lin
  0 siblings, 2 replies; 43+ messages in thread
From: linkw @ 2019-05-14  3:10 UTC (permalink / raw)
  To: gcc-patches; +Cc: segher, wschmidt, bin.cheng, rguenther, jakub, Kewen Lin

From: Kewen Lin <linkw@linux.ibm.com>

Previous version link for background:
https://gcc.gnu.org/ml/gcc-patches/2019-04/msg00912.html

Firstly, it's to call predict_doloop_p hook to check this
loop will be transformed to doloop or not, if yes, find
the expected comp iv use and its dependent original iv,
set the iv candidate as bind_cand of the group.
In following candidate selection process, we will bypass
the group with bind_cand, since we don't want to affect
global decision making for an iv use which will be
eliminated eventually.  At the time of iv candidate set
finalization, we will fill the cost pair for the group
with bind_cand.  If the bind_cand is already in the final
set, then just use it. Otherwise, we can check whether one
of existing final set is better and fill with that if so.

Bootstrapped and regression testing passed on powerpc64le.

Is it ok for trunk?

gcc/ChangeLog

2019-05-14  Kewen Lin  <linkw@gcc.gnu.org>

	PR middle-end/80791
	* tree-ssa-loop-ivopts.c (tree_ssa_iv_optimize_loop): Call
	predict_doloop_p hook and bind_cand_for_doloop_uses.
	(bind_cand_for_doloop_uses): New function.
	(find_optimal_iv_set): Call handle_groups_with_bind_cand.
	(handle_groups_with_bind_cand): New function.
	(record_group): Init bind_cand.
	(set_group_iv_cost): Consider bind_cand group.
	(iv_ca_dump): Add dump for bind_cand.
	(try_add_cand_for): Bypass bind_cand group.
	(iv_ca_extend): Likewise.
	(iv_ca_narrow): Likewise.
	(iv_ca_replace): Likewise.

gcc/testsuite/ChangeLog

2019-05-14  Kewen Lin  <linkw@gcc.gnu.org>

	PR middle-end/80791
	* gcc.dg/tree-ssa/ivopts-lt.c : Adjust.

---
 gcc/testsuite/gcc.dg/tree-ssa/ivopts-lt.c |   7 +-
 gcc/tree-ssa-loop-ivopts.c                | 155 +++++++++++++++++++++++++++++-
 2 files changed, 156 insertions(+), 6 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ivopts-lt.c b/gcc/testsuite/gcc.dg/tree-ssa/ivopts-lt.c
index 171c85a..f61288c 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/ivopts-lt.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ivopts-lt.c
@@ -17,6 +17,7 @@ f1 (char *p, uintptr_t i, uintptr_t n)
   while (i < n);
 }
 
-/* { dg-final { scan-tree-dump-times "PHI" 1 "ivopts" } } */
-/* { dg-final { scan-tree-dump-times "PHI <p_" 1 "ivopts"} } */
-/* { dg-final { scan-tree-dump-times "p_\[0-9\]* <" 1 "ivopts" } } */
+/* { dg-final { scan-tree-dump-times "PHI" 1 "ivopts" { target { ! powerpc*-*-* } } } } */
+/* { dg-final { scan-tree-dump-times "PHI" 2 "ivopts" { target { powerpc*-*-* } } } } */
+/* { dg-final { scan-tree-dump-times "PHI <p_" 1 "ivopts" { target { ! powerpc*-*-* } } } } */
+/* { dg-final { scan-tree-dump-times "p_\[0-9\]* <" 1 "ivopts" { target { ! powerpc*-*-* } } } } */
diff --git a/gcc/tree-ssa-loop-ivopts.c b/gcc/tree-ssa-loop-ivopts.c
index 885c8e8..50b5900 100644
--- a/gcc/tree-ssa-loop-ivopts.c
+++ b/gcc/tree-ssa-loop-ivopts.c
@@ -393,6 +393,8 @@ struct iv_group
   struct cost_pair *cost_map;
   /* The selected candidate for the group.  */
   struct iv_cand *selected;
+  /* The bind candidate for this group, for doloop only so far.  */
+  struct iv_cand *bind_cand;
   /* Uses in the group.  */
   vec<struct iv_use *> vuses;
 };
@@ -1592,6 +1594,7 @@ record_group (struct ivopts_data *data, enum use_type type)
   group->type = type;
   group->related_cands = BITMAP_ALLOC (NULL);
   group->vuses.create (1);
+  group->bind_cand = NULL;
 
   data->vgroups.safe_push (group);
   return group;
@@ -3612,7 +3615,9 @@ set_group_iv_cost (struct ivopts_data *data,
 {
   unsigned i, s;
 
-  if (cost.infinite_cost_p ())
+  gcc_assert (cand);
+  /* For the group with bind_cand, make it always have cost pair.  */
+  if (cost.infinite_cost_p () && group->bind_cand != cand)
     {
       BITMAP_FREE (inv_vars);
       BITMAP_FREE (inv_exprs);
@@ -6067,7 +6072,8 @@ iv_ca_dump (struct ivopts_data *data, FILE *file, struct iv_ca *ivs)
 		 group->id, cp->cand->id, cp->cost.cost,
 		 cp->cost.complexity);
       else
-	fprintf (file, "   group:%d --> ??\n", group->id);
+	fprintf (file, "   group:%d --> ?? %s\n", group->id,
+		 group->bind_cand ? "(bind)" : "");
     }
 
   const char *pref = "";
@@ -6110,6 +6116,9 @@ iv_ca_extend (struct ivopts_data *data, struct iv_ca *ivs,
   for (i = 0; i < ivs->upto; i++)
     {
       group = data->vgroups[i];
+      /* Ignore groups binded with some cand.  */
+      if (group->bind_cand)
+	continue;
       old_cp = iv_ca_cand_for_group (ivs, group);
 
       if (old_cp
@@ -6165,7 +6174,9 @@ iv_ca_narrow (struct ivopts_data *data, struct iv_ca *ivs,
   for (i = 0; i < data->vgroups.length (); i++)
     {
       group = data->vgroups[i];
-
+      /* Ignore groups binded with some cand.  */
+      if (group->bind_cand)
+	continue;
       old_cp = iv_ca_cand_for_group (ivs, group);
       if (old_cp->cand != cand)
 	continue;
@@ -6348,6 +6359,9 @@ iv_ca_replace (struct ivopts_data *data, struct iv_ca *ivs,
       for (j = 0; j < ivs->upto; j++)
 	{
 	  struct iv_group *group = data->vgroups[j];
+	  /* Ignore groups binded with some cand.  */
+	  if (group->bind_cand)
+	    continue;
 	  old_cp = iv_ca_cand_for_group (ivs, group);
 
 	  if (old_cp->cand != cand)
@@ -6406,6 +6420,15 @@ try_add_cand_for (struct ivopts_data *data, struct iv_ca *ivs,
   struct iv_ca_delta *best_delta = NULL, *act_delta;
   struct cost_pair *cp;
 
+  /* Bypass the candidate selection for the groups with bind_cand, but need
+     to keep upto up to date, to avoid the count of visited groups becomes
+     inconsistent in futher handlings.  */
+  if (group->bind_cand)
+    {
+      ivs->upto++;
+      return true;
+    }
+
   iv_ca_add_group (data, ivs, group);
   best_cost = iv_ca_cost (ivs);
   cp = iv_ca_cand_for_group (ivs, group);
@@ -6635,6 +6658,59 @@ find_optimal_iv_set_1 (struct ivopts_data *data, bool originalp)
   return set;
 }
 
+/* Since we bypass the candidate determination process for the groups with
+   bind_cand previously, now we want to fill the cost pair for them.  The
+   simplest way is to fill the bind_cand directly, but for some cases it
+   exposes more opportunities for downstream optimization if we rewrite the
+   cmp use with one candidate in the final set.  So the idea is:
+     1) if the bind_cand is already in final set, use bind_cand.
+     2) otherwise, check whether final set has better candidate,
+	fill with it if yes; or still go with bind_cand.  */
+
+static void
+handle_groups_with_bind_cand (struct ivopts_data *data, struct iv_ca *set)
+{
+  for (unsigned i = 0; i < data->vgroups.length (); i++)
+    {
+      struct iv_group *group = data->vgroups[i];
+      if (group->bind_cand)
+	{
+	  /* Since we always bypass it before.  */
+	  gcc_assert (!iv_ca_cand_for_group (set, group));
+
+	  struct cost_pair *best_cp
+	    = get_group_iv_cost (data, group, group->bind_cand);
+	  gcc_assert (best_cp);
+
+	  if (!bitmap_bit_p (set->cands, group->bind_cand->id))
+	    {
+	      /* Count in cost of bind_cand.  */
+	      best_cp->cost.cost += best_cp->cand->cost;
+	      unsigned j;
+	      bitmap_iterator bi;
+	      EXECUTE_IF_SET_IN_BITMAP (set->cands, 0, j, bi)
+	      {
+		struct iv_cand *cand = data->vcands[j];
+		/* Since the purpose of rewrite here is to expose more
+		   opportunities to downstream, the cost saving isn't
+		   critical because this cmp use gets elimintated
+		   finally.  So far we can't see any gains to replace
+		   original non memory base iv cand with memory based
+		   iv cand, but the rewrite could cause doloop unable
+		   to find it's finite.  */
+		if (group->bind_cand->iv->base_object == NULL_TREE
+		    && cand->iv->base_object != NULL_TREE)
+		  continue;
+		struct cost_pair *cp = get_group_iv_cost (data, group, cand);
+		if (cp && cp->cost < best_cp->cost)
+		  best_cp = cp;
+	      }
+	    }
+	  iv_ca_set_cp (data, set, group, best_cp);
+	}
+    }
+}
+
 static struct iv_ca *
 find_optimal_iv_set (struct ivopts_data *data)
 {
@@ -6672,6 +6748,8 @@ find_optimal_iv_set (struct ivopts_data *data)
   else if (origset)
     iv_ca_free (&origset);
 
+  handle_groups_with_bind_cand (data, set);
+
   for (i = 0; i < data->vgroups.length (); i++)
     {
       struct iv_group *group = data->vgroups[i];
@@ -7442,12 +7520,69 @@ loop_body_includes_call (basic_block *body, unsigned num_nodes)
   return false;
 }
 
+/* Doloop optimization RTL pass can make the related comparison computation
+   become dead and get eliminated, then these comparison IV uses should NOT
+   be considered in optimal IVs determination, set bind_cand for this kind
+   of group, bypass them in later candidate determination algorithm.  */
+
+static void
+bind_cand_for_doloop_uses (struct ivopts_data *data)
+{
+  struct loop *loop = data->current_loop;
+
+  for (unsigned i = 0; i < data->vgroups.length (); i++)
+    {
+      struct iv_group *group = data->vgroups[i];
+      if (group->type == USE_COMPARE)
+	{
+	  gcc_assert (group->vuses.length () == 1);
+	  struct iv_use *use = group->vuses[0];
+	  gimple *stmt = use->stmt;
+	  if (gimple_code (stmt) == GIMPLE_COND)
+	    {
+	      basic_block bb = gimple_bb (stmt);
+	      edge true_edge, false_edge;
+	      extract_true_false_edges_from_block (bb, &true_edge, &false_edge);
+
+	      /* This comparison is used for loop latch.  Require latch is empty
+		 for now.  */
+	      if ((loop->latch == true_edge->dest
+		   || loop->latch == false_edge->dest)
+		  && empty_block_p (loop->latch))
+		{
+		  for (unsigned j = 0; j < data->vcands.length (); j++)
+		    {
+		      if (bitmap_bit_p (group->related_cands, j))
+			{
+			  struct iv_cand *cand = data->vcands[j];
+			  tree op = use->iv->ssa_name;
+			  if (op == cand->var_before || op == cand->var_after)
+			    {
+			      group->bind_cand = cand;
+			      if (dump_file && (dump_flags & TDF_DETAILS))
+				{
+				  fprintf (dump_file, "Doloop cmp iv use: ");
+				  print_gimple_stmt (dump_file, stmt,
+						     TDF_DETAILS);
+				  dump_cand (dump_file, cand);
+				}
+			      break;
+			    }
+			}
+		    }
+		}
+	    }
+	}
+    }
+}
+
 /* Optimizes the LOOP.  Returns true if anything changed.  */
 
 static bool
 tree_ssa_iv_optimize_loop (struct ivopts_data *data, struct loop *loop)
 {
   bool changed = false;
+  bool bind_for_doloop_p = false;
   struct iv_ca *iv_ca;
   edge exit = single_dom_exit (loop);
   basic_block *body;
@@ -7496,6 +7631,20 @@ tree_ssa_iv_optimize_loop (struct ivopts_data *data, struct loop *loop)
   /* Finds candidates for the induction variables (item 2).  */
   find_iv_candidates (data);
 
+  bind_for_doloop_p = targetm.predict_doloop_p (loop);
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    {
+      fprintf (dump_file,
+	       "Predict loop %d can %sperform doloop optimization later.\n",
+	       loop->num, bind_for_doloop_p ? "" : "not ");
+      flow_loop_dump (loop, dump_file, NULL, 1);
+    }
+
+  /* Some compare iv_use is probably useless once the doloop optimization
+     performs.  Set bind_cand for the use (group).  */
+  if (bind_for_doloop_p)
+    bind_cand_for_doloop_uses (data);
+
   /* Calculates the costs (item 3, part 1).  */
   determine_iv_costs (data);
   determine_group_iv_costs (data);
-- 
2.7.4

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2 3/3] Consider doloop cmp use in ivopts
  2019-05-14  3:10 [PATCH v2 3/3] Consider doloop cmp use in ivopts linkw
@ 2019-05-14  7:26 ` Richard Biener
  2019-05-15  5:03   ` Kewen.Lin
  2019-06-19 11:47 ` [PATCH v3 3/3] PR80791 " Kewen.Lin
  1 sibling, 1 reply; 43+ messages in thread
From: Richard Biener @ 2019-05-14  7:26 UTC (permalink / raw)
  To: Kewen.Lin
  Cc: GCC Patches, Segher Boessenkool, Bill Schmidt, bin.cheng,
	Richard Guenther, Jakub Jelinek

On Tue, May 14, 2019 at 5:10 AM <linkw@linux.ibm.com> wrote:
>
> From: Kewen Lin <linkw@linux.ibm.com>
>
> Previous version link for background:
> https://gcc.gnu.org/ml/gcc-patches/2019-04/msg00912.html
>
> Firstly, it's to call predict_doloop_p hook to check this
> loop will be transformed to doloop or not, if yes, find
> the expected comp iv use and its dependent original iv,
> set the iv candidate as bind_cand of the group.
> In following candidate selection process, we will bypass
> the group with bind_cand, since we don't want to affect
> global decision making for an iv use which will be
> eliminated eventually.  At the time of iv candidate set
> finalization, we will fill the cost pair for the group
> with bind_cand.  If the bind_cand is already in the final
> set, then just use it. Otherwise, we can check whether one
> of existing final set is better and fill with that if so.
>
> Bootstrapped and regression testing passed on powerpc64le.
>
> Is it ok for trunk?

I wonder what prevents IVOPTs to consider a counter IV
(eventually such candidate needs to be added if that's not
already done) to be the most profitable variant w/o any
of the other changes?  I guess that would be costing of
the IV adjust plus branch which we would need to lower
in case there's nothing inside the loop that would make
later doloop transform fail?

Richard.

> gcc/ChangeLog
>
> 2019-05-14  Kewen Lin  <linkw@gcc.gnu.org>
>
>         PR middle-end/80791
>         * tree-ssa-loop-ivopts.c (tree_ssa_iv_optimize_loop): Call
>         predict_doloop_p hook and bind_cand_for_doloop_uses.
>         (bind_cand_for_doloop_uses): New function.
>         (find_optimal_iv_set): Call handle_groups_with_bind_cand.
>         (handle_groups_with_bind_cand): New function.
>         (record_group): Init bind_cand.
>         (set_group_iv_cost): Consider bind_cand group.
>         (iv_ca_dump): Add dump for bind_cand.
>         (try_add_cand_for): Bypass bind_cand group.
>         (iv_ca_extend): Likewise.
>         (iv_ca_narrow): Likewise.
>         (iv_ca_replace): Likewise.
>
> gcc/testsuite/ChangeLog
>
> 2019-05-14  Kewen Lin  <linkw@gcc.gnu.org>
>
>         PR middle-end/80791
>         * gcc.dg/tree-ssa/ivopts-lt.c : Adjust.
>
> ---
>  gcc/testsuite/gcc.dg/tree-ssa/ivopts-lt.c |   7 +-
>  gcc/tree-ssa-loop-ivopts.c                | 155 +++++++++++++++++++++++++++++-
>  2 files changed, 156 insertions(+), 6 deletions(-)
>
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ivopts-lt.c b/gcc/testsuite/gcc.dg/tree-ssa/ivopts-lt.c
> index 171c85a..f61288c 100644
> --- a/gcc/testsuite/gcc.dg/tree-ssa/ivopts-lt.c
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/ivopts-lt.c
> @@ -17,6 +17,7 @@ f1 (char *p, uintptr_t i, uintptr_t n)
>    while (i < n);
>  }
>
> -/* { dg-final { scan-tree-dump-times "PHI" 1 "ivopts" } } */
> -/* { dg-final { scan-tree-dump-times "PHI <p_" 1 "ivopts"} } */
> -/* { dg-final { scan-tree-dump-times "p_\[0-9\]* <" 1 "ivopts" } } */
> +/* { dg-final { scan-tree-dump-times "PHI" 1 "ivopts" { target { ! powerpc*-*-* } } } } */
> +/* { dg-final { scan-tree-dump-times "PHI" 2 "ivopts" { target { powerpc*-*-* } } } } */
> +/* { dg-final { scan-tree-dump-times "PHI <p_" 1 "ivopts" { target { ! powerpc*-*-* } } } } */
> +/* { dg-final { scan-tree-dump-times "p_\[0-9\]* <" 1 "ivopts" { target { ! powerpc*-*-* } } } } */
> diff --git a/gcc/tree-ssa-loop-ivopts.c b/gcc/tree-ssa-loop-ivopts.c
> index 885c8e8..50b5900 100644
> --- a/gcc/tree-ssa-loop-ivopts.c
> +++ b/gcc/tree-ssa-loop-ivopts.c
> @@ -393,6 +393,8 @@ struct iv_group
>    struct cost_pair *cost_map;
>    /* The selected candidate for the group.  */
>    struct iv_cand *selected;
> +  /* The bind candidate for this group, for doloop only so far.  */
> +  struct iv_cand *bind_cand;
>    /* Uses in the group.  */
>    vec<struct iv_use *> vuses;
>  };
> @@ -1592,6 +1594,7 @@ record_group (struct ivopts_data *data, enum use_type type)
>    group->type = type;
>    group->related_cands = BITMAP_ALLOC (NULL);
>    group->vuses.create (1);
> +  group->bind_cand = NULL;
>
>    data->vgroups.safe_push (group);
>    return group;
> @@ -3612,7 +3615,9 @@ set_group_iv_cost (struct ivopts_data *data,
>  {
>    unsigned i, s;
>
> -  if (cost.infinite_cost_p ())
> +  gcc_assert (cand);
> +  /* For the group with bind_cand, make it always have cost pair.  */
> +  if (cost.infinite_cost_p () && group->bind_cand != cand)
>      {
>        BITMAP_FREE (inv_vars);
>        BITMAP_FREE (inv_exprs);
> @@ -6067,7 +6072,8 @@ iv_ca_dump (struct ivopts_data *data, FILE *file, struct iv_ca *ivs)
>                  group->id, cp->cand->id, cp->cost.cost,
>                  cp->cost.complexity);
>        else
> -       fprintf (file, "   group:%d --> ??\n", group->id);
> +       fprintf (file, "   group:%d --> ?? %s\n", group->id,
> +                group->bind_cand ? "(bind)" : "");
>      }
>
>    const char *pref = "";
> @@ -6110,6 +6116,9 @@ iv_ca_extend (struct ivopts_data *data, struct iv_ca *ivs,
>    for (i = 0; i < ivs->upto; i++)
>      {
>        group = data->vgroups[i];
> +      /* Ignore groups binded with some cand.  */
> +      if (group->bind_cand)
> +       continue;
>        old_cp = iv_ca_cand_for_group (ivs, group);
>
>        if (old_cp
> @@ -6165,7 +6174,9 @@ iv_ca_narrow (struct ivopts_data *data, struct iv_ca *ivs,
>    for (i = 0; i < data->vgroups.length (); i++)
>      {
>        group = data->vgroups[i];
> -
> +      /* Ignore groups binded with some cand.  */
> +      if (group->bind_cand)
> +       continue;
>        old_cp = iv_ca_cand_for_group (ivs, group);
>        if (old_cp->cand != cand)
>         continue;
> @@ -6348,6 +6359,9 @@ iv_ca_replace (struct ivopts_data *data, struct iv_ca *ivs,
>        for (j = 0; j < ivs->upto; j++)
>         {
>           struct iv_group *group = data->vgroups[j];
> +         /* Ignore groups binded with some cand.  */
> +         if (group->bind_cand)
> +           continue;
>           old_cp = iv_ca_cand_for_group (ivs, group);
>
>           if (old_cp->cand != cand)
> @@ -6406,6 +6420,15 @@ try_add_cand_for (struct ivopts_data *data, struct iv_ca *ivs,
>    struct iv_ca_delta *best_delta = NULL, *act_delta;
>    struct cost_pair *cp;
>
> +  /* Bypass the candidate selection for the groups with bind_cand, but need
> +     to keep upto up to date, to avoid the count of visited groups becomes
> +     inconsistent in futher handlings.  */
> +  if (group->bind_cand)
> +    {
> +      ivs->upto++;
> +      return true;
> +    }
> +
>    iv_ca_add_group (data, ivs, group);
>    best_cost = iv_ca_cost (ivs);
>    cp = iv_ca_cand_for_group (ivs, group);
> @@ -6635,6 +6658,59 @@ find_optimal_iv_set_1 (struct ivopts_data *data, bool originalp)
>    return set;
>  }
>
> +/* Since we bypass the candidate determination process for the groups with
> +   bind_cand previously, now we want to fill the cost pair for them.  The
> +   simplest way is to fill the bind_cand directly, but for some cases it
> +   exposes more opportunities for downstream optimization if we rewrite the
> +   cmp use with one candidate in the final set.  So the idea is:
> +     1) if the bind_cand is already in final set, use bind_cand.
> +     2) otherwise, check whether final set has better candidate,
> +       fill with it if yes; or still go with bind_cand.  */
> +
> +static void
> +handle_groups_with_bind_cand (struct ivopts_data *data, struct iv_ca *set)
> +{
> +  for (unsigned i = 0; i < data->vgroups.length (); i++)
> +    {
> +      struct iv_group *group = data->vgroups[i];
> +      if (group->bind_cand)
> +       {
> +         /* Since we always bypass it before.  */
> +         gcc_assert (!iv_ca_cand_for_group (set, group));
> +
> +         struct cost_pair *best_cp
> +           = get_group_iv_cost (data, group, group->bind_cand);
> +         gcc_assert (best_cp);
> +
> +         if (!bitmap_bit_p (set->cands, group->bind_cand->id))
> +           {
> +             /* Count in cost of bind_cand.  */
> +             best_cp->cost.cost += best_cp->cand->cost;
> +             unsigned j;
> +             bitmap_iterator bi;
> +             EXECUTE_IF_SET_IN_BITMAP (set->cands, 0, j, bi)
> +             {
> +               struct iv_cand *cand = data->vcands[j];
> +               /* Since the purpose of rewrite here is to expose more
> +                  opportunities to downstream, the cost saving isn't
> +                  critical because this cmp use gets elimintated
> +                  finally.  So far we can't see any gains to replace
> +                  original non memory base iv cand with memory based
> +                  iv cand, but the rewrite could cause doloop unable
> +                  to find it's finite.  */
> +               if (group->bind_cand->iv->base_object == NULL_TREE
> +                   && cand->iv->base_object != NULL_TREE)
> +                 continue;
> +               struct cost_pair *cp = get_group_iv_cost (data, group, cand);
> +               if (cp && cp->cost < best_cp->cost)
> +                 best_cp = cp;
> +             }
> +           }
> +         iv_ca_set_cp (data, set, group, best_cp);
> +       }
> +    }
> +}
> +
>  static struct iv_ca *
>  find_optimal_iv_set (struct ivopts_data *data)
>  {
> @@ -6672,6 +6748,8 @@ find_optimal_iv_set (struct ivopts_data *data)
>    else if (origset)
>      iv_ca_free (&origset);
>
> +  handle_groups_with_bind_cand (data, set);
> +
>    for (i = 0; i < data->vgroups.length (); i++)
>      {
>        struct iv_group *group = data->vgroups[i];
> @@ -7442,12 +7520,69 @@ loop_body_includes_call (basic_block *body, unsigned num_nodes)
>    return false;
>  }
>
> +/* Doloop optimization RTL pass can make the related comparison computation
> +   become dead and get eliminated, then these comparison IV uses should NOT
> +   be considered in optimal IVs determination, set bind_cand for this kind
> +   of group, bypass them in later candidate determination algorithm.  */
> +
> +static void
> +bind_cand_for_doloop_uses (struct ivopts_data *data)
> +{
> +  struct loop *loop = data->current_loop;
> +
> +  for (unsigned i = 0; i < data->vgroups.length (); i++)
> +    {
> +      struct iv_group *group = data->vgroups[i];
> +      if (group->type == USE_COMPARE)
> +       {
> +         gcc_assert (group->vuses.length () == 1);
> +         struct iv_use *use = group->vuses[0];
> +         gimple *stmt = use->stmt;
> +         if (gimple_code (stmt) == GIMPLE_COND)
> +           {
> +             basic_block bb = gimple_bb (stmt);
> +             edge true_edge, false_edge;
> +             extract_true_false_edges_from_block (bb, &true_edge, &false_edge);
> +
> +             /* This comparison is used for loop latch.  Require latch is empty
> +                for now.  */
> +             if ((loop->latch == true_edge->dest
> +                  || loop->latch == false_edge->dest)
> +                 && empty_block_p (loop->latch))
> +               {
> +                 for (unsigned j = 0; j < data->vcands.length (); j++)
> +                   {
> +                     if (bitmap_bit_p (group->related_cands, j))
> +                       {
> +                         struct iv_cand *cand = data->vcands[j];
> +                         tree op = use->iv->ssa_name;
> +                         if (op == cand->var_before || op == cand->var_after)
> +                           {
> +                             group->bind_cand = cand;
> +                             if (dump_file && (dump_flags & TDF_DETAILS))
> +                               {
> +                                 fprintf (dump_file, "Doloop cmp iv use: ");
> +                                 print_gimple_stmt (dump_file, stmt,
> +                                                    TDF_DETAILS);
> +                                 dump_cand (dump_file, cand);
> +                               }
> +                             break;
> +                           }
> +                       }
> +                   }
> +               }
> +           }
> +       }
> +    }
> +}
> +
>  /* Optimizes the LOOP.  Returns true if anything changed.  */
>
>  static bool
>  tree_ssa_iv_optimize_loop (struct ivopts_data *data, struct loop *loop)
>  {
>    bool changed = false;
> +  bool bind_for_doloop_p = false;
>    struct iv_ca *iv_ca;
>    edge exit = single_dom_exit (loop);
>    basic_block *body;
> @@ -7496,6 +7631,20 @@ tree_ssa_iv_optimize_loop (struct ivopts_data *data, struct loop *loop)
>    /* Finds candidates for the induction variables (item 2).  */
>    find_iv_candidates (data);
>
> +  bind_for_doloop_p = targetm.predict_doloop_p (loop);
> +  if (dump_file && (dump_flags & TDF_DETAILS))
> +    {
> +      fprintf (dump_file,
> +              "Predict loop %d can %sperform doloop optimization later.\n",
> +              loop->num, bind_for_doloop_p ? "" : "not ");
> +      flow_loop_dump (loop, dump_file, NULL, 1);
> +    }
> +
> +  /* Some compare iv_use is probably useless once the doloop optimization
> +     performs.  Set bind_cand for the use (group).  */
> +  if (bind_for_doloop_p)
> +    bind_cand_for_doloop_uses (data);
> +
>    /* Calculates the costs (item 3, part 1).  */
>    determine_iv_costs (data);
>    determine_group_iv_costs (data);
> --
> 2.7.4
>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2 3/3] Consider doloop cmp use in ivopts
  2019-05-14  7:26 ` Richard Biener
@ 2019-05-15  5:03   ` Kewen.Lin
  2019-05-15  8:47     ` Richard Biener
  0 siblings, 1 reply; 43+ messages in thread
From: Kewen.Lin @ 2019-05-15  5:03 UTC (permalink / raw)
  To: Richard Biener
  Cc: GCC Patches, Segher Boessenkool, Bill Schmidt, bin.cheng,
	Richard Guenther, Jakub Jelinek

on 2019/5/14 下午3:26, Richard Biener wrote:
> On Tue, May 14, 2019 at 5:10 AM <linkw@linux.ibm.com> wrote:
>>
>> From: Kewen Lin <linkw@linux.ibm.com>
>>
>> Previous version link for background:
>> https://gcc.gnu.org/ml/gcc-patches/2019-04/msg00912.html
>>
>> Firstly, it's to call predict_doloop_p hook to check this
>> loop will be transformed to doloop or not, if yes, find
>> the expected comp iv use and its dependent original iv,
>> set the iv candidate as bind_cand of the group.
>> In following candidate selection process, we will bypass
>> the group with bind_cand, since we don't want to affect
>> global decision making for an iv use which will be
>> eliminated eventually.  At the time of iv candidate set
>> finalization, we will fill the cost pair for the group
>> with bind_cand.  If the bind_cand is already in the final
>> set, then just use it. Otherwise, we can check whether one
>> of existing final set is better and fill with that if so.
>>
>> Bootstrapped and regression testing passed on powerpc64le.
>>
>> Is it ok for trunk?
> 
> I wonder what prevents IVOPTs to consider a counter IV
> (eventually such candidate needs to be added if that's not
> already done) to be the most profitable variant w/o any
> of the other changes?  I guess that would be costing of
> the IV adjust plus branch which we would need to lower
> in case there's nothing inside the loop that would make
> later doloop transform fail?
> 
> Richard.
> 

If the question is for "w/o this patch", I think IVOPTs
can find counter IV as the most profitable one for the cmp
use in most time.  But the key issue is that it may stop
us to bring in more iv cands.  We have to add on iv cost
of new cand desirable for some iv use, it's probably more
than the cost of just using counter IV for the interest
use.  

If the question is for "w/i this patch", since we bypass
the doloop cmp use in candidate determination algorithm, 
it's possible that some other iv cands are preferred for 
the remaining uses rather than the counter IV. For example,
for some address type iv use, iv cand with memory based is
mostly better.


Thanks,
Kewen

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2 3/3] Consider doloop cmp use in ivopts
  2019-05-15  5:03   ` Kewen.Lin
@ 2019-05-15  8:47     ` Richard Biener
  2019-05-15 16:17       ` Segher Boessenkool
                         ` (2 more replies)
  0 siblings, 3 replies; 43+ messages in thread
From: Richard Biener @ 2019-05-15  8:47 UTC (permalink / raw)
  To: Kewen.Lin
  Cc: Richard Biener, GCC Patches, Segher Boessenkool, Bill Schmidt,
	bin.cheng, Jakub Jelinek

[-- Attachment #1: Type: text/plain, Size: 2982 bytes --]

On Wed, 15 May 2019, Kewen.Lin wrote:

> on 2019/5/14 下午3:26, Richard Biener wrote:
> > On Tue, May 14, 2019 at 5:10 AM <linkw@linux.ibm.com> wrote:
> >>
> >> From: Kewen Lin <linkw@linux.ibm.com>
> >>
> >> Previous version link for background:
> >> https://gcc.gnu.org/ml/gcc-patches/2019-04/msg00912.html
> >>
> >> Firstly, it's to call predict_doloop_p hook to check this
> >> loop will be transformed to doloop or not, if yes, find
> >> the expected comp iv use and its dependent original iv,
> >> set the iv candidate as bind_cand of the group.
> >> In following candidate selection process, we will bypass
> >> the group with bind_cand, since we don't want to affect
> >> global decision making for an iv use which will be
> >> eliminated eventually.  At the time of iv candidate set
> >> finalization, we will fill the cost pair for the group
> >> with bind_cand.  If the bind_cand is already in the final
> >> set, then just use it. Otherwise, we can check whether one
> >> of existing final set is better and fill with that if so.
> >>
> >> Bootstrapped and regression testing passed on powerpc64le.
> >>
> >> Is it ok for trunk?
> > 
> > I wonder what prevents IVOPTs to consider a counter IV
> > (eventually such candidate needs to be added if that's not
> > already done) to be the most profitable variant w/o any
> > of the other changes?  I guess that would be costing of
> > the IV adjust plus branch which we would need to lower
> > in case there's nothing inside the loop that would make
> > later doloop transform fail?
> > 
> > Richard.
> > 
> 
> If the question is for "w/o this patch", I think IVOPTs
> can find counter IV as the most profitable one for the cmp
> use in most time.  But the key issue is that it may stop
> us to bring in more iv cands.  We have to add on iv cost
> of new cand desirable for some iv use, it's probably more
> than the cost of just using counter IV for the interest
> use.  
> 
> If the question is for "w/i this patch", since we bypass
> the doloop cmp use in candidate determination algorithm, 
> it's possible that some other iv cands are preferred for 
> the remaining uses rather than the counter IV. For example,
> for some address type iv use, iv cand with memory based is
> mostly better.

Ah, so the key issue is that the doloop IV is "free"?  That
is, it doesn't consume a general register and whatnot?  That
is allocating this IV doesn't really interfere with other IVs?
But can other uses be based on the doloop IV easily (if the
IV doesn't reside in a general reg?)?

Otherwise I understand that IVOPTs doesn't properly cost
the doloop IV update and conditional branch.  That's clearly
something we should fix (maybe even indepenently on other
changes).  One important thing is that we need to base costs
on a common base to not compare apples and oranges, didn't
dig into your patch in detail enough to see whether it
fits into the general cost model or whether it is a hack
ontop of everything.

Richard.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2 3/3] Consider doloop cmp use in ivopts
  2019-05-15  8:47     ` Richard Biener
@ 2019-05-15 16:17       ` Segher Boessenkool
  2019-05-16  7:25         ` Richard Biener
  2019-05-16  3:53       ` Kewen.Lin
  2019-05-16 18:41       ` Jeff Law
  2 siblings, 1 reply; 43+ messages in thread
From: Segher Boessenkool @ 2019-05-15 16:17 UTC (permalink / raw)
  To: Richard Biener
  Cc: Kewen.Lin, Richard Biener, GCC Patches, Bill Schmidt, bin.cheng,
	Jakub Jelinek

On Wed, May 15, 2019 at 10:47:31AM +0200, Richard Biener wrote:
> Ah, so the key issue is that the doloop IV is "free"?  That
> is, it doesn't consume a general register and whatnot?  That
> is allocating this IV doesn't really interfere with other IVs?

That is one half of it, yes.

> But can other uses be based on the doloop IV easily (if the
> IV doesn't reside in a general reg?)?

Getting the value of the count reg can be expensive, that is the
other half of it.

> Otherwise I understand that IVOPTs doesn't properly cost
> the doloop IV update and conditional branch.

Currently it doesn't even *know* something is or isn't a doloop.
And yeah that matters a lot for proper costing, on all targets that
have a doloop.

> That's clearly
> something we should fix (maybe even indepenently on other
> changes).  One important thing is that we need to base costs
> on a common base to not compare apples and oranges, didn't
> dig into your patch in detail enough to see whether it
> fits into the general cost model or whether it is a hack
> ontop of everything.

The different cost for a doloop is pretty easy...  Might have to
be a target hook though; on Power the decrement + compare-to-zero
are "free", while on some other targets only the "compare" is.
The cost for using the IV...  For us we could just disallow it
being used at all (except for the looping itself of course), but
not sure what is optimal in general.  Another hook?


Segher

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2 3/3] Consider doloop cmp use in ivopts
  2019-05-15  8:47     ` Richard Biener
  2019-05-15 16:17       ` Segher Boessenkool
@ 2019-05-16  3:53       ` Kewen.Lin
  2019-05-16 18:41       ` Jeff Law
  2 siblings, 0 replies; 43+ messages in thread
From: Kewen.Lin @ 2019-05-16  3:53 UTC (permalink / raw)
  To: Richard Biener
  Cc: Richard Biener, GCC Patches, Segher Boessenkool, Bill Schmidt,
	bin.cheng, Jakub Jelinek

on 2019/5/15 下午4:47, Richard Biener wrote:
> On Wed, 15 May 2019, Kewen.Lin wrote:
> 
>> on 2019/5/14 下午3:26, Richard Biener wrote:
>>> On Tue, May 14, 2019 at 5:10 AM <linkw@linux.ibm.com> wrote:
>>>>
>>>> From: Kewen Lin <linkw@linux.ibm.com>
>>>>
>>>> Previous version link for background:
>>>> https://gcc.gnu.org/ml/gcc-patches/2019-04/msg00912.html
>>>>
>>>> Firstly, it's to call predict_doloop_p hook to check this
>>>> loop will be transformed to doloop or not, if yes, find
>>>> the expected comp iv use and its dependent original iv,
>>>> set the iv candidate as bind_cand of the group.
>>>> In following candidate selection process, we will bypass
>>>> the group with bind_cand, since we don't want to affect
>>>> global decision making for an iv use which will be
>>>> eliminated eventually.  At the time of iv candidate set
>>>> finalization, we will fill the cost pair for the group
>>>> with bind_cand.  If the bind_cand is already in the final
>>>> set, then just use it. Otherwise, we can check whether one
>>>> of existing final set is better and fill with that if so.
>>>>
>>>> Bootstrapped and regression testing passed on powerpc64le.
>>>>
>>>> Is it ok for trunk?
>>>
>>> I wonder what prevents IVOPTs to consider a counter IV
>>> (eventually such candidate needs to be added if that's not
>>> already done) to be the most profitable variant w/o any
>>> of the other changes?  I guess that would be costing of
>>> the IV adjust plus branch which we would need to lower
>>> in case there's nothing inside the loop that would make
>>> later doloop transform fail?
>>>
>>> Richard.
>>>
>>
>> If the question is for "w/o this patch", I think IVOPTs
>> can find counter IV as the most profitable one for the cmp
>> use in most time.  But the key issue is that it may stop
>> us to bring in more iv cands.  We have to add on iv cost
>> of new cand desirable for some iv use, it's probably more
>> than the cost of just using counter IV for the interest
>> use.  
>>
>> If the question is for "w/i this patch", since we bypass
>> the doloop cmp use in candidate determination algorithm, 
>> it's possible that some other iv cands are preferred for 
>> the remaining uses rather than the counter IV. For example,
>> for some address type iv use, iv cand with memory based is
>> mostly better.
> 
> Ah, so the key issue is that the doloop IV is "free"?  That
> is, it doesn't consume a general register and whatnot?  That
> is allocating this IV doesn't really interfere with other IVs?
> But can other uses be based on the doloop IV easily (if the
> IV doesn't reside in a general reg?)?

Yes, it takes one special hardware register "counter 
register" on Power.  For other uses based on doloop
IV, if there are no more suitable IVs, we still need
one general reg for update and use.  In the current
patch, although we bypass this doloop cmp use, it's
still allowed to have other uses to choose this
doloop IV candidate.  The costing is as usual. Since
the doloop cmp use is actually free, we don't want
ivopts to consider it and affect optimal IV set.

> 
> Otherwise I understand that IVOPTs doesn't properly cost
> the doloop IV update and conditional branch.  That's clearly
> something we should fix (maybe even indepenently on other
> changes).  One important thing is that we need to base costs
> on a common base to not compare apples and oranges, didn't
> dig into your patch in detail enough to see whether it
> fits into the general cost model or whether it is a hack
> ontop of everything.
> 

In the previous version of patch, it's to make this doloop 
cmp use as zero cost with any iv cands (like it's invisible),
sounds better fit in general cost model? But it requires us
to preserve the doloop IV incase it's not chosen.
The current version is to bind the doloop IV at the first
place, bypass the choosing process and fill it directly if 
no better found later.  For Power, either zero cost or
bypass can coexist with the cost model framework.


Thanks,
Kewen

> Richard.
> 

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2 3/3] Consider doloop cmp use in ivopts
  2019-05-15 16:17       ` Segher Boessenkool
@ 2019-05-16  7:25         ` Richard Biener
  2019-05-16 17:35           ` Segher Boessenkool
  0 siblings, 1 reply; 43+ messages in thread
From: Richard Biener @ 2019-05-16  7:25 UTC (permalink / raw)
  To: Segher Boessenkool
  Cc: Kewen.Lin, Richard Biener, GCC Patches, Bill Schmidt, bin.cheng,
	Jakub Jelinek

On Wed, 15 May 2019, Segher Boessenkool wrote:

> On Wed, May 15, 2019 at 10:47:31AM +0200, Richard Biener wrote:
> > Ah, so the key issue is that the doloop IV is "free"?  That
> > is, it doesn't consume a general register and whatnot?  That
> > is allocating this IV doesn't really interfere with other IVs?
> 
> That is one half of it, yes.
> 
> > But can other uses be based on the doloop IV easily (if the
> > IV doesn't reside in a general reg?)?
> 
> Getting the value of the count reg can be expensive, that is the
> other half of it.
> 
> > Otherwise I understand that IVOPTs doesn't properly cost
> > the doloop IV update and conditional branch.
> 
> Currently it doesn't even *know* something is or isn't a doloop.
> And yeah that matters a lot for proper costing, on all targets that
> have a doloop.

Ah, OK.  So for general handling IVOPTs would add a new
candidate kind (doloop kind) which is costed differently
at the various uses.  The "guessed" RTL we create for
costing also needs to properly create a proper counter reg
(IIRC it always creates pseudos right now, but here it would
need to be a hard reg so costing can properly pessimize uses
in addresses/memory?).

> > That's clearly
> > something we should fix (maybe even indepenently on other
> > changes).  One important thing is that we need to base costs
> > on a common base to not compare apples and oranges, didn't
> > dig into your patch in detail enough to see whether it
> > fits into the general cost model or whether it is a hack
> > ontop of everything.
> 
> The different cost for a doloop is pretty easy...  Might have to
> be a target hook though; on Power the decrement + compare-to-zero
> are "free", while on some other targets only the "compare" is.
> The cost for using the IV...  For us we could just disallow it
> being used at all (except for the looping itself of course), but
> not sure what is optimal in general.  Another hook?

Indeed the easiest thing is to simply disallow uses of the doloop
IV outside of the increment, compare and branch (maybe have a
target hook that says whether a particular IV may be used for
a particular USE).

We'd still need to cost the spilling thing around calls of course,
but this can maybe be done incrementally.  It's still RTL doloop
that ultimatively decides on the doloop use.

Richard.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2 3/3] Consider doloop cmp use in ivopts
  2019-05-16  7:25         ` Richard Biener
@ 2019-05-16 17:35           ` Segher Boessenkool
  0 siblings, 0 replies; 43+ messages in thread
From: Segher Boessenkool @ 2019-05-16 17:35 UTC (permalink / raw)
  To: Richard Biener
  Cc: Kewen.Lin, Richard Biener, GCC Patches, Bill Schmidt, bin.cheng,
	Jakub Jelinek

On Thu, May 16, 2019 at 09:25:49AM +0200, Richard Biener wrote:
> On Wed, 15 May 2019, Segher Boessenkool wrote:
> > > Otherwise I understand that IVOPTs doesn't properly cost
> > > the doloop IV update and conditional branch.
> > 
> > Currently it doesn't even *know* something is or isn't a doloop.
> > And yeah that matters a lot for proper costing, on all targets that
> > have a doloop.
> 
> Ah, OK.  So for general handling IVOPTs would add a new
> candidate kind (doloop kind) which is costed differently
> at the various uses.

That sounds good.

> The "guessed" RTL we create for
> costing also needs to properly create a proper counter reg
> (IIRC it always creates pseudos right now, but here it would
> need to be a hard reg so costing can properly pessimize uses
> in addresses/memory?).

We always use a pseudo currently; it is not turned into a hard register
until after RA.  Expanding as hard registers directly works really well,
*if* you can put *all* uses of that hard reg into the RTl at expand time
already.  Indirect jumps and switch tables want to use the count
register as well; this complicates things enormously.  Also, sometimes
a loop is mangled enough (in RTL) that it is better to use a GPR as IV.

> > The different cost for a doloop is pretty easy...  Might have to
> > be a target hook though; on Power the decrement + compare-to-zero
> > are "free", while on some other targets only the "compare" is.
> > The cost for using the IV...  For us we could just disallow it
> > being used at all (except for the looping itself of course), but
> > not sure what is optimal in general.  Another hook?
> 
> Indeed the easiest thing is to simply disallow uses of the doloop
> IV outside of the increment, compare and branch (maybe have a
> target hook that says whether a particular IV may be used for
> a particular USE).

But is that generic enough?

> We'd still need to cost the spilling thing around calls of course,
> but this can maybe be done incrementally.  It's still RTL doloop
> that ultimatively decides on the doloop use.

We cannot have doloops with calls, on rs6000.  This differs per target
of course.

We really need to get a good overview of what our various targets need.


Segher

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2 3/3] Consider doloop cmp use in ivopts
  2019-05-15  8:47     ` Richard Biener
  2019-05-15 16:17       ` Segher Boessenkool
  2019-05-16  3:53       ` Kewen.Lin
@ 2019-05-16 18:41       ` Jeff Law
  2019-05-16 21:42         ` Segher Boessenkool
  2 siblings, 1 reply; 43+ messages in thread
From: Jeff Law @ 2019-05-16 18:41 UTC (permalink / raw)
  To: Richard Biener, Kewen.Lin
  Cc: Richard Biener, GCC Patches, Segher Boessenkool, Bill Schmidt,
	bin.cheng, Jakub Jelinek

On 5/15/19 2:47 AM, Richard Biener wrote:
> On Wed, 15 May 2019, Kewen.Lin wrote:
> 
>> on 2019/5/14 下午3:26, Richard Biener wrote:
>>> On Tue, May 14, 2019 at 5:10 AM <linkw@linux.ibm.com> wrote:
>>>>
>>>> From: Kewen Lin <linkw@linux.ibm.com>
>>>>
>>>> Previous version link for background:
>>>> https://gcc.gnu.org/ml/gcc-patches/2019-04/msg00912.html
>>>>
>>>> Firstly, it's to call predict_doloop_p hook to check this
>>>> loop will be transformed to doloop or not, if yes, find
>>>> the expected comp iv use and its dependent original iv,
>>>> set the iv candidate as bind_cand of the group.
>>>> In following candidate selection process, we will bypass
>>>> the group with bind_cand, since we don't want to affect
>>>> global decision making for an iv use which will be
>>>> eliminated eventually.  At the time of iv candidate set
>>>> finalization, we will fill the cost pair for the group
>>>> with bind_cand.  If the bind_cand is already in the final
>>>> set, then just use it. Otherwise, we can check whether one
>>>> of existing final set is better and fill with that if so.
>>>>
>>>> Bootstrapped and regression testing passed on powerpc64le.
>>>>
>>>> Is it ok for trunk?
>>>
>>> I wonder what prevents IVOPTs to consider a counter IV
>>> (eventually such candidate needs to be added if that's not
>>> already done) to be the most profitable variant w/o any
>>> of the other changes?  I guess that would be costing of
>>> the IV adjust plus branch which we would need to lower
>>> in case there's nothing inside the loop that would make
>>> later doloop transform fail?
>>>
>>> Richard.
>>>
>>
>> If the question is for "w/o this patch", I think IVOPTs
>> can find counter IV as the most profitable one for the cmp
>> use in most time.  But the key issue is that it may stop
>> us to bring in more iv cands.  We have to add on iv cost
>> of new cand desirable for some iv use, it's probably more
>> than the cost of just using counter IV for the interest
>> use.  
>>
>> If the question is for "w/i this patch", since we bypass
>> the doloop cmp use in candidate determination algorithm, 
>> it's possible that some other iv cands are preferred for 
>> the remaining uses rather than the counter IV. For example,
>> for some address type iv use, iv cand with memory based is
>> mostly better.
> 
> Ah, so the key issue is that the doloop IV is "free"?  That
> is, it doesn't consume a general register and whatnot?  That
> is allocating this IV doesn't really interfere with other IVs?
> But can other uses be based on the doloop IV easily (if the
> IV doesn't reside in a general reg?)?
That's my understanding of how at least some of the low overhead looping
instructions work on some ISAs (ppc included).  There's a special loop
count register and the low overhead looping insns handle the decrement
and branch.

This is similar, but different than something like m68k dbcc where the
counter is a GPR.

For architectures like PPC, we probably don't want to use the loop count
for anything else as it's likely expensive to get data in/out of the the
loop count register.

For architectures where the counter is stored in a GPR, then we have
more flexibility in how we use it.

So at least part of the problem is cost modeling of this.  It's all
pretty low level, so not really a good match for the goals of gimple.
But we may ultimately have no choice here but to be pragmatic like we've
done with stuff like vector widths and allow some target properties to
bleed in.

The only saving grace is the existence of low overhead loops is static
-- the target either has them or it doesn't.  Similarly whether or not
the counter is a GPR or not is a static property of the target.

> Otherwise I understand that IVOPTs doesn't properly cost
> the doloop IV update and conditional branch.  That's clearly
> something we should fix (maybe even indepenently on other
> changes). 
It feels independent to me.

 One important thing is that we need to base costs
> on a common base to not compare apples and oranges, didn't
> dig into your patch in detail enough to see whether it
> fits into the general cost model or whether it is a hack
> ontop of everything.
Agreed.

jeff

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v2 3/3] Consider doloop cmp use in ivopts
  2019-05-16 18:41       ` Jeff Law
@ 2019-05-16 21:42         ` Segher Boessenkool
  0 siblings, 0 replies; 43+ messages in thread
From: Segher Boessenkool @ 2019-05-16 21:42 UTC (permalink / raw)
  To: Jeff Law
  Cc: Richard Biener, Kewen.Lin, Richard Biener, GCC Patches,
	Bill Schmidt, bin.cheng, Jakub Jelinek

Hi Jeff,

On Thu, May 16, 2019 at 12:41:16PM -0600, Jeff Law wrote:
> For architectures like PPC, we probably don't want to use the loop count
> for anything else as it's likely expensive to get data in/out of the the
> loop count register.

That is part of it.  Another part is that it costs extra code, negating
one of the advantages of using these instructions.  And a third reason
we do not want this is that on some implementations you have to load the
count register early enough to get the loop predicted correctly.

> So at least part of the problem is cost modeling of this.  It's all
> pretty low level, so not really a good match for the goals of gimple.
> But we may ultimately have no choice here but to be pragmatic like we've
> done with stuff like vector widths and allow some target properties to
> bleed in.

*All* of ivopts is low level in this sense: *all* of it is about finding
out what IVs to use such that it is lowest cost on the target.

Other than costs it doesn't use many target attributes.  For doloop it
would also ask the target whether some loop can be a doloop at all.  So
everything it does is quite high level still, but it *does* have to know
about some very machine-specific things.

Maybe two hooks for that: one, taking a struct loop, to decide if that
loop should be considered for a doloop at all; and another taking a
gimple statement, and returning whether that statement prevents the
loop it is in from being a doloop.  That way we do not have to pass
a lot of gimple data and work to the backends.  Most can just look at
some of the simple loop properties ("is this an inner loop?"), and
allow all statements or just disallow some particular types.

> > Otherwise I understand that IVOPTs doesn't properly cost
> > the doloop IV update and conditional branch.  That's clearly
> > something we should fix (maybe even indepenently on other
> > changes). 
> It feels independent to me.

It cannot cost things properly if nothing has yet decided whether some
loop could (or should) be a doloop :-)


Segher

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH v3 3/3] PR80791 Consider doloop cmp use in ivopts
  2019-05-14  3:10 [PATCH v2 3/3] Consider doloop cmp use in ivopts linkw
  2019-05-14  7:26 ` Richard Biener
@ 2019-06-19 11:47 ` Kewen.Lin
  2019-06-20  9:09   ` Segher Boessenkool
  2019-07-21  9:06   ` [PATCH v3 " Bin.Cheng
  1 sibling, 2 replies; 43+ messages in thread
From: Kewen.Lin @ 2019-06-19 11:47 UTC (permalink / raw)
  To: gcc-patches
  Cc: segher, wschmidt, bin.cheng, rguenther, jakub, Jeff Law,
	Kugan Vivekanandarajah

[-- Attachment #1: Type: text/plain, Size: 4669 bytes --]

Hi all,

This is the following patch after https://gcc.gnu.org/ml/gcc-patches/2019-06/msg00910.html

Main steps:
  1) Identify the doloop cmp type iv use and record its bind_cand (explain it later).
  2) Set zero cost for pairs between this use and any iv cand.
  3) IV cand set selecting algorithm runs as usual.
  4) Fix up the selected iv cand for doloop use if need.

It only focuses on the targets like Power which has specific count register.
target hook have_count_reg_decr_p is proposed for it.

Some notes:

*) Why we need zero cost?  How about just decrease the cost for the pair
   between doloop use and its original iv cand?  How about just decrease
   the cost for the pair between doloop use and one selected iv cand?

   Since some target supports hardware count register for decrement and
   branch, it doesn't need the general instruction sequence for decr, cmp and
   branch in general registers.  The cost of moving count register to GPR
   is generally high, so it's standalone and can't be shared with other iv 
   uses.  It means IVOPTs can take doloop use as invisible (zero cost).

   Let's take a look at PR80791 for example.

                            original biv (cand 4)  use derived iv (cand 6)
     generic use:                   4                  0
     comp use (doloop use):         0                 infinite
    
    For iv cost, original biv has cost 4 while use derived iv has cost 5.
    When IVOPTs considers doloop use, the optimal cost is 8 (original biv 
    iv cost 4 + use cost 4).  Unfortunately it's not actually optimal, since
    later doloop transformation updates loop closing by count register,
    original biv (and its update) won't be needed in loop closing any more.
    The generic use become the only use for original biv.  That means, if we 
    know the doloop will perform later, we shouldn't consider the doloop use
    when determining IV set.  If we don't consider it, the algorithm will 
    choose iv cand 6 with total cost 5 (iv cost 5 + use cost 0).

    From the above, we can see that to decrease the cost for the pair between 
    doloop use and original biv doesn't work.  Meanwhile it's hard to predict
    one good iv cand in final optimal set here and pre-update the cost
    between it and doloop use.  The analysis would be heavy and imperfect.
   
*) Why we need bind_cand?

    As above, we assign zero cost for pairs between doloop use and each iv 
    cand.  It's possible that doloop use gets assigned one iv cand which is
    invalid to be used during later rewrite.  Then we have to fix it up with iv
    cand originally used for it.  It's fine whatever this iv cand exists in
    final iv cand set or not, even if it's not in the set, it will be 
    eliminated in doloop transformation.

By the way, I was thinking whether we can replace the hook have_count_reg_decr_p
with flag_branch_on_count_reg.  As the description of the "no-" option, "Disable
the optimization pass that scans for opportunities to use 'decrement and branch' 
instructions on a count register instead of instruction sequences that decrement 
a register, compare it against zero, and then branch based upon the result.", it
implicitly says it has count register support.  But I noticed that the gate of 
doloop_optimize checks this flag, as what I got from the previous discussions, some
targets which can perform doloop_optimize don't have specific count register, so it
sounds we can't make use of the flag, is it correct?

Bootstrapped on powerpcle, also ran regression testing on powerpcle, got one failure
which is exposed by this patch and the root cause is duplicate of PR62147.
case is gcc.target/powerpc/20050830-1.c

Is it OK for trunk?  

--------------

gcc/ChangeLog

2019-06-19  Kewen Lin  <linkw@gcc.gnu.org>

	PR middle-end/80791
	* target.def (have_count_reg_decr_p): New hook.
	* doc/tm.texi.in (TARGET_HAVE_COUNT_REG_DECR_P): New hook.
	* doc/tm.texi: Regenerate.
	* config/rs6000/rs6000.c (rs6000_have_count_reg_decr_p): New function.
	(TARGET_HAVE_COUNT_REG_DECR_P): New macro.
	* tree-ssa-loop-ivopts.c (adjust_group_iv_cost_for_doloop): New function.
	(fixup_doloop_groups): Likewise.
	(find_doloop_use_and_its_bind): Likewise.
	(record_group): Init bind_cand.
	(determine_group_iv_cost): Call adjust_group_iv_cost_for_doloop.
	(find_optimal_iv_set): Call fixup_doloop_groups.
	(tree_ssa_iv_optimize_loop): Call function have_count_reg_decr_p, 
	generic_predict_doloop_p and find_doloop_use_and_its_bind.
	(generic_predict_doloop_p): Update attribute.

gcc/testsuite/ChangeLog

2019-06-19  Kewen Lin  <linkw@gcc.gnu.org>

	PR middle-end/80791
	* gcc.dg/tree-ssa/ivopts-lt.c: Adjust.



[-- Attachment #2: ivopts.diff --]
[-- Type: text/plain, Size: 10687 bytes --]

diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index 6667cd0..12f1dfd 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -1912,6 +1912,9 @@ static const struct attribute_spec rs6000_attribute_table[] =
 #undef TARGET_PREDICT_DOLOOP_P
 #define TARGET_PREDICT_DOLOOP_P rs6000_predict_doloop_p
 
+#undef TARGET_HAVE_COUNT_REG_DECR_P
+#define TARGET_HAVE_COUNT_REG_DECR_P rs6000_have_count_reg_decr_p
+
 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV rs6000_atomic_assign_expand_fenv
 
@@ -39437,6 +39440,14 @@ rs6000_predict_doloop_p (struct loop *loop)
   return true;
 }
 
+/* Return true if count register for branch is supported.  */
+
+static bool
+rs6000_have_count_reg_decr_p ()
+{
+  return flag_branch_on_count_reg;
+}
+
 struct gcc_target targetm = TARGET_INITIALIZER;
 
 #include "gt-rs6000.h"
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index c2aa4d0..46e488f 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -11618,6 +11618,12 @@ loops, and will help ivopts to make some decisions.
 The default version of this hook returns false.
 @end deftypefn
 
+@deftypefn {Target Hook} bool TARGET_HAVE_COUNT_REG_DECR_P (void)
+Return true if the target supports hardware count register for decrement
+and branch.
+The default version of this hook returns false.
+@end deftypefn
+
 @deftypefn {Target Hook} bool TARGET_CAN_USE_DOLOOP_P (const widest_int @var{&iterations}, const widest_int @var{&iterations_max}, unsigned int @var{loop_depth}, bool @var{entered_at_top})
 Return true if it is possible to use low-overhead loops (@code{doloop_end}
 and @code{doloop_begin}) for a particular loop.  @var{iterations} gives the
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index b4d57b8..5f43b27 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -7946,6 +7946,8 @@ to by @var{ce_info}.
 
 @hook TARGET_PREDICT_DOLOOP_P
 
+@hook TARGET_HAVE_COUNT_REG_DECR_P
+
 @hook TARGET_CAN_USE_DOLOOP_P
 
 @hook TARGET_INVALID_WITHIN_DOLOOP
diff --git a/gcc/target.def b/gcc/target.def
index 71b6972..ec15a6d 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -4247,6 +4247,14 @@ The default version of this hook returns false.",
  default_predict_doloop_p)
 
 DEFHOOK
+(have_count_reg_decr_p,
+ "Return true if the target supports hardware count register for decrement\n\
+and branch.\n\
+The default version of this hook returns false.",
+ bool, (void),
+ hook_bool_void_false)
+
+DEFHOOK
 (can_use_doloop_p,
  "Return true if it is possible to use low-overhead loops (@code{doloop_end}\n\
 and @code{doloop_begin}) for a particular loop.  @var{iterations} gives the\n\
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ivopts-lt.c b/gcc/testsuite/gcc.dg/tree-ssa/ivopts-lt.c
index 7d5859b..71d7f67 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/ivopts-lt.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ivopts-lt.c
@@ -17,6 +17,7 @@ f1 (char *p, uintptr_t i, uintptr_t n)
   while (i < n);
 }
 
-/* { dg-final { scan-tree-dump-times "PHI" 1 "ivopts" } } */
-/* { dg-final { scan-tree-dump-times "PHI <p_" 1 "ivopts"} } */
-/* { dg-final { scan-tree-dump-times "p_\[0-9\]* <" 1 "ivopts" } } */
+/* { dg-final { scan-tree-dump-times "PHI" 1 "ivopts" { target { ! powerpc*-*-* } } } } */
+/* { dg-final { scan-tree-dump-times "PHI" 2 "ivopts" { target { powerpc*-*-* } } } } */
+/* { dg-final { scan-tree-dump-times "PHI <p_" 1 "ivopts" { target { ! powerpc*-*-* } } } } */
+/* { dg-final { scan-tree-dump-times "p_\[0-9\]* <" 1 "ivopts" { target { ! powerpc*-*-* } } } } */
diff --git a/gcc/tree-ssa-loop-ivopts.c b/gcc/tree-ssa-loop-ivopts.c
index 530ea4a..9a14ba8 100644
--- a/gcc/tree-ssa-loop-ivopts.c
+++ b/gcc/tree-ssa-loop-ivopts.c
@@ -399,6 +399,8 @@ struct iv_group
   struct cost_pair *cost_map;
   /* The selected candidate for the group.  */
   struct iv_cand *selected;
+  /* The bind candidate for this group, for doloop use group only.  */
+  struct iv_cand *bind_cand;
   /* Uses in the group.  */
   vec<struct iv_use *> vuses;
 };
@@ -612,6 +614,9 @@ struct ivopts_data
 
   /* Whether the loop body can only be exited via single exit.  */
   bool loop_single_exit_p;
+
+  /* Whether the loop has doloop comparison use.  */
+  bool doloop_use_p;
 };
 
 /* An assignment of iv candidates to uses.  */
@@ -1528,6 +1533,7 @@ record_group (struct ivopts_data *data, enum use_type type)
   group->type = type;
   group->related_cands = BITMAP_ALLOC (NULL);
   group->vuses.create (1);
+  group->bind_cand = NULL;
 
   data->vgroups.safe_push (group);
   return group;
@@ -3724,7 +3730,7 @@ prepare_decl_rtl (tree *expr_p, int *ws, void *data)
    Some RTL specific checks seems unable to be checked in gimple, if any new
    checks or easy checks _are_ missing here, please add them.  */
 
-static bool ATTRIBUTE_UNUSED
+static bool
 generic_predict_doloop_p (struct ivopts_data *data)
 {
   struct loop *loop = data->current_loop;
@@ -5291,6 +5297,21 @@ determine_group_iv_cost_cond (struct ivopts_data *data,
   return !cost.infinite_cost_p ();
 }
 
+/* Set no cost for pair between doloop iv use GROUP and iv cand CAND.  */
+
+static bool
+adjust_group_iv_cost_for_doloop (struct ivopts_data *data,
+				 struct iv_group *group, struct iv_cand *cand)
+{
+  struct cost_pair *cp = get_group_iv_cost (data, group, cand);
+  if (!cp)
+    set_group_iv_cost (data, group, cand, no_cost, NULL, NULL_TREE, ERROR_MARK,
+		       NULL);
+  else
+    cp->cost = no_cost;
+  return true;
+}
+
 /* Determines cost of computing uses in GROUP with CAND.  Returns false
    if USE cannot be represented with CAND.  */
 
@@ -5308,7 +5329,12 @@ determine_group_iv_cost (struct ivopts_data *data,
       return determine_group_iv_cost_address (data, group, cand);
 
     case USE_COMPARE:
-      return determine_group_iv_cost_cond (data, group, cand);
+      {
+	bool finite_cost_p = determine_group_iv_cost_cond (data, group, cand);
+	if (data->doloop_use_p && group->bind_cand)
+	  finite_cost_p = adjust_group_iv_cost_for_doloop (data, group, cand);
+	return finite_cost_p;
+      }
 
     default:
       gcc_unreachable ();
@@ -6723,6 +6749,29 @@ find_optimal_iv_set_1 (struct ivopts_data *data, bool originalp)
   return set;
 }
 
+/* For doloop use, if the algothrim selects some candidate which invalid for
+   later rewrite, fix it up with bind_cand.  */
+
+static void
+fixup_doloop_groups (struct ivopts_data *data, struct iv_ca *set)
+{
+  for (unsigned i = 0; i < data->vgroups.length (); i++)
+    {
+      struct iv_group *group = data->vgroups[i];
+      if (group->bind_cand)
+	{
+	  struct cost_pair *cp = iv_ca_cand_for_group (set, group);
+	  gcc_assert (cp);
+	  if (cp->cand != group->bind_cand && cp->value == NULL_TREE)
+	    {
+	      struct cost_pair *bind_cp
+		= get_group_iv_cost (data, group, group->bind_cand);
+	      iv_ca_set_cp (data, set, group, bind_cp);
+	    }
+	}
+    }
+}
+
 static struct iv_ca *
 find_optimal_iv_set (struct ivopts_data *data)
 {
@@ -6760,6 +6809,9 @@ find_optimal_iv_set (struct ivopts_data *data)
   else if (origset)
     iv_ca_free (&origset);
 
+  if (data->doloop_use_p)
+    fixup_doloop_groups (data, set);
+
   for (i = 0; i < data->vgroups.length (); i++)
     {
       struct iv_group *group = data->vgroups[i];
@@ -7568,6 +7620,72 @@ determine_scaling_factor (struct ivopts_data *data, basic_block *body)
     }
 }
 
+/* Find doloop comparison use and set its related bind_cand.  We adjust the
+   doloop use group cost against various IV cands, it's possible to assign
+   some cost like zero rather than original inifite cost.  The point is to
+   give more chances to consider other IV cands instead of BIV.  The cost
+   originally given on doloop use can affect optimal decision because it can
+   become dead and get eliminated but considered too much here.
+
+   So it's possible that doloop use is assigned one invalid IV cand to rewrite.
+   In this case, we need bind_cand to fix up.  Even if the bind_cand doesn't
+   exist in final iv_ca set, it won't affect optimal decision since it gets
+   eliminated along with doloop use.  */
+
+static bool
+find_doloop_use_and_its_bind (struct ivopts_data *data)
+{
+  struct loop *loop = data->current_loop;
+
+  for (unsigned i = 0; i < data->vgroups.length (); i++)
+    {
+      struct iv_group *group = data->vgroups[i];
+      if (group->type == USE_COMPARE)
+	{
+	  gcc_assert (group->vuses.length () == 1);
+	  struct iv_use *use = group->vuses[0];
+	  gimple *stmt = use->stmt;
+	  if (gimple_code (stmt) == GIMPLE_COND)
+	    {
+	      basic_block bb = gimple_bb (stmt);
+	      edge true_edge, false_edge;
+	      extract_true_false_edges_from_block (bb, &true_edge, &false_edge);
+	      /* This comparison is used for loop latch.  Require latch is empty
+		 for now.  */
+	      if ((loop->latch == true_edge->dest
+		   || loop->latch == false_edge->dest)
+		  && empty_block_p (loop->latch))
+		{
+		  for (unsigned j = 0; j < data->vcands.length (); j++)
+		    {
+		      if (bitmap_bit_p (group->related_cands, j))
+			{
+			  struct iv_cand *cand = data->vcands[j];
+			  tree op = use->iv->ssa_name;
+			  if (op == cand->var_before || op == cand->var_after)
+			    {
+			      group->bind_cand = cand;
+			      if (dump_file && (dump_flags & TDF_DETAILS))
+				{
+				  fprintf (dump_file, "Doloop cmp iv use: ");
+				  print_gimple_stmt (dump_file, stmt,
+						     TDF_DETAILS);
+				  dump_cand (dump_file, cand);
+				}
+			      break;
+			    }
+			}
+		    }
+		  if (group->bind_cand)
+		    return true;
+		}
+	    }
+	}
+    }
+
+  return false;
+}
+
 /* Optimizes the LOOP.  Returns true if anything changed.  */
 
 static bool
@@ -7580,6 +7698,7 @@ tree_ssa_iv_optimize_loop (struct ivopts_data *data, struct loop *loop,
   basic_block *body;
 
   gcc_assert (!data->niters);
+  data->doloop_use_p = false;
   data->current_loop = loop;
   data->loop_loc = find_loop_location (loop).get_location_t ();
   data->speed = optimize_loop_for_speed_p (loop);
@@ -7625,6 +7744,18 @@ tree_ssa_iv_optimize_loop (struct ivopts_data *data, struct loop *loop,
   /* Finds candidates for the induction variables (item 2).  */
   find_iv_candidates (data);
 
+  if (targetm.have_count_reg_decr_p () && generic_predict_doloop_p (data))
+    {
+      data->doloop_use_p = find_doloop_use_and_its_bind (data);
+      if (data->doloop_use_p && dump_file && (dump_flags & TDF_DETAILS))
+	{
+	  fprintf (dump_file,
+		   "Predict loop %d can perform doloop optimization later.\n",
+		   loop->num);
+	  flow_loop_dump (loop, dump_file, NULL, 1);
+	}
+    }
+
   /* Calculates the costs (item 3, part 1).  */
   determine_iv_costs (data);
   determine_group_iv_costs (data);

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v3 3/3] PR80791 Consider doloop cmp use in ivopts
  2019-06-19 11:47 ` [PATCH v3 3/3] PR80791 " Kewen.Lin
@ 2019-06-20  9:09   ` Segher Boessenkool
  2019-06-20 12:08     ` Kewen.Lin
  2019-07-21  9:06   ` [PATCH v3 " Bin.Cheng
  1 sibling, 1 reply; 43+ messages in thread
From: Segher Boessenkool @ 2019-06-20  9:09 UTC (permalink / raw)
  To: Kewen.Lin
  Cc: gcc-patches, wschmidt, bin.cheng, rguenther, jakub, Jeff Law,
	Kugan Vivekanandarajah

Hi Kewen,

On Wed, Jun 19, 2019 at 07:47:34PM +0800, Kewen.Lin wrote:
> +/* Return true if count register for branch is supported.  */
> +
> +static bool
> +rs6000_have_count_reg_decr_p ()
> +{
> +  return flag_branch_on_count_reg;
> +}

rs6000 unconditionally supports these instructions, not just when that
flag is set.  If you need to look at the flag, the *caller* of this new
hook should, not every implementation of the hook.  So just "return true"
here?

>  DEFHOOK
> +(have_count_reg_decr_p,
> + "Return true if the target supports hardware count register for decrement\n\
> +and branch.\n\
> +The default version of this hook returns false.",
> + bool, (void),
> + hook_bool_void_false)

Is it important here that you cannot use that register as a GPR, that any
use of it is expensive because it has to be moved to/from a GPR?  The doc
should say something like that; a little more context, what the hook is
meant to be used for.

> +/* For doloop use, if the algothrim selects some candidate which invalid for
> +   later rewrite, fix it up with bind_cand.  */

"algorithm", "which is invalid".

> +/* Find doloop comparison use and set its related bind_cand.  We adjust the
> +   doloop use group cost against various IV cands, it's possible to assign
> +   some cost like zero rather than original inifite cost.  The point is to

"infinite"

Looks good :-)


Segher

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v3 3/3] PR80791 Consider doloop cmp use in ivopts
  2019-06-20  9:09   ` Segher Boessenkool
@ 2019-06-20 12:08     ` Kewen.Lin
  2019-06-20 12:17       ` Kewen.Lin
  0 siblings, 1 reply; 43+ messages in thread
From: Kewen.Lin @ 2019-06-20 12:08 UTC (permalink / raw)
  To: Segher Boessenkool
  Cc: gcc-patches, wschmidt, bin.cheng, rguenther, jakub, Jeff Law,
	Kugan Vivekanandarajah

[-- Attachment #1: Type: text/plain, Size: 866 bytes --]

Hi Segher,

> On Wed, Jun 19, 2019 at 07:47:34PM +0800, Kewen.Lin wrote:
>> +/* Return true if count register for branch is supported.  */
>> +
>> +static bool
>> +rs6000_have_count_reg_decr_p ()
>> +{
>> +  return flag_branch_on_count_reg;
>> +}
> 
> rs6000 unconditionally supports these instructions, not just when that
> flag is set.  If you need to look at the flag, the *caller* of this new
> hook should, not every implementation of the hook.  So just "return true"
> here?

Good point!  Updated it as hookpod.

>> +/* For doloop use, if the algothrim selects some candidate which invalid for
> 
> "algorithm", "which is invalid".

>> +   some cost like zero rather than original inifite cost.  The point is to
> 
> "infinite"
> 

Thanks for catching!  I should run spelling check next time.  :)

New version attached with comments addressed.


Thanks,
Kewen

[-- Attachment #2: ivopts.diff --]
[-- Type: text/plain, Size: 5633 bytes --]

diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index 12f1dfd..e98aba9 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -1913,7 +1913,7 @@ static const struct attribute_spec rs6000_attribute_table[] =
 #define TARGET_PREDICT_DOLOOP_P rs6000_predict_doloop_p
 
 #undef TARGET_HAVE_COUNT_REG_DECR_P
-#define TARGET_HAVE_COUNT_REG_DECR_P rs6000_have_count_reg_decr_p
+#define TARGET_HAVE_COUNT_REG_DECR_P true
 
 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV rs6000_atomic_assign_expand_fenv
@@ -39440,14 +39440,6 @@ rs6000_predict_doloop_p (struct loop *loop)
   return true;
 }
 
-/* Return true if count register for branch is supported.  */
-
-static bool
-rs6000_have_count_reg_decr_p ()
-{
-  return flag_branch_on_count_reg;
-}
-
 struct gcc_target targetm = TARGET_INITIALIZER;
 
 #include "gt-rs6000.h"
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index 46e488f..5477294 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -11618,11 +11618,13 @@ loops, and will help ivopts to make some decisions.
 The default version of this hook returns false.
 @end deftypefn
 
-@deftypefn {Target Hook} bool TARGET_HAVE_COUNT_REG_DECR_P (void)
+@deftypevr {Target Hook} bool TARGET_HAVE_COUNT_REG_DECR_P
 Return true if the target supports hardware count register for decrement
-and branch.
-The default version of this hook returns false.
-@end deftypefn
+and branch.  This count register can't be used as general register since
+moving to/from a general register from/to it is very expensive.
+For the targets with this support, ivopts can take doloop use as zero cost.
+The default value is false.
+@end deftypevr
 
 @deftypefn {Target Hook} bool TARGET_CAN_USE_DOLOOP_P (const widest_int @var{&iterations}, const widest_int @var{&iterations_max}, unsigned int @var{loop_depth}, bool @var{entered_at_top})
 Return true if it is possible to use low-overhead loops (@code{doloop_end}
diff --git a/gcc/target.def b/gcc/target.def
index ec15a6d..8a64e5b 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -4246,13 +4246,15 @@ The default version of this hook returns false.",
  bool, (struct loop *loop),
  default_predict_doloop_p)
 
-DEFHOOK
+DEFHOOKPOD
 (have_count_reg_decr_p,
  "Return true if the target supports hardware count register for decrement\n\
-and branch.\n\
-The default version of this hook returns false.",
- bool, (void),
- hook_bool_void_false)
+and branch.  This count register can't be used as general register since\n\
+moving to/from a general register from/to it is very expensive.\n\
+For the targets with this support, ivopts can take doloop use as zero cost.\n\
+The default value is false.",
+ bool, false)
+
 
 DEFHOOK
 (can_use_doloop_p,
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ivopts-lt.c b/gcc/testsuite/gcc.dg/tree-ssa/ivopts-lt.c
index 7d5859b..71d7f67 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/ivopts-lt.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ivopts-lt.c
@@ -17,6 +17,7 @@ f1 (char *p, uintptr_t i, uintptr_t n)
   while (i < n);
 }
 
-/* { dg-final { scan-tree-dump-times "PHI" 1 "ivopts" } } */
-/* { dg-final { scan-tree-dump-times "PHI <p_" 1 "ivopts"} } */
-/* { dg-final { scan-tree-dump-times "p_\[0-9\]* <" 1 "ivopts" } } */
+/* { dg-final { scan-tree-dump-times "PHI" 1 "ivopts" { target { ! powerpc*-*-* } } } } */
+/* { dg-final { scan-tree-dump-times "PHI" 2 "ivopts" { target { powerpc*-*-* } } } } */
+/* { dg-final { scan-tree-dump-times "PHI <p_" 1 "ivopts" { target { ! powerpc*-*-* } } } } */
+/* { dg-final { scan-tree-dump-times "p_\[0-9\]* <" 1 "ivopts" { target { ! powerpc*-*-* } } } } */
diff --git a/gcc/tree-ssa-loop-ivopts.c b/gcc/tree-ssa-loop-ivopts.c
index b1138ea..742d3fa 100644
--- a/gcc/tree-ssa-loop-ivopts.c
+++ b/gcc/tree-ssa-loop-ivopts.c
@@ -3730,7 +3730,7 @@ prepare_decl_rtl (tree *expr_p, int *ws, void *data)
    Some RTL specific checks seems unable to be checked in gimple, if any new
    checks or easy checks _are_ missing here, please add them.  */
 
-static bool ATTRIBUTE_UNUSED
+static bool
 generic_predict_doloop_p (struct ivopts_data *data)
 {
   struct loop *loop = data->current_loop;
@@ -6749,7 +6749,7 @@ find_optimal_iv_set_1 (struct ivopts_data *data, bool originalp)
   return set;
 }
 
-/* For doloop use, if the algothrim selects some candidate which invalid for
+/* For doloop use, if the algorithm selects some candidate which is invalid for
    later rewrite, fix it up with bind_cand.  */
 
 static void
@@ -7622,7 +7622,7 @@ determine_scaling_factor (struct ivopts_data *data, basic_block *body)
 
 /* Find doloop comparison use and set its related bind_cand.  We adjust the
    doloop use group cost against various IV cands, it's possible to assign
-   some cost like zero rather than original inifite cost.  The point is to
+   some cost like zero rather than original infinite cost.  The point is to
    give more chances to consider other IV cands instead of BIV.  The cost
    originally given on doloop use can affect optimal decision because it can
    become dead and get eliminated but considered too much here.
@@ -7744,7 +7744,8 @@ tree_ssa_iv_optimize_loop (struct ivopts_data *data, struct loop *loop,
   /* Finds candidates for the induction variables (item 2).  */
   find_iv_candidates (data);
 
-  if (targetm.have_count_reg_decr_p() && generic_predict_doloop_p (data))
+  if (flag_branch_on_count_reg && targetm.have_count_reg_decr_p
+      && generic_predict_doloop_p (data))
     {
       data->doloop_use_p = find_doloop_use_and_its_bind (data);
       if (data->doloop_use_p && dump_file && (dump_flags & TDF_DETAILS))

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v3 3/3] PR80791 Consider doloop cmp use in ivopts
  2019-06-20 12:08     ` Kewen.Lin
@ 2019-06-20 12:17       ` Kewen.Lin
  2019-07-10  2:31         ` [PING^1][PATCH v4 " Kewen.Lin
  0 siblings, 1 reply; 43+ messages in thread
From: Kewen.Lin @ 2019-06-20 12:17 UTC (permalink / raw)
  To: Segher Boessenkool
  Cc: gcc-patches, wschmidt, bin.cheng, rguenther, jakub, Jeff Law,
	Kugan Vivekanandarajah

[-- Attachment #1: Type: text/plain, Size: 1048 bytes --]

Hi,

Sorry, the previous patch is incomplete.
New one attached.  Sorry for inconvenience.

on 2019/6/20 脧脗脦莽8:08, Kewen.Lin wrote:
> Hi Segher,
> 
>> On Wed, Jun 19, 2019 at 07:47:34PM +0800, Kewen.Lin wrote:
>>> +/* Return true if count register for branch is supported.  */
>>> +
>>> +static bool
>>> +rs6000_have_count_reg_decr_p ()
>>> +{
>>> +  return flag_branch_on_count_reg;
>>> +}
>>
>> rs6000 unconditionally supports these instructions, not just when that
>> flag is set.  If you need to look at the flag, the *caller* of this new
>> hook should, not every implementation of the hook.  So just "return true"
>> here?
> 
> Good point!  Updated it as hookpod.
> 
>>> +/* For doloop use, if the algothrim selects some candidate which invalid for
>>
>> "algorithm", "which is invalid".
> 
>>> +   some cost like zero rather than original inifite cost.  The point is to
>>
>> "infinite"
>>
> 
> Thanks for catching!  I should run spelling check next time.  :)
> 
> New version attached with comments addressed.
> 
> 
> Thanks,
> Kewen
> 

[-- Attachment #2: ivopts.diff --]
[-- Type: text/plain, Size: 10673 bytes --]

diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index 6667cd0..e98aba9 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -1912,6 +1912,9 @@ static const struct attribute_spec rs6000_attribute_table[] =
 #undef TARGET_PREDICT_DOLOOP_P
 #define TARGET_PREDICT_DOLOOP_P rs6000_predict_doloop_p
 
+#undef TARGET_HAVE_COUNT_REG_DECR_P
+#define TARGET_HAVE_COUNT_REG_DECR_P true
+
 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV rs6000_atomic_assign_expand_fenv
 
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index c2aa4d0..5477294 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -11618,6 +11618,14 @@ loops, and will help ivopts to make some decisions.
 The default version of this hook returns false.
 @end deftypefn
 
+@deftypevr {Target Hook} bool TARGET_HAVE_COUNT_REG_DECR_P
+Return true if the target supports hardware count register for decrement
+and branch.  This count register can't be used as general register since
+moving to/from a general register from/to it is very expensive.
+For the targets with this support, ivopts can take doloop use as zero cost.
+The default value is false.
+@end deftypevr
+
 @deftypefn {Target Hook} bool TARGET_CAN_USE_DOLOOP_P (const widest_int @var{&iterations}, const widest_int @var{&iterations_max}, unsigned int @var{loop_depth}, bool @var{entered_at_top})
 Return true if it is possible to use low-overhead loops (@code{doloop_end}
 and @code{doloop_begin}) for a particular loop.  @var{iterations} gives the
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index b4d57b8..5f43b27 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -7946,6 +7946,8 @@ to by @var{ce_info}.
 
 @hook TARGET_PREDICT_DOLOOP_P
 
+@hook TARGET_HAVE_COUNT_REG_DECR_P
+
 @hook TARGET_CAN_USE_DOLOOP_P
 
 @hook TARGET_INVALID_WITHIN_DOLOOP
diff --git a/gcc/target.def b/gcc/target.def
index 71b6972..8a64e5b 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -4246,6 +4246,16 @@ The default version of this hook returns false.",
  bool, (struct loop *loop),
  default_predict_doloop_p)
 
+DEFHOOKPOD
+(have_count_reg_decr_p,
+ "Return true if the target supports hardware count register for decrement\n\
+and branch.  This count register can't be used as general register since\n\
+moving to/from a general register from/to it is very expensive.\n\
+For the targets with this support, ivopts can take doloop use as zero cost.\n\
+The default value is false.",
+ bool, false)
+
+
 DEFHOOK
 (can_use_doloop_p,
  "Return true if it is possible to use low-overhead loops (@code{doloop_end}\n\
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ivopts-lt.c b/gcc/testsuite/gcc.dg/tree-ssa/ivopts-lt.c
index 7d5859b..71d7f67 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/ivopts-lt.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ivopts-lt.c
@@ -17,6 +17,7 @@ f1 (char *p, uintptr_t i, uintptr_t n)
   while (i < n);
 }
 
-/* { dg-final { scan-tree-dump-times "PHI" 1 "ivopts" } } */
-/* { dg-final { scan-tree-dump-times "PHI <p_" 1 "ivopts"} } */
-/* { dg-final { scan-tree-dump-times "p_\[0-9\]* <" 1 "ivopts" } } */
+/* { dg-final { scan-tree-dump-times "PHI" 1 "ivopts" { target { ! powerpc*-*-* } } } } */
+/* { dg-final { scan-tree-dump-times "PHI" 2 "ivopts" { target { powerpc*-*-* } } } } */
+/* { dg-final { scan-tree-dump-times "PHI <p_" 1 "ivopts" { target { ! powerpc*-*-* } } } } */
+/* { dg-final { scan-tree-dump-times "p_\[0-9\]* <" 1 "ivopts" { target { ! powerpc*-*-* } } } } */
diff --git a/gcc/tree-ssa-loop-ivopts.c b/gcc/tree-ssa-loop-ivopts.c
index 530ea4a..742d3fa 100644
--- a/gcc/tree-ssa-loop-ivopts.c
+++ b/gcc/tree-ssa-loop-ivopts.c
@@ -399,6 +399,8 @@ struct iv_group
   struct cost_pair *cost_map;
   /* The selected candidate for the group.  */
   struct iv_cand *selected;
+  /* The bind candidate for this group, for doloop use group only.  */
+  struct iv_cand *bind_cand;
   /* Uses in the group.  */
   vec<struct iv_use *> vuses;
 };
@@ -612,6 +614,9 @@ struct ivopts_data
 
   /* Whether the loop body can only be exited via single exit.  */
   bool loop_single_exit_p;
+
+  /* Whether the loop has doloop comparison use.  */
+  bool doloop_use_p;
 };
 
 /* An assignment of iv candidates to uses.  */
@@ -1528,6 +1533,7 @@ record_group (struct ivopts_data *data, enum use_type type)
   group->type = type;
   group->related_cands = BITMAP_ALLOC (NULL);
   group->vuses.create (1);
+  group->bind_cand = NULL;
 
   data->vgroups.safe_push (group);
   return group;
@@ -3724,7 +3730,7 @@ prepare_decl_rtl (tree *expr_p, int *ws, void *data)
    Some RTL specific checks seems unable to be checked in gimple, if any new
    checks or easy checks _are_ missing here, please add them.  */
 
-static bool ATTRIBUTE_UNUSED
+static bool
 generic_predict_doloop_p (struct ivopts_data *data)
 {
   struct loop *loop = data->current_loop;
@@ -5291,6 +5297,21 @@ determine_group_iv_cost_cond (struct ivopts_data *data,
   return !cost.infinite_cost_p ();
 }
 
+/* Set no cost for pair between doloop iv use GROUP and iv cand CAND.  */
+
+static bool
+adjust_group_iv_cost_for_doloop (struct ivopts_data *data,
+				 struct iv_group *group, struct iv_cand *cand)
+{
+  struct cost_pair *cp = get_group_iv_cost (data, group, cand);
+  if (!cp)
+    set_group_iv_cost (data, group, cand, no_cost, NULL, NULL_TREE, ERROR_MARK,
+		       NULL);
+  else
+    cp->cost = no_cost;
+  return true;
+}
+
 /* Determines cost of computing uses in GROUP with CAND.  Returns false
    if USE cannot be represented with CAND.  */
 
@@ -5308,7 +5329,12 @@ determine_group_iv_cost (struct ivopts_data *data,
       return determine_group_iv_cost_address (data, group, cand);
 
     case USE_COMPARE:
-      return determine_group_iv_cost_cond (data, group, cand);
+      {
+	bool finite_cost_p = determine_group_iv_cost_cond (data, group, cand);
+	if (data->doloop_use_p && group->bind_cand)
+	  finite_cost_p = adjust_group_iv_cost_for_doloop (data, group, cand);
+	return finite_cost_p;
+      }
 
     default:
       gcc_unreachable ();
@@ -6723,6 +6749,29 @@ find_optimal_iv_set_1 (struct ivopts_data *data, bool originalp)
   return set;
 }
 
+/* For doloop use, if the algorithm selects some candidate which is invalid for
+   later rewrite, fix it up with bind_cand.  */
+
+static void
+fixup_doloop_groups (struct ivopts_data *data, struct iv_ca *set)
+{
+  for (unsigned i = 0; i < data->vgroups.length (); i++)
+    {
+      struct iv_group *group = data->vgroups[i];
+      if (group->bind_cand)
+	{
+	  struct cost_pair *cp = iv_ca_cand_for_group (set, group);
+	  gcc_assert (cp);
+	  if (cp->cand != group->bind_cand && cp->value == NULL_TREE)
+	    {
+	      struct cost_pair *bind_cp
+		= get_group_iv_cost (data, group, group->bind_cand);
+	      iv_ca_set_cp (data, set, group, bind_cp);
+	    }
+	}
+    }
+}
+
 static struct iv_ca *
 find_optimal_iv_set (struct ivopts_data *data)
 {
@@ -6760,6 +6809,9 @@ find_optimal_iv_set (struct ivopts_data *data)
   else if (origset)
     iv_ca_free (&origset);
 
+  if (data->doloop_use_p)
+    fixup_doloop_groups (data, set);
+
   for (i = 0; i < data->vgroups.length (); i++)
     {
       struct iv_group *group = data->vgroups[i];
@@ -7568,6 +7620,72 @@ determine_scaling_factor (struct ivopts_data *data, basic_block *body)
     }
 }
 
+/* Find doloop comparison use and set its related bind_cand.  We adjust the
+   doloop use group cost against various IV cands, it's possible to assign
+   some cost like zero rather than original infinite cost.  The point is to
+   give more chances to consider other IV cands instead of BIV.  The cost
+   originally given on doloop use can affect optimal decision because it can
+   become dead and get eliminated but considered too much here.
+
+   So it's possible that doloop use is assigned one invalid IV cand to rewrite.
+   In this case, we need bind_cand to fix up.  Even if the bind_cand doesn't
+   exist in final iv_ca set, it won't affect optimal decision since it gets
+   eliminated along with doloop use.  */
+
+static bool
+find_doloop_use_and_its_bind (struct ivopts_data *data)
+{
+  struct loop *loop = data->current_loop;
+
+  for (unsigned i = 0; i < data->vgroups.length (); i++)
+    {
+      struct iv_group *group = data->vgroups[i];
+      if (group->type == USE_COMPARE)
+	{
+	  gcc_assert (group->vuses.length () == 1);
+	  struct iv_use *use = group->vuses[0];
+	  gimple *stmt = use->stmt;
+	  if (gimple_code (stmt) == GIMPLE_COND)
+	    {
+	      basic_block bb = gimple_bb (stmt);
+	      edge true_edge, false_edge;
+	      extract_true_false_edges_from_block (bb, &true_edge, &false_edge);
+	      /* This comparison is used for loop latch.  Require latch is empty
+		 for now.  */
+	      if ((loop->latch == true_edge->dest
+		   || loop->latch == false_edge->dest)
+		  && empty_block_p (loop->latch))
+		{
+		  for (unsigned j = 0; j < data->vcands.length (); j++)
+		    {
+		      if (bitmap_bit_p (group->related_cands, j))
+			{
+			  struct iv_cand *cand = data->vcands[j];
+			  tree op = use->iv->ssa_name;
+			  if (op == cand->var_before || op == cand->var_after)
+			    {
+			      group->bind_cand = cand;
+			      if (dump_file && (dump_flags & TDF_DETAILS))
+				{
+				  fprintf (dump_file, "Doloop cmp iv use: ");
+				  print_gimple_stmt (dump_file, stmt,
+						     TDF_DETAILS);
+				  dump_cand (dump_file, cand);
+				}
+			      break;
+			    }
+			}
+		    }
+		  if (group->bind_cand)
+		    return true;
+		}
+	    }
+	}
+    }
+
+  return false;
+}
+
 /* Optimizes the LOOP.  Returns true if anything changed.  */
 
 static bool
@@ -7580,6 +7698,7 @@ tree_ssa_iv_optimize_loop (struct ivopts_data *data, struct loop *loop,
   basic_block *body;
 
   gcc_assert (!data->niters);
+  data->doloop_use_p = false;
   data->current_loop = loop;
   data->loop_loc = find_loop_location (loop).get_location_t ();
   data->speed = optimize_loop_for_speed_p (loop);
@@ -7625,6 +7744,19 @@ tree_ssa_iv_optimize_loop (struct ivopts_data *data, struct loop *loop,
   /* Finds candidates for the induction variables (item 2).  */
   find_iv_candidates (data);
 
+  if (flag_branch_on_count_reg && targetm.have_count_reg_decr_p
+      && generic_predict_doloop_p (data))
+    {
+      data->doloop_use_p = find_doloop_use_and_its_bind (data);
+      if (data->doloop_use_p && dump_file && (dump_flags & TDF_DETAILS))
+	{
+	  fprintf (dump_file,
+		   "Predict loop %d can perform doloop optimization later.\n",
+		   loop->num);
+	  flow_loop_dump (loop, dump_file, NULL, 1);
+	}
+    }
+
   /* Calculates the costs (item 3, part 1).  */
   determine_iv_costs (data);
   determine_group_iv_costs (data);

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PING^1][PATCH v4 3/3] PR80791 Consider doloop cmp use in ivopts
  2019-06-20 12:17       ` Kewen.Lin
@ 2019-07-10  2:31         ` Kewen.Lin
  2019-07-12 12:40           ` Richard Biener
  0 siblings, 1 reply; 43+ messages in thread
From: Kewen.Lin @ 2019-07-10  2:31 UTC (permalink / raw)
  To: gcc-patches
  Cc: Segher Boessenkool, wschmidt, bin.cheng, rguenther, jakub,
	Jeff Law, Kugan Vivekanandarajah

Hi all,

I'd like to gentle ping the below patch:
https://gcc.gnu.org/ml/gcc-patches/2019-06/msg01225.html

The previous version for more context/background:
https://gcc.gnu.org/ml/gcc-patches/2019-06/msg01126.html

Thanks a lot in advance!


on 2019/6/20 脧脗脦莽8:16, Kewen.Lin wrote:
> Hi,
> 
> Sorry, the previous patch is incomplete.
> New one attached.  Sorry for inconvenience.
> 
> on 2019/6/20 脧脗脦莽8:08, Kewen.Lin wrote:
>> Hi Segher,
>>
>>> On Wed, Jun 19, 2019 at 07:47:34PM +0800, Kewen.Lin wrote:
>>>> +/* Return true if count register for branch is supported.  */
>>>> +
>>>> +static bool
>>>> +rs6000_have_count_reg_decr_p ()
>>>> +{
>>>> +  return flag_branch_on_count_reg;
>>>> +}
>>>
>>> rs6000 unconditionally supports these instructions, not just when that
>>> flag is set.  If you need to look at the flag, the *caller* of this new
>>> hook should, not every implementation of the hook.  So just "return true"
>>> here?
>>
>> Good point!  Updated it as hookpod.
>>
>>>> +/* For doloop use, if the algothrim selects some candidate which invalid for
>>>
>>> "algorithm", "which is invalid".
>>
>>>> +   some cost like zero rather than original inifite cost.  The point is to
>>>
>>> "infinite"
>>>
>>
>> Thanks for catching!  I should run spelling check next time.  :)
>>
>> New version attached with comments addressed.
>>
>>
>> Thanks,
>> Kewen
>>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PING^1][PATCH v4 3/3] PR80791 Consider doloop cmp use in ivopts
  2019-07-10  2:31         ` [PING^1][PATCH v4 " Kewen.Lin
@ 2019-07-12 12:40           ` Richard Biener
  2019-07-12 14:10             ` Segher Boessenkool
                               ` (2 more replies)
  0 siblings, 3 replies; 43+ messages in thread
From: Richard Biener @ 2019-07-12 12:40 UTC (permalink / raw)
  To: Kewen.Lin
  Cc: gcc-patches, Segher Boessenkool, wschmidt, bin.cheng, jakub,
	Jeff Law, Kugan Vivekanandarajah

[-- Attachment #1: Type: text/plain, Size: 2442 bytes --]

On Wed, 10 Jul 2019, Kewen.Lin wrote:

> Hi all,
> 
> I'd like to gentle ping the below patch:
> https://gcc.gnu.org/ml/gcc-patches/2019-06/msg01225.html
> 
> The previous version for more context/background:
> https://gcc.gnu.org/ml/gcc-patches/2019-06/msg01126.html
> 
> Thanks a lot in advance!

Again I would have hoped Bin to chime in here.

Am I correct that doloop HW implementations are constrainted
by a decrement of one?  I see no code in the patch to constrain
things this way.  I'm not familiar with the group code at all
but I would have expected the patch to only affect costing
of IVs of the appropriate form (decrement one and possibly
no uses besides the one in the compare/decrement).  Since
ivcanon already adds a canonical counter IV it's not
necessary to generate an artificial candidate IV of the
wanted style (that's something I might have expected as well).

The rest should be just magic from the IVOPTs side?

There might be the need to only consider at most one counter IV
in the costing code.

Richard.

> 
> on 2019/6/20 下午8:16, Kewen.Lin wrote:
> > Hi,
> > 
> > Sorry, the previous patch is incomplete.
> > New one attached.  Sorry for inconvenience.
> > 
> > on 2019/6/20 下午8:08, Kewen.Lin wrote:
> >> Hi Segher,
> >>
> >>> On Wed, Jun 19, 2019 at 07:47:34PM +0800, Kewen.Lin wrote:
> >>>> +/* Return true if count register for branch is supported.  */
> >>>> +
> >>>> +static bool
> >>>> +rs6000_have_count_reg_decr_p ()
> >>>> +{
> >>>> +  return flag_branch_on_count_reg;
> >>>> +}
> >>>
> >>> rs6000 unconditionally supports these instructions, not just when that
> >>> flag is set.  If you need to look at the flag, the *caller* of this new
> >>> hook should, not every implementation of the hook.  So just "return true"
> >>> here?
> >>
> >> Good point!  Updated it as hookpod.
> >>
> >>>> +/* For doloop use, if the algothrim selects some candidate which invalid for
> >>>
> >>> "algorithm", "which is invalid".
> >>
> >>>> +   some cost like zero rather than original inifite cost.  The point is to
> >>>
> >>> "infinite"
> >>>
> >>
> >> Thanks for catching!  I should run spelling check next time.  :)
> >>
> >> New version attached with comments addressed.
> >>
> >>
> >> Thanks,
> >> Kewen
> >>
> 
> 

-- 
Richard Biener <rguenther@suse.de>
SUSE Linux GmbH, Maxfeldstrasse 5, 90409 Nuernberg, Germany;
GF: Felix Imendörffer, Mary Higgins, Sri Rasiah; HRB 21284 (AG Nürnberg)

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PING^1][PATCH v4 3/3] PR80791 Consider doloop cmp use in ivopts
  2019-07-12 12:40           ` Richard Biener
@ 2019-07-12 14:10             ` Segher Boessenkool
  2019-07-15  6:40             ` Kewen.Lin
  2019-07-15  6:50             ` Bin.Cheng
  2 siblings, 0 replies; 43+ messages in thread
From: Segher Boessenkool @ 2019-07-12 14:10 UTC (permalink / raw)
  To: Richard Biener
  Cc: Kewen.Lin, gcc-patches, wschmidt, bin.cheng, jakub, Jeff Law,
	Kugan Vivekanandarajah

On Fri, Jul 12, 2019 at 02:11:16PM +0200, Richard Biener wrote:
> Am I correct that doloop HW implementations are constrainted
> by a decrement of one?

GCC's doloop patterns are.  Not all hardware is.


Segher

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PING^1][PATCH v4 3/3] PR80791 Consider doloop cmp use in ivopts
  2019-07-12 12:40           ` Richard Biener
  2019-07-12 14:10             ` Segher Boessenkool
@ 2019-07-15  6:40             ` Kewen.Lin
  2019-07-15  6:50             ` Bin.Cheng
  2 siblings, 0 replies; 43+ messages in thread
From: Kewen.Lin @ 2019-07-15  6:40 UTC (permalink / raw)
  To: Richard Biener
  Cc: gcc-patches, Segher Boessenkool, wschmidt, bin.cheng, jakub,
	Jeff Law, Kugan Vivekanandarajah

Hi Richard,

on 2019/7/12 下午8:11, Richard Biener wrote:
> On Wed, 10 Jul 2019, Kewen.Lin wrote:
> 
>> Hi all,
>>
>> I'd like to gentle ping the below patch:
>> https://gcc.gnu.org/ml/gcc-patches/2019-06/msg01225.html
>>
>> The previous version for more context/background:
>> https://gcc.gnu.org/ml/gcc-patches/2019-06/msg01126.html
>>
>> Thanks a lot in advance!
> 
> Again I would have hoped Bin to chime in here.
> 
> Am I correct that doloop HW implementations are constrainted
> by a decrement of one?  I see no code in the patch to constrain
> things this way.  

If my understanding is correct, under have_count_reg_decr_p
I don't think we should check the decrement one pattern, doloop
can transform the loop closing to decrement by 1 since it knows
the iteration total count.  Since it uses special hardware register
like Power count register, we don't expect it to be shared with
other uses.  Btw, it also doesn't require the compare should be the 
comp/decrement pattern, so this patch more focuses on this comp
is needed or not (should be considered in selection or not).

> I'm not familiar with the group code at all
> but I would have expected the patch to only affect costing
> of IVs of the appropriate form (decrement one and possibly
> no uses besides the one in the compare/decrement).  

But since we select IV cand for every IV uses, we never knows
this IV cand will have the only use till the whole selection
done.

> Since
> ivcanon already adds a canonical counter IV it's not
> necessary to generate an artificial candidate IV of the
> wanted style (that's something I might have expected as well).

This patch is only for the case guarded in have_count_reg_decr_p.
It doesn't requires to have the artificial candidate IV as well
as decrement-compare-jump code sequence.  The code on power looks
like:  mtctr Rx   // move Rx (which holding total_counter) 
                  // to ctr register
      L:                   
       loop body...
       bnze L     // decrease ctr register and jump to L if 
                  // ctr nonzero    

> 
> The rest should be just magic from the IVOPTs side?
> 
> There might be the need to only consider at most one counter IV
> in the costing code.

The current patch doesn't introduce any IV cands but focus on 
zeroing the cost of comp IV use since we know it will be eliminated.
Still to leverage the existing candidate selection algorithm to decide
the final optimal IV set.  Bring back the canonical counter IV only
if it's not selected by any IV uses to keep the doloop comp use
rewriting correct, but it shouldn't affect anything since the use will
be eliminated and is the only use, the IV and its related will be
removed as well.


Thanks,
Kewen

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PING^1][PATCH v4 3/3] PR80791 Consider doloop cmp use in ivopts
  2019-07-12 12:40           ` Richard Biener
  2019-07-12 14:10             ` Segher Boessenkool
  2019-07-15  6:40             ` Kewen.Lin
@ 2019-07-15  6:50             ` Bin.Cheng
  2 siblings, 0 replies; 43+ messages in thread
From: Bin.Cheng @ 2019-07-15  6:50 UTC (permalink / raw)
  To: Richard Biener
  Cc: Kewen.Lin, gcc-patches List, Segher Boessenkool, Bill Schmidt,
	bin.cheng, Jakub Jelinek, Jeff Law, Kugan Vivekanandarajah

On Fri, Jul 12, 2019 at 8:11 PM Richard Biener <rguenther@suse.de> wrote:
>
> On Wed, 10 Jul 2019, Kewen.Lin wrote:
>
> > Hi all,
> >
> > I'd like to gentle ping the below patch:
> > https://gcc.gnu.org/ml/gcc-patches/2019-06/msg01225.html
> >
> > The previous version for more context/background:
> > https://gcc.gnu.org/ml/gcc-patches/2019-06/msg01126.html
> >
> > Thanks a lot in advance!
>
> Again I would have hoped Bin to chime in here.
Sorry for missing this one, will get to the patch this week.  Sorry
again for the inconvenience.

Thanks,
bin
>
> Am I correct that doloop HW implementations are constrainted
> by a decrement of one?  I see no code in the patch to constrain
> things this way.  I'm not familiar with the group code at all
> but I would have expected the patch to only affect costing
> of IVs of the appropriate form (decrement one and possibly
> no uses besides the one in the compare/decrement).  Since
> ivcanon already adds a canonical counter IV it's not
> necessary to generate an artificial candidate IV of the
> wanted style (that's something I might have expected as well).
>
> The rest should be just magic from the IVOPTs side?
>
> There might be the need to only consider at most one counter IV
> in the costing code.
>
> Richard.
>
> >
> > on 2019/6/20 下午8:16, Kewen.Lin wrote:
> > > Hi,
> > >
> > > Sorry, the previous patch is incomplete.
> > > New one attached.  Sorry for inconvenience.
> > >
> > > on 2019/6/20 下午8:08, Kewen.Lin wrote:
> > >> Hi Segher,
> > >>
> > >>> On Wed, Jun 19, 2019 at 07:47:34PM +0800, Kewen.Lin wrote:
> > >>>> +/* Return true if count register for branch is supported.  */
> > >>>> +
> > >>>> +static bool
> > >>>> +rs6000_have_count_reg_decr_p ()
> > >>>> +{
> > >>>> +  return flag_branch_on_count_reg;
> > >>>> +}
> > >>>
> > >>> rs6000 unconditionally supports these instructions, not just when that
> > >>> flag is set.  If you need to look at the flag, the *caller* of this new
> > >>> hook should, not every implementation of the hook.  So just "return true"
> > >>> here?
> > >>
> > >> Good point!  Updated it as hookpod.
> > >>
> > >>>> +/* For doloop use, if the algothrim selects some candidate which invalid for
> > >>>
> > >>> "algorithm", "which is invalid".
> > >>
> > >>>> +   some cost like zero rather than original inifite cost.  The point is to
> > >>>
> > >>> "infinite"
> > >>>
> > >>
> > >> Thanks for catching!  I should run spelling check next time.  :)
> > >>
> > >> New version attached with comments addressed.
> > >>
> > >>
> > >> Thanks,
> > >> Kewen
> > >>
> >
> >
>
> --
> Richard Biener <rguenther@suse.de>
> SUSE Linux GmbH, Maxfeldstrasse 5, 90409 Nuernberg, Germany;
> GF: Felix Imendörffer, Mary Higgins, Sri Rasiah; HRB 21284 (AG Nürnberg)

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v3 3/3] PR80791 Consider doloop cmp use in ivopts
  2019-06-19 11:47 ` [PATCH v3 3/3] PR80791 " Kewen.Lin
  2019-06-20  9:09   ` Segher Boessenkool
@ 2019-07-21  9:06   ` Bin.Cheng
  2019-07-22  5:42     ` Kewen.Lin
  1 sibling, 1 reply; 43+ messages in thread
From: Bin.Cheng @ 2019-07-21  9:06 UTC (permalink / raw)
  To: Kewen.Lin; +Cc: gcc-patches List

On Wed, Jun 19, 2019 at 7:47 PM Kewen.Lin <linkw@linux.ibm.com> wrote:
>
> Hi all,
>
> This is the following patch after https://gcc.gnu.org/ml/gcc-patches/2019-06/msg00910.html
>
> Main steps:
>   1) Identify the doloop cmp type iv use and record its bind_cand (explain it later).
>   2) Set zero cost for pairs between this use and any iv cand.
>   3) IV cand set selecting algorithm runs as usual.
>   4) Fix up the selected iv cand for doloop use if need.
>
> It only focuses on the targets like Power which has specific count register.
> target hook have_count_reg_decr_p is proposed for it.
>
> Some notes:
>
> *) Why we need zero cost?  How about just decrease the cost for the pair
>    between doloop use and its original iv cand?  How about just decrease
>    the cost for the pair between doloop use and one selected iv cand?
>
>    Since some target supports hardware count register for decrement and
>    branch, it doesn't need the general instruction sequence for decr, cmp and
>    branch in general registers.  The cost of moving count register to GPR
>    is generally high, so it's standalone and can't be shared with other iv
>    uses.  It means IVOPTs can take doloop use as invisible (zero cost).
>
>    Let's take a look at PR80791 for example.
>
>                             original biv (cand 4)  use derived iv (cand 6)
>      generic use:                   4                  0
>      comp use (doloop use):         0                 infinite
>
>     For iv cost, original biv has cost 4 while use derived iv has cost 5.
>     When IVOPTs considers doloop use, the optimal cost is 8 (original biv
>     iv cost 4 + use cost 4).  Unfortunately it's not actually optimal, since
>     later doloop transformation updates loop closing by count register,
>     original biv (and its update) won't be needed in loop closing any more.
>     The generic use become the only use for original biv.  That means, if we
>     know the doloop will perform later, we shouldn't consider the doloop use
>     when determining IV set.  If we don't consider it, the algorithm will
>     choose iv cand 6 with total cost 5 (iv cost 5 + use cost 0).
>
>     From the above, we can see that to decrease the cost for the pair between
>     doloop use and original biv doesn't work.  Meanwhile it's hard to predict
>     one good iv cand in final optimal set here and pre-update the cost
>     between it and doloop use.  The analysis would be heavy and imperfect.
>
> *) Why we need bind_cand?
>
>     As above, we assign zero cost for pairs between doloop use and each iv
>     cand.  It's possible that doloop use gets assigned one iv cand which is
>     invalid to be used during later rewrite.  Then we have to fix it up with iv
>     cand originally used for it.  It's fine whatever this iv cand exists in
>     final iv cand set or not, even if it's not in the set, it will be
>     eliminated in doloop transformation.
>
> By the way, I was thinking whether we can replace the hook have_count_reg_decr_p
> with flag_branch_on_count_reg.  As the description of the "no-" option, "Disable
> the optimization pass that scans for opportunities to use 'decrement and branch'
> instructions on a count register instead of instruction sequences that decrement
> a register, compare it against zero, and then branch based upon the result.", it
> implicitly says it has count register support.  But I noticed that the gate of
> doloop_optimize checks this flag, as what I got from the previous discussions, some
> targets which can perform doloop_optimize don't have specific count register, so it
> sounds we can't make use of the flag, is it correct?
>
> Bootstrapped on powerpcle, also ran regression testing on powerpcle, got one failure
> which is exposed by this patch and the root cause is duplicate of PR62147.
> case is gcc.target/powerpc/20050830-1.c
>
> Is it OK for trunk?
Sorry for the delaying.

I am not in favor of the approach very much.  When rewriting the pass
last time, we tried to reuse as much code as possible between cost
computation and iv_use rewriting.  we also followed guideline when
finite cost computed for cand/use pair, the use should be rewritten
using the cand successfully.  However, the patch adjust infinite cost
to zero cost causing cand can't be used to rewrite iv_use selected,
this is a backward step IMHO.

I am not sure if this is only useful for doloop cases, or for general cases?

Comment mentioned the point is to give more chances to consider other
IV cands instead of BIV.  If current algorithm relies on zeroing cost
of impossible cand/use pair to select optimal result, I suspect it's a
bug which should be fixed in candidate selection algorithm.  Do you
have a test case showing the issue? We should fix it as a standalone
problem, while the approach is covering the problem and not that
sound.

However, I think the patch can be changed that only finite cost should
be adjusted to zero.  Thus guarantee any cand selected is valid to
rewrite iv_use.

Thanks,
bin
>
> --------------
>
> gcc/ChangeLog
>
> 2019-06-19  Kewen Lin  <linkw@gcc.gnu.org>
>
>         PR middle-end/80791
>         * target.def (have_count_reg_decr_p): New hook.
>         * doc/tm.texi.in (TARGET_HAVE_COUNT_REG_DECR_P): New hook.
>         * doc/tm.texi: Regenerate.
>         * config/rs6000/rs6000.c (rs6000_have_count_reg_decr_p): New function.
>         (TARGET_HAVE_COUNT_REG_DECR_P): New macro.
>         * tree-ssa-loop-ivopts.c (adjust_group_iv_cost_for_doloop): New function.
>         (fixup_doloop_groups): Likewise.
>         (find_doloop_use_and_its_bind): Likewise.
>         (record_group): Init bind_cand.
>         (determine_group_iv_cost): Call adjust_group_iv_cost_for_doloop.
>         (find_optimal_iv_set): Call fixup_doloop_groups.
>         (tree_ssa_iv_optimize_loop): Call function have_count_reg_decr_p,
>         generic_predict_doloop_p and find_doloop_use_and_its_bind.
>         (generic_predict_doloop_p): Update attribute.
>
> gcc/testsuite/ChangeLog
>
> 2019-06-19  Kewen Lin  <linkw@gcc.gnu.org>
>
>         PR middle-end/80791
>         * gcc.dg/tree-ssa/ivopts-lt.c: Adjust.
>
>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v3 3/3] PR80791 Consider doloop cmp use in ivopts
  2019-07-21  9:06   ` [PATCH v3 " Bin.Cheng
@ 2019-07-22  5:42     ` Kewen.Lin
  2019-07-22  6:53       ` Segher Boessenkool
  2019-07-23  6:28       ` [PATCH v5 " Kewen.Lin
  0 siblings, 2 replies; 43+ messages in thread
From: Kewen.Lin @ 2019-07-22  5:42 UTC (permalink / raw)
  To: Bin.Cheng; +Cc: gcc-patches List, segher, Bill Schmidt, Richard Guenther

Hi Bin,

on 2019/7/21 上午11:07, Bin.Cheng wrote:
> On Wed, Jun 19, 2019 at 7:47 PM Kewen.Lin <linkw@linux.ibm.com> wrote:
>>
>> Hi all,
>>
>> This is the following patch after https://gcc.gnu.org/ml/gcc-patches/2019-06/msg00910.html
>>
>> Main steps:
>>   1) Identify the doloop cmp type iv use and record its bind_cand (explain it later).
>>   2) Set zero cost for pairs between this use and any iv cand.
>>   3) IV cand set selecting algorithm runs as usual.
>>   4) Fix up the selected iv cand for doloop use if need.
>>
>> It only focuses on the targets like Power which has specific count register.
>> target hook have_count_reg_decr_p is proposed for it.
>>
>> Some notes:
>>
>> *) Why we need zero cost?  How about just decrease the cost for the pair
>>    between doloop use and its original iv cand?  How about just decrease
>>    the cost for the pair between doloop use and one selected iv cand?
>>
>>    Since some target supports hardware count register for decrement and
>>    branch, it doesn't need the general instruction sequence for decr, cmp and
>>    branch in general registers.  The cost of moving count register to GPR
>>    is generally high, so it's standalone and can't be shared with other iv
>>    uses.  It means IVOPTs can take doloop use as invisible (zero cost).
>>
>>    Let's take a look at PR80791 for example.
>>
>>                             original biv (cand 4)  use derived iv (cand 6)
>>      generic use:                   4                  0
>>      comp use (doloop use):         0                 infinite
>>
>>     For iv cost, original biv has cost 4 while use derived iv has cost 5.
>>     When IVOPTs considers doloop use, the optimal cost is 8 (original biv
>>     iv cost 4 + use cost 4).  Unfortunately it's not actually optimal, since
>>     later doloop transformation updates loop closing by count register,
>>     original biv (and its update) won't be needed in loop closing any more.
>>     The generic use become the only use for original biv.  That means, if we
>>     know the doloop will perform later, we shouldn't consider the doloop use
>>     when determining IV set.  If we don't consider it, the algorithm will
>>     choose iv cand 6 with total cost 5 (iv cost 5 + use cost 0).
>>
>>     From the above, we can see that to decrease the cost for the pair between
>>     doloop use and original biv doesn't work.  Meanwhile it's hard to predict
>>     one good iv cand in final optimal set here and pre-update the cost
>>     between it and doloop use.  The analysis would be heavy and imperfect.
>>
>> *) Why we need bind_cand?
>>
>>     As above, we assign zero cost for pairs between doloop use and each iv
>>     cand.  It's possible that doloop use gets assigned one iv cand which is
>>     invalid to be used during later rewrite.  Then we have to fix it up with iv
>>     cand originally used for it.  It's fine whatever this iv cand exists in
>>     final iv cand set or not, even if it's not in the set, it will be
>>     eliminated in doloop transformation.
>>
>> By the way, I was thinking whether we can replace the hook have_count_reg_decr_p
>> with flag_branch_on_count_reg.  As the description of the "no-" option, "Disable
>> the optimization pass that scans for opportunities to use 'decrement and branch'
>> instructions on a count register instead of instruction sequences that decrement
>> a register, compare it against zero, and then branch based upon the result.", it
>> implicitly says it has count register support.  But I noticed that the gate of
>> doloop_optimize checks this flag, as what I got from the previous discussions, some
>> targets which can perform doloop_optimize don't have specific count register, so it
>> sounds we can't make use of the flag, is it correct?
>>
>> Bootstrapped on powerpcle, also ran regression testing on powerpcle, got one failure
>> which is exposed by this patch and the root cause is duplicate of PR62147.
>> case is gcc.target/powerpc/20050830-1.c
>>
>> Is it OK for trunk?
> Sorry for the delaying.
> 
> I am not in favor of the approach very much.  When rewriting the pass
> last time, we tried to reuse as much code as possible between cost
> computation and iv_use rewriting.  we also followed guideline when
> finite cost computed for cand/use pair, the use should be rewritten
> using the cand successfully.  However, the patch adjust infinite cost
> to zero cost causing cand can't be used to rewrite iv_use selected,
> this is a backward step IMHO.

Thanks a lot for your time and comments.

V2: https://gcc.gnu.org/ml/gcc-patches/2019-05/msg00655.html

The previous version 2 (above link) used the way to teach selection 
algorithm to be aware of the group with bind_cand, it didn't zeroing 
the cost of doloop IV use, but both of them are equivalent to ignore
this doloop IV use in selection. 

Then I was thinking as granted that it changed many places to take care 
of this bind_cand group, worsen the readability and seems invasive to
the existing algorithm too much.  For example, affected functions were: 
set_group_iv_cost/iv_ca_dump/try_add_cand_for/iv_ca_extend/iv_ca_narrow
/iv_ca_replace.  

At that time, I thought version 3 doesn't need to teach the existing algorithm
anything, leaves it to go as before and only need some fixups when it needs.
It has better readability and well fit in current handlings.
> 
> I am not sure if this is only useful for doloop cases, or for general cases?
> 

Not sure either.

> Comment mentioned the point is to give more chances to consider other
> IV cands instead of BIV.  If current algorithm relies on zeroing cost
> of impossible cand/use pair to select optimal result, I suspect it's a
> bug which should be fixed in candidate selection algorithm.  Do you
> have a test case showing the issue? We should fix it as a standalone
> problem, while the approach is covering the problem and not that
> sound.

The best case is the one which caused PR80791.  It has two IV uses, one 
is generic (use 0) and the other is compare (doloop use, use 1).  
The best optimal set is to assign cand 6 to use 0 and assign whatever 
but excepting for infinite cost cand to use 1.  The actual selection set
is to assign BIV to both uses.

The wording "to give more chances to consider other IV cands instead of
BIV" is for the use 0.  If we don't make anything special for use 1, cand
4 is always considered in the optimal set, then use 0 can't have any chances
to use cand 6 (extra iv cost).  Zeroing the cost to make selection not
consider use 1 at all.

You may have the question why not just zeroing those finite cost ones,
then in this case, only cand 1->5 can be selected for use 1 then BIV still
wins since it have best iv cost, it's selected for use 0 instead of cand 6.
If we consider infinite cost cand, bring cand 6 in and get optimal set cand 6.
btw, under TARGET_HAVE_COUNT_REG_DECR_P, the doloop use will be eliminated.
Zeroing it also matches this.

The original cost mapping without my patch:

<Group-candidate Costs>:
Group 0:
  cand>-cost>---compl.>-inv.expr.>------inv.vars
  0>----8>------0>------1;>-----NIL;
  1>----8>------0>------1;>-----NIL;
  2>----4>------0>------NIL;>---NIL;
  3>----8>------0>------1;>-----NIL;
  4>----4>------0>------NIL;>---NIL;
  5>----8>------0>------NIL;>---NIL;
  6>----0>------0>------NIL;>---NIL;

Group 1:
  cand>-cost>---compl.>-inv.expr.>------inv.vars
  0>----8>------0>------NIL;>---1
  1>----8>------0>------NIL;>---1
  2>----0>------0>------NIL;>---NIL;
  3>----4>------0>------NIL;>---1
  4>----0>------0>------NIL;>---NIL;
  5>----4>------0>------NIL;>---NIL;

I'm not sure it can be taken as a bug or not, better to say doloop use
only?  For doloop use, we can teach the algorithm not to take iv cost
into account if that iv cand is only for doloop use.  Since the doloop
use will be eliminated, it's fine.  But for the generic one, we need to
have the iv cost there.

I expect it doesn't need to teach many places. I'll give it a try.  :)

> 
> However, I think the patch can be changed that only finite cost should
> be adjusted to zero.  Thus guarantee any cand selected is valid to
> rewrite iv_use.
> 

Agreed if the above is taught.

Thanks again!


Kewen

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v3 3/3] PR80791 Consider doloop cmp use in ivopts
  2019-07-22  5:42     ` Kewen.Lin
@ 2019-07-22  6:53       ` Segher Boessenkool
  2019-07-22  7:18         ` Kewen.Lin
  2019-07-22  8:02         ` Richard Biener
  2019-07-23  6:28       ` [PATCH v5 " Kewen.Lin
  1 sibling, 2 replies; 43+ messages in thread
From: Segher Boessenkool @ 2019-07-22  6:53 UTC (permalink / raw)
  To: Kewen.Lin; +Cc: Bin.Cheng, gcc-patches List, Bill Schmidt, Richard Guenther

Hi!

(Maybe I am missing half of the discussion -- sorry if so).

I think we should have a new iv for just the doloop (which can have the
same starting value and step and type as another iv).

Has this been considered?


Segher

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v3 3/3] PR80791 Consider doloop cmp use in ivopts
  2019-07-22  6:53       ` Segher Boessenkool
@ 2019-07-22  7:18         ` Kewen.Lin
  2019-07-22  8:02         ` Richard Biener
  1 sibling, 0 replies; 43+ messages in thread
From: Kewen.Lin @ 2019-07-22  7:18 UTC (permalink / raw)
  To: Segher Boessenkool
  Cc: Bin.Cheng, gcc-patches List, Bill Schmidt, Richard Guenther

Hi Segher,

on 2019/7/22 脧脗脦莽2:26, Segher Boessenkool wrote:
> Hi!
> 
> (Maybe I am missing half of the discussion -- sorry if so).
> 
> I think we should have a new iv for just the doloop (which can have the
> same starting value and step and type as another iv).
> 
> Has this been considered?
> 
> 

I don't have any patches to introduce it.  I guess you mean one pre-bind
candidate is dedicated to doloop use only?  Version 2 introduced pre-bind,
but I dropped it as it's invasive to the current selection algorithm.

The current implementation is to zeroing cost for doloop use with any 
candidates and let selection algorithm pick up whatever for it.  I think
it's fine since doloop_optimize can transform anythings to expected only
if it knows the iteration count.

Thanks,
Kewen

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v3 3/3] PR80791 Consider doloop cmp use in ivopts
  2019-07-22  6:53       ` Segher Boessenkool
  2019-07-22  7:18         ` Kewen.Lin
@ 2019-07-22  8:02         ` Richard Biener
  2019-07-22 21:47           ` Segher Boessenkool
  2019-07-23  6:09           ` Kewen.Lin
  1 sibling, 2 replies; 43+ messages in thread
From: Richard Biener @ 2019-07-22  8:02 UTC (permalink / raw)
  To: Segher Boessenkool; +Cc: Kewen.Lin, Bin.Cheng, gcc-patches List, Bill Schmidt

On Mon, 22 Jul 2019, Segher Boessenkool wrote:

> Hi!
> 
> (Maybe I am missing half of the discussion -- sorry if so).
> 
> I think we should have a new iv for just the doloop (which can have the
> same starting value and step and type as another iv).
> 
> Has this been considered?

I was also suggesting this (maybe with too many words ;)).  If
it's a doloop target add such IV (candidate!) which has zero
use-cost for the increment and compare but a (target configurable)
penalty for other uses.  Invasiveness of this approach is probably
that you need to distinguish this candidate by making it a new
kind (or maybe we can just have a specia candidate number...).

Richard.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v3 3/3] PR80791 Consider doloop cmp use in ivopts
  2019-07-22  8:02         ` Richard Biener
@ 2019-07-22 21:47           ` Segher Boessenkool
  2019-07-23  6:14             ` Kewen.Lin
  2019-07-23  7:38             ` Richard Biener
  2019-07-23  6:09           ` Kewen.Lin
  1 sibling, 2 replies; 43+ messages in thread
From: Segher Boessenkool @ 2019-07-22 21:47 UTC (permalink / raw)
  To: Richard Biener; +Cc: Kewen.Lin, Bin.Cheng, gcc-patches List, Bill Schmidt

On Mon, Jul 22, 2019 at 09:18:10AM +0200, Richard Biener wrote:
> On Mon, 22 Jul 2019, Segher Boessenkool wrote:
> 
> > Hi!
> > 
> > (Maybe I am missing half of the discussion -- sorry if so).
> > 
> > I think we should have a new iv for just the doloop (which can have the
> > same starting value and step and type as another iv).
> > 
> > Has this been considered?
> 
> I was also suggesting this (maybe with too many words ;)).  If
> it's a doloop target add such IV (candidate!) which has zero
> use-cost for the increment and compare but a (target configurable)
> penalty for other uses.  Invasiveness of this approach is probably
> that you need to distinguish this candidate by making it a new
> kind (or maybe we can just have a specia candidate number...).

Or just set some (boolean) flag in the candidate.

I think it should simply not be allowed for any use except the doloop
uses at all?  You can have multiple ivs for the same loop just fine,
right?  And costs will make everything work out, if the costs are set
correctly?


Segher

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v3 3/3] PR80791 Consider doloop cmp use in ivopts
  2019-07-22  8:02         ` Richard Biener
  2019-07-22 21:47           ` Segher Boessenkool
@ 2019-07-23  6:09           ` Kewen.Lin
  2019-07-23  8:05             ` Richard Biener
  1 sibling, 1 reply; 43+ messages in thread
From: Kewen.Lin @ 2019-07-23  6:09 UTC (permalink / raw)
  To: Richard Biener, Segher Boessenkool
  Cc: Bin.Cheng, gcc-patches List, Bill Schmidt

on 2019/7/22 脧脗脦莽3:18, Richard Biener wrote:
> On Mon, 22 Jul 2019, Segher Boessenkool wrote:
> 
>> Hi!
>>
>> (Maybe I am missing half of the discussion -- sorry if so).
>>
>> I think we should have a new iv for just the doloop (which can have the
>> same starting value and step and type as another iv).
>>
>> Has this been considered?
> 
> I was also suggesting this (maybe with too many words ;)).  If
> it's a doloop target add such IV (candidate!) which has zero
> use-cost for the increment and compare but a (target configurable)
> penalty for other uses.  Invasiveness of this approach is probably
> that you need to distinguish this candidate by making it a new
> kind (or maybe we can just have a specia candidate number...).
> 

Hi Richard,

Really appreciate your comments on this, very sorry not to go with this.
Since this patch is for TARGET_HAVE_COUNT_REG_DECR_P, I was thinking
it's fairly enough to reuse the existing IV cands and just zeroing doloop
use cost with them.  I'm very happy to unify it.  If you/Segher/Bin don't
have any concerns, I'd like to make it as one follow up item.

One thing to double check is this dedicated IV will follow decrement
instead of increment align with doloop optimize?  Then it looks to shape
the loop closing to doloop pattern, at least it's decrement.


Thanks,
Kewen

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v3 3/3] PR80791 Consider doloop cmp use in ivopts
  2019-07-22 21:47           ` Segher Boessenkool
@ 2019-07-23  6:14             ` Kewen.Lin
  2019-07-23  7:38             ` Richard Biener
  1 sibling, 0 replies; 43+ messages in thread
From: Kewen.Lin @ 2019-07-23  6:14 UTC (permalink / raw)
  To: Segher Boessenkool, Richard Biener
  Cc: Bin.Cheng, gcc-patches List, Bill Schmidt

Hi Segher,

on 2019/7/23 脡脧脦莽5:43, Segher Boessenkool wrote:
> On Mon, Jul 22, 2019 at 09:18:10AM +0200, Richard Biener wrote:
>> On Mon, 22 Jul 2019, Segher Boessenkool wrote:
>>
>>> Hi!
>>>
>>> (Maybe I am missing half of the discussion -- sorry if so).
>>>
>>> I think we should have a new iv for just the doloop (which can have the
>>> same starting value and step and type as another iv).
>>>
>>> Has this been considered?
>>
>> I was also suggesting this (maybe with too many words ;)).  If
>> it's a doloop target add such IV (candidate!) which has zero
>> use-cost for the increment and compare but a (target configurable)
>> penalty for other uses.  Invasiveness of this approach is probably
>> that you need to distinguish this candidate by making it a new
>> kind (or maybe we can just have a specia candidate number...).
> 
> Or just set some (boolean) flag in the candidate.
> 
> I think it should simply not be allowed for any use except the doloop
> uses at all?  

For the targets where the iteration count doesn't sit in its hardware count
register, we may need to allow the IV to be used for other suitable uses?

> You can have multiple ivs for the same loop just fine,
> right?  

Yes.

> And costs will make everything work out, if the costs are set
> correctly?

There are some cases requiring to do IV elimination, it might require some
cost adjustment/tuning to keep this.  I met this when I did pre-bind the
BIV for it, if the dedicated IV has the best cost and is associated to
doloop use, it probably stops the others to merge.

If my understanding is correct, this is more like to transform the loop
into doloop pattern earlier, the penalty of mis-predication of doloop can
be more? Pros is the setup code sequence for iteration count happens in
middle-end, can be optimized better (RTL misses some range info).

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH v5 3/3] PR80791 Consider doloop cmp use in ivopts
  2019-07-22  5:42     ` Kewen.Lin
  2019-07-22  6:53       ` Segher Boessenkool
@ 2019-07-23  6:28       ` Kewen.Lin
  2019-08-14  7:48         ` [PATCH v6 " Kewen.Lin
  1 sibling, 1 reply; 43+ messages in thread
From: Kewen.Lin @ 2019-07-23  6:28 UTC (permalink / raw)
  To: Bin.Cheng; +Cc: gcc-patches List, segher, Bill Schmidt, Richard Guenther

[-- Attachment #1: Type: text/plain, Size: 1317 bytes --]

Hi Bin,

This patch follows your suggestion, to avoid use infinite cost iv cand to rewrite.
In order to allow other IV cands to be considered, zeroing the iv cand cost if 
its users are only doloop uses.  (See the typical case in previous reply.)

Could you please have a look?  Thanks in advance!


Kewen
-------------

gcc/ChangeLog

2019-07-23  Kewen Lin  <linkw@gcc.gnu.org>

	PR middle-end/80791
	* target.def (have_count_reg_decr_p): New hook.
	* doc/tm.texi.in (TARGET_HAVE_COUNT_REG_DECR_P): New hook.
	* doc/tm.texi: Regenerate.
	* config/rs6000/rs6000.c (rs6000_have_count_reg_decr_p): New function.
	(TARGET_HAVE_COUNT_REG_DECR_P): New macro.
	* tree-ssa-loop-ivopts.c (adjust_group_iv_cost_for_doloop): New function.
	(find_doloop_use): Likewise.
	(record_group): Init doloop_p.
	(determine_group_iv_cost): Call adjust_group_iv_cost_for_doloop.
	(tree_ssa_iv_optimize_loop): Call function have_count_reg_decr_p, 
	generic_predict_doloop_p and find_doloop_use.
	(generic_predict_doloop_p): Update attribute.
	(iv_ca_set_no_cp): Adjust cand cost handling for doloop.
	(iv_ca_set_cp): Likewise.
	(iv_ca_new): Init n_cand_doloop_uses.
	(iv_ca_free): Free n_cand_doloop_uses.

gcc/testsuite/ChangeLog

2019-07-23  Kewen Lin  <linkw@gcc.gnu.org>

	PR middle-end/80791
	* gcc.dg/tree-ssa/ivopts-lt.c: Adjust.




[-- Attachment #2: reassoc_v5.diff --]
[-- Type: text/plain, Size: 10224 bytes --]

diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index 6667cd0..e98aba9 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -1912,6 +1912,9 @@ static const struct attribute_spec rs6000_attribute_table[] =
 #undef TARGET_PREDICT_DOLOOP_P
 #define TARGET_PREDICT_DOLOOP_P rs6000_predict_doloop_p
 
+#undef TARGET_HAVE_COUNT_REG_DECR_P
+#define TARGET_HAVE_COUNT_REG_DECR_P true
+
 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV rs6000_atomic_assign_expand_fenv
 
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index c2aa4d0..5477294 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -11618,6 +11618,14 @@ loops, and will help ivopts to make some decisions.
 The default version of this hook returns false.
 @end deftypefn
 
+@deftypevr {Target Hook} bool TARGET_HAVE_COUNT_REG_DECR_P
+Return true if the target supports hardware count register for decrement
+and branch.  This count register can't be used as general register since
+moving to/from a general register from/to it is very expensive.
+For the targets with this support, ivopts can take doloop use as zero cost.
+The default value is false.
+@end deftypevr
+
 @deftypefn {Target Hook} bool TARGET_CAN_USE_DOLOOP_P (const widest_int @var{&iterations}, const widest_int @var{&iterations_max}, unsigned int @var{loop_depth}, bool @var{entered_at_top})
 Return true if it is possible to use low-overhead loops (@code{doloop_end}
 and @code{doloop_begin}) for a particular loop.  @var{iterations} gives the
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index b4d57b8..5f43b27 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -7946,6 +7946,8 @@ to by @var{ce_info}.
 
 @hook TARGET_PREDICT_DOLOOP_P
 
+@hook TARGET_HAVE_COUNT_REG_DECR_P
+
 @hook TARGET_CAN_USE_DOLOOP_P
 
 @hook TARGET_INVALID_WITHIN_DOLOOP
diff --git a/gcc/target.def b/gcc/target.def
index 71b6972..8a64e5b 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -4246,6 +4246,16 @@ The default version of this hook returns false.",
  bool, (struct loop *loop),
  default_predict_doloop_p)
 
+DEFHOOKPOD
+(have_count_reg_decr_p,
+ "Return true if the target supports hardware count register for decrement\n\
+and branch.  This count register can't be used as general register since\n\
+moving to/from a general register from/to it is very expensive.\n\
+For the targets with this support, ivopts can take doloop use as zero cost.\n\
+The default value is false.",
+ bool, false)
+
+
 DEFHOOK
 (can_use_doloop_p,
  "Return true if it is possible to use low-overhead loops (@code{doloop_end}\n\
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ivopts-lt.c b/gcc/testsuite/gcc.dg/tree-ssa/ivopts-lt.c
index 7d5859b..3486e1a 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/ivopts-lt.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ivopts-lt.c
@@ -18,5 +18,5 @@ f1 (char *p, uintptr_t i, uintptr_t n)
 }
 
 /* { dg-final { scan-tree-dump-times "PHI" 1 "ivopts" } } */
-/* { dg-final { scan-tree-dump-times "PHI <p_" 1 "ivopts"} } */
-/* { dg-final { scan-tree-dump-times "p_\[0-9\]* <" 1 "ivopts" } } */
+/* { dg-final { scan-tree-dump-times "PHI <p_" 1 "ivopts" { target { ! powerpc*-*-* } } } } */
+/* { dg-final { scan-tree-dump-times "p_\[0-9\]* <" 1 "ivopts" { target { ! powerpc*-*-* } } } } */
diff --git a/gcc/tree-ssa-loop-ivopts.c b/gcc/tree-ssa-loop-ivopts.c
index 530ea4a..80a0f12 100644
--- a/gcc/tree-ssa-loop-ivopts.c
+++ b/gcc/tree-ssa-loop-ivopts.c
@@ -399,6 +399,8 @@ struct iv_group
   struct cost_pair *cost_map;
   /* The selected candidate for the group.  */
   struct iv_cand *selected;
+  /* To indicate this is a doloop use group.  */
+  bool doloop_p;
   /* Uses in the group.  */
   vec<struct iv_use *> vuses;
 };
@@ -612,6 +614,9 @@ struct ivopts_data
 
   /* Whether the loop body can only be exited via single exit.  */
   bool loop_single_exit_p;
+
+  /* Whether the loop has doloop comparison use.  */
+  bool doloop_use_p;
 };
 
 /* An assignment of iv candidates to uses.  */
@@ -630,6 +635,9 @@ struct iv_ca
   /* Number of times each candidate is used.  */
   unsigned *n_cand_uses;
 
+  /* How many doloop uses for each candidates.  */
+  unsigned *n_cand_doloop_uses;
+
   /* The candidates used.  */
   bitmap cands;
 
@@ -1528,6 +1536,7 @@ record_group (struct ivopts_data *data, enum use_type type)
   group->type = type;
   group->related_cands = BITMAP_ALLOC (NULL);
   group->vuses.create (1);
+  group->doloop_p = false;
 
   data->vgroups.safe_push (group);
   return group;
@@ -3724,7 +3733,7 @@ prepare_decl_rtl (tree *expr_p, int *ws, void *data)
    Some RTL specific checks seems unable to be checked in gimple, if any new
    checks or easy checks _are_ missing here, please add them.  */
 
-static bool ATTRIBUTE_UNUSED
+static bool
 generic_predict_doloop_p (struct ivopts_data *data)
 {
   struct loop *loop = data->current_loop;
@@ -5291,6 +5300,17 @@ determine_group_iv_cost_cond (struct ivopts_data *data,
   return !cost.infinite_cost_p ();
 }
 
+/* Set no cost for pair between doloop iv use GROUP and iv cand CAND.  */
+
+static void
+adjust_group_iv_cost_for_doloop (struct ivopts_data *data,
+				 struct iv_group *group, struct iv_cand *cand)
+{
+  struct cost_pair *cp = get_group_iv_cost (data, group, cand);
+  gcc_assert (cp);
+  cp->cost = no_cost;
+}
+
 /* Determines cost of computing uses in GROUP with CAND.  Returns false
    if USE cannot be represented with CAND.  */
 
@@ -5308,7 +5328,12 @@ determine_group_iv_cost (struct ivopts_data *data,
       return determine_group_iv_cost_address (data, group, cand);
 
     case USE_COMPARE:
-      return determine_group_iv_cost_cond (data, group, cand);
+      {
+	bool finite_cost_p = determine_group_iv_cost_cond (data, group, cand);
+	if (data->doloop_use_p && group->doloop_p && finite_cost_p)
+	  adjust_group_iv_cost_for_doloop (data, group, cand);
+	return finite_cost_p;
+      }
 
     default:
       gcc_unreachable ();
@@ -5829,11 +5854,15 @@ iv_ca_set_no_cp (struct ivopts_data *data, struct iv_ca *ivs,
   ivs->cand_for_group[gid] = NULL;
   ivs->n_cand_uses[cid]--;
 
+  if (group->doloop_p)
+    ivs->n_cand_doloop_uses[cid]--;
+  else if (ivs->n_cand_uses[cid] == ivs->n_cand_doloop_uses[cid])
+    ivs->cand_cost -= cp->cand->cost;
+
   if (ivs->n_cand_uses[cid] == 0)
     {
       bitmap_clear_bit (ivs->cands, cid);
       ivs->n_cands--;
-      ivs->cand_cost -= cp->cand->cost;
       iv_ca_set_remove_invs (ivs, cp->cand->inv_vars, ivs->n_inv_var_uses);
       iv_ca_set_remove_invs (ivs, cp->cand->inv_exprs, ivs->n_inv_expr_uses);
     }
@@ -5886,11 +5915,14 @@ iv_ca_set_cp (struct ivopts_data *data, struct iv_ca *ivs,
       ivs->bad_groups--;
       ivs->cand_for_group[gid] = cp;
       ivs->n_cand_uses[cid]++;
+      if (group->doloop_p)
+	ivs->n_cand_doloop_uses[cid]++;
+      else if (ivs->n_cand_uses[cid] == (ivs->n_cand_doloop_uses[cid] + 1))
+	ivs->cand_cost += cp->cand->cost;
       if (ivs->n_cand_uses[cid] == 1)
 	{
 	  bitmap_set_bit (ivs->cands, cid);
 	  ivs->n_cands++;
-	  ivs->cand_cost += cp->cand->cost;
 	  iv_ca_set_add_invs (ivs, cp->cand->inv_vars, ivs->n_inv_var_uses);
 	  iv_ca_set_add_invs (ivs, cp->cand->inv_exprs, ivs->n_inv_expr_uses);
 	}
@@ -6098,6 +6130,7 @@ iv_ca_new (struct ivopts_data *data)
   nw->cand_for_group = XCNEWVEC (struct cost_pair *,
 				 data->vgroups.length ());
   nw->n_cand_uses = XCNEWVEC (unsigned, data->vcands.length ());
+  nw->n_cand_doloop_uses = XCNEWVEC (unsigned, data->vcands.length ());
   nw->cands = BITMAP_ALLOC (NULL);
   nw->n_cands = 0;
   nw->n_invs = 0;
@@ -6117,6 +6150,7 @@ iv_ca_free (struct iv_ca **ivs)
 {
   free ((*ivs)->cand_for_group);
   free ((*ivs)->n_cand_uses);
+  free ((*ivs)->n_cand_doloop_uses);
   BITMAP_FREE ((*ivs)->cands);
   free ((*ivs)->n_inv_var_uses);
   free ((*ivs)->n_inv_expr_uses);
@@ -7568,6 +7602,47 @@ determine_scaling_factor (struct ivopts_data *data, basic_block *body)
     }
 }
 
+/* Find doloop comparison use and set its doloop_p on if found.  */
+
+static bool
+find_doloop_use (struct ivopts_data *data)
+{
+  struct loop *loop = data->current_loop;
+
+  for (unsigned i = 0; i < data->vgroups.length (); i++)
+    {
+      struct iv_group *group = data->vgroups[i];
+      if (group->type == USE_COMPARE)
+	{
+	  gcc_assert (group->vuses.length () == 1);
+	  struct iv_use *use = group->vuses[0];
+	  gimple *stmt = use->stmt;
+	  if (gimple_code (stmt) == GIMPLE_COND)
+	    {
+	      basic_block bb = gimple_bb (stmt);
+	      edge true_edge, false_edge;
+	      extract_true_false_edges_from_block (bb, &true_edge, &false_edge);
+	      /* This comparison is used for loop latch.  Require latch is empty
+		 for now.  */
+	      if ((loop->latch == true_edge->dest
+		   || loop->latch == false_edge->dest)
+		  && empty_block_p (loop->latch))
+		{
+		  group->doloop_p = true;
+		  if (dump_file && (dump_flags & TDF_DETAILS))
+		    {
+		      fprintf (dump_file, "Doloop cmp iv use: ");
+		      print_gimple_stmt (dump_file, stmt, TDF_DETAILS);
+		    }
+		  return true;
+		}
+	    }
+	}
+    }
+
+  return false;
+}
+
 /* Optimizes the LOOP.  Returns true if anything changed.  */
 
 static bool
@@ -7580,6 +7655,7 @@ tree_ssa_iv_optimize_loop (struct ivopts_data *data, struct loop *loop,
   basic_block *body;
 
   gcc_assert (!data->niters);
+  data->doloop_use_p = false;
   data->current_loop = loop;
   data->loop_loc = find_loop_location (loop).get_location_t ();
   data->speed = optimize_loop_for_speed_p (loop);
@@ -7625,6 +7701,19 @@ tree_ssa_iv_optimize_loop (struct ivopts_data *data, struct loop *loop,
   /* Finds candidates for the induction variables (item 2).  */
   find_iv_candidates (data);
 
+  if (flag_branch_on_count_reg && targetm.have_count_reg_decr_p
+      && generic_predict_doloop_p (data))
+    {
+      data->doloop_use_p = find_doloop_use (data);
+      if (data->doloop_use_p && dump_file && (dump_flags & TDF_DETAILS))
+	{
+	  fprintf (dump_file,
+		   "Predict loop %d can perform doloop optimization later.\n",
+		   loop->num);
+	  flow_loop_dump (loop, dump_file, NULL, 1);
+	}
+    }
+
   /* Calculates the costs (item 3, part 1).  */
   determine_iv_costs (data);
   determine_group_iv_costs (data);

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v3 3/3] PR80791 Consider doloop cmp use in ivopts
  2019-07-22 21:47           ` Segher Boessenkool
  2019-07-23  6:14             ` Kewen.Lin
@ 2019-07-23  7:38             ` Richard Biener
  1 sibling, 0 replies; 43+ messages in thread
From: Richard Biener @ 2019-07-23  7:38 UTC (permalink / raw)
  To: Segher Boessenkool; +Cc: Kewen.Lin, Bin.Cheng, gcc-patches List, Bill Schmidt

On Mon, 22 Jul 2019, Segher Boessenkool wrote:

> On Mon, Jul 22, 2019 at 09:18:10AM +0200, Richard Biener wrote:
> > On Mon, 22 Jul 2019, Segher Boessenkool wrote:
> > 
> > > Hi!
> > > 
> > > (Maybe I am missing half of the discussion -- sorry if so).
> > > 
> > > I think we should have a new iv for just the doloop (which can have the
> > > same starting value and step and type as another iv).
> > > 
> > > Has this been considered?
> > 
> > I was also suggesting this (maybe with too many words ;)).  If
> > it's a doloop target add such IV (candidate!) which has zero
> > use-cost for the increment and compare but a (target configurable)
> > penalty for other uses.  Invasiveness of this approach is probably
> > that you need to distinguish this candidate by making it a new
> > kind (or maybe we can just have a specia candidate number...).
> 
> Or just set some (boolean) flag in the candidate.
> 
> I think it should simply not be allowed for any use except the doloop
> uses at all?  You can have multiple ivs for the same loop just fine,
> right?  And costs will make everything work out, if the costs are set
> correctly?

Sure.  Upthread it was mentioned some targets can easily use the
counter IV in other IV uses so it's really a matter of costs.  That is,
IVOPTs generated "fake" RTL should, for doloop IVs, choose an
appropriate register so the target can do costing.

Richard.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v3 3/3] PR80791 Consider doloop cmp use in ivopts
  2019-07-23  6:09           ` Kewen.Lin
@ 2019-07-23  8:05             ` Richard Biener
  0 siblings, 0 replies; 43+ messages in thread
From: Richard Biener @ 2019-07-23  8:05 UTC (permalink / raw)
  To: Kewen.Lin; +Cc: Segher Boessenkool, Bin.Cheng, gcc-patches List, Bill Schmidt

[-- Attachment #1: Type: text/plain, Size: 1833 bytes --]

On Tue, 23 Jul 2019, Kewen.Lin wrote:

> on 2019/7/22 脧脗脦莽3:18, Richard Biener wrote:
> > On Mon, 22 Jul 2019, Segher Boessenkool wrote:
> > 
> >> Hi!
> >>
> >> (Maybe I am missing half of the discussion -- sorry if so).
> >>
> >> I think we should have a new iv for just the doloop (which can have the
> >> same starting value and step and type as another iv).
> >>
> >> Has this been considered?
> > 
> > I was also suggesting this (maybe with too many words ;)).  If
> > it's a doloop target add such IV (candidate!) which has zero
> > use-cost for the increment and compare but a (target configurable)
> > penalty for other uses.  Invasiveness of this approach is probably
> > that you need to distinguish this candidate by making it a new
> > kind (or maybe we can just have a specia candidate number...).
> > 
> 
> Hi Richard,
> 
> Really appreciate your comments on this, very sorry not to go with this.
> Since this patch is for TARGET_HAVE_COUNT_REG_DECR_P, I was thinking
> it's fairly enough to reuse the existing IV cands and just zeroing doloop
> use cost with them.  I'm very happy to unify it.  If you/Segher/Bin don't
> have any concerns, I'd like to make it as one follow up item.
> 
> One thing to double check is this dedicated IV will follow decrement
> instead of increment align with doloop optimize?  Then it looks to shape
> the loop closing to doloop pattern, at least it's decrement.

I think doloop support should be as "simple" as always adding a
candidate starting from niter (-1?), step -1 marked as DOLOOP_IV
(which is then used in costing, making uses in the IV update and
the compare zero cost and uses in other places according to the
target by using an appropriate hardreg for the fake RTL we create).

IV costing and elimination should then choose the doloop IV if that's
profitable.

Richard.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH v6 3/3] PR80791 Consider doloop cmp use in ivopts
  2019-07-23  6:28       ` [PATCH v5 " Kewen.Lin
@ 2019-08-14  7:48         ` Kewen.Lin
  2019-08-21 13:42           ` Bin.Cheng
  0 siblings, 1 reply; 43+ messages in thread
From: Kewen.Lin @ 2019-08-14  7:48 UTC (permalink / raw)
  To: gcc-patches List; +Cc: Bin.Cheng, segher, Bill Schmidt, Richard Guenther

[-- Attachment #1: Type: text/plain, Size: 3851 bytes --]

Hi!

Comparing to the previous versions of implementation mainly based on the 
existing IV cands but zeroing the related group/use cost, this new one is based
on Richard and Segher's suggestion introducing one doloop dedicated IV cand.  

Some key points are listed below:
  1) New field doloop_p in struct iv_cand to indicate doloop dedicated IV cand.
  2) Special name "doloop" assigned.
  3) Doloop IV cand with form (niter+1, +, -1)
  4) For doloop IV cand, no extra one cost like BIV, assign zero cost for step.
  5) Support may_be_zero (regressed PR is in this case), the base of doloop IV
     can be COND_EXPR, add handlings in cand_value_at and may_eliminate_iv.
  6) Add more expr support in force_expr_to_var_cost for reasonable cost
     calculation on the IV base with may_be_zero (like COND_EXPR).
  7) Set zero cost when using doloop IV cand for doloop use.
  8) Add three hooks (should we merge _generic and _address?).
    *) have_count_reg_decr_p, is to indicate the target has special hardware
       count register, we shouldn't consider the impact of doloop IV when
       calculating register pressures.
    *) doloop_cost_for_generic, is the extra cost when using doloop IV cand for
       generic type IV use.
    *) doloop_cost_for_address, is the extra cost when using doloop IV cand for
       address type IV use.

Bootstrapped on powerpc64le-linux-gnu and regression testing passed excepting
for one failure on gcc/testsuite/gcc.dg/guality/loop-1.c at -O3 which is tracked
by PR89983.

Any comments and suggestions are highly appreciated.  Thanks!

Kewen

---------

gcc/ChangeLog

2019-08-14  Kewen Lin  <linkw@gcc.gnu.org>

	PR middle-end/80791
	* config/rs6000/rs6000.c (TARGET_HAVE_COUNT_REG_DECR_P): New macro.
	(TARGET_DOLOOP_COST_FOR_GENERIC): Likewise.
	(TARGET_DOLOOP_COST_FOR_ADDRESS): Likewise.
	* target.def (have_count_reg_decr_p): New hook.
	(doloop_cost_for_generic): Likewise.
	(doloop_cost_for_address): Likewise.
	* doc/tm.texi.in (TARGET_HAVE_COUNT_REG_DECR_P): Likewise.
	(TARGET_DOLOOP_COST_FOR_GENERIC): Likewise.
	(TARGET_DOLOOP_COST_FOR_ADDRESS): Likewise.
	* doc/tm.texi: Regenerate.
	* tree-ssa-loop-ivopts.c (comp_cost::operator+=): Consider infinite cost
	addend.
	(record_group): Init doloop_p.
	(add_candidate_1): Add optional argument doloop, change the handlings
	accordingly.
	(add_candidate): Likewise.
	(add_iv_candidate_for_biv): Update the call to add_candidate.
	(generic_predict_doloop_p): Update attribute.
	(force_expr_to_var_cost): Add costing for expressions COND_EXPR/LT_EXPR/
	LE_EXPR/GT_EXPR/GE_EXPR/EQ_EXPR/NE_EXPR/UNORDERED_EXPR/ORDERED_EXPR/
	UNLT_EXPR/UNLE_EXPR/UNGT_EXPR/UNGE_EXPR/UNEQ_EXPR/LTGT_EXPR/MAX_EXPR/
	MIN_EXPR.
	(determine_group_iv_cost_generic): Update for doloop IV cand.
	(determine_group_iv_cost_address): Likewise.
	(determine_group_iv_cost_cond): Likewise.
	(determine_iv_cost): Likewise.
	(ivopts_estimate_reg_pressure): Likewise.
	(cand_value_at): Update argument niter type to struct tree_niter_desc*,
	consider doloop IV cand and may_be_zero.
	(may_eliminate_iv): Update the call to cand_value_at, consider doloop
	IV cand and may_be_zero.
	(add_iv_candidate_for_doloop): New function.
	(find_iv_candidates): Call function add_iv_candidate_for_doloop.
	(determine_set_costs): Update the call to ivopts_estimate_reg_pressure.
	(iv_ca_recount_cost): Likewise.
	(iv_ca_new): Init n_doloop_cands.
	(iv_ca_set_no_cp): Update n_doloop_cands.
	(iv_ca_set_cp): Likewise.
	(iv_ca_dump): Dump register cost.
	(find_doloop_use): Likewise.
	(tree_ssa_iv_optimize_loop): Call function generic_predict_doloop_p and
	find_doloop_use.

gcc/testsuite/ChangeLog

2019-08-14  Kewen Lin  <linkw@gcc.gnu.org>

	PR middle-end/80791
	* gcc.dg/tree-ssa/ivopts-3.c: Adjust for doloop change.
	* gcc.dg/tree-ssa/ivopts-lt.c: Likewise.
	* gcc.dg/tree-ssa/pr32044.c: Likewise.


[-- Attachment #2: doloop_dedicated_iv1.diff --]
[-- Type: text/plain, Size: 25270 bytes --]

diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index 6667cd0..5eccbdc 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -1912,6 +1912,16 @@ static const struct attribute_spec rs6000_attribute_table[] =
 #undef TARGET_PREDICT_DOLOOP_P
 #define TARGET_PREDICT_DOLOOP_P rs6000_predict_doloop_p
 
+#undef TARGET_HAVE_COUNT_REG_DECR_P
+#define TARGET_HAVE_COUNT_REG_DECR_P true
+
+/* 1000000000 is infinite cost in IVOPTs.  */
+#undef TARGET_DOLOOP_COST_FOR_GENERIC
+#define TARGET_DOLOOP_COST_FOR_GENERIC 1000000000
+
+#undef TARGET_DOLOOP_COST_FOR_ADDRESS
+#define TARGET_DOLOOP_COST_FOR_ADDRESS 1000000000
+
 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV rs6000_atomic_assign_expand_fenv
 
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index c2aa4d0..9f3a08a 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -11618,6 +11618,29 @@ loops, and will help ivopts to make some decisions.
 The default version of this hook returns false.
 @end deftypefn
 
+@deftypevr {Target Hook} bool TARGET_HAVE_COUNT_REG_DECR_P
+Return true if the target supports hardware count register for decrement
+and branch.  This count register can't be used as general register since
+moving to/from a general register from/to it is very expensive.
+The default value is false.
+@end deftypevr
+
+@deftypevr {Target Hook} int64_t TARGET_DOLOOP_COST_FOR_GENERIC
+IVOPTs introduces one doloop dedicated IV candidate, this hook offers
+ target owner a way to adjust cost when selecting doloop IV candidate for a
+ generic IV use.  At calcuation, this value will be added on normal cost
+ already calculated by current implementation.
+The default value is zero.
+@end deftypevr
+
+@deftypevr {Target Hook} int64_t TARGET_DOLOOP_COST_FOR_ADDRESS
+IVOPTs introduces one doloop dedicated IV candidate, this hook offers
+ target owner a way to adjust cost when selecting doloop IV candidate for an
+ address IV use.  At calcuation, this value will be added on normal cost
+ already calculated by current implementation.
+The default value is zero.
+@end deftypevr
+
 @deftypefn {Target Hook} bool TARGET_CAN_USE_DOLOOP_P (const widest_int @var{&iterations}, const widest_int @var{&iterations_max}, unsigned int @var{loop_depth}, bool @var{entered_at_top})
 Return true if it is possible to use low-overhead loops (@code{doloop_end}
 and @code{doloop_begin}) for a particular loop.  @var{iterations} gives the
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index b4d57b8..4346773 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -7946,6 +7946,12 @@ to by @var{ce_info}.
 
 @hook TARGET_PREDICT_DOLOOP_P
 
+@hook TARGET_HAVE_COUNT_REG_DECR_P
+
+@hook TARGET_DOLOOP_COST_FOR_GENERIC
+
+@hook TARGET_DOLOOP_COST_FOR_ADDRESS
+
 @hook TARGET_CAN_USE_DOLOOP_P
 
 @hook TARGET_INVALID_WITHIN_DOLOOP
diff --git a/gcc/target.def b/gcc/target.def
index 71b6972..69e2844 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -4246,6 +4246,32 @@ The default version of this hook returns false.",
  bool, (struct loop *loop),
  default_predict_doloop_p)
 
+DEFHOOKPOD
+(have_count_reg_decr_p,
+ "Return true if the target supports hardware count register for decrement\n\
+and branch.  This count register can't be used as general register since\n\
+moving to/from a general register from/to it is very expensive.\n\
+The default value is false.",
+ bool, false)
+
+DEFHOOKPOD
+(doloop_cost_for_generic,
+ "IVOPTs introduces one doloop dedicated IV candidate, this hook offers\n\
+ target owner a way to adjust cost when selecting doloop IV candidate for a\n\
+ generic IV use.  At calcuation, this value will be added on normal cost\n\
+ already calculated by current implementation.\n\
+The default value is zero.",
+ int64_t, 0)
+
+DEFHOOKPOD
+(doloop_cost_for_address,
+ "IVOPTs introduces one doloop dedicated IV candidate, this hook offers\n\
+ target owner a way to adjust cost when selecting doloop IV candidate for an\n\
+ address IV use.  At calcuation, this value will be added on normal cost\n\
+ already calculated by current implementation.\n\
+The default value is zero.",
+ int64_t, 0)
+
 DEFHOOK
 (can_use_doloop_p,
  "Return true if it is possible to use low-overhead loops (@code{doloop_end}\n\
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ivopts-3.c b/gcc/testsuite/gcc.dg/tree-ssa/ivopts-3.c
index 214e6a7..ce4b1d0 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/ivopts-3.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ivopts-3.c
@@ -10,4 +10,6 @@ int main (void)
     f2 ();
 }
 
-/* { dg-final { scan-tree-dump-times "!= 0" 5 "ivopts" } }  */
+/* { dg-final { scan-tree-dump-times "!= 0" 5 "ivopts" { target { ! powerpc*-*-* } } } } */
+/* More debug information emitted for doloop on powerpc.  */
+/* { dg-final { scan-tree-dump-times "!= 0" 6 "ivopts" { target { powerpc*-*-* } } } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ivopts-lt.c b/gcc/testsuite/gcc.dg/tree-ssa/ivopts-lt.c
index 7d5859b..71d7f67 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/ivopts-lt.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ivopts-lt.c
@@ -17,6 +17,7 @@ f1 (char *p, uintptr_t i, uintptr_t n)
   while (i < n);
 }
 
-/* { dg-final { scan-tree-dump-times "PHI" 1 "ivopts" } } */
-/* { dg-final { scan-tree-dump-times "PHI <p_" 1 "ivopts"} } */
-/* { dg-final { scan-tree-dump-times "p_\[0-9\]* <" 1 "ivopts" } } */
+/* { dg-final { scan-tree-dump-times "PHI" 1 "ivopts" { target { ! powerpc*-*-* } } } } */
+/* { dg-final { scan-tree-dump-times "PHI" 2 "ivopts" { target { powerpc*-*-* } } } } */
+/* { dg-final { scan-tree-dump-times "PHI <p_" 1 "ivopts" { target { ! powerpc*-*-* } } } } */
+/* { dg-final { scan-tree-dump-times "p_\[0-9\]* <" 1 "ivopts" { target { ! powerpc*-*-* } } } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr32044.c b/gcc/testsuite/gcc.dg/tree-ssa/pr32044.c
index 8a8977a..06c27b0 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/pr32044.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr32044.c
@@ -1,6 +1,10 @@
 /* { dg-do compile } */
 /* { dg-options "-O2 -fdump-tree-optimized" } */
 
+/* For powerpc, disable doloop IV cand generation in IVOPTs to avoid unexpected
+   division operation for its base setup.  */
+/* { dg-additional-options "-fno-branch-count-reg" { target { powerpc*-*-* } } } */
+
 int foo (int n)
 {
   while (n >= 45)
diff --git a/gcc/tree-ssa-loop-ivopts.c b/gcc/tree-ssa-loop-ivopts.c
index 530ea4a..11852af 100644
--- a/gcc/tree-ssa-loop-ivopts.c
+++ b/gcc/tree-ssa-loop-ivopts.c
@@ -275,6 +275,9 @@ comp_cost::operator+= (comp_cost cost)
 comp_cost
 comp_cost::operator+= (HOST_WIDE_INT c)
 {
+  if (c >= INFTY)
+    this->cost = INFTY;
+
   if (infinite_cost_p ())
     return *this;
 
@@ -399,6 +402,8 @@ struct iv_group
   struct cost_pair *cost_map;
   /* The selected candidate for the group.  */
   struct iv_cand *selected;
+  /* To indicate this is a doloop use group.  */
+  bool doloop_p;
   /* Uses in the group.  */
   vec<struct iv_use *> vuses;
 };
@@ -439,6 +444,7 @@ struct iv_cand
 			   be hoisted out of loop.  */
   struct iv *orig_iv;	/* The original iv if this cand is added from biv with
 			   smaller type.  */
+  bool doloop_p;	/* Whether this is a doloop candidate.  */
 };
 
 /* Hashtable entry for common candidate derived from iv uses.  */
@@ -612,6 +618,9 @@ struct ivopts_data
 
   /* Whether the loop body can only be exited via single exit.  */
   bool loop_single_exit_p;
+
+  /* Whether the loop has doloop comparison use.  */
+  bool doloop_use_p;
 };
 
 /* An assignment of iv candidates to uses.  */
@@ -636,6 +645,9 @@ struct iv_ca
   /* The number of candidates in the set.  */
   unsigned n_cands;
 
+  /* The number of doloop candidate in the set.  */
+  unsigned n_doloop_cands;
+
   /* The number of invariants needed, including both invariant variants and
      invariant expressions.  */
   unsigned n_invs;
@@ -1528,6 +1540,7 @@ record_group (struct ivopts_data *data, enum use_type type)
   group->type = type;
   group->related_cands = BITMAP_ALLOC (NULL);
   group->vuses.create (1);
+  group->doloop_p = false;
 
   data->vgroups.safe_push (group);
   return group;
@@ -3017,9 +3030,9 @@ get_loop_invariant_expr (struct ivopts_data *data, tree inv_expr)
    replacement of the final value of the iv by a direct computation.  */
 
 static struct iv_cand *
-add_candidate_1 (struct ivopts_data *data,
-		 tree base, tree step, bool important, enum iv_position pos,
-		 struct iv_use *use, gimple *incremented_at,
+add_candidate_1 (struct ivopts_data *data, tree base, tree step, bool important,
+		 enum iv_position pos, struct iv_use *use,
+		 gimple *incremented_at, bool doloop = false,
 		 struct iv *orig_iv = NULL)
 {
   unsigned i;
@@ -3079,11 +3092,15 @@ add_candidate_1 (struct ivopts_data *data,
       cand->pos = pos;
       if (pos != IP_ORIGINAL)
 	{
-	  cand->var_before = create_tmp_var_raw (TREE_TYPE (base), "ivtmp");
+	  if (doloop)
+	    cand->var_before = create_tmp_var_raw (TREE_TYPE (base), "doloop");
+	  else
+	    cand->var_before = create_tmp_var_raw (TREE_TYPE (base), "ivtmp");
 	  cand->var_after = cand->var_before;
 	}
       cand->important = important;
       cand->incremented_at = incremented_at;
+      cand->doloop_p = doloop;
       data->vcands.safe_push (cand);
 
       if (!poly_int_tree_p (step))
@@ -3116,6 +3133,7 @@ add_candidate_1 (struct ivopts_data *data,
     }
 
   cand->important |= important;
+  cand->doloop_p |= doloop;
 
   /* Relate candidate to the group for which it is added.  */
   if (use)
@@ -3209,16 +3227,17 @@ add_autoinc_candidates (struct ivopts_data *data, tree base, tree step,
    the end of loop.  */
 
 static void
-add_candidate (struct ivopts_data *data,
-	       tree base, tree step, bool important, struct iv_use *use,
+add_candidate (struct ivopts_data *data, tree base, tree step, bool important,
+	       struct iv_use *use, bool doloop = false,
 	       struct iv *orig_iv = NULL)
 {
   if (ip_normal_pos (data->current_loop))
-    add_candidate_1 (data, base, step, important,
-		     IP_NORMAL, use, NULL, orig_iv);
+    add_candidate_1 (data, base, step, important, IP_NORMAL, use, NULL, doloop,
+		     orig_iv);
   if (ip_end_pos (data->current_loop)
       && allow_ip_end_pos_p (data->current_loop))
-    add_candidate_1 (data, base, step, important, IP_END, use, NULL, orig_iv);
+    add_candidate_1 (data, base, step, important, IP_END, use, NULL, doloop,
+		     orig_iv);
 }
 
 /* Adds standard iv candidates.  */
@@ -3262,7 +3281,7 @@ add_iv_candidate_for_biv (struct ivopts_data *data, struct iv *iv)
       tree step = fold_convert (sizetype, iv->step);
 
       /* Add iv cand of same precision as index part in TARGET_MEM_REF.  */
-      add_candidate (data, base, step, true, NULL, iv);
+      add_candidate (data, base, step, true, NULL, false, iv);
       /* Add iv cand of the original type only if it has nonlinear use.  */
       if (iv->nonlin_use)
 	add_candidate (data, iv->base, iv->step, true, NULL);
@@ -3724,7 +3743,7 @@ prepare_decl_rtl (tree *expr_p, int *ws, void *data)
    Some RTL specific checks seems unable to be checked in gimple, if any new
    checks or easy checks _are_ missing here, please add them.  */
 
-static bool ATTRIBUTE_UNUSED
+static bool
 generic_predict_doloop_p (struct ivopts_data *data)
 {
   struct loop *loop = data->current_loop;
@@ -4177,6 +4196,36 @@ force_expr_to_var_cost (tree expr, bool speed)
       STRIP_NOPS (op0);
       op1 = NULL_TREE;
       break;
+    /* See add_iv_candidate_for_doloop, for doloop may_be_zero case, we
+       introduce COND_EXPR for IV base, need to support better cost estimation
+       for this COND_EXPR and tcc_comparison.  */
+    case COND_EXPR:
+      op0 = TREE_OPERAND (expr, 1);
+      STRIP_NOPS (op0);
+      op1 = TREE_OPERAND (expr, 2);
+      STRIP_NOPS (op1);
+      break;
+    case LT_EXPR:
+    case LE_EXPR:
+    case GT_EXPR:
+    case GE_EXPR:
+    case EQ_EXPR:
+    case NE_EXPR:
+    case UNORDERED_EXPR:
+    case ORDERED_EXPR:
+    case UNLT_EXPR:
+    case UNLE_EXPR:
+    case UNGT_EXPR:
+    case UNGE_EXPR:
+    case UNEQ_EXPR:
+    case LTGT_EXPR:
+    case MAX_EXPR:
+    case MIN_EXPR:
+      op0 = TREE_OPERAND (expr, 0);
+      STRIP_NOPS (op0);
+      op1 = TREE_OPERAND (expr, 1);
+      STRIP_NOPS (op1);
+      break;
 
     default:
       /* Just an arbitrary value, FIXME.  */
@@ -4258,6 +4307,35 @@ force_expr_to_var_cost (tree expr, bool speed)
     case RSHIFT_EXPR:
       cost = comp_cost (add_cost (speed, mode), 0);
       break;
+    case COND_EXPR:
+      op0 = TREE_OPERAND (expr, 0);
+      STRIP_NOPS (op0);
+      if (op0 == NULL_TREE || TREE_CODE (op0) == SSA_NAME
+	  || CONSTANT_CLASS_P (op0))
+	cost = no_cost;
+      else
+	cost = force_expr_to_var_cost (op0, speed);
+      break;
+    case LT_EXPR:
+    case LE_EXPR:
+    case GT_EXPR:
+    case GE_EXPR:
+    case EQ_EXPR:
+    case NE_EXPR:
+    case UNORDERED_EXPR:
+    case ORDERED_EXPR:
+    case UNLT_EXPR:
+    case UNLE_EXPR:
+    case UNGT_EXPR:
+    case UNGE_EXPR:
+    case UNEQ_EXPR:
+    case LTGT_EXPR:
+    case MAX_EXPR:
+    case MIN_EXPR:
+      /* Simply use 1.5 * add cost for now, FIXME if there is some more accurate
+	 cost evaluation way.  */
+      cost = comp_cost (1.5 * add_cost (speed, mode), 0);
+      break;
 
     default:
       gcc_unreachable ();
@@ -4706,8 +4784,12 @@ determine_group_iv_cost_generic (struct ivopts_data *data,
   if (cand->pos == IP_ORIGINAL && cand->incremented_at == use->stmt)
     cost = no_cost;
   else
-    cost = get_computation_cost (data, use, cand, false,
-				 &inv_vars, NULL, &inv_expr);
+    {
+      cost = get_computation_cost (data, use, cand, false, &inv_vars, NULL,
+				   &inv_expr);
+      if (cand->doloop_p)
+	cost += targetm.doloop_cost_for_generic;
+    }
 
   if (inv_expr)
     {
@@ -4735,6 +4817,9 @@ determine_group_iv_cost_address (struct ivopts_data *data,
   cost = get_computation_cost (data, use, cand, true,
 			       &inv_vars, &can_autoinc, &inv_expr);
 
+  if (cand->doloop_p)
+    cost += targetm.doloop_cost_for_address;
+
   if (inv_expr)
     {
       inv_exprs = BITMAP_ALLOC (NULL);
@@ -4783,11 +4868,12 @@ determine_group_iv_cost_address (struct ivopts_data *data,
    stores it to VAL.  */
 
 static void
-cand_value_at (struct loop *loop, struct iv_cand *cand, gimple *at, tree niter,
-	       aff_tree *val)
+cand_value_at (struct loop *loop, struct iv_cand *cand, gimple *at,
+	       struct tree_niter_desc *desc, aff_tree *val)
 {
   aff_tree step, delta, nit;
   struct iv *iv = cand->iv;
+  tree niter = desc->niter;
   tree type = TREE_TYPE (iv->base);
   tree steptype;
   if (POINTER_TYPE_P (type))
@@ -4803,7 +4889,15 @@ cand_value_at (struct loop *loop, struct iv_cand *cand, gimple *at, tree niter,
   if (stmt_after_increment (loop, cand, at))
     aff_combination_add (&delta, &step);
 
-  tree_to_aff_combination (iv->base, type, val);
+  tree base = iv->base;
+  /* See add_iv_candidate_for_doloop, if may_be_zero is set, we want to extract
+     the value under !may_be_zero to get the compact bound which also well fits
+     for may_be_zero since we ensure the value for it is const one.  */
+  if (cand->doloop_p && desc->may_be_zero && !integer_zerop (desc->may_be_zero))
+    base = fold_build2 (PLUS_EXPR, TREE_TYPE (niter),
+			unshare_expr (rewrite_to_non_trapping_overflow (niter)),
+			build_int_cst (TREE_TYPE (niter), 1));
+  tree_to_aff_combination (base, type, val);
   if (!POINTER_TYPE_P (type))
     aff_combination_convert (val, steptype);
   aff_combination_add (val, &delta);
@@ -5142,7 +5236,7 @@ may_eliminate_iv (struct ivopts_data *data,
 	}
     }
 
-  cand_value_at (loop, cand, use->stmt, desc->niter, &bnd);
+  cand_value_at (loop, cand, use->stmt, desc, &bnd);
 
   *bound = fold_convert (TREE_TYPE (cand->iv->base),
 			 aff_combination_to_tree (&bnd));
@@ -5159,8 +5253,11 @@ may_eliminate_iv (struct ivopts_data *data,
 
      TODO: we could also calculate the value MAY_BE_ZERO ? 0 : NITER and
 	   base the exit condition on it.  However, that is often too
-	   expensive.  */
-  if (!integer_zerop (desc->may_be_zero))
+	   expensive.
+
+     For doloop candidate, we have considered MAY_BE_ZERO for IV base, need to
+     support MAY_BE_ZERO ? 0 : NITER, so simply bypass this check.  */
+  if (!integer_zerop (desc->may_be_zero) && !cand->doloop_p)
     return iv_elimination_compare_lt (data, cand, comp, desc);
 
   return true;
@@ -5264,6 +5361,9 @@ determine_group_iv_cost_cond (struct ivopts_data *data,
       inv_vars = inv_vars_elim;
       inv_vars_elim = NULL;
       inv_expr = inv_expr_elim;
+      /* For doloop candidate/use pair, adjust to zero cost.  */
+      if (group->doloop_p && cand->doloop_p)
+	cost = no_cost;
     }
   else
     {
@@ -5390,6 +5490,42 @@ relate_compare_use_with_all_cands (struct ivopts_data *data)
     }
 }
 
+/* Add one doloop dedicated IV candidate:
+     - Base is (may_be_zero ? 1 : (niter + 1)).
+     - Step is -1.  */
+
+static void
+add_iv_candidate_for_doloop (struct ivopts_data *data)
+{
+  tree_niter_desc *niter_desc = niter_for_single_dom_exit (data);
+  gcc_assert (niter_desc && niter_desc->assumptions);
+
+  tree niter = niter_desc->niter;
+  tree ntype = TREE_TYPE (niter);
+  gcc_assert (TREE_CODE (ntype) == INTEGER_TYPE);
+
+  tree may_be_zero = niter_desc->may_be_zero;
+  if (may_be_zero && integer_zerop (may_be_zero))
+    may_be_zero = NULL_TREE;
+  if (may_be_zero)
+    {
+      if (COMPARISON_CLASS_P (may_be_zero))
+	{
+	  niter = fold_build3 (COND_EXPR, ntype, may_be_zero,
+			       build_int_cst (ntype, 0),
+			       rewrite_to_non_trapping_overflow (niter));
+	}
+      /* Don't try to obtain the iteration count expression when may_be_zero is
+	 integer_nonzerop (actually iteration count is one) or else.  */
+      else
+	return;
+    }
+
+  tree base = fold_build2 (PLUS_EXPR, ntype, unshare_expr (niter),
+			   build_int_cst (ntype, 1));
+  add_candidate (data, base, build_int_cst (ntype, -1), true, NULL, true);
+}
+
 /* Finds the candidates for the induction variables.  */
 
 static void
@@ -5398,6 +5534,10 @@ find_iv_candidates (struct ivopts_data *data)
   /* Add commonly used ivs.  */
   add_standard_iv_candidates (data);
 
+  /* Add doloop dedicate ivs.  */
+  if (data->doloop_use_p)
+    add_iv_candidate_for_doloop (data);
+
   /* Add old induction variables.  */
   add_iv_candidate_for_bivs (data);
 
@@ -5578,16 +5718,21 @@ determine_iv_cost (struct ivopts_data *data, struct iv_cand *cand)
      or a const set.  */
   if (cost_base.cost == 0)
     cost_base.cost = COSTS_N_INSNS (1);
-  cost_step = add_cost (data->speed, TYPE_MODE (TREE_TYPE (base)));
-
+  /* Doloop decrement should be considered as zero cost.  */
+  if (cand->doloop_p)
+    cost_step = 0;
+  else
+    cost_step = add_cost (data->speed, TYPE_MODE (TREE_TYPE (base)));
   cost = cost_step + adjust_setup_cost (data, cost_base.cost);
 
   /* Prefer the original ivs unless we may gain something by replacing it.
      The reason is to make debugging simpler; so this is not relevant for
      artificial ivs created by other optimization passes.  */
-  if (cand->pos != IP_ORIGINAL
-      || !SSA_NAME_VAR (cand->var_before)
-      || DECL_ARTIFICIAL (SSA_NAME_VAR (cand->var_before)))
+  if ((cand->pos != IP_ORIGINAL
+       || !SSA_NAME_VAR (cand->var_before)
+       || DECL_ARTIFICIAL (SSA_NAME_VAR (cand->var_before)))
+      /* Prefer doloop as well.  */
+      && !cand->doloop_p)
     cost++;
 
   /* Prefer not to insert statements into latch unless there are some
@@ -5633,10 +5778,14 @@ determine_iv_costs (struct ivopts_data *data)
 
 static unsigned
 ivopts_estimate_reg_pressure (struct ivopts_data *data, unsigned n_invs,
-			      unsigned n_cands)
+			      unsigned n_cands, unsigned n_doloop_cands)
 {
   unsigned cost;
-  unsigned n_old = data->regs_used, n_new = n_invs + n_cands;
+  unsigned n_old = data->regs_used, n_spr_for_doloop = 0;
+  /* If target supports count register for doloop, it doesn't take GPR.  */
+  if (targetm.have_count_reg_decr_p)
+    n_spr_for_doloop = n_doloop_cands;
+  unsigned n_new = n_invs + n_cands - n_spr_for_doloop;
   unsigned regs_needed = n_new + n_old, available_regs = target_avail_regs;
   bool speed = data->speed;
 
@@ -5666,7 +5815,7 @@ ivopts_estimate_reg_pressure (struct ivopts_data *data, unsigned n_invs,
 
   /* Finally, add the number of candidates, so that we prefer eliminating
      induction variables if possible.  */
-  return cost + n_cands;
+  return cost + n_cands - n_spr_for_doloop;
 }
 
 /* For each size of the induction variable set determine the penalty.  */
@@ -5727,7 +5876,7 @@ determine_set_costs (struct ivopts_data *data)
       fprintf (dump_file, "  ivs\tcost\n");
       for (j = 0; j <= 2 * target_avail_regs; j++)
 	fprintf (dump_file, "  %d\t%d\n", j,
-		 ivopts_estimate_reg_pressure (data, 0, j));
+		 ivopts_estimate_reg_pressure (data, 0, j, 0));
       fprintf (dump_file, "\n");
     }
 }
@@ -5786,7 +5935,8 @@ iv_ca_recount_cost (struct ivopts_data *data, struct iv_ca *ivs)
   comp_cost cost = ivs->cand_use_cost;
 
   cost += ivs->cand_cost;
-  cost += ivopts_estimate_reg_pressure (data, ivs->n_invs, ivs->n_cands);
+  cost += ivopts_estimate_reg_pressure (data, ivs->n_invs, ivs->n_cands,
+					ivs->n_doloop_cands);
   ivs->cost = cost;
 }
 
@@ -5833,6 +5983,8 @@ iv_ca_set_no_cp (struct ivopts_data *data, struct iv_ca *ivs,
     {
       bitmap_clear_bit (ivs->cands, cid);
       ivs->n_cands--;
+      if (cp->cand->doloop_p)
+	ivs->n_doloop_cands--;
       ivs->cand_cost -= cp->cand->cost;
       iv_ca_set_remove_invs (ivs, cp->cand->inv_vars, ivs->n_inv_var_uses);
       iv_ca_set_remove_invs (ivs, cp->cand->inv_exprs, ivs->n_inv_expr_uses);
@@ -5890,6 +6042,8 @@ iv_ca_set_cp (struct ivopts_data *data, struct iv_ca *ivs,
 	{
 	  bitmap_set_bit (ivs->cands, cid);
 	  ivs->n_cands++;
+	  if (cp->cand->doloop_p)
+	    ivs->n_doloop_cands++;
 	  ivs->cand_cost += cp->cand->cost;
 	  iv_ca_set_add_invs (ivs, cp->cand->inv_vars, ivs->n_inv_var_uses);
 	  iv_ca_set_add_invs (ivs, cp->cand->inv_exprs, ivs->n_inv_expr_uses);
@@ -6100,6 +6254,7 @@ iv_ca_new (struct ivopts_data *data)
   nw->n_cand_uses = XCNEWVEC (unsigned, data->vcands.length ());
   nw->cands = BITMAP_ALLOC (NULL);
   nw->n_cands = 0;
+  nw->n_doloop_cands = 0;
   nw->n_invs = 0;
   nw->cand_use_cost = no_cost;
   nw->cand_cost = 0;
@@ -6134,6 +6289,9 @@ iv_ca_dump (struct ivopts_data *data, FILE *file, struct iv_ca *ivs)
 
   fprintf (file, "  cost: %" PRId64 " (complexity %d)\n", cost.cost,
 	   cost.complexity);
+  fprintf (file, "  reg_cost: %d\n",
+	   ivopts_estimate_reg_pressure (data, ivs->n_invs, ivs->n_cands,
+					 ivs->n_doloop_cands));
   fprintf (file, "  cand_cost: %" PRId64 "\n  cand_group_cost: "
 	   "%" PRId64 " (complexity %d)\n", ivs->cand_cost,
 	   ivs->cand_use_cost.cost, ivs->cand_use_cost.complexity);
@@ -7568,6 +7726,47 @@ determine_scaling_factor (struct ivopts_data *data, basic_block *body)
     }
 }
 
+/* Find doloop comparison use and set its doloop_p on if found.  */
+
+static bool
+find_doloop_use (struct ivopts_data *data)
+{
+  struct loop *loop = data->current_loop;
+
+  for (unsigned i = 0; i < data->vgroups.length (); i++)
+    {
+      struct iv_group *group = data->vgroups[i];
+      if (group->type == USE_COMPARE)
+	{
+	  gcc_assert (group->vuses.length () == 1);
+	  struct iv_use *use = group->vuses[0];
+	  gimple *stmt = use->stmt;
+	  if (gimple_code (stmt) == GIMPLE_COND)
+	    {
+	      basic_block bb = gimple_bb (stmt);
+	      edge true_edge, false_edge;
+	      extract_true_false_edges_from_block (bb, &true_edge, &false_edge);
+	      /* This comparison is used for loop latch.  Require latch is empty
+		 for now.  */
+	      if ((loop->latch == true_edge->dest
+		   || loop->latch == false_edge->dest)
+		  && empty_block_p (loop->latch))
+		{
+		  group->doloop_p = true;
+		  if (dump_file && (dump_flags & TDF_DETAILS))
+		    {
+		      fprintf (dump_file, "Doloop cmp iv use: ");
+		      print_gimple_stmt (dump_file, stmt, TDF_DETAILS);
+		    }
+		  return true;
+		}
+	    }
+	}
+    }
+
+  return false;
+}
+
 /* Optimizes the LOOP.  Returns true if anything changed.  */
 
 static bool
@@ -7580,6 +7779,7 @@ tree_ssa_iv_optimize_loop (struct ivopts_data *data, struct loop *loop,
   basic_block *body;
 
   gcc_assert (!data->niters);
+  data->doloop_use_p = false;
   data->current_loop = loop;
   data->loop_loc = find_loop_location (loop).get_location_t ();
   data->speed = optimize_loop_for_speed_p (loop);
@@ -7622,6 +7822,22 @@ tree_ssa_iv_optimize_loop (struct ivopts_data *data, struct loop *loop,
   /* Determine cost scaling factor for basic blocks in loop.  */
   determine_scaling_factor (data, body);
 
+  if (flag_branch_on_count_reg && generic_predict_doloop_p (data))
+    {
+      if (find_doloop_use (data))
+	{
+	  data->doloop_use_p = true;
+	  if (dump_file && (dump_flags & TDF_DETAILS))
+	    {
+	      fprintf (dump_file,
+		       "Predict loop %d can perform"
+		       " doloop optimization later.\n",
+		       loop->num);
+	      flow_loop_dump (loop, dump_file, NULL, 1);
+	    }
+	}
+    }
+
   /* Finds candidates for the induction variables (item 2).  */
   find_iv_candidates (data);
 

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v6 3/3] PR80791 Consider doloop cmp use in ivopts
  2019-08-14  7:48         ` [PATCH v6 " Kewen.Lin
@ 2019-08-21 13:42           ` Bin.Cheng
  2019-08-22  7:09             ` Kewen.Lin
  0 siblings, 1 reply; 43+ messages in thread
From: Bin.Cheng @ 2019-08-21 13:42 UTC (permalink / raw)
  To: Kewen.Lin
  Cc: gcc-patches List, Segher Boessenkool, Bill Schmidt, Richard Guenther

On Wed, Aug 14, 2019 at 3:23 PM Kewen.Lin <linkw@linux.ibm.com> wrote:
>
> Hi!
>
> Comparing to the previous versions of implementation mainly based on the
> existing IV cands but zeroing the related group/use cost, this new one is based
> on Richard and Segher's suggestion introducing one doloop dedicated IV cand.
>
> Some key points are listed below:
>   1) New field doloop_p in struct iv_cand to indicate doloop dedicated IV cand.
>   2) Special name "doloop" assigned.
>   3) Doloop IV cand with form (niter+1, +, -1)
>   4) For doloop IV cand, no extra one cost like BIV, assign zero cost for step.
>   5) Support may_be_zero (regressed PR is in this case), the base of doloop IV
>      can be COND_EXPR, add handlings in cand_value_at and may_eliminate_iv.
>   6) Add more expr support in force_expr_to_var_cost for reasonable cost
>      calculation on the IV base with may_be_zero (like COND_EXPR).
>   7) Set zero cost when using doloop IV cand for doloop use.
>   8) Add three hooks (should we merge _generic and _address?).
>     *) have_count_reg_decr_p, is to indicate the target has special hardware
>        count register, we shouldn't consider the impact of doloop IV when
>        calculating register pressures.
>     *) doloop_cost_for_generic, is the extra cost when using doloop IV cand for
>        generic type IV use.
>     *) doloop_cost_for_address, is the extra cost when using doloop IV cand for
>        address type IV use.
What will happen if doloop IV cand be used for generic/address type iv
use?  Can RTL doloop can still perform doloop optimization in this
case?

>
> Bootstrapped on powerpc64le-linux-gnu and regression testing passed excepting
> for one failure on gcc/testsuite/gcc.dg/guality/loop-1.c at -O3 which is tracked
> by PR89983.
>
> Any comments and suggestions are highly appreciated.  Thanks!
Not sure if I understand the patch correctly, some comments embedded.

+  /* The number of doloop candidate in the set.  */
+  unsigned n_doloop_cands;
+
This is unnecessary.  See below comments.

-    add_candidate_1 (data, base, step, important,
-                    IP_NORMAL, use, NULL, orig_iv);
+    add_candidate_1 (data, base, step, important, IP_NORMAL, use, NULL, doloop,
+                    orig_iv);
   if (ip_end_pos (data->current_loop)
       && allow_ip_end_pos_p (data->current_loop))
-    add_candidate_1 (data, base, step, important, IP_END, use, NULL, orig_iv);
+    add_candidate_1 (data, base, step, important, IP_END, use, NULL, doloop,
+                    orig_iv);
Do we need to skip ip_end_pos case for doloop candidate?  Because the
candidate increment will be inserted in latch, i.e, increment position
is after exit condition.

-  tree_to_aff_combination (iv->base, type, val);
+  tree base = iv->base;
+  /* See add_iv_candidate_for_doloop, if may_be_zero is set, we want to extract
+     the value under !may_be_zero to get the compact bound which also well fits
+     for may_be_zero since we ensure the value for it is const one.  */
+  if (cand->doloop_p && desc->may_be_zero && !integer_zerop
(desc->may_be_zero))
+    base = fold_build2 (PLUS_EXPR, TREE_TYPE (niter),
+                       unshare_expr (rewrite_to_non_trapping_overflow (niter)),
+                       build_int_cst (TREE_TYPE (niter), 1));
+  tree_to_aff_combination (base, type, val);
I don't quite follow here.  The iv->base is computed from niter, I
suppose compact bound is for cheaper candidate initialization?  Why
it's possible to extract !may_be_zero niter for may_be_zero here?  The
niter under !may_be_zero has no indication about the real niter under
may_be_zero.

-  cand_value_at (loop, cand, use->stmt, desc->niter, &bnd);
+  cand_value_at (loop, cand, use->stmt, desc, &bnd);
If I understand correctly, doloop use/cand will only be
identified/added for single exit loop, and there will be only one
cond(doloop) iv_use and only one doloop cand for doloop loop.  So the
cand_value at niter at use position would be 0.  If that's the case,
we can skip calling cand_value_at here for doloop cand.  The change to
cand_value_at would be unnecessary neither.

-          expensive.  */
-  if (!integer_zerop (desc->may_be_zero))
+          expensive.
+
+     For doloop candidate, we have considered MAY_BE_ZERO for IV base, need to
+     support MAY_BE_ZERO ? 0 : NITER, so simply bypass this check.  */
+  if (!integer_zerop (desc->may_be_zero) && !cand->doloop_p)
     return iv_elimination_compare_lt (data, cand, comp, desc);
And we can early return before this?

+  if (may_be_zero)
+    {
+      if (COMPARISON_CLASS_P (may_be_zero))
+       {
+         niter = fold_build3 (COND_EXPR, ntype, may_be_zero,
+                              build_int_cst (ntype, 0),
+                              rewrite_to_non_trapping_overflow (niter));
+       }
+      /* Don't try to obtain the iteration count expression when may_be_zero is
+        integer_nonzerop (actually iteration count is one) or else.  */
+      else
+       return;
+    }
+
+  tree base = fold_build2 (PLUS_EXPR, ntype, unshare_expr (niter),
+                          build_int_cst (ntype, 1));
niter is the number of latch executions, so niter + 1 could wrap here,
but guess it's not a problem the similar issue is not handled in
vectorizer neither.

+  unsigned n_old = data->regs_used, n_spr_for_doloop = 0;
+  /* If target supports count register for doloop, it doesn't take GPR.  */
+  if (targetm.have_count_reg_decr_p)
+    n_spr_for_doloop = n_doloop_cands;
+  unsigned n_new = n_invs + n_cands - n_spr_for_doloop;
Not necessary.  See below.

-  cost += ivopts_estimate_reg_pressure (data, ivs->n_invs, ivs->n_cands);
+  cost += ivopts_estimate_reg_pressure (data, ivs->n_invs, ivs->n_cands,
+                                       ivs->n_doloop_cands);
Also.

       ivs->n_cands--;
+      if (cp->cand->doloop_p)
+       ivs->n_doloop_cands--;

          ivs->n_cands++;
+         if (cp->cand->doloop_p)
+           ivs->n_doloop_cands++;
You can just book n_cands under condition !cp->cand->doloop_p.

+  if (flag_branch_on_count_reg && generic_predict_doloop_p (data))
+    {
+      if (find_doloop_use (data))
+       {
+         data->doloop_use_p = true;
+         if (dump_file && (dump_flags & TDF_DETAILS))
+           {
+             fprintf (dump_file,
+                      "Predict loop %d can perform"
+                      " doloop optimization later.\n",
+                      loop->num);
+             flow_loop_dump (loop, dump_file, NULL, 1);
+           }
+       }
+    }
+
Please factor this into a function to keep caller short.

Thanks,
bin

>
> Kewen
>
> ---------
>
> gcc/ChangeLog
>
> 2019-08-14  Kewen Lin  <linkw@gcc.gnu.org>
>
>         PR middle-end/80791
>         * config/rs6000/rs6000.c (TARGET_HAVE_COUNT_REG_DECR_P): New macro.
>         (TARGET_DOLOOP_COST_FOR_GENERIC): Likewise.
>         (TARGET_DOLOOP_COST_FOR_ADDRESS): Likewise.
>         * target.def (have_count_reg_decr_p): New hook.
>         (doloop_cost_for_generic): Likewise.
>         (doloop_cost_for_address): Likewise.
>         * doc/tm.texi.in (TARGET_HAVE_COUNT_REG_DECR_P): Likewise.
>         (TARGET_DOLOOP_COST_FOR_GENERIC): Likewise.
>         (TARGET_DOLOOP_COST_FOR_ADDRESS): Likewise.
>         * doc/tm.texi: Regenerate.
>         * tree-ssa-loop-ivopts.c (comp_cost::operator+=): Consider infinite cost
>         addend.
>         (record_group): Init doloop_p.
>         (add_candidate_1): Add optional argument doloop, change the handlings
>         accordingly.
>         (add_candidate): Likewise.
>         (add_iv_candidate_for_biv): Update the call to add_candidate.
>         (generic_predict_doloop_p): Update attribute.
>         (force_expr_to_var_cost): Add costing for expressions COND_EXPR/LT_EXPR/
>         LE_EXPR/GT_EXPR/GE_EXPR/EQ_EXPR/NE_EXPR/UNORDERED_EXPR/ORDERED_EXPR/
>         UNLT_EXPR/UNLE_EXPR/UNGT_EXPR/UNGE_EXPR/UNEQ_EXPR/LTGT_EXPR/MAX_EXPR/
>         MIN_EXPR.
>         (determine_group_iv_cost_generic): Update for doloop IV cand.
>         (determine_group_iv_cost_address): Likewise.
>         (determine_group_iv_cost_cond): Likewise.
>         (determine_iv_cost): Likewise.
>         (ivopts_estimate_reg_pressure): Likewise.
>         (cand_value_at): Update argument niter type to struct tree_niter_desc*,
>         consider doloop IV cand and may_be_zero.
>         (may_eliminate_iv): Update the call to cand_value_at, consider doloop
>         IV cand and may_be_zero.
>         (add_iv_candidate_for_doloop): New function.
>         (find_iv_candidates): Call function add_iv_candidate_for_doloop.
>         (determine_set_costs): Update the call to ivopts_estimate_reg_pressure.
>         (iv_ca_recount_cost): Likewise.
>         (iv_ca_new): Init n_doloop_cands.
>         (iv_ca_set_no_cp): Update n_doloop_cands.
>         (iv_ca_set_cp): Likewise.
>         (iv_ca_dump): Dump register cost.
>         (find_doloop_use): Likewise.
>         (tree_ssa_iv_optimize_loop): Call function generic_predict_doloop_p and
>         find_doloop_use.
>
> gcc/testsuite/ChangeLog
>
> 2019-08-14  Kewen Lin  <linkw@gcc.gnu.org>
>
>         PR middle-end/80791
>         * gcc.dg/tree-ssa/ivopts-3.c: Adjust for doloop change.
>         * gcc.dg/tree-ssa/ivopts-lt.c: Likewise.
>         * gcc.dg/tree-ssa/pr32044.c: Likewise.
>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v6 3/3] PR80791 Consider doloop cmp use in ivopts
  2019-08-21 13:42           ` Bin.Cheng
@ 2019-08-22  7:09             ` Kewen.Lin
  2019-08-22  8:07               ` Bin.Cheng
  0 siblings, 1 reply; 43+ messages in thread
From: Kewen.Lin @ 2019-08-22  7:09 UTC (permalink / raw)
  To: Bin.Cheng
  Cc: gcc-patches List, Segher Boessenkool, Bill Schmidt, Richard Guenther

Hi Bin,

Thanks for your time!

on 2019/8/21 下午8:32, Bin.Cheng wrote:
> On Wed, Aug 14, 2019 at 3:23 PM Kewen.Lin <linkw@linux.ibm.com> wrote:
>>
>> Hi!
>>
>> Comparing to the previous versions of implementation mainly based on the
>> existing IV cands but zeroing the related group/use cost, this new one is based
>> on Richard and Segher's suggestion introducing one doloop dedicated IV cand.
>>
>> Some key points are listed below:
>>   1) New field doloop_p in struct iv_cand to indicate doloop dedicated IV cand.
>>   2) Special name "doloop" assigned.
>>   3) Doloop IV cand with form (niter+1, +, -1)
>>   4) For doloop IV cand, no extra one cost like BIV, assign zero cost for step.
>>   5) Support may_be_zero (regressed PR is in this case), the base of doloop IV
>>      can be COND_EXPR, add handlings in cand_value_at and may_eliminate_iv.
>>   6) Add more expr support in force_expr_to_var_cost for reasonable cost
>>      calculation on the IV base with may_be_zero (like COND_EXPR).
>>   7) Set zero cost when using doloop IV cand for doloop use.
>>   8) Add three hooks (should we merge _generic and _address?).
>>     *) have_count_reg_decr_p, is to indicate the target has special hardware
>>        count register, we shouldn't consider the impact of doloop IV when
>>        calculating register pressures.
>>     *) doloop_cost_for_generic, is the extra cost when using doloop IV cand for
>>        generic type IV use.
>>     *) doloop_cost_for_address, is the extra cost when using doloop IV cand for
>>        address type IV use.
> What will happen if doloop IV cand be used for generic/address type iv
> use?  Can RTL doloop can still perform doloop optimization in this
> case?
> 

On Power, we put the iteration count into hardware count register, it takes very
high cost to move the count to GPR, so the cost is set as INF to make it impossible
to use it for generic/address type iv use.  But as some discussion before, on some
targets using GPR instead of hardware count register, they probably want to use this
doloop iv used for other uses if profitable.  These two hooks offer the possibility.
In that case, I think RTL doloop can still perform since it can still get the 
pattern and transform.  The generic/address uses can still use it.
>>
>> Bootstrapped on powerpc64le-linux-gnu and regression testing passed excepting
>> for one failure on gcc/testsuite/gcc.dg/guality/loop-1.c at -O3 which is tracked
>> by PR89983.
>>
>> Any comments and suggestions are highly appreciated.  Thanks!
> Not sure if I understand the patch correctly, some comments embedded.
> 
> +  /* The number of doloop candidate in the set.  */
> +  unsigned n_doloop_cands;
> +
> This is unnecessary.  See below comments.
> 
> -    add_candidate_1 (data, base, step, important,
> -                    IP_NORMAL, use, NULL, orig_iv);
> +    add_candidate_1 (data, base, step, important, IP_NORMAL, use, NULL, doloop,
> +                    orig_iv);
>    if (ip_end_pos (data->current_loop)
>        && allow_ip_end_pos_p (data->current_loop))
> -    add_candidate_1 (data, base, step, important, IP_END, use, NULL, orig_iv);
> +    add_candidate_1 (data, base, step, important, IP_END, use, NULL, doloop,
> +                    orig_iv);
> Do we need to skip ip_end_pos case for doloop candidate?  Because the
> candidate increment will be inserted in latch, i.e, increment position
> is after exit condition.
> 

Yes, we should skip it.  Currently function find_doloop_use has the check on an
empty latch and gimple_cond to latch, partially excluding it.  But it's still good
to guard it directly here.

> -  tree_to_aff_combination (iv->base, type, val);
> +  tree base = iv->base;
> +  /* See add_iv_candidate_for_doloop, if may_be_zero is set, we want to extract
> +     the value under !may_be_zero to get the compact bound which also well fits
> +     for may_be_zero since we ensure the value for it is const one.  */
> +  if (cand->doloop_p && desc->may_be_zero && !integer_zerop
> (desc->may_be_zero))
> +    base = fold_build2 (PLUS_EXPR, TREE_TYPE (niter),
> +                       unshare_expr (rewrite_to_non_trapping_overflow (niter)),
> +                       build_int_cst (TREE_TYPE (niter), 1));
> +  tree_to_aff_combination (base, type, val);
> I don't quite follow here.  The iv->base is computed from niter, I
> suppose compact bound is for cheaper candidate initialization?  Why
> it's possible to extract !may_be_zero niter for may_be_zero here?  The
> niter under !may_be_zero has no indication about the real niter under
> may_be_zero.
> 

As you note below, the cand_value for doloop would be zero, but for the case
may_be_zero set, the current calculation would take care of the whole niter
expression including the cond_expr introduced by may_be_zero check, it's 
unexpected.  The purpose is to use the value under condition !may_be_zero
for the calculation, and yes, to get expected zero finally.

> -  cand_value_at (loop, cand, use->stmt, desc->niter, &bnd);
> +  cand_value_at (loop, cand, use->stmt, desc, &bnd);
> If I understand correctly, doloop use/cand will only be
> identified/added for single exit loop, and there will be only one
> cond(doloop) iv_use and only one doloop cand for doloop loop.  So the
> cand_value at niter at use position would be 0.  If that's the case,
> we can skip calling cand_value_at here for doloop cand.  The change to
> cand_value_at would be unnecessary neither.
> 

Exactly, I'll add the early return with zero bound for doloop.

> -          expensive.  */
> -  if (!integer_zerop (desc->may_be_zero))
> +          expensive.
> +
> +     For doloop candidate, we have considered MAY_BE_ZERO for IV base, need to
> +     support MAY_BE_ZERO ? 0 : NITER, so simply bypass this check.  */
> +  if (!integer_zerop (desc->may_be_zero) && !cand->doloop_p)
>      return iv_elimination_compare_lt (data, cand, comp, desc);
> And we can early return before this?
> 

OK.

> +  if (may_be_zero)
> +    {
> +      if (COMPARISON_CLASS_P (may_be_zero))
> +       {
> +         niter = fold_build3 (COND_EXPR, ntype, may_be_zero,
> +                              build_int_cst (ntype, 0),
> +                              rewrite_to_non_trapping_overflow (niter));
> +       }
> +      /* Don't try to obtain the iteration count expression when may_be_zero is
> +        integer_nonzerop (actually iteration count is one) or else.  */
> +      else
> +       return;
> +    }
> +
> +  tree base = fold_build2 (PLUS_EXPR, ntype, unshare_expr (niter),
> +                          build_int_cst (ntype, 1));
> niter is the number of latch executions, so niter + 1 could wrap here,
> but guess it's not a problem the similar issue is not handled in
> vectorizer neither.
> 

OK.

> +  unsigned n_old = data->regs_used, n_spr_for_doloop = 0;
> +  /* If target supports count register for doloop, it doesn't take GPR.  */
> +  if (targetm.have_count_reg_decr_p)
> +    n_spr_for_doloop = n_doloop_cands;
> +  unsigned n_new = n_invs + n_cands - n_spr_for_doloop;
> Not necessary.  See below.

> -  cost += ivopts_estimate_reg_pressure (data, ivs->n_invs, ivs->n_cands);
> +  cost += ivopts_estimate_reg_pressure (data, ivs->n_invs, ivs->n_cands,
> +                                       ivs->n_doloop_cands);
> Also.
> 
>        ivs->n_cands--;
> +      if (cp->cand->doloop_p)
> +       ivs->n_doloop_cands--;
> 
>           ivs->n_cands++;
> +         if (cp->cand->doloop_p)
> +           ivs->n_doloop_cands++;
> You can just book n_cands under condition !cp->cand->doloop_p.

If my understanding is correct, you are suggesting the code like:

if (!cp->cand->doloop_p)
  ivs->n_cands++;

But I'm afraid that it can NOT satisfy the need in function
ivopts_estimate_reg_pressure.  As the comments, "if target supports
count register for doloop it doesn't take GPR.".  If we make doloop
cand invisible in n_cands, it's fine for target with count register,
but we may miss to count them on targets without count register.

> 
> +  if (flag_branch_on_count_reg && generic_predict_doloop_p (data))
> +    {
> +      if (find_doloop_use (data))
> +       {
> +         data->doloop_use_p = true;
> +         if (dump_file && (dump_flags & TDF_DETAILS))
> +           {
> +             fprintf (dump_file,
> +                      "Predict loop %d can perform"
> +                      " doloop optimization later.\n",
> +                      loop->num);
> +             flow_loop_dump (loop, dump_file, NULL, 1);
> +           }
> +       }
> +    }
> +
> Please factor this into a function to keep caller short.
> 

OK.


Thanks!
Kewen

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v6 3/3] PR80791 Consider doloop cmp use in ivopts
  2019-08-22  7:09             ` Kewen.Lin
@ 2019-08-22  8:07               ` Bin.Cheng
  2019-08-22  9:16                 ` Kewen.Lin
  0 siblings, 1 reply; 43+ messages in thread
From: Bin.Cheng @ 2019-08-22  8:07 UTC (permalink / raw)
  To: Kewen.Lin
  Cc: gcc-patches List, Segher Boessenkool, Bill Schmidt, Richard Guenther

On Thu, Aug 22, 2019 at 11:18 AM Kewen.Lin <linkw@linux.ibm.com> wrote:
>
> Hi Bin,
>
> Thanks for your time!
>
> on 2019/8/21 下午8:32, Bin.Cheng wrote:
> > On Wed, Aug 14, 2019 at 3:23 PM Kewen.Lin <linkw@linux.ibm.com> wrote:
> >>
> >> Hi!
> >>
> >> Comparing to the previous versions of implementation mainly based on the
> >> existing IV cands but zeroing the related group/use cost, this new one is based
> >> on Richard and Segher's suggestion introducing one doloop dedicated IV cand.
> >>
> >> Some key points are listed below:
> >>   1) New field doloop_p in struct iv_cand to indicate doloop dedicated IV cand.
> >>   2) Special name "doloop" assigned.
> >>   3) Doloop IV cand with form (niter+1, +, -1)
> >>   4) For doloop IV cand, no extra one cost like BIV, assign zero cost for step.
> >>   5) Support may_be_zero (regressed PR is in this case), the base of doloop IV
> >>      can be COND_EXPR, add handlings in cand_value_at and may_eliminate_iv.
> >>   6) Add more expr support in force_expr_to_var_cost for reasonable cost
> >>      calculation on the IV base with may_be_zero (like COND_EXPR).
> >>   7) Set zero cost when using doloop IV cand for doloop use.
> >>   8) Add three hooks (should we merge _generic and _address?).
> >>     *) have_count_reg_decr_p, is to indicate the target has special hardware
> >>        count register, we shouldn't consider the impact of doloop IV when
> >>        calculating register pressures.
> >>     *) doloop_cost_for_generic, is the extra cost when using doloop IV cand for
> >>        generic type IV use.
> >>     *) doloop_cost_for_address, is the extra cost when using doloop IV cand for
> >>        address type IV use.
> > What will happen if doloop IV cand be used for generic/address type iv
> > use?  Can RTL doloop can still perform doloop optimization in this
> > case?
> >
>
> On Power, we put the iteration count into hardware count register, it takes very
> high cost to move the count to GPR, so the cost is set as INF to make it impossible
> to use it for generic/address type iv use.  But as some discussion before, on some
> targets using GPR instead of hardware count register, they probably want to use this
> doloop iv used for other uses if profitable.  These two hooks offer the possibility.
> In that case, I think RTL doloop can still perform since it can still get the
> pattern and transform.  The generic/address uses can still use it.
> >>
> >> Bootstrapped on powerpc64le-linux-gnu and regression testing passed excepting
> >> for one failure on gcc/testsuite/gcc.dg/guality/loop-1.c at -O3 which is tracked
> >> by PR89983.
> >>
> >> Any comments and suggestions are highly appreciated.  Thanks!
> > Not sure if I understand the patch correctly, some comments embedded.
> >
> > +  /* The number of doloop candidate in the set.  */
> > +  unsigned n_doloop_cands;
> > +
> > This is unnecessary.  See below comments.
> >
> > -    add_candidate_1 (data, base, step, important,
> > -                    IP_NORMAL, use, NULL, orig_iv);
> > +    add_candidate_1 (data, base, step, important, IP_NORMAL, use, NULL, doloop,
> > +                    orig_iv);
> >    if (ip_end_pos (data->current_loop)
> >        && allow_ip_end_pos_p (data->current_loop))
> > -    add_candidate_1 (data, base, step, important, IP_END, use, NULL, orig_iv);
> > +    add_candidate_1 (data, base, step, important, IP_END, use, NULL, doloop,
> > +                    orig_iv);
> > Do we need to skip ip_end_pos case for doloop candidate?  Because the
> > candidate increment will be inserted in latch, i.e, increment position
> > is after exit condition.
> >
>
> Yes, we should skip it.  Currently function find_doloop_use has the check on an
> empty latch and gimple_cond to latch, partially excluding it.  But it's still good
> to guard it directly here.
>
> > -  tree_to_aff_combination (iv->base, type, val);
> > +  tree base = iv->base;
> > +  /* See add_iv_candidate_for_doloop, if may_be_zero is set, we want to extract
> > +     the value under !may_be_zero to get the compact bound which also well fits
> > +     for may_be_zero since we ensure the value for it is const one.  */
> > +  if (cand->doloop_p && desc->may_be_zero && !integer_zerop
> > (desc->may_be_zero))
> > +    base = fold_build2 (PLUS_EXPR, TREE_TYPE (niter),
> > +                       unshare_expr (rewrite_to_non_trapping_overflow (niter)),
> > +                       build_int_cst (TREE_TYPE (niter), 1));
> > +  tree_to_aff_combination (base, type, val);
> > I don't quite follow here.  The iv->base is computed from niter, I
> > suppose compact bound is for cheaper candidate initialization?  Why
> > it's possible to extract !may_be_zero niter for may_be_zero here?  The
> > niter under !may_be_zero has no indication about the real niter under
> > may_be_zero.
> >
>
> As you note below, the cand_value for doloop would be zero, but for the case
> may_be_zero set, the current calculation would take care of the whole niter
> expression including the cond_expr introduced by may_be_zero check, it's
> unexpected.  The purpose is to use the value under condition !may_be_zero
> for the calculation, and yes, to get expected zero finally.
>
> > -  cand_value_at (loop, cand, use->stmt, desc->niter, &bnd);
> > +  cand_value_at (loop, cand, use->stmt, desc, &bnd);
> > If I understand correctly, doloop use/cand will only be
> > identified/added for single exit loop, and there will be only one
> > cond(doloop) iv_use and only one doloop cand for doloop loop.  So the
> > cand_value at niter at use position would be 0.  If that's the case,
> > we can skip calling cand_value_at here for doloop cand.  The change to
> > cand_value_at would be unnecessary neither.
> >
>
> Exactly, I'll add the early return with zero bound for doloop.
>
> > -          expensive.  */
> > -  if (!integer_zerop (desc->may_be_zero))
> > +          expensive.
> > +
> > +     For doloop candidate, we have considered MAY_BE_ZERO for IV base, need to
> > +     support MAY_BE_ZERO ? 0 : NITER, so simply bypass this check.  */
> > +  if (!integer_zerop (desc->may_be_zero) && !cand->doloop_p)
> >      return iv_elimination_compare_lt (data, cand, comp, desc);
> > And we can early return before this?
> >
>
> OK.
>
> > +  if (may_be_zero)
> > +    {
> > +      if (COMPARISON_CLASS_P (may_be_zero))
> > +       {
> > +         niter = fold_build3 (COND_EXPR, ntype, may_be_zero,
> > +                              build_int_cst (ntype, 0),
> > +                              rewrite_to_non_trapping_overflow (niter));
> > +       }
> > +      /* Don't try to obtain the iteration count expression when may_be_zero is
> > +        integer_nonzerop (actually iteration count is one) or else.  */
> > +      else
> > +       return;
> > +    }
> > +
> > +  tree base = fold_build2 (PLUS_EXPR, ntype, unshare_expr (niter),
> > +                          build_int_cst (ntype, 1));
> > niter is the number of latch executions, so niter + 1 could wrap here,
> > but guess it's not a problem the similar issue is not handled in
> > vectorizer neither.
> >
>
> OK.
>
> > +  unsigned n_old = data->regs_used, n_spr_for_doloop = 0;
> > +  /* If target supports count register for doloop, it doesn't take GPR.  */
> > +  if (targetm.have_count_reg_decr_p)
> > +    n_spr_for_doloop = n_doloop_cands;
> > +  unsigned n_new = n_invs + n_cands - n_spr_for_doloop;
> > Not necessary.  See below.
>
> > -  cost += ivopts_estimate_reg_pressure (data, ivs->n_invs, ivs->n_cands);
> > +  cost += ivopts_estimate_reg_pressure (data, ivs->n_invs, ivs->n_cands,
> > +                                       ivs->n_doloop_cands);
> > Also.
> >
> >        ivs->n_cands--;
> > +      if (cp->cand->doloop_p)
> > +       ivs->n_doloop_cands--;
> >
> >           ivs->n_cands++;
> > +         if (cp->cand->doloop_p)
> > +           ivs->n_doloop_cands++;
> > You can just book n_cands under condition !cp->cand->doloop_p.
>
> If my understanding is correct, you are suggesting the code like:
>
> if (!cp->cand->doloop_p)
>   ivs->n_cands++;
>
> But I'm afraid that it can NOT satisfy the need in function
> ivopts_estimate_reg_pressure.  As the comments, "if target supports
> count register for doloop it doesn't take GPR.".  If we make doloop
> cand invisible in n_cands, it's fine for target with count register,
> but we may miss to count them on targets without count register.
Why not one more step do checks:
if (!cp->cand->doloop_p || !targetm.have_count_reg_decr_p)
  ivs->n_cands++;

Thanks,
bin
>
> >
> > +  if (flag_branch_on_count_reg && generic_predict_doloop_p (data))
> > +    {
> > +      if (find_doloop_use (data))
> > +       {
> > +         data->doloop_use_p = true;
> > +         if (dump_file && (dump_flags & TDF_DETAILS))
> > +           {
> > +             fprintf (dump_file,
> > +                      "Predict loop %d can perform"
> > +                      " doloop optimization later.\n",
> > +                      loop->num);
> > +             flow_loop_dump (loop, dump_file, NULL, 1);
> > +           }
> > +       }
> > +    }
> > +
> > Please factor this into a function to keep caller short.
> >
>
> OK.
>
>
> Thanks!
> Kewen
>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v6 3/3] PR80791 Consider doloop cmp use in ivopts
  2019-08-22  8:07               ` Bin.Cheng
@ 2019-08-22  9:16                 ` Kewen.Lin
  2019-08-23  5:31                   ` Bin.Cheng
  0 siblings, 1 reply; 43+ messages in thread
From: Kewen.Lin @ 2019-08-22  9:16 UTC (permalink / raw)
  To: Bin.Cheng
  Cc: gcc-patches List, Segher Boessenkool, Bill Schmidt, Richard Guenther

[-- Attachment #1: Type: text/plain, Size: 10743 bytes --]

Hi Bin,

on 2019/8/22 下午1:46, Bin.Cheng wrote:
> On Thu, Aug 22, 2019 at 11:18 AM Kewen.Lin <linkw@linux.ibm.com> wrote:
>>
>> Hi Bin,
>>
>> Thanks for your time!
>>
>> on 2019/8/21 下午8:32, Bin.Cheng wrote:
>>> On Wed, Aug 14, 2019 at 3:23 PM Kewen.Lin <linkw@linux.ibm.com> wrote:
>>>>
>>>> Hi!
>>>>
>>>> Comparing to the previous versions of implementation mainly based on the
>>>> existing IV cands but zeroing the related group/use cost, this new one is based
>>>> on Richard and Segher's suggestion introducing one doloop dedicated IV cand.
>>>>
>>>> Some key points are listed below:
>>>>   1) New field doloop_p in struct iv_cand to indicate doloop dedicated IV cand.
>>>>   2) Special name "doloop" assigned.
>>>>   3) Doloop IV cand with form (niter+1, +, -1)
>>>>   4) For doloop IV cand, no extra one cost like BIV, assign zero cost for step.
>>>>   5) Support may_be_zero (regressed PR is in this case), the base of doloop IV
>>>>      can be COND_EXPR, add handlings in cand_value_at and may_eliminate_iv.
>>>>   6) Add more expr support in force_expr_to_var_cost for reasonable cost
>>>>      calculation on the IV base with may_be_zero (like COND_EXPR).
>>>>   7) Set zero cost when using doloop IV cand for doloop use.
>>>>   8) Add three hooks (should we merge _generic and _address?).
>>>>     *) have_count_reg_decr_p, is to indicate the target has special hardware
>>>>        count register, we shouldn't consider the impact of doloop IV when
>>>>        calculating register pressures.
>>>>     *) doloop_cost_for_generic, is the extra cost when using doloop IV cand for
>>>>        generic type IV use.
>>>>     *) doloop_cost_for_address, is the extra cost when using doloop IV cand for
>>>>        address type IV use.
>>> What will happen if doloop IV cand be used for generic/address type iv
>>> use?  Can RTL doloop can still perform doloop optimization in this
>>> case?
>>>
>>
>> On Power, we put the iteration count into hardware count register, it takes very
>> high cost to move the count to GPR, so the cost is set as INF to make it impossible
>> to use it for generic/address type iv use.  But as some discussion before, on some
>> targets using GPR instead of hardware count register, they probably want to use this
>> doloop iv used for other uses if profitable.  These two hooks offer the possibility.
>> In that case, I think RTL doloop can still perform since it can still get the
>> pattern and transform.  The generic/address uses can still use it.
>>>>
>>>> Bootstrapped on powerpc64le-linux-gnu and regression testing passed excepting
>>>> for one failure on gcc/testsuite/gcc.dg/guality/loop-1.c at -O3 which is tracked
>>>> by PR89983.
>>>>
>>>> Any comments and suggestions are highly appreciated.  Thanks!
>>> Not sure if I understand the patch correctly, some comments embedded.
>>>
>>> +  /* The number of doloop candidate in the set.  */
>>> +  unsigned n_doloop_cands;
>>> +
>>> This is unnecessary.  See below comments.
>>>
>>> -    add_candidate_1 (data, base, step, important,
>>> -                    IP_NORMAL, use, NULL, orig_iv);
>>> +    add_candidate_1 (data, base, step, important, IP_NORMAL, use, NULL, doloop,
>>> +                    orig_iv);
>>>    if (ip_end_pos (data->current_loop)
>>>        && allow_ip_end_pos_p (data->current_loop))
>>> -    add_candidate_1 (data, base, step, important, IP_END, use, NULL, orig_iv);
>>> +    add_candidate_1 (data, base, step, important, IP_END, use, NULL, doloop,
>>> +                    orig_iv);
>>> Do we need to skip ip_end_pos case for doloop candidate?  Because the
>>> candidate increment will be inserted in latch, i.e, increment position
>>> is after exit condition.
>>>
>>
>> Yes, we should skip it.  Currently function find_doloop_use has the check on an
>> empty latch and gimple_cond to latch, partially excluding it.  But it's still good
>> to guard it directly here.
>>
>>> -  tree_to_aff_combination (iv->base, type, val);
>>> +  tree base = iv->base;
>>> +  /* See add_iv_candidate_for_doloop, if may_be_zero is set, we want to extract
>>> +     the value under !may_be_zero to get the compact bound which also well fits
>>> +     for may_be_zero since we ensure the value for it is const one.  */
>>> +  if (cand->doloop_p && desc->may_be_zero && !integer_zerop
>>> (desc->may_be_zero))
>>> +    base = fold_build2 (PLUS_EXPR, TREE_TYPE (niter),
>>> +                       unshare_expr (rewrite_to_non_trapping_overflow (niter)),
>>> +                       build_int_cst (TREE_TYPE (niter), 1));
>>> +  tree_to_aff_combination (base, type, val);
>>> I don't quite follow here.  The iv->base is computed from niter, I
>>> suppose compact bound is for cheaper candidate initialization?  Why
>>> it's possible to extract !may_be_zero niter for may_be_zero here?  The
>>> niter under !may_be_zero has no indication about the real niter under
>>> may_be_zero.
>>>
>>
>> As you note below, the cand_value for doloop would be zero, but for the case
>> may_be_zero set, the current calculation would take care of the whole niter
>> expression including the cond_expr introduced by may_be_zero check, it's
>> unexpected.  The purpose is to use the value under condition !may_be_zero
>> for the calculation, and yes, to get expected zero finally.
>>
>>> -  cand_value_at (loop, cand, use->stmt, desc->niter, &bnd);
>>> +  cand_value_at (loop, cand, use->stmt, desc, &bnd);
>>> If I understand correctly, doloop use/cand will only be
>>> identified/added for single exit loop, and there will be only one
>>> cond(doloop) iv_use and only one doloop cand for doloop loop.  So the
>>> cand_value at niter at use position would be 0.  If that's the case,
>>> we can skip calling cand_value_at here for doloop cand.  The change to
>>> cand_value_at would be unnecessary neither.
>>>
>>
>> Exactly, I'll add the early return with zero bound for doloop.
>>
>>> -          expensive.  */
>>> -  if (!integer_zerop (desc->may_be_zero))
>>> +          expensive.
>>> +
>>> +     For doloop candidate, we have considered MAY_BE_ZERO for IV base, need to
>>> +     support MAY_BE_ZERO ? 0 : NITER, so simply bypass this check.  */
>>> +  if (!integer_zerop (desc->may_be_zero) && !cand->doloop_p)
>>>      return iv_elimination_compare_lt (data, cand, comp, desc);
>>> And we can early return before this?
>>>
>>
>> OK.
>>
>>> +  if (may_be_zero)
>>> +    {
>>> +      if (COMPARISON_CLASS_P (may_be_zero))
>>> +       {
>>> +         niter = fold_build3 (COND_EXPR, ntype, may_be_zero,
>>> +                              build_int_cst (ntype, 0),
>>> +                              rewrite_to_non_trapping_overflow (niter));
>>> +       }
>>> +      /* Don't try to obtain the iteration count expression when may_be_zero is
>>> +        integer_nonzerop (actually iteration count is one) or else.  */
>>> +      else
>>> +       return;
>>> +    }
>>> +
>>> +  tree base = fold_build2 (PLUS_EXPR, ntype, unshare_expr (niter),
>>> +                          build_int_cst (ntype, 1));
>>> niter is the number of latch executions, so niter + 1 could wrap here,
>>> but guess it's not a problem the similar issue is not handled in
>>> vectorizer neither.
>>>
>>
>> OK.
>>
>>> +  unsigned n_old = data->regs_used, n_spr_for_doloop = 0;
>>> +  /* If target supports count register for doloop, it doesn't take GPR.  */
>>> +  if (targetm.have_count_reg_decr_p)
>>> +    n_spr_for_doloop = n_doloop_cands;
>>> +  unsigned n_new = n_invs + n_cands - n_spr_for_doloop;
>>> Not necessary.  See below.
>>
>>> -  cost += ivopts_estimate_reg_pressure (data, ivs->n_invs, ivs->n_cands);
>>> +  cost += ivopts_estimate_reg_pressure (data, ivs->n_invs, ivs->n_cands,
>>> +                                       ivs->n_doloop_cands);
>>> Also.
>>>
>>>        ivs->n_cands--;
>>> +      if (cp->cand->doloop_p)
>>> +       ivs->n_doloop_cands--;
>>>
>>>           ivs->n_cands++;
>>> +         if (cp->cand->doloop_p)
>>> +           ivs->n_doloop_cands++;
>>> You can just book n_cands under condition !cp->cand->doloop_p.
>>
>> If my understanding is correct, you are suggesting the code like:
>>
>> if (!cp->cand->doloop_p)
>>   ivs->n_cands++;
>>
>> But I'm afraid that it can NOT satisfy the need in function
>> ivopts_estimate_reg_pressure.  As the comments, "if target supports
>> count register for doloop it doesn't take GPR.".  If we make doloop
>> cand invisible in n_cands, it's fine for target with count register,
>> but we may miss to count them on targets without count register.
> Why not one more step do checks:
> if (!cp->cand->doloop_p || !targetm.have_count_reg_decr_p)
>   ivs->n_cands++;
> 

Yes, it works.  Thanks!

The new patch addressing the comments is attached.  
Could you please have a look again?  Thanks in advance!


Kewen

---------

gcc/ChangeLog

2019-08-22  Kewen Lin  <linkw@gcc.gnu.org>

	PR middle-end/80791
	* config/rs6000/rs6000.c (TARGET_HAVE_COUNT_REG_DECR_P): New macro.
	(TARGET_DOLOOP_COST_FOR_GENERIC): Likewise.
	(TARGET_DOLOOP_COST_FOR_ADDRESS): Likewise.
	* target.def (have_count_reg_decr_p): New hook.
	(doloop_cost_for_generic): Likewise.
	(doloop_cost_for_address): Likewise.
	* doc/tm.texi.in (TARGET_HAVE_COUNT_REG_DECR_P): Likewise.
	(TARGET_DOLOOP_COST_FOR_GENERIC): Likewise.
	(TARGET_DOLOOP_COST_FOR_ADDRESS): Likewise.
	* doc/tm.texi: Regenerate.
	* tree-ssa-loop-ivopts.c (comp_cost::operator+=): Consider infinite cost
	addend.
	(record_group): Init doloop_p.
	(add_candidate_1): Add optional argument doloop, change the handlings
	accordingly.
	(add_candidate): Likewise.
	(add_iv_candidate_for_biv): Update the call to add_candidate.
	(generic_predict_doloop_p): Update attribute.
	(force_expr_to_var_cost): Add costing for expressions COND_EXPR/LT_EXPR/
	LE_EXPR/GT_EXPR/GE_EXPR/EQ_EXPR/NE_EXPR/UNORDERED_EXPR/ORDERED_EXPR/
	UNLT_EXPR/UNLE_EXPR/UNGT_EXPR/UNGE_EXPR/UNEQ_EXPR/LTGT_EXPR/MAX_EXPR/
	MIN_EXPR.
	(determine_group_iv_cost_generic): Update for doloop IV cand.
	(determine_group_iv_cost_address): Likewise.
	(determine_group_iv_cost_cond): Likewise.
	(determine_iv_cost): Likewise.
	(ivopts_estimate_reg_pressure): Likewise.
	(may_eliminate_iv): Likewise.
	(add_iv_candidate_for_doloop): New function.
	(find_iv_candidates): Call function add_iv_candidate_for_doloop.
	(iv_ca_set_no_cp): Update for doloop IV cand.
	(iv_ca_set_cp): Likewise.
	(iv_ca_dump): Dump register cost.
	(find_doloop_use): New function.
	(predict_and_process_doloop): Likewise.
	(tree_ssa_iv_optimize_loop): Call function predict_and_process_doloop.

gcc/testsuite/ChangeLog

2019-08-22  Kewen Lin  <linkw@gcc.gnu.org>

	PR middle-end/80791
	* gcc.dg/tree-ssa/ivopts-3.c: Adjust for doloop change.
	* gcc.dg/tree-ssa/ivopts-lt.c: Likewise.
	* gcc.dg/tree-ssa/pr32044.c: Likewise.


[-- Attachment #2: doloop_dedicated_iv2.diff --]
[-- Type: text/plain, Size: 21639 bytes --]

diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index 6667cd0..5eccbdc 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -1912,6 +1912,16 @@ static const struct attribute_spec rs6000_attribute_table[] =
 #undef TARGET_PREDICT_DOLOOP_P
 #define TARGET_PREDICT_DOLOOP_P rs6000_predict_doloop_p
 
+#undef TARGET_HAVE_COUNT_REG_DECR_P
+#define TARGET_HAVE_COUNT_REG_DECR_P true
+
+/* 1000000000 is infinite cost in IVOPTs.  */
+#undef TARGET_DOLOOP_COST_FOR_GENERIC
+#define TARGET_DOLOOP_COST_FOR_GENERIC 1000000000
+
+#undef TARGET_DOLOOP_COST_FOR_ADDRESS
+#define TARGET_DOLOOP_COST_FOR_ADDRESS 1000000000
+
 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV rs6000_atomic_assign_expand_fenv
 
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index c2aa4d0..9f3a08a 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -11618,6 +11618,29 @@ loops, and will help ivopts to make some decisions.
 The default version of this hook returns false.
 @end deftypefn
 
+@deftypevr {Target Hook} bool TARGET_HAVE_COUNT_REG_DECR_P
+Return true if the target supports hardware count register for decrement
+and branch.  This count register can't be used as general register since
+moving to/from a general register from/to it is very expensive.
+The default value is false.
+@end deftypevr
+
+@deftypevr {Target Hook} int64_t TARGET_DOLOOP_COST_FOR_GENERIC
+IVOPTs introduces one doloop dedicated IV candidate, this hook offers
+ target owner a way to adjust cost when selecting doloop IV candidate for a
+ generic IV use.  At calcuation, this value will be added on normal cost
+ already calculated by current implementation.
+The default value is zero.
+@end deftypevr
+
+@deftypevr {Target Hook} int64_t TARGET_DOLOOP_COST_FOR_ADDRESS
+IVOPTs introduces one doloop dedicated IV candidate, this hook offers
+ target owner a way to adjust cost when selecting doloop IV candidate for an
+ address IV use.  At calcuation, this value will be added on normal cost
+ already calculated by current implementation.
+The default value is zero.
+@end deftypevr
+
 @deftypefn {Target Hook} bool TARGET_CAN_USE_DOLOOP_P (const widest_int @var{&iterations}, const widest_int @var{&iterations_max}, unsigned int @var{loop_depth}, bool @var{entered_at_top})
 Return true if it is possible to use low-overhead loops (@code{doloop_end}
 and @code{doloop_begin}) for a particular loop.  @var{iterations} gives the
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index b4d57b8..4346773 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -7946,6 +7946,12 @@ to by @var{ce_info}.
 
 @hook TARGET_PREDICT_DOLOOP_P
 
+@hook TARGET_HAVE_COUNT_REG_DECR_P
+
+@hook TARGET_DOLOOP_COST_FOR_GENERIC
+
+@hook TARGET_DOLOOP_COST_FOR_ADDRESS
+
 @hook TARGET_CAN_USE_DOLOOP_P
 
 @hook TARGET_INVALID_WITHIN_DOLOOP
diff --git a/gcc/target.def b/gcc/target.def
index 71b6972..69e2844 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -4246,6 +4246,32 @@ The default version of this hook returns false.",
  bool, (struct loop *loop),
  default_predict_doloop_p)
 
+DEFHOOKPOD
+(have_count_reg_decr_p,
+ "Return true if the target supports hardware count register for decrement\n\
+and branch.  This count register can't be used as general register since\n\
+moving to/from a general register from/to it is very expensive.\n\
+The default value is false.",
+ bool, false)
+
+DEFHOOKPOD
+(doloop_cost_for_generic,
+ "IVOPTs introduces one doloop dedicated IV candidate, this hook offers\n\
+ target owner a way to adjust cost when selecting doloop IV candidate for a\n\
+ generic IV use.  At calcuation, this value will be added on normal cost\n\
+ already calculated by current implementation.\n\
+The default value is zero.",
+ int64_t, 0)
+
+DEFHOOKPOD
+(doloop_cost_for_address,
+ "IVOPTs introduces one doloop dedicated IV candidate, this hook offers\n\
+ target owner a way to adjust cost when selecting doloop IV candidate for an\n\
+ address IV use.  At calcuation, this value will be added on normal cost\n\
+ already calculated by current implementation.\n\
+The default value is zero.",
+ int64_t, 0)
+
 DEFHOOK
 (can_use_doloop_p,
  "Return true if it is possible to use low-overhead loops (@code{doloop_end}\n\
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ivopts-3.c b/gcc/testsuite/gcc.dg/tree-ssa/ivopts-3.c
index 214e6a7..ce4b1d0 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/ivopts-3.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ivopts-3.c
@@ -10,4 +10,6 @@ int main (void)
     f2 ();
 }
 
-/* { dg-final { scan-tree-dump-times "!= 0" 5 "ivopts" } }  */
+/* { dg-final { scan-tree-dump-times "!= 0" 5 "ivopts" { target { ! powerpc*-*-* } } } } */
+/* More debug information emitted for doloop on powerpc.  */
+/* { dg-final { scan-tree-dump-times "!= 0" 6 "ivopts" { target { powerpc*-*-* } } } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ivopts-lt.c b/gcc/testsuite/gcc.dg/tree-ssa/ivopts-lt.c
index 7d5859b..71d7f67 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/ivopts-lt.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ivopts-lt.c
@@ -17,6 +17,7 @@ f1 (char *p, uintptr_t i, uintptr_t n)
   while (i < n);
 }
 
-/* { dg-final { scan-tree-dump-times "PHI" 1 "ivopts" } } */
-/* { dg-final { scan-tree-dump-times "PHI <p_" 1 "ivopts"} } */
-/* { dg-final { scan-tree-dump-times "p_\[0-9\]* <" 1 "ivopts" } } */
+/* { dg-final { scan-tree-dump-times "PHI" 1 "ivopts" { target { ! powerpc*-*-* } } } } */
+/* { dg-final { scan-tree-dump-times "PHI" 2 "ivopts" { target { powerpc*-*-* } } } } */
+/* { dg-final { scan-tree-dump-times "PHI <p_" 1 "ivopts" { target { ! powerpc*-*-* } } } } */
+/* { dg-final { scan-tree-dump-times "p_\[0-9\]* <" 1 "ivopts" { target { ! powerpc*-*-* } } } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr32044.c b/gcc/testsuite/gcc.dg/tree-ssa/pr32044.c
index 8a8977a..06c27b0 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/pr32044.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr32044.c
@@ -1,6 +1,10 @@
 /* { dg-do compile } */
 /* { dg-options "-O2 -fdump-tree-optimized" } */
 
+/* For powerpc, disable doloop IV cand generation in IVOPTs to avoid unexpected
+   division operation for its base setup.  */
+/* { dg-additional-options "-fno-branch-count-reg" { target { powerpc*-*-* } } } */
+
 int foo (int n)
 {
   while (n >= 45)
diff --git a/gcc/tree-ssa-loop-ivopts.c b/gcc/tree-ssa-loop-ivopts.c
index 530ea4a..be3b0b5 100644
--- a/gcc/tree-ssa-loop-ivopts.c
+++ b/gcc/tree-ssa-loop-ivopts.c
@@ -275,6 +275,9 @@ comp_cost::operator+= (comp_cost cost)
 comp_cost
 comp_cost::operator+= (HOST_WIDE_INT c)
 {
+  if (c >= INFTY)
+    this->cost = INFTY;
+
   if (infinite_cost_p ())
     return *this;
 
@@ -399,6 +402,8 @@ struct iv_group
   struct cost_pair *cost_map;
   /* The selected candidate for the group.  */
   struct iv_cand *selected;
+  /* To indicate this is a doloop use group.  */
+  bool doloop_p;
   /* Uses in the group.  */
   vec<struct iv_use *> vuses;
 };
@@ -439,6 +444,7 @@ struct iv_cand
 			   be hoisted out of loop.  */
   struct iv *orig_iv;	/* The original iv if this cand is added from biv with
 			   smaller type.  */
+  bool doloop_p;	/* Whether this is a doloop candidate.  */
 };
 
 /* Hashtable entry for common candidate derived from iv uses.  */
@@ -612,6 +618,9 @@ struct ivopts_data
 
   /* Whether the loop body can only be exited via single exit.  */
   bool loop_single_exit_p;
+
+  /* Whether the loop has doloop comparison use.  */
+  bool doloop_use_p;
 };
 
 /* An assignment of iv candidates to uses.  */
@@ -1528,6 +1537,7 @@ record_group (struct ivopts_data *data, enum use_type type)
   group->type = type;
   group->related_cands = BITMAP_ALLOC (NULL);
   group->vuses.create (1);
+  group->doloop_p = false;
 
   data->vgroups.safe_push (group);
   return group;
@@ -3017,9 +3027,9 @@ get_loop_invariant_expr (struct ivopts_data *data, tree inv_expr)
    replacement of the final value of the iv by a direct computation.  */
 
 static struct iv_cand *
-add_candidate_1 (struct ivopts_data *data,
-		 tree base, tree step, bool important, enum iv_position pos,
-		 struct iv_use *use, gimple *incremented_at,
+add_candidate_1 (struct ivopts_data *data, tree base, tree step, bool important,
+		 enum iv_position pos, struct iv_use *use,
+		 gimple *incremented_at, bool doloop = false,
 		 struct iv *orig_iv = NULL)
 {
   unsigned i;
@@ -3079,11 +3089,15 @@ add_candidate_1 (struct ivopts_data *data,
       cand->pos = pos;
       if (pos != IP_ORIGINAL)
 	{
-	  cand->var_before = create_tmp_var_raw (TREE_TYPE (base), "ivtmp");
+	  if (doloop)
+	    cand->var_before = create_tmp_var_raw (TREE_TYPE (base), "doloop");
+	  else
+	    cand->var_before = create_tmp_var_raw (TREE_TYPE (base), "ivtmp");
 	  cand->var_after = cand->var_before;
 	}
       cand->important = important;
       cand->incremented_at = incremented_at;
+      cand->doloop_p = doloop;
       data->vcands.safe_push (cand);
 
       if (!poly_int_tree_p (step))
@@ -3116,6 +3130,7 @@ add_candidate_1 (struct ivopts_data *data,
     }
 
   cand->important |= important;
+  cand->doloop_p |= doloop;
 
   /* Relate candidate to the group for which it is added.  */
   if (use)
@@ -3209,16 +3224,17 @@ add_autoinc_candidates (struct ivopts_data *data, tree base, tree step,
    the end of loop.  */
 
 static void
-add_candidate (struct ivopts_data *data,
-	       tree base, tree step, bool important, struct iv_use *use,
+add_candidate (struct ivopts_data *data, tree base, tree step, bool important,
+	       struct iv_use *use, bool doloop = false,
 	       struct iv *orig_iv = NULL)
 {
   if (ip_normal_pos (data->current_loop))
-    add_candidate_1 (data, base, step, important,
-		     IP_NORMAL, use, NULL, orig_iv);
-  if (ip_end_pos (data->current_loop)
+    add_candidate_1 (data, base, step, important, IP_NORMAL, use, NULL, doloop,
+		     orig_iv);
+  if (!doloop && ip_end_pos (data->current_loop)
       && allow_ip_end_pos_p (data->current_loop))
-    add_candidate_1 (data, base, step, important, IP_END, use, NULL, orig_iv);
+    add_candidate_1 (data, base, step, important, IP_END, use, NULL, doloop,
+		     orig_iv);
 }
 
 /* Adds standard iv candidates.  */
@@ -3262,7 +3278,7 @@ add_iv_candidate_for_biv (struct ivopts_data *data, struct iv *iv)
       tree step = fold_convert (sizetype, iv->step);
 
       /* Add iv cand of same precision as index part in TARGET_MEM_REF.  */
-      add_candidate (data, base, step, true, NULL, iv);
+      add_candidate (data, base, step, true, NULL, false, iv);
       /* Add iv cand of the original type only if it has nonlinear use.  */
       if (iv->nonlin_use)
 	add_candidate (data, iv->base, iv->step, true, NULL);
@@ -3724,7 +3740,7 @@ prepare_decl_rtl (tree *expr_p, int *ws, void *data)
    Some RTL specific checks seems unable to be checked in gimple, if any new
    checks or easy checks _are_ missing here, please add them.  */
 
-static bool ATTRIBUTE_UNUSED
+static bool
 generic_predict_doloop_p (struct ivopts_data *data)
 {
   struct loop *loop = data->current_loop;
@@ -4177,6 +4193,36 @@ force_expr_to_var_cost (tree expr, bool speed)
       STRIP_NOPS (op0);
       op1 = NULL_TREE;
       break;
+    /* See add_iv_candidate_for_doloop, for doloop may_be_zero case, we
+       introduce COND_EXPR for IV base, need to support better cost estimation
+       for this COND_EXPR and tcc_comparison.  */
+    case COND_EXPR:
+      op0 = TREE_OPERAND (expr, 1);
+      STRIP_NOPS (op0);
+      op1 = TREE_OPERAND (expr, 2);
+      STRIP_NOPS (op1);
+      break;
+    case LT_EXPR:
+    case LE_EXPR:
+    case GT_EXPR:
+    case GE_EXPR:
+    case EQ_EXPR:
+    case NE_EXPR:
+    case UNORDERED_EXPR:
+    case ORDERED_EXPR:
+    case UNLT_EXPR:
+    case UNLE_EXPR:
+    case UNGT_EXPR:
+    case UNGE_EXPR:
+    case UNEQ_EXPR:
+    case LTGT_EXPR:
+    case MAX_EXPR:
+    case MIN_EXPR:
+      op0 = TREE_OPERAND (expr, 0);
+      STRIP_NOPS (op0);
+      op1 = TREE_OPERAND (expr, 1);
+      STRIP_NOPS (op1);
+      break;
 
     default:
       /* Just an arbitrary value, FIXME.  */
@@ -4258,6 +4304,35 @@ force_expr_to_var_cost (tree expr, bool speed)
     case RSHIFT_EXPR:
       cost = comp_cost (add_cost (speed, mode), 0);
       break;
+    case COND_EXPR:
+      op0 = TREE_OPERAND (expr, 0);
+      STRIP_NOPS (op0);
+      if (op0 == NULL_TREE || TREE_CODE (op0) == SSA_NAME
+	  || CONSTANT_CLASS_P (op0))
+	cost = no_cost;
+      else
+	cost = force_expr_to_var_cost (op0, speed);
+      break;
+    case LT_EXPR:
+    case LE_EXPR:
+    case GT_EXPR:
+    case GE_EXPR:
+    case EQ_EXPR:
+    case NE_EXPR:
+    case UNORDERED_EXPR:
+    case ORDERED_EXPR:
+    case UNLT_EXPR:
+    case UNLE_EXPR:
+    case UNGT_EXPR:
+    case UNGE_EXPR:
+    case UNEQ_EXPR:
+    case LTGT_EXPR:
+    case MAX_EXPR:
+    case MIN_EXPR:
+      /* Simply use 1.5 * add cost for now, FIXME if there is some more accurate
+	 cost evaluation way.  */
+      cost = comp_cost (1.5 * add_cost (speed, mode), 0);
+      break;
 
     default:
       gcc_unreachable ();
@@ -4706,8 +4781,12 @@ determine_group_iv_cost_generic (struct ivopts_data *data,
   if (cand->pos == IP_ORIGINAL && cand->incremented_at == use->stmt)
     cost = no_cost;
   else
-    cost = get_computation_cost (data, use, cand, false,
-				 &inv_vars, NULL, &inv_expr);
+    {
+      cost = get_computation_cost (data, use, cand, false, &inv_vars, NULL,
+				   &inv_expr);
+      if (cand->doloop_p)
+	cost += targetm.doloop_cost_for_generic;
+    }
 
   if (inv_expr)
     {
@@ -4735,6 +4814,9 @@ determine_group_iv_cost_address (struct ivopts_data *data,
   cost = get_computation_cost (data, use, cand, true,
 			       &inv_vars, &can_autoinc, &inv_expr);
 
+  if (cand->doloop_p)
+    cost += targetm.doloop_cost_for_address;
+
   if (inv_expr)
     {
       inv_exprs = BITMAP_ALLOC (NULL);
@@ -5142,6 +5224,15 @@ may_eliminate_iv (struct ivopts_data *data,
 	}
     }
 
+  /* For doloop IV cand, the bound would be zero.  It's safe whether
+     may_be_zero set or not.  */
+  if (cand->doloop_p)
+    {
+      *bound = build_int_cst (TREE_TYPE (cand->iv->base), 0);
+      *comp = iv_elimination_compare (data, use);
+      return true;
+    }
+
   cand_value_at (loop, cand, use->stmt, desc->niter, &bnd);
 
   *bound = fold_convert (TREE_TYPE (cand->iv->base),
@@ -5264,6 +5355,9 @@ determine_group_iv_cost_cond (struct ivopts_data *data,
       inv_vars = inv_vars_elim;
       inv_vars_elim = NULL;
       inv_expr = inv_expr_elim;
+      /* For doloop candidate/use pair, adjust to zero cost.  */
+      if (group->doloop_p && cand->doloop_p)
+	cost = no_cost;
     }
   else
     {
@@ -5390,6 +5484,42 @@ relate_compare_use_with_all_cands (struct ivopts_data *data)
     }
 }
 
+/* Add one doloop dedicated IV candidate:
+     - Base is (may_be_zero ? 1 : (niter + 1)).
+     - Step is -1.  */
+
+static void
+add_iv_candidate_for_doloop (struct ivopts_data *data)
+{
+  tree_niter_desc *niter_desc = niter_for_single_dom_exit (data);
+  gcc_assert (niter_desc && niter_desc->assumptions);
+
+  tree niter = niter_desc->niter;
+  tree ntype = TREE_TYPE (niter);
+  gcc_assert (TREE_CODE (ntype) == INTEGER_TYPE);
+
+  tree may_be_zero = niter_desc->may_be_zero;
+  if (may_be_zero && integer_zerop (may_be_zero))
+    may_be_zero = NULL_TREE;
+  if (may_be_zero)
+    {
+      if (COMPARISON_CLASS_P (may_be_zero))
+	{
+	  niter = fold_build3 (COND_EXPR, ntype, may_be_zero,
+			       build_int_cst (ntype, 0),
+			       rewrite_to_non_trapping_overflow (niter));
+	}
+      /* Don't try to obtain the iteration count expression when may_be_zero is
+	 integer_nonzerop (actually iteration count is one) or else.  */
+      else
+	return;
+    }
+
+  tree base = fold_build2 (PLUS_EXPR, ntype, unshare_expr (niter),
+			   build_int_cst (ntype, 1));
+  add_candidate (data, base, build_int_cst (ntype, -1), true, NULL, true);
+}
+
 /* Finds the candidates for the induction variables.  */
 
 static void
@@ -5398,6 +5528,10 @@ find_iv_candidates (struct ivopts_data *data)
   /* Add commonly used ivs.  */
   add_standard_iv_candidates (data);
 
+  /* Add doloop dedicate ivs.  */
+  if (data->doloop_use_p)
+    add_iv_candidate_for_doloop (data);
+
   /* Add old induction variables.  */
   add_iv_candidate_for_bivs (data);
 
@@ -5578,16 +5712,21 @@ determine_iv_cost (struct ivopts_data *data, struct iv_cand *cand)
      or a const set.  */
   if (cost_base.cost == 0)
     cost_base.cost = COSTS_N_INSNS (1);
-  cost_step = add_cost (data->speed, TYPE_MODE (TREE_TYPE (base)));
-
+  /* Doloop decrement should be considered as zero cost.  */
+  if (cand->doloop_p)
+    cost_step = 0;
+  else
+    cost_step = add_cost (data->speed, TYPE_MODE (TREE_TYPE (base)));
   cost = cost_step + adjust_setup_cost (data, cost_base.cost);
 
   /* Prefer the original ivs unless we may gain something by replacing it.
      The reason is to make debugging simpler; so this is not relevant for
      artificial ivs created by other optimization passes.  */
-  if (cand->pos != IP_ORIGINAL
-      || !SSA_NAME_VAR (cand->var_before)
-      || DECL_ARTIFICIAL (SSA_NAME_VAR (cand->var_before)))
+  if ((cand->pos != IP_ORIGINAL
+       || !SSA_NAME_VAR (cand->var_before)
+       || DECL_ARTIFICIAL (SSA_NAME_VAR (cand->var_before)))
+      /* Prefer doloop as well.  */
+      && !cand->doloop_p)
     cost++;
 
   /* Prefer not to insert statements into latch unless there are some
@@ -5832,7 +5971,8 @@ iv_ca_set_no_cp (struct ivopts_data *data, struct iv_ca *ivs,
   if (ivs->n_cand_uses[cid] == 0)
     {
       bitmap_clear_bit (ivs->cands, cid);
-      ivs->n_cands--;
+      if (!cp->cand->doloop_p || !targetm.have_count_reg_decr_p)
+	ivs->n_cands--;
       ivs->cand_cost -= cp->cand->cost;
       iv_ca_set_remove_invs (ivs, cp->cand->inv_vars, ivs->n_inv_var_uses);
       iv_ca_set_remove_invs (ivs, cp->cand->inv_exprs, ivs->n_inv_expr_uses);
@@ -5889,7 +6029,8 @@ iv_ca_set_cp (struct ivopts_data *data, struct iv_ca *ivs,
       if (ivs->n_cand_uses[cid] == 1)
 	{
 	  bitmap_set_bit (ivs->cands, cid);
-	  ivs->n_cands++;
+	  if (!cp->cand->doloop_p || !targetm.have_count_reg_decr_p)
+	    ivs->n_cands++;
 	  ivs->cand_cost += cp->cand->cost;
 	  iv_ca_set_add_invs (ivs, cp->cand->inv_vars, ivs->n_inv_var_uses);
 	  iv_ca_set_add_invs (ivs, cp->cand->inv_exprs, ivs->n_inv_expr_uses);
@@ -6134,6 +6275,8 @@ iv_ca_dump (struct ivopts_data *data, FILE *file, struct iv_ca *ivs)
 
   fprintf (file, "  cost: %" PRId64 " (complexity %d)\n", cost.cost,
 	   cost.complexity);
+  fprintf (file, "  reg_cost: %d\n",
+	   ivopts_estimate_reg_pressure (data, ivs->n_invs, ivs->n_cands));
   fprintf (file, "  cand_cost: %" PRId64 "\n  cand_group_cost: "
 	   "%" PRId64 " (complexity %d)\n", ivs->cand_cost,
 	   ivs->cand_use_cost.cost, ivs->cand_use_cost.complexity);
@@ -7568,6 +7711,75 @@ determine_scaling_factor (struct ivopts_data *data, basic_block *body)
     }
 }
 
+/* Find doloop comparison use and set its doloop_p on if found.  */
+
+static bool
+find_doloop_use (struct ivopts_data *data)
+{
+  struct loop *loop = data->current_loop;
+
+  for (unsigned i = 0; i < data->vgroups.length (); i++)
+    {
+      struct iv_group *group = data->vgroups[i];
+      if (group->type == USE_COMPARE)
+	{
+	  gcc_assert (group->vuses.length () == 1);
+	  struct iv_use *use = group->vuses[0];
+	  gimple *stmt = use->stmt;
+	  if (gimple_code (stmt) == GIMPLE_COND)
+	    {
+	      basic_block bb = gimple_bb (stmt);
+	      edge true_edge, false_edge;
+	      extract_true_false_edges_from_block (bb, &true_edge, &false_edge);
+	      /* This comparison is used for loop latch.  Require latch is empty
+		 for now.  */
+	      if ((loop->latch == true_edge->dest
+		   || loop->latch == false_edge->dest)
+		  && empty_block_p (loop->latch))
+		{
+		  group->doloop_p = true;
+		  if (dump_file && (dump_flags & TDF_DETAILS))
+		    {
+		      fprintf (dump_file, "Doloop cmp iv use: ");
+		      print_gimple_stmt (dump_file, stmt, TDF_DETAILS);
+		    }
+		  return true;
+		}
+	    }
+	}
+    }
+
+  return false;
+}
+
+/* For the targets which support doloop, to predict whether later RTL doloop
+   transformation will perform on this loop, further detect the doloop use and
+   mark the flag doloop_use_p if predicted.  */
+
+void
+predict_and_process_doloop (struct ivopts_data *data)
+{
+  if (!flag_branch_on_count_reg)
+    return;
+
+  if (!generic_predict_doloop_p (data))
+    return;
+
+  if (find_doloop_use (data))
+    {
+      data->doloop_use_p = true;
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	{
+	  struct loop *loop = data->current_loop;
+	  fprintf (dump_file,
+		   "Predict loop %d can perform"
+		   " doloop optimization later.\n",
+		   loop->num);
+	  flow_loop_dump (loop, dump_file, NULL, 1);
+	}
+    }
+}
+
 /* Optimizes the LOOP.  Returns true if anything changed.  */
 
 static bool
@@ -7580,6 +7792,7 @@ tree_ssa_iv_optimize_loop (struct ivopts_data *data, struct loop *loop,
   basic_block *body;
 
   gcc_assert (!data->niters);
+  data->doloop_use_p = false;
   data->current_loop = loop;
   data->loop_loc = find_loop_location (loop).get_location_t ();
   data->speed = optimize_loop_for_speed_p (loop);
@@ -7622,6 +7835,9 @@ tree_ssa_iv_optimize_loop (struct ivopts_data *data, struct loop *loop,
   /* Determine cost scaling factor for basic blocks in loop.  */
   determine_scaling_factor (data, body);
 
+  /* Predict doloop and find the doloop use if predicted.  */
+  predict_and_process_doloop (data);
+
   /* Finds candidates for the induction variables (item 2).  */
   find_iv_candidates (data);
 

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v6 3/3] PR80791 Consider doloop cmp use in ivopts
  2019-08-22  9:16                 ` Kewen.Lin
@ 2019-08-23  5:31                   ` Bin.Cheng
  2019-08-23  9:57                     ` Kewen.Lin
  0 siblings, 1 reply; 43+ messages in thread
From: Bin.Cheng @ 2019-08-23  5:31 UTC (permalink / raw)
  To: Kewen.Lin
  Cc: gcc-patches List, Segher Boessenkool, Bill Schmidt, Richard Guenther

On Thu, Aug 22, 2019 at 3:09 PM Kewen.Lin <linkw@linux.ibm.com> wrote:
>
> Hi Bin,
>
> on 2019/8/22 下午1:46, Bin.Cheng wrote:
> > On Thu, Aug 22, 2019 at 11:18 AM Kewen.Lin <linkw@linux.ibm.com> wrote:
> >>
> >> Hi Bin,
> >>
> >> Thanks for your time!
> >>
> >> on 2019/8/21 下午8:32, Bin.Cheng wrote:
> >>> On Wed, Aug 14, 2019 at 3:23 PM Kewen.Lin <linkw@linux.ibm.com> wrote:
> >>>>
> >>>> Hi!
> >>>>
> >>>> Comparing to the previous versions of implementation mainly based on the
> >>>> existing IV cands but zeroing the related group/use cost, this new one is based
> >>>> on Richard and Segher's suggestion introducing one doloop dedicated IV cand.
> >>>>
> >>>> Some key points are listed below:
> >>>>   1) New field doloop_p in struct iv_cand to indicate doloop dedicated IV cand.
> >>>>   2) Special name "doloop" assigned.
> >>>>   3) Doloop IV cand with form (niter+1, +, -1)
> >>>>   4) For doloop IV cand, no extra one cost like BIV, assign zero cost for step.
> >>>>   5) Support may_be_zero (regressed PR is in this case), the base of doloop IV
> >>>>      can be COND_EXPR, add handlings in cand_value_at and may_eliminate_iv.
> >>>>   6) Add more expr support in force_expr_to_var_cost for reasonable cost
> >>>>      calculation on the IV base with may_be_zero (like COND_EXPR).
> >>>>   7) Set zero cost when using doloop IV cand for doloop use.
> >>>>   8) Add three hooks (should we merge _generic and _address?).
> >>>>     *) have_count_reg_decr_p, is to indicate the target has special hardware
> >>>>        count register, we shouldn't consider the impact of doloop IV when
> >>>>        calculating register pressures.
> >>>>     *) doloop_cost_for_generic, is the extra cost when using doloop IV cand for
> >>>>        generic type IV use.
> >>>>     *) doloop_cost_for_address, is the extra cost when using doloop IV cand for
> >>>>        address type IV use.
> >>> What will happen if doloop IV cand be used for generic/address type iv
> >>> use?  Can RTL doloop can still perform doloop optimization in this
> >>> case?
> >>>
> >>
> >> On Power, we put the iteration count into hardware count register, it takes very
> >> high cost to move the count to GPR, so the cost is set as INF to make it impossible
> >> to use it for generic/address type iv use.  But as some discussion before, on some
> >> targets using GPR instead of hardware count register, they probably want to use this
> >> doloop iv used for other uses if profitable.  These two hooks offer the possibility.
> >> In that case, I think RTL doloop can still perform since it can still get the
> >> pattern and transform.  The generic/address uses can still use it.
> >>>>
> >>>> Bootstrapped on powerpc64le-linux-gnu and regression testing passed excepting
> >>>> for one failure on gcc/testsuite/gcc.dg/guality/loop-1.c at -O3 which is tracked
> >>>> by PR89983.
> >>>>
> >>>> Any comments and suggestions are highly appreciated.  Thanks!
> >>> Not sure if I understand the patch correctly, some comments embedded.
> >>>
> >>> +  /* The number of doloop candidate in the set.  */
> >>> +  unsigned n_doloop_cands;
> >>> +
> >>> This is unnecessary.  See below comments.
> >>>
> >>> -    add_candidate_1 (data, base, step, important,
> >>> -                    IP_NORMAL, use, NULL, orig_iv);
> >>> +    add_candidate_1 (data, base, step, important, IP_NORMAL, use, NULL, doloop,
> >>> +                    orig_iv);
> >>>    if (ip_end_pos (data->current_loop)
> >>>        && allow_ip_end_pos_p (data->current_loop))
> >>> -    add_candidate_1 (data, base, step, important, IP_END, use, NULL, orig_iv);
> >>> +    add_candidate_1 (data, base, step, important, IP_END, use, NULL, doloop,
> >>> +                    orig_iv);
> >>> Do we need to skip ip_end_pos case for doloop candidate?  Because the
> >>> candidate increment will be inserted in latch, i.e, increment position
> >>> is after exit condition.
> >>>
> >>
> >> Yes, we should skip it.  Currently function find_doloop_use has the check on an
> >> empty latch and gimple_cond to latch, partially excluding it.  But it's still good
> >> to guard it directly here.
> >>
> >>> -  tree_to_aff_combination (iv->base, type, val);
> >>> +  tree base = iv->base;
> >>> +  /* See add_iv_candidate_for_doloop, if may_be_zero is set, we want to extract
> >>> +     the value under !may_be_zero to get the compact bound which also well fits
> >>> +     for may_be_zero since we ensure the value for it is const one.  */
> >>> +  if (cand->doloop_p && desc->may_be_zero && !integer_zerop
> >>> (desc->may_be_zero))
> >>> +    base = fold_build2 (PLUS_EXPR, TREE_TYPE (niter),
> >>> +                       unshare_expr (rewrite_to_non_trapping_overflow (niter)),
> >>> +                       build_int_cst (TREE_TYPE (niter), 1));
> >>> +  tree_to_aff_combination (base, type, val);
> >>> I don't quite follow here.  The iv->base is computed from niter, I
> >>> suppose compact bound is for cheaper candidate initialization?  Why
> >>> it's possible to extract !may_be_zero niter for may_be_zero here?  The
> >>> niter under !may_be_zero has no indication about the real niter under
> >>> may_be_zero.
> >>>
> >>
> >> As you note below, the cand_value for doloop would be zero, but for the case
> >> may_be_zero set, the current calculation would take care of the whole niter
> >> expression including the cond_expr introduced by may_be_zero check, it's
> >> unexpected.  The purpose is to use the value under condition !may_be_zero
> >> for the calculation, and yes, to get expected zero finally.
> >>
> >>> -  cand_value_at (loop, cand, use->stmt, desc->niter, &bnd);
> >>> +  cand_value_at (loop, cand, use->stmt, desc, &bnd);
> >>> If I understand correctly, doloop use/cand will only be
> >>> identified/added for single exit loop, and there will be only one
> >>> cond(doloop) iv_use and only one doloop cand for doloop loop.  So the
> >>> cand_value at niter at use position would be 0.  If that's the case,
> >>> we can skip calling cand_value_at here for doloop cand.  The change to
> >>> cand_value_at would be unnecessary neither.
> >>>
> >>
> >> Exactly, I'll add the early return with zero bound for doloop.
> >>
> >>> -          expensive.  */
> >>> -  if (!integer_zerop (desc->may_be_zero))
> >>> +          expensive.
> >>> +
> >>> +     For doloop candidate, we have considered MAY_BE_ZERO for IV base, need to
> >>> +     support MAY_BE_ZERO ? 0 : NITER, so simply bypass this check.  */
> >>> +  if (!integer_zerop (desc->may_be_zero) && !cand->doloop_p)
> >>>      return iv_elimination_compare_lt (data, cand, comp, desc);
> >>> And we can early return before this?
> >>>
> >>
> >> OK.
> >>
> >>> +  if (may_be_zero)
> >>> +    {
> >>> +      if (COMPARISON_CLASS_P (may_be_zero))
> >>> +       {
> >>> +         niter = fold_build3 (COND_EXPR, ntype, may_be_zero,
> >>> +                              build_int_cst (ntype, 0),
> >>> +                              rewrite_to_non_trapping_overflow (niter));
> >>> +       }
> >>> +      /* Don't try to obtain the iteration count expression when may_be_zero is
> >>> +        integer_nonzerop (actually iteration count is one) or else.  */
> >>> +      else
> >>> +       return;
> >>> +    }
> >>> +
> >>> +  tree base = fold_build2 (PLUS_EXPR, ntype, unshare_expr (niter),
> >>> +                          build_int_cst (ntype, 1));
> >>> niter is the number of latch executions, so niter + 1 could wrap here,
> >>> but guess it's not a problem the similar issue is not handled in
> >>> vectorizer neither.
> >>>
> >>
> >> OK.
> >>
> >>> +  unsigned n_old = data->regs_used, n_spr_for_doloop = 0;
> >>> +  /* If target supports count register for doloop, it doesn't take GPR.  */
> >>> +  if (targetm.have_count_reg_decr_p)
> >>> +    n_spr_for_doloop = n_doloop_cands;
> >>> +  unsigned n_new = n_invs + n_cands - n_spr_for_doloop;
> >>> Not necessary.  See below.
> >>
> >>> -  cost += ivopts_estimate_reg_pressure (data, ivs->n_invs, ivs->n_cands);
> >>> +  cost += ivopts_estimate_reg_pressure (data, ivs->n_invs, ivs->n_cands,
> >>> +                                       ivs->n_doloop_cands);
> >>> Also.
> >>>
> >>>        ivs->n_cands--;
> >>> +      if (cp->cand->doloop_p)
> >>> +       ivs->n_doloop_cands--;
> >>>
> >>>           ivs->n_cands++;
> >>> +         if (cp->cand->doloop_p)
> >>> +           ivs->n_doloop_cands++;
> >>> You can just book n_cands under condition !cp->cand->doloop_p.
> >>
> >> If my understanding is correct, you are suggesting the code like:
> >>
> >> if (!cp->cand->doloop_p)
> >>   ivs->n_cands++;
> >>
> >> But I'm afraid that it can NOT satisfy the need in function
> >> ivopts_estimate_reg_pressure.  As the comments, "if target supports
> >> count register for doloop it doesn't take GPR.".  If we make doloop
> >> cand invisible in n_cands, it's fine for target with count register,
> >> but we may miss to count them on targets without count register.
> > Why not one more step do checks:
> > if (!cp->cand->doloop_p || !targetm.have_count_reg_decr_p)
> >   ivs->n_cands++;
> >
>
> Yes, it works.  Thanks!
>
> The new patch addressing the comments is attached.
> Could you please have a look again?  Thanks in advance!
Thanks for working on this.  A bit more nit-pickings.

-    add_candidate_1 (data, base, step, important,
-                    IP_NORMAL, use, NULL, orig_iv);
-  if (ip_end_pos (data->current_loop)
+    add_candidate_1 (data, base, step, important, IP_NORMAL, use, NULL, doloop,
+                    orig_iv);
+  if (!doloop && ip_end_pos (data->current_loop)
Could you add some comments elaborating why ip_end_pos candidate
shouldn't be added for doloop case?  Because the increment position is
wrong.

Also if you make doloop the last default parameter of add_candidate_1,
you can save more unnecessary changes to calls to add_candidate?

-    cost = get_computation_cost (data, use, cand, false,
-                                &inv_vars, NULL, &inv_expr);
+    {
+      cost = get_computation_cost (data, use, cand, false, &inv_vars, NULL,
+                                  &inv_expr);
+      if (cand->doloop_p)
+       cost += targetm.doloop_cost_for_generic;
+    }
This adjustment

   cost = get_computation_cost (data, use, cand, true,
                               &inv_vars, &can_autoinc, &inv_expr);

+  if (cand->doloop_p)
+    cost += targetm.doloop_cost_for_address;
+
and this adjustment can be moved into get_computation_cost where all
cost adjustments are done.

+      /* For doloop candidate/use pair, adjust to zero cost.  */
+      if (group->doloop_p && cand->doloop_p)
+       cost = no_cost;
Note above code handles comparing against zero case and decreases the
cost by one (which prefers the same kind candidate as doloop one),
it's very possible to have -1 cost for doloop cand here.  how about
just set to no_cost if it's positive?  Your call.

+/* For the targets which support doloop, to predict whether later RTL doloop
+   transformation will perform on this loop, further detect the doloop use and
+   mark the flag doloop_use_p if predicted.  */
+
+void
+predict_and_process_doloop (struct ivopts_data *data)
A better name here? Sorry I don't have another candidate in mind...

+  data->doloop_use_p = false;
This can be moved to the beginning of above
'predict_and_process_doloop' function.

Lastly, could you please add some brief description/comment about
doloop handling as a subsection in the file head comment?

Otherwise, the ivopt changes look good to me.

Thanks,
bin

>
>
> Kewen
>
> ---------
>
> gcc/ChangeLog
>
> 2019-08-22  Kewen Lin  <linkw@gcc.gnu.org>
>
>         PR middle-end/80791
>         * config/rs6000/rs6000.c (TARGET_HAVE_COUNT_REG_DECR_P): New macro.
>         (TARGET_DOLOOP_COST_FOR_GENERIC): Likewise.
>         (TARGET_DOLOOP_COST_FOR_ADDRESS): Likewise.
>         * target.def (have_count_reg_decr_p): New hook.
>         (doloop_cost_for_generic): Likewise.
>         (doloop_cost_for_address): Likewise.
>         * doc/tm.texi.in (TARGET_HAVE_COUNT_REG_DECR_P): Likewise.
>         (TARGET_DOLOOP_COST_FOR_GENERIC): Likewise.
>         (TARGET_DOLOOP_COST_FOR_ADDRESS): Likewise.
>         * doc/tm.texi: Regenerate.
>         * tree-ssa-loop-ivopts.c (comp_cost::operator+=): Consider infinite cost
>         addend.
>         (record_group): Init doloop_p.
>         (add_candidate_1): Add optional argument doloop, change the handlings
>         accordingly.
>         (add_candidate): Likewise.
>         (add_iv_candidate_for_biv): Update the call to add_candidate.
>         (generic_predict_doloop_p): Update attribute.
>         (force_expr_to_var_cost): Add costing for expressions COND_EXPR/LT_EXPR/
>         LE_EXPR/GT_EXPR/GE_EXPR/EQ_EXPR/NE_EXPR/UNORDERED_EXPR/ORDERED_EXPR/
>         UNLT_EXPR/UNLE_EXPR/UNGT_EXPR/UNGE_EXPR/UNEQ_EXPR/LTGT_EXPR/MAX_EXPR/
>         MIN_EXPR.
>         (determine_group_iv_cost_generic): Update for doloop IV cand.
>         (determine_group_iv_cost_address): Likewise.
>         (determine_group_iv_cost_cond): Likewise.
>         (determine_iv_cost): Likewise.
>         (ivopts_estimate_reg_pressure): Likewise.
>         (may_eliminate_iv): Likewise.
>         (add_iv_candidate_for_doloop): New function.
>         (find_iv_candidates): Call function add_iv_candidate_for_doloop.
>         (iv_ca_set_no_cp): Update for doloop IV cand.
>         (iv_ca_set_cp): Likewise.
>         (iv_ca_dump): Dump register cost.
>         (find_doloop_use): New function.
>         (predict_and_process_doloop): Likewise.
>         (tree_ssa_iv_optimize_loop): Call function predict_and_process_doloop.
>
> gcc/testsuite/ChangeLog
>
> 2019-08-22  Kewen Lin  <linkw@gcc.gnu.org>
>
>         PR middle-end/80791
>         * gcc.dg/tree-ssa/ivopts-3.c: Adjust for doloop change.
>         * gcc.dg/tree-ssa/ivopts-lt.c: Likewise.
>         * gcc.dg/tree-ssa/pr32044.c: Likewise.
>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v6 3/3] PR80791 Consider doloop cmp use in ivopts
  2019-08-23  5:31                   ` Bin.Cheng
@ 2019-08-23  9:57                     ` Kewen.Lin
  2019-08-23 10:43                       ` Bin.Cheng
  0 siblings, 1 reply; 43+ messages in thread
From: Kewen.Lin @ 2019-08-23  9:57 UTC (permalink / raw)
  To: Bin.Cheng
  Cc: gcc-patches List, Segher Boessenkool, Bill Schmidt, Richard Guenther

[-- Attachment #1: Type: text/plain, Size: 6594 bytes --]

Hi Bin

on 2019/8/23 上午10:19, Bin.Cheng wrote:
> On Thu, Aug 22, 2019 at 3:09 PM Kewen.Lin <linkw@linux.ibm.com> wrote:
>>
>> Hi Bin,
>>
>> on 2019/8/22 下午1:46, Bin.Cheng wrote:
>>> On Thu, Aug 22, 2019 at 11:18 AM Kewen.Lin <linkw@linux.ibm.com> wrote:
>>>>
>>>> Hi Bin,
>>>>
>>>> Thanks for your time!
>>>>
>>>> on 2019/8/21 下午8:32, Bin.Cheng wrote:
>>>>> On Wed, Aug 14, 2019 at 3:23 PM Kewen.Lin <linkw@linux.ibm.com> wrote:
>>>>>>
>>>>>> Hi!
>>>>>>
>>>>>> Comparing to the previous versions of implementation mainly based on the
>>>>>> existing IV cands but zeroing the related group/use cost, this new one is based
>>>>>> on Richard and Segher's suggestion introducing one doloop dedicated IV cand.
>>>>>>
>>>>>> Some key points are listed below:
>>>>>>   1) New field doloop_p in struct iv_cand to indicate doloop dedicated IV cand.
>>>>>>   2) Special name "doloop" assigned.
>>>>>>   3) Doloop IV cand with form (niter+1, +, -1)
>>>>>>   4) For doloop IV cand, no extra one cost like BIV, assign zero cost for step.
>>>>>>   5) Support may_be_zero (regressed PR is in this case), the base of doloop IV
>>>>>>      can be COND_EXPR, add handlings in cand_value_at and may_eliminate_iv.
>>>>>>   6) Add more expr support in force_expr_to_var_cost for reasonable cost
>>>>>>      calculation on the IV base with may_be_zero (like COND_EXPR).
>>>>>>   7) Set zero cost when using doloop IV cand for doloop use.
>>>>>>   8) Add three hooks (should we merge _generic and _address?).
>>>>>>     *) have_count_reg_decr_p, is to indicate the target has special hardware
>>>>>>        count register, we shouldn't consider the impact of doloop IV when
>>>>>>        calculating register pressures.
>>>>>>     *) doloop_cost_for_generic, is the extra cost when using doloop IV cand for
>>>>>>        generic type IV use.
>>>>>>     *) doloop_cost_for_address, is the extra cost when using doloop IV cand for
>>>>>>        address type IV use.

>> The new patch addressing the comments is attached.
>> Could you please have a look again?  Thanks in advance!
> Thanks for working on this.  A bit more nit-pickings.
> 
> -    add_candidate_1 (data, base, step, important,
> -                    IP_NORMAL, use, NULL, orig_iv);
> -  if (ip_end_pos (data->current_loop)
> +    add_candidate_1 (data, base, step, important, IP_NORMAL, use, NULL, doloop,
> +                    orig_iv);
> +  if (!doloop && ip_end_pos (data->current_loop)
> Could you add some comments elaborating why ip_end_pos candidate
> shouldn't be added for doloop case?  Because the increment position is
> wrong.
> 
> Also if you make doloop the last default parameter of add_candidate_1,
> you can save more unnecessary changes to calls to add_candidate?
> 
> -    cost = get_computation_cost (data, use, cand, false,
> -                                &inv_vars, NULL, &inv_expr);
> +    {
> +      cost = get_computation_cost (data, use, cand, false, &inv_vars, NULL,
> +                                  &inv_expr);
> +      if (cand->doloop_p)
> +       cost += targetm.doloop_cost_for_generic;
> +    }
> This adjustment
> 
>    cost = get_computation_cost (data, use, cand, true,
>                                &inv_vars, &can_autoinc, &inv_expr);
> 
> +  if (cand->doloop_p)
> +    cost += targetm.doloop_cost_for_address;
> +
> and this adjustment can be moved into get_computation_cost where all
> cost adjustments are done.
> 
> +      /* For doloop candidate/use pair, adjust to zero cost.  */
> +      if (group->doloop_p && cand->doloop_p)
> +       cost = no_cost;
> Note above code handles comparing against zero case and decreases the
> cost by one (which prefers the same kind candidate as doloop one),
> it's very possible to have -1 cost for doloop cand here.  how about
> just set to no_cost if it's positive?  Your call.
> 
> +/* For the targets which support doloop, to predict whether later RTL doloop
> +   transformation will perform on this loop, further detect the doloop use and
> +   mark the flag doloop_use_p if predicted.  */
> +
> +void
> +predict_and_process_doloop (struct ivopts_data *data)
> A better name here? Sorry I don't have another candidate in mind...
> 
> +  data->doloop_use_p = false;
> This can be moved to the beginning of above
> 'predict_and_process_doloop' function.
> 
> Lastly, could you please add some brief description/comment about
> doloop handling as a subsection in the file head comment?
> 
> Otherwise, the ivopt changes look good to me.
> 
> Thanks,
> bin
> 

Thanks for your prompt reply!  I've updated the code as your comments,
the updated version is attached.  Looking forward to your review again.


Thanks,
Kewen

-----

gcc/ChangeLog

2019-08-23  Kewen Lin  <linkw@gcc.gnu.org>

	PR middle-end/80791
	* config/rs6000/rs6000.c (TARGET_HAVE_COUNT_REG_DECR_P): New macro.
	(TARGET_DOLOOP_COST_FOR_GENERIC): Likewise.
	(TARGET_DOLOOP_COST_FOR_ADDRESS): Likewise.
	* target.def (have_count_reg_decr_p): New hook.
	(doloop_cost_for_generic): Likewise.
	(doloop_cost_for_address): Likewise.
	* doc/tm.texi.in (TARGET_HAVE_COUNT_REG_DECR_P): Likewise.
	(TARGET_DOLOOP_COST_FOR_GENERIC): Likewise.
	(TARGET_DOLOOP_COST_FOR_ADDRESS): Likewise.
	* doc/tm.texi: Regenerate.
	* tree-ssa-loop-ivopts.c (comp_cost::operator+=): Consider infinite cost
	addend.
	(record_group): Init doloop_p.
	(add_candidate_1): Add optional argument doloop, change the handlings
	accordingly.
	(add_candidate): Likewise.
	(generic_predict_doloop_p): Update attribute.
	(force_expr_to_var_cost): Add costing for expressions COND_EXPR/LT_EXPR/
	LE_EXPR/GT_EXPR/GE_EXPR/EQ_EXPR/NE_EXPR/UNORDERED_EXPR/ORDERED_EXPR/
	UNLT_EXPR/UNLE_EXPR/UNGT_EXPR/UNGE_EXPR/UNEQ_EXPR/LTGT_EXPR/MAX_EXPR/
	MIN_EXPR.
	(get_computation_cost): Update for doloop IV cand extra cost.	
	(determine_group_iv_cost_cond): Update for doloop IV cand.
	(determine_iv_cost): Likewise.
	(ivopts_estimate_reg_pressure): Likewise.
	(may_eliminate_iv): Update handlings for doloop IV cand.
	(add_iv_candidate_for_doloop): New function.
	(find_iv_candidates): Call function add_iv_candidate_for_doloop.
	(iv_ca_set_no_cp): Update for doloop IV cand.
	(iv_ca_set_cp): Likewise.
	(iv_ca_dump): Dump register cost.
	(find_doloop_use): New function.
	(analyze_and_mark_doloop_use): Likewise.
	(tree_ssa_iv_optimize_loop): Call function analyze_and_mark_doloop_use.

gcc/testsuite/ChangeLog

2019-08-23  Kewen Lin  <linkw@gcc.gnu.org>

	PR middle-end/80791
	* gcc.dg/tree-ssa/ivopts-3.c: Adjust for doloop change.
	* gcc.dg/tree-ssa/ivopts-lt.c: Likewise.
	* gcc.dg/tree-ssa/pr32044.c: Likewise.



[-- Attachment #2: doloop_dedicated_iv3.diff --]
[-- Type: text/plain, Size: 22932 bytes --]

diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index 6667cd0..5eccbdc 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -1912,6 +1912,16 @@ static const struct attribute_spec rs6000_attribute_table[] =
 #undef TARGET_PREDICT_DOLOOP_P
 #define TARGET_PREDICT_DOLOOP_P rs6000_predict_doloop_p
 
+#undef TARGET_HAVE_COUNT_REG_DECR_P
+#define TARGET_HAVE_COUNT_REG_DECR_P true
+
+/* 1000000000 is infinite cost in IVOPTs.  */
+#undef TARGET_DOLOOP_COST_FOR_GENERIC
+#define TARGET_DOLOOP_COST_FOR_GENERIC 1000000000
+
+#undef TARGET_DOLOOP_COST_FOR_ADDRESS
+#define TARGET_DOLOOP_COST_FOR_ADDRESS 1000000000
+
 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV rs6000_atomic_assign_expand_fenv
 
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index c2aa4d0..9f3a08a 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -11618,6 +11618,29 @@ loops, and will help ivopts to make some decisions.
 The default version of this hook returns false.
 @end deftypefn
 
+@deftypevr {Target Hook} bool TARGET_HAVE_COUNT_REG_DECR_P
+Return true if the target supports hardware count register for decrement
+and branch.  This count register can't be used as general register since
+moving to/from a general register from/to it is very expensive.
+The default value is false.
+@end deftypevr
+
+@deftypevr {Target Hook} int64_t TARGET_DOLOOP_COST_FOR_GENERIC
+IVOPTs introduces one doloop dedicated IV candidate, this hook offers
+ target owner a way to adjust cost when selecting doloop IV candidate for a
+ generic IV use.  At calcuation, this value will be added on normal cost
+ already calculated by current implementation.
+The default value is zero.
+@end deftypevr
+
+@deftypevr {Target Hook} int64_t TARGET_DOLOOP_COST_FOR_ADDRESS
+IVOPTs introduces one doloop dedicated IV candidate, this hook offers
+ target owner a way to adjust cost when selecting doloop IV candidate for an
+ address IV use.  At calcuation, this value will be added on normal cost
+ already calculated by current implementation.
+The default value is zero.
+@end deftypevr
+
 @deftypefn {Target Hook} bool TARGET_CAN_USE_DOLOOP_P (const widest_int @var{&iterations}, const widest_int @var{&iterations_max}, unsigned int @var{loop_depth}, bool @var{entered_at_top})
 Return true if it is possible to use low-overhead loops (@code{doloop_end}
 and @code{doloop_begin}) for a particular loop.  @var{iterations} gives the
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index b4d57b8..4346773 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -7946,6 +7946,12 @@ to by @var{ce_info}.
 
 @hook TARGET_PREDICT_DOLOOP_P
 
+@hook TARGET_HAVE_COUNT_REG_DECR_P
+
+@hook TARGET_DOLOOP_COST_FOR_GENERIC
+
+@hook TARGET_DOLOOP_COST_FOR_ADDRESS
+
 @hook TARGET_CAN_USE_DOLOOP_P
 
 @hook TARGET_INVALID_WITHIN_DOLOOP
diff --git a/gcc/target.def b/gcc/target.def
index 71b6972..69e2844 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -4246,6 +4246,32 @@ The default version of this hook returns false.",
  bool, (struct loop *loop),
  default_predict_doloop_p)
 
+DEFHOOKPOD
+(have_count_reg_decr_p,
+ "Return true if the target supports hardware count register for decrement\n\
+and branch.  This count register can't be used as general register since\n\
+moving to/from a general register from/to it is very expensive.\n\
+The default value is false.",
+ bool, false)
+
+DEFHOOKPOD
+(doloop_cost_for_generic,
+ "IVOPTs introduces one doloop dedicated IV candidate, this hook offers\n\
+ target owner a way to adjust cost when selecting doloop IV candidate for a\n\
+ generic IV use.  At calcuation, this value will be added on normal cost\n\
+ already calculated by current implementation.\n\
+The default value is zero.",
+ int64_t, 0)
+
+DEFHOOKPOD
+(doloop_cost_for_address,
+ "IVOPTs introduces one doloop dedicated IV candidate, this hook offers\n\
+ target owner a way to adjust cost when selecting doloop IV candidate for an\n\
+ address IV use.  At calcuation, this value will be added on normal cost\n\
+ already calculated by current implementation.\n\
+The default value is zero.",
+ int64_t, 0)
+
 DEFHOOK
 (can_use_doloop_p,
  "Return true if it is possible to use low-overhead loops (@code{doloop_end}\n\
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ivopts-3.c b/gcc/testsuite/gcc.dg/tree-ssa/ivopts-3.c
index 214e6a7..ce4b1d0 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/ivopts-3.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ivopts-3.c
@@ -10,4 +10,6 @@ int main (void)
     f2 ();
 }
 
-/* { dg-final { scan-tree-dump-times "!= 0" 5 "ivopts" } }  */
+/* { dg-final { scan-tree-dump-times "!= 0" 5 "ivopts" { target { ! powerpc*-*-* } } } } */
+/* More debug information emitted for doloop on powerpc.  */
+/* { dg-final { scan-tree-dump-times "!= 0" 6 "ivopts" { target { powerpc*-*-* } } } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ivopts-lt.c b/gcc/testsuite/gcc.dg/tree-ssa/ivopts-lt.c
index 7d5859b..71d7f67 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/ivopts-lt.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ivopts-lt.c
@@ -17,6 +17,7 @@ f1 (char *p, uintptr_t i, uintptr_t n)
   while (i < n);
 }
 
-/* { dg-final { scan-tree-dump-times "PHI" 1 "ivopts" } } */
-/* { dg-final { scan-tree-dump-times "PHI <p_" 1 "ivopts"} } */
-/* { dg-final { scan-tree-dump-times "p_\[0-9\]* <" 1 "ivopts" } } */
+/* { dg-final { scan-tree-dump-times "PHI" 1 "ivopts" { target { ! powerpc*-*-* } } } } */
+/* { dg-final { scan-tree-dump-times "PHI" 2 "ivopts" { target { powerpc*-*-* } } } } */
+/* { dg-final { scan-tree-dump-times "PHI <p_" 1 "ivopts" { target { ! powerpc*-*-* } } } } */
+/* { dg-final { scan-tree-dump-times "p_\[0-9\]* <" 1 "ivopts" { target { ! powerpc*-*-* } } } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr32044.c b/gcc/testsuite/gcc.dg/tree-ssa/pr32044.c
index 8a8977a..06c27b0 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/pr32044.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr32044.c
@@ -1,6 +1,10 @@
 /* { dg-do compile } */
 /* { dg-options "-O2 -fdump-tree-optimized" } */
 
+/* For powerpc, disable doloop IV cand generation in IVOPTs to avoid unexpected
+   division operation for its base setup.  */
+/* { dg-additional-options "-fno-branch-count-reg" { target { powerpc*-*-* } } } */
+
 int foo (int n)
 {
   while (n >= 45)
diff --git a/gcc/tree-ssa-loop-ivopts.c b/gcc/tree-ssa-loop-ivopts.c
index 530ea4a..88e7890 100644
--- a/gcc/tree-ssa-loop-ivopts.c
+++ b/gcc/tree-ssa-loop-ivopts.c
@@ -64,7 +64,30 @@ along with GCC; see the file COPYING3.  If not see
    All of this is done loop by loop.  Doing it globally is theoretically
    possible, it might give a better performance and it might enable us
    to decide costs more precisely, but getting all the interactions right
-   would be complicated.  */
+   would be complicated.
+
+   For the targets supporting low-overhead loops, IVOPTs has to take care of
+   the loops which will probably be transformed in RTL doloop optimization,
+   to try to make selected IV candidate set optimal.  The process of doloop
+   support includes:
+
+   1) Analyze the current loop will be transformed to doloop or not, find and
+      mark its compare type IV use as doloop use (iv_group field doloop_p), and
+      set flag doloop_use_p of ivopts_data to notify subsequent processings on
+      doloop.  See analyze_and_mark_doloop_use and its callees for the details.
+      The target hook predict_doloop_p can be used for target specific checks.
+
+   2) Add one doloop dedicated IV cand {(may_be_zero ? 1 : (niter + 1)), +, -1},
+      set flag doloop_p of iv_cand, step cost is set as zero and no extra cost
+      like biv.  For cost determination between doloop IV cand and IV use, the
+      target hooks doloop_cost_for_generic and doloop_cost_for_address are
+      provided to add on extra costs for generic type and address type IV use.
+      Zero cost is assigned to the pair between doloop IV cand and doloop IV
+      use, and bound zero is set for IV elimination.
+
+   3) With the cost setting in step 2), the current cost model based IV
+      selection algorithm will process as usual, pick up doloop dedicated IV if
+      profitable.  */
 
 #include "config.h"
 #include "system.h"
@@ -275,6 +298,9 @@ comp_cost::operator+= (comp_cost cost)
 comp_cost
 comp_cost::operator+= (HOST_WIDE_INT c)
 {
+  if (c >= INFTY)
+    this->cost = INFTY;
+
   if (infinite_cost_p ())
     return *this;
 
@@ -399,6 +425,8 @@ struct iv_group
   struct cost_pair *cost_map;
   /* The selected candidate for the group.  */
   struct iv_cand *selected;
+  /* To indicate this is a doloop use group.  */
+  bool doloop_p;
   /* Uses in the group.  */
   vec<struct iv_use *> vuses;
 };
@@ -439,6 +467,7 @@ struct iv_cand
 			   be hoisted out of loop.  */
   struct iv *orig_iv;	/* The original iv if this cand is added from biv with
 			   smaller type.  */
+  bool doloop_p;	/* Whether this is a doloop candidate.  */
 };
 
 /* Hashtable entry for common candidate derived from iv uses.  */
@@ -612,6 +641,9 @@ struct ivopts_data
 
   /* Whether the loop body can only be exited via single exit.  */
   bool loop_single_exit_p;
+
+  /* Whether the loop has doloop comparison use.  */
+  bool doloop_use_p;
 };
 
 /* An assignment of iv candidates to uses.  */
@@ -1528,6 +1560,7 @@ record_group (struct ivopts_data *data, enum use_type type)
   group->type = type;
   group->related_cands = BITMAP_ALLOC (NULL);
   group->vuses.create (1);
+  group->doloop_p = false;
 
   data->vgroups.safe_push (group);
   return group;
@@ -3017,10 +3050,10 @@ get_loop_invariant_expr (struct ivopts_data *data, tree inv_expr)
    replacement of the final value of the iv by a direct computation.  */
 
 static struct iv_cand *
-add_candidate_1 (struct ivopts_data *data,
-		 tree base, tree step, bool important, enum iv_position pos,
-		 struct iv_use *use, gimple *incremented_at,
-		 struct iv *orig_iv = NULL)
+add_candidate_1 (struct ivopts_data *data, tree base, tree step, bool important,
+		 enum iv_position pos, struct iv_use *use,
+		 gimple *incremented_at, struct iv *orig_iv = NULL,
+		 bool doloop = false)
 {
   unsigned i;
   struct iv_cand *cand = NULL;
@@ -3079,11 +3112,15 @@ add_candidate_1 (struct ivopts_data *data,
       cand->pos = pos;
       if (pos != IP_ORIGINAL)
 	{
-	  cand->var_before = create_tmp_var_raw (TREE_TYPE (base), "ivtmp");
+	  if (doloop)
+	    cand->var_before = create_tmp_var_raw (TREE_TYPE (base), "doloop");
+	  else
+	    cand->var_before = create_tmp_var_raw (TREE_TYPE (base), "ivtmp");
 	  cand->var_after = cand->var_before;
 	}
       cand->important = important;
       cand->incremented_at = incremented_at;
+      cand->doloop_p = doloop;
       data->vcands.safe_push (cand);
 
       if (!poly_int_tree_p (step))
@@ -3116,6 +3153,7 @@ add_candidate_1 (struct ivopts_data *data,
     }
 
   cand->important |= important;
+  cand->doloop_p |= doloop;
 
   /* Relate candidate to the group for which it is added.  */
   if (use)
@@ -3209,14 +3247,16 @@ add_autoinc_candidates (struct ivopts_data *data, tree base, tree step,
    the end of loop.  */
 
 static void
-add_candidate (struct ivopts_data *data,
-	       tree base, tree step, bool important, struct iv_use *use,
-	       struct iv *orig_iv = NULL)
+add_candidate (struct ivopts_data *data, tree base, tree step, bool important,
+	       struct iv_use *use, struct iv *orig_iv = NULL,
+	       bool doloop = false)
 {
   if (ip_normal_pos (data->current_loop))
-    add_candidate_1 (data, base, step, important,
-		     IP_NORMAL, use, NULL, orig_iv);
-  if (ip_end_pos (data->current_loop)
+    add_candidate_1 (data, base, step, important, IP_NORMAL, use, NULL, orig_iv,
+		     doloop);
+  /* Exclude doloop candidate here since it requires decrement then comparison
+     and jump, the IP_END position doesn't match.  */
+  if (!doloop && ip_end_pos (data->current_loop)
       && allow_ip_end_pos_p (data->current_loop))
     add_candidate_1 (data, base, step, important, IP_END, use, NULL, orig_iv);
 }
@@ -3724,7 +3764,7 @@ prepare_decl_rtl (tree *expr_p, int *ws, void *data)
    Some RTL specific checks seems unable to be checked in gimple, if any new
    checks or easy checks _are_ missing here, please add them.  */
 
-static bool ATTRIBUTE_UNUSED
+static bool
 generic_predict_doloop_p (struct ivopts_data *data)
 {
   struct loop *loop = data->current_loop;
@@ -4177,6 +4217,36 @@ force_expr_to_var_cost (tree expr, bool speed)
       STRIP_NOPS (op0);
       op1 = NULL_TREE;
       break;
+    /* See add_iv_candidate_for_doloop, for doloop may_be_zero case, we
+       introduce COND_EXPR for IV base, need to support better cost estimation
+       for this COND_EXPR and tcc_comparison.  */
+    case COND_EXPR:
+      op0 = TREE_OPERAND (expr, 1);
+      STRIP_NOPS (op0);
+      op1 = TREE_OPERAND (expr, 2);
+      STRIP_NOPS (op1);
+      break;
+    case LT_EXPR:
+    case LE_EXPR:
+    case GT_EXPR:
+    case GE_EXPR:
+    case EQ_EXPR:
+    case NE_EXPR:
+    case UNORDERED_EXPR:
+    case ORDERED_EXPR:
+    case UNLT_EXPR:
+    case UNLE_EXPR:
+    case UNGT_EXPR:
+    case UNGE_EXPR:
+    case UNEQ_EXPR:
+    case LTGT_EXPR:
+    case MAX_EXPR:
+    case MIN_EXPR:
+      op0 = TREE_OPERAND (expr, 0);
+      STRIP_NOPS (op0);
+      op1 = TREE_OPERAND (expr, 1);
+      STRIP_NOPS (op1);
+      break;
 
     default:
       /* Just an arbitrary value, FIXME.  */
@@ -4258,6 +4328,35 @@ force_expr_to_var_cost (tree expr, bool speed)
     case RSHIFT_EXPR:
       cost = comp_cost (add_cost (speed, mode), 0);
       break;
+    case COND_EXPR:
+      op0 = TREE_OPERAND (expr, 0);
+      STRIP_NOPS (op0);
+      if (op0 == NULL_TREE || TREE_CODE (op0) == SSA_NAME
+	  || CONSTANT_CLASS_P (op0))
+	cost = no_cost;
+      else
+	cost = force_expr_to_var_cost (op0, speed);
+      break;
+    case LT_EXPR:
+    case LE_EXPR:
+    case GT_EXPR:
+    case GE_EXPR:
+    case EQ_EXPR:
+    case NE_EXPR:
+    case UNORDERED_EXPR:
+    case ORDERED_EXPR:
+    case UNLT_EXPR:
+    case UNLE_EXPR:
+    case UNGT_EXPR:
+    case UNGE_EXPR:
+    case UNEQ_EXPR:
+    case LTGT_EXPR:
+    case MAX_EXPR:
+    case MIN_EXPR:
+      /* Simply use 1.5 * add cost for now, FIXME if there is some more accurate
+	 cost evaluation way.  */
+      cost = comp_cost (1.5 * add_cost (speed, mode), 0);
+      break;
 
     default:
       gcc_unreachable ();
@@ -4634,7 +4733,10 @@ get_computation_cost (struct ivopts_data *data, struct iv_use *use,
     {
       cost = get_address_cost (data, use, cand, &aff_inv, &aff_var, ratio,
 			       inv_vars, inv_expr, can_autoinc, speed);
-      return get_scaled_computation_cost_at (data, at, cost);
+      cost = get_scaled_computation_cost_at (data, at, cost);
+      /* For doloop IV cand, add on the extra cost.  */
+      cost += cand->doloop_p ? targetm.doloop_cost_for_address : 0;
+      return cost;
     }
 
   bool simple_inv = (aff_combination_const_p (&aff_inv)
@@ -4684,6 +4786,10 @@ get_computation_cost (struct ivopts_data *data, struct iv_use *use,
   if (comp_inv && !integer_zerop (comp_inv))
     cost += add_cost (speed, TYPE_MODE (utype));
 
+  /* For doloop IV cand, add on the extra cost.  */
+  if (cand->doloop_p && use->type == USE_NONLINEAR_EXPR)
+    cost += targetm.doloop_cost_for_generic;
+
   return get_scaled_computation_cost_at (data, at, cost);
 }
 
@@ -5142,6 +5248,15 @@ may_eliminate_iv (struct ivopts_data *data,
 	}
     }
 
+  /* For doloop IV cand, the bound would be zero.  It's safe whether
+     may_be_zero set or not.  */
+  if (cand->doloop_p)
+    {
+      *bound = build_int_cst (TREE_TYPE (cand->iv->base), 0);
+      *comp = iv_elimination_compare (data, use);
+      return true;
+    }
+
   cand_value_at (loop, cand, use->stmt, desc->niter, &bnd);
 
   *bound = fold_convert (TREE_TYPE (cand->iv->base),
@@ -5264,6 +5379,9 @@ determine_group_iv_cost_cond (struct ivopts_data *data,
       inv_vars = inv_vars_elim;
       inv_vars_elim = NULL;
       inv_expr = inv_expr_elim;
+      /* For doloop candidate/use pair, adjust to zero cost.  */
+      if (group->doloop_p && cand->doloop_p && elim_cost.cost > no_cost.cost)
+	cost = no_cost;
     }
   else
     {
@@ -5390,6 +5508,42 @@ relate_compare_use_with_all_cands (struct ivopts_data *data)
     }
 }
 
+/* Add one doloop dedicated IV candidate:
+     - Base is (may_be_zero ? 1 : (niter + 1)).
+     - Step is -1.  */
+
+static void
+add_iv_candidate_for_doloop (struct ivopts_data *data)
+{
+  tree_niter_desc *niter_desc = niter_for_single_dom_exit (data);
+  gcc_assert (niter_desc && niter_desc->assumptions);
+
+  tree niter = niter_desc->niter;
+  tree ntype = TREE_TYPE (niter);
+  gcc_assert (TREE_CODE (ntype) == INTEGER_TYPE);
+
+  tree may_be_zero = niter_desc->may_be_zero;
+  if (may_be_zero && integer_zerop (may_be_zero))
+    may_be_zero = NULL_TREE;
+  if (may_be_zero)
+    {
+      if (COMPARISON_CLASS_P (may_be_zero))
+	{
+	  niter = fold_build3 (COND_EXPR, ntype, may_be_zero,
+			       build_int_cst (ntype, 0),
+			       rewrite_to_non_trapping_overflow (niter));
+	}
+      /* Don't try to obtain the iteration count expression when may_be_zero is
+	 integer_nonzerop (actually iteration count is one) or else.  */
+      else
+	return;
+    }
+
+  tree base = fold_build2 (PLUS_EXPR, ntype, unshare_expr (niter),
+			   build_int_cst (ntype, 1));
+  add_candidate (data, base, build_int_cst (ntype, -1), true, NULL, NULL, true);
+}
+
 /* Finds the candidates for the induction variables.  */
 
 static void
@@ -5398,6 +5552,10 @@ find_iv_candidates (struct ivopts_data *data)
   /* Add commonly used ivs.  */
   add_standard_iv_candidates (data);
 
+  /* Add doloop dedicate ivs.  */
+  if (data->doloop_use_p)
+    add_iv_candidate_for_doloop (data);
+
   /* Add old induction variables.  */
   add_iv_candidate_for_bivs (data);
 
@@ -5578,16 +5736,21 @@ determine_iv_cost (struct ivopts_data *data, struct iv_cand *cand)
      or a const set.  */
   if (cost_base.cost == 0)
     cost_base.cost = COSTS_N_INSNS (1);
-  cost_step = add_cost (data->speed, TYPE_MODE (TREE_TYPE (base)));
-
+  /* Doloop decrement should be considered as zero cost.  */
+  if (cand->doloop_p)
+    cost_step = 0;
+  else
+    cost_step = add_cost (data->speed, TYPE_MODE (TREE_TYPE (base)));
   cost = cost_step + adjust_setup_cost (data, cost_base.cost);
 
   /* Prefer the original ivs unless we may gain something by replacing it.
      The reason is to make debugging simpler; so this is not relevant for
      artificial ivs created by other optimization passes.  */
-  if (cand->pos != IP_ORIGINAL
-      || !SSA_NAME_VAR (cand->var_before)
-      || DECL_ARTIFICIAL (SSA_NAME_VAR (cand->var_before)))
+  if ((cand->pos != IP_ORIGINAL
+       || !SSA_NAME_VAR (cand->var_before)
+       || DECL_ARTIFICIAL (SSA_NAME_VAR (cand->var_before)))
+      /* Prefer doloop as well.  */
+      && !cand->doloop_p)
     cost++;
 
   /* Prefer not to insert statements into latch unless there are some
@@ -5832,7 +5995,8 @@ iv_ca_set_no_cp (struct ivopts_data *data, struct iv_ca *ivs,
   if (ivs->n_cand_uses[cid] == 0)
     {
       bitmap_clear_bit (ivs->cands, cid);
-      ivs->n_cands--;
+      if (!cp->cand->doloop_p || !targetm.have_count_reg_decr_p)
+	ivs->n_cands--;
       ivs->cand_cost -= cp->cand->cost;
       iv_ca_set_remove_invs (ivs, cp->cand->inv_vars, ivs->n_inv_var_uses);
       iv_ca_set_remove_invs (ivs, cp->cand->inv_exprs, ivs->n_inv_expr_uses);
@@ -5889,7 +6053,8 @@ iv_ca_set_cp (struct ivopts_data *data, struct iv_ca *ivs,
       if (ivs->n_cand_uses[cid] == 1)
 	{
 	  bitmap_set_bit (ivs->cands, cid);
-	  ivs->n_cands++;
+	  if (!cp->cand->doloop_p || !targetm.have_count_reg_decr_p)
+	    ivs->n_cands++;
 	  ivs->cand_cost += cp->cand->cost;
 	  iv_ca_set_add_invs (ivs, cp->cand->inv_vars, ivs->n_inv_var_uses);
 	  iv_ca_set_add_invs (ivs, cp->cand->inv_exprs, ivs->n_inv_expr_uses);
@@ -6134,6 +6299,8 @@ iv_ca_dump (struct ivopts_data *data, FILE *file, struct iv_ca *ivs)
 
   fprintf (file, "  cost: %" PRId64 " (complexity %d)\n", cost.cost,
 	   cost.complexity);
+  fprintf (file, "  reg_cost: %d\n",
+	   ivopts_estimate_reg_pressure (data, ivs->n_invs, ivs->n_cands));
   fprintf (file, "  cand_cost: %" PRId64 "\n  cand_group_cost: "
 	   "%" PRId64 " (complexity %d)\n", ivs->cand_cost,
 	   ivs->cand_use_cost.cost, ivs->cand_use_cost.complexity);
@@ -7568,6 +7735,77 @@ determine_scaling_factor (struct ivopts_data *data, basic_block *body)
     }
 }
 
+/* Find doloop comparison use and set its doloop_p on if found.  */
+
+static bool
+find_doloop_use (struct ivopts_data *data)
+{
+  struct loop *loop = data->current_loop;
+
+  for (unsigned i = 0; i < data->vgroups.length (); i++)
+    {
+      struct iv_group *group = data->vgroups[i];
+      if (group->type == USE_COMPARE)
+	{
+	  gcc_assert (group->vuses.length () == 1);
+	  struct iv_use *use = group->vuses[0];
+	  gimple *stmt = use->stmt;
+	  if (gimple_code (stmt) == GIMPLE_COND)
+	    {
+	      basic_block bb = gimple_bb (stmt);
+	      edge true_edge, false_edge;
+	      extract_true_false_edges_from_block (bb, &true_edge, &false_edge);
+	      /* This comparison is used for loop latch.  Require latch is empty
+		 for now.  */
+	      if ((loop->latch == true_edge->dest
+		   || loop->latch == false_edge->dest)
+		  && empty_block_p (loop->latch))
+		{
+		  group->doloop_p = true;
+		  if (dump_file && (dump_flags & TDF_DETAILS))
+		    {
+		      fprintf (dump_file, "Doloop cmp iv use: ");
+		      print_gimple_stmt (dump_file, stmt, TDF_DETAILS);
+		    }
+		  return true;
+		}
+	    }
+	}
+    }
+
+  return false;
+}
+
+/* For the targets which support doloop, to predict whether later RTL doloop
+   transformation will perform on this loop, further detect the doloop use and
+   mark the flag doloop_use_p if predicted.  */
+
+void
+analyze_and_mark_doloop_use (struct ivopts_data *data)
+{
+  data->doloop_use_p = false;
+
+  if (!flag_branch_on_count_reg)
+    return;
+
+  if (!generic_predict_doloop_p (data))
+    return;
+
+  if (find_doloop_use (data))
+    {
+      data->doloop_use_p = true;
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	{
+	  struct loop *loop = data->current_loop;
+	  fprintf (dump_file,
+		   "Predict loop %d can perform"
+		   " doloop optimization later.\n",
+		   loop->num);
+	  flow_loop_dump (loop, dump_file, NULL, 1);
+	}
+    }
+}
+
 /* Optimizes the LOOP.  Returns true if anything changed.  */
 
 static bool
@@ -7622,6 +7860,9 @@ tree_ssa_iv_optimize_loop (struct ivopts_data *data, struct loop *loop,
   /* Determine cost scaling factor for basic blocks in loop.  */
   determine_scaling_factor (data, body);
 
+  /* Analyze doloop possibility and mark the doloop use if predicted.  */
+  analyze_and_mark_doloop_use (data);
+
   /* Finds candidates for the induction variables (item 2).  */
   find_iv_candidates (data);
 

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v6 3/3] PR80791 Consider doloop cmp use in ivopts
  2019-08-23  9:57                     ` Kewen.Lin
@ 2019-08-23 10:43                       ` Bin.Cheng
  2019-08-23 11:02                         ` Segher Boessenkool
  2019-08-24 22:43                         ` Kewen.Lin
  0 siblings, 2 replies; 43+ messages in thread
From: Bin.Cheng @ 2019-08-23 10:43 UTC (permalink / raw)
  To: Kewen.Lin
  Cc: gcc-patches List, Segher Boessenkool, Bill Schmidt, Richard Guenther

On Fri, Aug 23, 2019 at 4:27 PM Kewen.Lin <linkw@linux.ibm.com> wrote:
>
> Hi Bin
>
> on 2019/8/23 上午10:19, Bin.Cheng wrote:
> > On Thu, Aug 22, 2019 at 3:09 PM Kewen.Lin <linkw@linux.ibm.com> wrote:
> >>
> >> Hi Bin,
> >>
> >> on 2019/8/22 下午1:46, Bin.Cheng wrote:
> >>> On Thu, Aug 22, 2019 at 11:18 AM Kewen.Lin <linkw@linux.ibm.com> wrote:
> >>>>
> >>>> Hi Bin,
> >>>>
> >>>> Thanks for your time!
> >>>>
> >>>> on 2019/8/21 下午8:32, Bin.Cheng wrote:
> >>>>> On Wed, Aug 14, 2019 at 3:23 PM Kewen.Lin <linkw@linux.ibm.com> wrote:
> >>>>>>
> >>>>>> Hi!
> >>>>>>
> >>>>>> Comparing to the previous versions of implementation mainly based on the
> >>>>>> existing IV cands but zeroing the related group/use cost, this new one is based
> >>>>>> on Richard and Segher's suggestion introducing one doloop dedicated IV cand.
> >>>>>>
> >>>>>> Some key points are listed below:
> >>>>>>   1) New field doloop_p in struct iv_cand to indicate doloop dedicated IV cand.
> >>>>>>   2) Special name "doloop" assigned.
> >>>>>>   3) Doloop IV cand with form (niter+1, +, -1)
> >>>>>>   4) For doloop IV cand, no extra one cost like BIV, assign zero cost for step.
> >>>>>>   5) Support may_be_zero (regressed PR is in this case), the base of doloop IV
> >>>>>>      can be COND_EXPR, add handlings in cand_value_at and may_eliminate_iv.
> >>>>>>   6) Add more expr support in force_expr_to_var_cost for reasonable cost
> >>>>>>      calculation on the IV base with may_be_zero (like COND_EXPR).
> >>>>>>   7) Set zero cost when using doloop IV cand for doloop use.
> >>>>>>   8) Add three hooks (should we merge _generic and _address?).
> >>>>>>     *) have_count_reg_decr_p, is to indicate the target has special hardware
> >>>>>>        count register, we shouldn't consider the impact of doloop IV when
> >>>>>>        calculating register pressures.
> >>>>>>     *) doloop_cost_for_generic, is the extra cost when using doloop IV cand for
> >>>>>>        generic type IV use.
> >>>>>>     *) doloop_cost_for_address, is the extra cost when using doloop IV cand for
> >>>>>>        address type IV use.
>
> >> The new patch addressing the comments is attached.
> >> Could you please have a look again?  Thanks in advance!
> > Thanks for working on this.  A bit more nit-pickings.
> >
> > -    add_candidate_1 (data, base, step, important,
> > -                    IP_NORMAL, use, NULL, orig_iv);
> > -  if (ip_end_pos (data->current_loop)
> > +    add_candidate_1 (data, base, step, important, IP_NORMAL, use, NULL, doloop,
> > +                    orig_iv);
> > +  if (!doloop && ip_end_pos (data->current_loop)
> > Could you add some comments elaborating why ip_end_pos candidate
> > shouldn't be added for doloop case?  Because the increment position is
> > wrong.
> >
> > Also if you make doloop the last default parameter of add_candidate_1,
> > you can save more unnecessary changes to calls to add_candidate?
> >
> > -    cost = get_computation_cost (data, use, cand, false,
> > -                                &inv_vars, NULL, &inv_expr);
> > +    {
> > +      cost = get_computation_cost (data, use, cand, false, &inv_vars, NULL,
> > +                                  &inv_expr);
> > +      if (cand->doloop_p)
> > +       cost += targetm.doloop_cost_for_generic;
> > +    }
> > This adjustment
> >
> >    cost = get_computation_cost (data, use, cand, true,
> >                                &inv_vars, &can_autoinc, &inv_expr);
> >
> > +  if (cand->doloop_p)
> > +    cost += targetm.doloop_cost_for_address;
> > +
> > and this adjustment can be moved into get_computation_cost where all
> > cost adjustments are done.
> >
> > +      /* For doloop candidate/use pair, adjust to zero cost.  */
> > +      if (group->doloop_p && cand->doloop_p)
> > +       cost = no_cost;
> > Note above code handles comparing against zero case and decreases the
> > cost by one (which prefers the same kind candidate as doloop one),
> > it's very possible to have -1 cost for doloop cand here.  how about
> > just set to no_cost if it's positive?  Your call.
> >
> > +/* For the targets which support doloop, to predict whether later RTL doloop
> > +   transformation will perform on this loop, further detect the doloop use and
> > +   mark the flag doloop_use_p if predicted.  */
> > +
> > +void
> > +predict_and_process_doloop (struct ivopts_data *data)
> > A better name here? Sorry I don't have another candidate in mind...
> >
> > +  data->doloop_use_p = false;
> > This can be moved to the beginning of above
> > 'predict_and_process_doloop' function.
> >
> > Lastly, could you please add some brief description/comment about
> > doloop handling as a subsection in the file head comment?
> >
> > Otherwise, the ivopt changes look good to me.
> >
> > Thanks,
> > bin
> >
>
> Thanks for your prompt reply!  I've updated the code as your comments,
> the updated version is attached.  Looking forward to your review again.

Sorry to bother.

-      return get_scaled_computation_cost_at (data, at, cost);
+      cost = get_scaled_computation_cost_at (data, at, cost);
+      /* For doloop IV cand, add on the extra cost.  */
+      cost += cand->doloop_p ? targetm.doloop_cost_for_address : 0;
+      return cost;
Here the cost is adjusted after scaling, while:

+  /* For doloop IV cand, add on the extra cost.  */
+  if (cand->doloop_p && use->type == USE_NONLINEAR_EXPR)
+    cost += targetm.doloop_cost_for_generic;
+
   return get_scaled_computation_cost_at (data, at, cost);
is adjusted before scaling.  Please work consistently.

+      /* Simply use 1.5 * add cost for now, FIXME if there is some
more accurate
+        cost evaluation way.  */
+      cost = comp_cost (1.5 * add_cost (speed, mode), 0);
+      break;
Is 1.5 important for some test cases?  Can we simply use 1 instead?
Or at least use xxx * 2 / 3 in order to avoid floating number.

Not sure if non-ivopts parts are already approved?  If so, the patch
is okay with above issues addressed.

Thanks very much for your time!

Thanks,
bin
>
>
> Thanks,
> Kewen
>
> -----
>
> gcc/ChangeLog
>
> 2019-08-23  Kewen Lin  <linkw@gcc.gnu.org>
>
>         PR middle-end/80791
>         * config/rs6000/rs6000.c (TARGET_HAVE_COUNT_REG_DECR_P): New macro.
>         (TARGET_DOLOOP_COST_FOR_GENERIC): Likewise.
>         (TARGET_DOLOOP_COST_FOR_ADDRESS): Likewise.
>         * target.def (have_count_reg_decr_p): New hook.
>         (doloop_cost_for_generic): Likewise.
>         (doloop_cost_for_address): Likewise.
>         * doc/tm.texi.in (TARGET_HAVE_COUNT_REG_DECR_P): Likewise.
>         (TARGET_DOLOOP_COST_FOR_GENERIC): Likewise.
>         (TARGET_DOLOOP_COST_FOR_ADDRESS): Likewise.
>         * doc/tm.texi: Regenerate.
>         * tree-ssa-loop-ivopts.c (comp_cost::operator+=): Consider infinite cost
>         addend.
>         (record_group): Init doloop_p.
>         (add_candidate_1): Add optional argument doloop, change the handlings
>         accordingly.
>         (add_candidate): Likewise.
>         (generic_predict_doloop_p): Update attribute.
>         (force_expr_to_var_cost): Add costing for expressions COND_EXPR/LT_EXPR/
>         LE_EXPR/GT_EXPR/GE_EXPR/EQ_EXPR/NE_EXPR/UNORDERED_EXPR/ORDERED_EXPR/
>         UNLT_EXPR/UNLE_EXPR/UNGT_EXPR/UNGE_EXPR/UNEQ_EXPR/LTGT_EXPR/MAX_EXPR/
>         MIN_EXPR.
>         (get_computation_cost): Update for doloop IV cand extra cost.
>         (determine_group_iv_cost_cond): Update for doloop IV cand.
>         (determine_iv_cost): Likewise.
>         (ivopts_estimate_reg_pressure): Likewise.
>         (may_eliminate_iv): Update handlings for doloop IV cand.
>         (add_iv_candidate_for_doloop): New function.
>         (find_iv_candidates): Call function add_iv_candidate_for_doloop.
>         (iv_ca_set_no_cp): Update for doloop IV cand.
>         (iv_ca_set_cp): Likewise.
>         (iv_ca_dump): Dump register cost.
>         (find_doloop_use): New function.
>         (analyze_and_mark_doloop_use): Likewise.
>         (tree_ssa_iv_optimize_loop): Call function analyze_and_mark_doloop_use.
>
> gcc/testsuite/ChangeLog
>
> 2019-08-23  Kewen Lin  <linkw@gcc.gnu.org>
>
>         PR middle-end/80791
>         * gcc.dg/tree-ssa/ivopts-3.c: Adjust for doloop change.
>         * gcc.dg/tree-ssa/ivopts-lt.c: Likewise.
>         * gcc.dg/tree-ssa/pr32044.c: Likewise.
>
>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v6 3/3] PR80791 Consider doloop cmp use in ivopts
  2019-08-23 10:43                       ` Bin.Cheng
@ 2019-08-23 11:02                         ` Segher Boessenkool
  2019-09-11  6:18                           ` Kewen.Lin
  2019-08-24 22:43                         ` Kewen.Lin
  1 sibling, 1 reply; 43+ messages in thread
From: Segher Boessenkool @ 2019-08-23 11:02 UTC (permalink / raw)
  To: Bin.Cheng; +Cc: Kewen.Lin, gcc-patches List, Bill Schmidt, Richard Guenther

Hi!

On Fri, Aug 23, 2019 at 05:43:32PM +0800, Bin.Cheng wrote:
> On Fri, Aug 23, 2019 at 4:27 PM Kewen.Lin <linkw@linux.ibm.com> wrote:
> Not sure if non-ivopts parts are already approved?  If so, the patch
> is okay with above issues addressed.

The rs6000 part is fine.  The target.def entries need some spell check
and copy-editing, but are obvious and trivial otherwise, and/or you can
approve it as ivopts maintainer.

> Thanks very much for your time!

And thank you as well Bin :-)


Segher

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v6 3/3] PR80791 Consider doloop cmp use in ivopts
  2019-08-23 10:43                       ` Bin.Cheng
  2019-08-23 11:02                         ` Segher Boessenkool
@ 2019-08-24 22:43                         ` Kewen.Lin
  1 sibling, 0 replies; 43+ messages in thread
From: Kewen.Lin @ 2019-08-24 22:43 UTC (permalink / raw)
  To: Bin.Cheng
  Cc: gcc-patches List, Segher Boessenkool, Bill Schmidt, Richard Guenther

Hi Bin,

on 2019/8/23 下午5:43, Bin.Cheng wrote:
> On Fri, Aug 23, 2019 at 4:27 PM Kewen.Lin <linkw@linux.ibm.com> wrote:
>>
>> Hi Bin
>>
>> on 2019/8/23 上午10:19, Bin.Cheng wrote:
>>> On Thu, Aug 22, 2019 at 3:09 PM Kewen.Lin <linkw@linux.ibm.com> wrote:
>>>>
>>>> Hi Bin,
>>>>
>>>> on 2019/8/22 下午1:46, Bin.Cheng wrote:
>>>>> On Thu, Aug 22, 2019 at 11:18 AM Kewen.Lin <linkw@linux.ibm.com> wrote:
>>>>>>
>>>>>> Hi Bin,
>>>>>>
>>>>>> Thanks for your time!
>>>>>>
>>>>>> on 2019/8/21 下午8:32, Bin.Cheng wrote:
>>>>>>> On Wed, Aug 14, 2019 at 3:23 PM Kewen.Lin <linkw@linux.ibm.com> wrote:
>>>>>>>>
>>>>>>>> Hi!
>>>>>>>>
>>>>>>>> Comparing to the previous versions of implementation mainly based on the
>>>>>>>> existing IV cands but zeroing the related group/use cost, this new one is based
>>>>>>>> on Richard and Segher's suggestion introducing one doloop dedicated IV cand.
>>>>>>>>
>>>>>>>> Some key points are listed below:
>>>>>>>>   1) New field doloop_p in struct iv_cand to indicate doloop dedicated IV cand.
>>>>>>>>   2) Special name "doloop" assigned.
>>>>>>>>   3) Doloop IV cand with form (niter+1, +, -1)
>>>>>>>>   4) For doloop IV cand, no extra one cost like BIV, assign zero cost for step.
>>>>>>>>   5) Support may_be_zero (regressed PR is in this case), the base of doloop IV
>>>>>>>>      can be COND_EXPR, add handlings in cand_value_at and may_eliminate_iv.
>>>>>>>>   6) Add more expr support in force_expr_to_var_cost for reasonable cost
>>>>>>>>      calculation on the IV base with may_be_zero (like COND_EXPR).
>>>>>>>>   7) Set zero cost when using doloop IV cand for doloop use.
>>>>>>>>   8) Add three hooks (should we merge _generic and _address?).
>>>>>>>>     *) have_count_reg_decr_p, is to indicate the target has special hardware
>>>>>>>>        count register, we shouldn't consider the impact of doloop IV when
>>>>>>>>        calculating register pressures.
>>>>>>>>     *) doloop_cost_for_generic, is the extra cost when using doloop IV cand for
>>>>>>>>        generic type IV use.
>>>>>>>>     *) doloop_cost_for_address, is the extra cost when using doloop IV cand for
>>>>>>>>        address type IV use.
>>
>>>> The new patch addressing the comments is attached.
>>>> Could you please have a look again?  Thanks in advance!
>>> Thanks for working on this.  A bit more nit-pickings.
>>>
>>> -    add_candidate_1 (data, base, step, important,
>>> -                    IP_NORMAL, use, NULL, orig_iv);
>>> -  if (ip_end_pos (data->current_loop)
>>> +    add_candidate_1 (data, base, step, important, IP_NORMAL, use, NULL, doloop,
>>> +                    orig_iv);
>>> +  if (!doloop && ip_end_pos (data->current_loop)
>>> Could you add some comments elaborating why ip_end_pos candidate
>>> shouldn't be added for doloop case?  Because the increment position is
>>> wrong.
>>>
>>> Also if you make doloop the last default parameter of add_candidate_1,
>>> you can save more unnecessary changes to calls to add_candidate?
>>>
>>> -    cost = get_computation_cost (data, use, cand, false,
>>> -                                &inv_vars, NULL, &inv_expr);
>>> +    {
>>> +      cost = get_computation_cost (data, use, cand, false, &inv_vars, NULL,
>>> +                                  &inv_expr);
>>> +      if (cand->doloop_p)
>>> +       cost += targetm.doloop_cost_for_generic;
>>> +    }
>>> This adjustment
>>>
>>>    cost = get_computation_cost (data, use, cand, true,
>>>                                &inv_vars, &can_autoinc, &inv_expr);
>>>
>>> +  if (cand->doloop_p)
>>> +    cost += targetm.doloop_cost_for_address;
>>> +
>>> and this adjustment can be moved into get_computation_cost where all
>>> cost adjustments are done.
>>>
>>> +      /* For doloop candidate/use pair, adjust to zero cost.  */
>>> +      if (group->doloop_p && cand->doloop_p)
>>> +       cost = no_cost;
>>> Note above code handles comparing against zero case and decreases the
>>> cost by one (which prefers the same kind candidate as doloop one),
>>> it's very possible to have -1 cost for doloop cand here.  how about
>>> just set to no_cost if it's positive?  Your call.
>>>
>>> +/* For the targets which support doloop, to predict whether later RTL doloop
>>> +   transformation will perform on this loop, further detect the doloop use and
>>> +   mark the flag doloop_use_p if predicted.  */
>>> +
>>> +void
>>> +predict_and_process_doloop (struct ivopts_data *data)
>>> A better name here? Sorry I don't have another candidate in mind...
>>>
>>> +  data->doloop_use_p = false;
>>> This can be moved to the beginning of above
>>> 'predict_and_process_doloop' function.
>>>
>>> Lastly, could you please add some brief description/comment about
>>> doloop handling as a subsection in the file head comment?
>>>
>>> Otherwise, the ivopt changes look good to me.
>>>
>>> Thanks,
>>> bin
>>>
>>
>> Thanks for your prompt reply!  I've updated the code as your comments,
>> the updated version is attached.  Looking forward to your review again.
> 
> Sorry to bother.
> 
> -      return get_scaled_computation_cost_at (data, at, cost);
> +      cost = get_scaled_computation_cost_at (data, at, cost);
> +      /* For doloop IV cand, add on the extra cost.  */
> +      cost += cand->doloop_p ? targetm.doloop_cost_for_address : 0;
> +      return cost;
> Here the cost is adjusted after scaling, while:
> 
> +  /* For doloop IV cand, add on the extra cost.  */
> +  if (cand->doloop_p && use->type == USE_NONLINEAR_EXPR)
> +    cost += targetm.doloop_cost_for_generic;
> +
>    return get_scaled_computation_cost_at (data, at, cost);
> is adjusted before scaling.  Please work consistently.
> 

Thanks for catching!

> +      /* Simply use 1.5 * add cost for now, FIXME if there is some
> more accurate
> +        cost evaluation way.  */
> +      cost = comp_cost (1.5 * add_cost (speed, mode), 0);
> +      break;
> Is 1.5 important for some test cases?  Can we simply use 1 instead?
> Or at least use xxx * 2 / 3 in order to avoid floating number.
> 

No, I was thinking they may deserve a bit more than the add since
the cost was a high value before this patch, two was too much for some
cases in my initial prototype, then I just chose 1.5.
I think it should be fine to use 1 here.


The appended diff:

diff --git a/gcc/tree-ssa-loop-ivopts.c b/gcc/tree-ssa-loop-ivopts.c
index 88e7890..31ab858 100644
--- a/gcc/tree-ssa-loop-ivopts.c
+++ b/gcc/tree-ssa-loop-ivopts.c
@@ -4353,9 +4353,9 @@ force_expr_to_var_cost (tree expr, bool speed)
     case LTGT_EXPR:
     case MAX_EXPR:
     case MIN_EXPR:
-      /* Simply use 1.5 * add cost for now, FIXME if there is some more accurate
-        cost evaluation way.  */
-      cost = comp_cost (1.5 * add_cost (speed, mode), 0);
+      /* Simply use add cost for now, FIXME if there is some more accurate cost
+        evaluation way.  */
+      cost = comp_cost (add_cost (speed, mode), 0);
       break;

     default:
@@ -4786,11 +4786,13 @@ get_computation_cost (struct ivopts_data *data, struct iv_use *use,
   if (comp_inv && !integer_zerop (comp_inv))
     cost += add_cost (speed, TYPE_MODE (utype));

+  cost = get_scaled_computation_cost_at (data, at, cost);
+
   /* For doloop IV cand, add on the extra cost.  */
   if (cand->doloop_p && use->type == USE_NONLINEAR_EXPR)
     cost += targetm.doloop_cost_for_generic;

-  return get_scaled_computation_cost_at (data, at, cost);
+  return cost;
 }


> Not sure if non-ivopts parts are already approved?  If so, the patch
> is okay with above issues addressed.
> 
> Thanks very much for your time!
> 

Thanks a lot for your time and helpful comments as well!!!


Thanks,
Kewen

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v6 3/3] PR80791 Consider doloop cmp use in ivopts
  2019-08-23 11:02                         ` Segher Boessenkool
@ 2019-09-11  6:18                           ` Kewen.Lin
  2019-09-12  8:14                             ` Richard Biener
  0 siblings, 1 reply; 43+ messages in thread
From: Kewen.Lin @ 2019-09-11  6:18 UTC (permalink / raw)
  To: gcc-patches List
  Cc: Segher Boessenkool, Bin.Cheng, Bill Schmidt, Richard Guenther

[-- Attachment #1: Type: text/plain, Size: 2768 bytes --]

Hi,

Sorry for the late update.  I've updated the words of target hooks part.

Could someone help to review it?  Thanks in advance!

By the way, as previous emails in this thread, Bin has approved the IVOPTs
part, while Segher has approved the rs6000 part.


Thanks,
Kewen

-----

gcc/ChangeLog

2019-09-11  Kewen Lin  <linkw@gcc.gnu.org>

	PR middle-end/80791
	* config/rs6000/rs6000.c (TARGET_HAVE_COUNT_REG_DECR_P): New macro.
	(TARGET_DOLOOP_COST_FOR_GENERIC): Likewise.
	(TARGET_DOLOOP_COST_FOR_ADDRESS): Likewise.
	* target.def (have_count_reg_decr_p): New hook.
	(doloop_cost_for_generic): Likewise.
	(doloop_cost_for_address): Likewise.
	* doc/tm.texi.in (TARGET_HAVE_COUNT_REG_DECR_P): Likewise.
	(TARGET_DOLOOP_COST_FOR_GENERIC): Likewise.
	(TARGET_DOLOOP_COST_FOR_ADDRESS): Likewise.
	* doc/tm.texi: Regenerate.
	* tree-ssa-loop-ivopts.c (comp_cost::operator+=): Consider infinite cost
	addend.
	(record_group): Init doloop_p.
	(add_candidate_1): Add optional argument doloop, change the handlings
	accordingly.
	(add_candidate): Likewise.
	(generic_predict_doloop_p): Update attribute.
	(force_expr_to_var_cost): Add costing for expressions COND_EXPR/LT_EXPR/
	LE_EXPR/GT_EXPR/GE_EXPR/EQ_EXPR/NE_EXPR/UNORDERED_EXPR/ORDERED_EXPR/
	UNLT_EXPR/UNLE_EXPR/UNGT_EXPR/UNGE_EXPR/UNEQ_EXPR/LTGT_EXPR/MAX_EXPR/
	MIN_EXPR.
	(get_computation_cost): Update for doloop IV cand extra cost.	
	(determine_group_iv_cost_cond): Update for doloop IV cand.
	(determine_iv_cost): Likewise.
	(ivopts_estimate_reg_pressure): Likewise.
	(may_eliminate_iv): Update handlings for doloop IV cand.
	(add_iv_candidate_for_doloop): New function.
	(find_iv_candidates): Call function add_iv_candidate_for_doloop.
	(iv_ca_set_no_cp): Update for doloop IV cand.
	(iv_ca_set_cp): Likewise.
	(iv_ca_dump): Dump register cost.
	(find_doloop_use): New function.
	(analyze_and_mark_doloop_use): Likewise.
	(tree_ssa_iv_optimize_loop): Call function analyze_and_mark_doloop_use.

gcc/testsuite/ChangeLog

2019-09-11  Kewen Lin  <linkw@gcc.gnu.org>

	PR middle-end/80791
	* gcc.dg/tree-ssa/ivopts-3.c: Adjust for doloop change.
	* gcc.dg/tree-ssa/ivopts-lt.c: Likewise.
	* gcc.dg/tree-ssa/pr32044.c: Likewise.


on 2019/8/23 脧脗脦莽6:18, Segher Boessenkool wrote:
> Hi!
> 
> On Fri, Aug 23, 2019 at 05:43:32PM +0800, Bin.Cheng wrote:
>> On Fri, Aug 23, 2019 at 4:27 PM Kewen.Lin <linkw@linux.ibm.com> wrote:
>> Not sure if non-ivopts parts are already approved?  If so, the patch
>> is okay with above issues addressed.
> 
> The rs6000 part is fine.  The target.def entries need some spell check
> and copy-editing, but are obvious and trivial otherwise, and/or you can
> approve it as ivopts maintainer.
> 
>> Thanks very much for your time!
> 
> And thank you as well Bin :-)
> 
> 
> Segher
> 

[-- Attachment #2: doloop_dedicated_iv5.diff --]
[-- Type: text/plain, Size: 23948 bytes --]

diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index 6667cd0..5eccbdc 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -1912,6 +1912,16 @@ static const struct attribute_spec rs6000_attribute_table[] =
 #undef TARGET_PREDICT_DOLOOP_P
 #define TARGET_PREDICT_DOLOOP_P rs6000_predict_doloop_p
 
+#undef TARGET_HAVE_COUNT_REG_DECR_P
+#define TARGET_HAVE_COUNT_REG_DECR_P true
+
+/* 1000000000 is infinite cost in IVOPTs.  */
+#undef TARGET_DOLOOP_COST_FOR_GENERIC
+#define TARGET_DOLOOP_COST_FOR_GENERIC 1000000000
+
+#undef TARGET_DOLOOP_COST_FOR_ADDRESS
+#define TARGET_DOLOOP_COST_FOR_ADDRESS 1000000000
+
 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV rs6000_atomic_assign_expand_fenv
 
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index c2aa4d0..2d3015c 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -11618,6 +11618,36 @@ loops, and will help ivopts to make some decisions.
 The default version of this hook returns false.
 @end deftypefn
 
+@deftypevr {Target Hook} bool TARGET_HAVE_COUNT_REG_DECR_P
+Return true if the target supports hardware count register for decrement
+and branch.
+The default value is false.
+@end deftypevr
+
+@deftypevr {Target Hook} int64_t TARGET_DOLOOP_COST_FOR_GENERIC
+One IV candidate dedicated for doloop is introduced in IVOPTs, we can
+calculate the computation cost of adopting it to any generic IV use by
+function get_computation_cost as before.  But for targets which have
+hardware count register support for decrement and branch, it may have to
+move IV value from hardware count register to general purpose register
+while doloop IV candidate is used for generic IV uses.  It probably takes
+expensive penalty.  This hook allows target owners to define the cost for
+this especially for generic IV uses.
+The default value is zero.
+@end deftypevr
+
+@deftypevr {Target Hook} int64_t TARGET_DOLOOP_COST_FOR_ADDRESS
+One IV candidate dedicated for doloop is introduced in IVOPTs, we can
+calculate the computation cost of adopting it to any address IV use by
+function get_computation_cost as before.  But for targets which have
+hardware count register support for decrement and branch, it may have to
+move IV value from hardware count register to general purpose register
+while doloop IV candidate is used for address IV uses.  It probably takes
+expensive penalty.  This hook allows target owners to define the cost for
+this escpecially for address IV uses.
+The default value is zero.
+@end deftypevr
+
 @deftypefn {Target Hook} bool TARGET_CAN_USE_DOLOOP_P (const widest_int @var{&iterations}, const widest_int @var{&iterations_max}, unsigned int @var{loop_depth}, bool @var{entered_at_top})
 Return true if it is possible to use low-overhead loops (@code{doloop_end}
 and @code{doloop_begin}) for a particular loop.  @var{iterations} gives the
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index b4d57b8..4346773 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -7946,6 +7946,12 @@ to by @var{ce_info}.
 
 @hook TARGET_PREDICT_DOLOOP_P
 
+@hook TARGET_HAVE_COUNT_REG_DECR_P
+
+@hook TARGET_DOLOOP_COST_FOR_GENERIC
+
+@hook TARGET_DOLOOP_COST_FOR_ADDRESS
+
 @hook TARGET_CAN_USE_DOLOOP_P
 
 @hook TARGET_INVALID_WITHIN_DOLOOP
diff --git a/gcc/target.def b/gcc/target.def
index 71b6972..3328cc5 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -4246,6 +4246,39 @@ The default version of this hook returns false.",
  bool, (struct loop *loop),
  default_predict_doloop_p)
 
+DEFHOOKPOD
+(have_count_reg_decr_p,
+ "Return true if the target supports hardware count register for decrement\n\
+and branch.\n\
+The default value is false.",
+ bool, false)
+
+DEFHOOKPOD
+(doloop_cost_for_generic,
+ "One IV candidate dedicated for doloop is introduced in IVOPTs, we can\n\
+calculate the computation cost of adopting it to any generic IV use by\n\
+function get_computation_cost as before.  But for targets which have\n\
+hardware count register support for decrement and branch, it may have to\n\
+move IV value from hardware count register to general purpose register\n\
+while doloop IV candidate is used for generic IV uses.  It probably takes\n\
+expensive penalty.  This hook allows target owners to define the cost for\n\
+this especially for generic IV uses.\n\
+The default value is zero.",
+ int64_t, 0)
+
+DEFHOOKPOD
+(doloop_cost_for_address,
+ "One IV candidate dedicated for doloop is introduced in IVOPTs, we can\n\
+calculate the computation cost of adopting it to any address IV use by\n\
+function get_computation_cost as before.  But for targets which have\n\
+hardware count register support for decrement and branch, it may have to\n\
+move IV value from hardware count register to general purpose register\n\
+while doloop IV candidate is used for address IV uses.  It probably takes\n\
+expensive penalty.  This hook allows target owners to define the cost for\n\
+this escpecially for address IV uses.\n\
+The default value is zero.",
+ int64_t, 0)
+
 DEFHOOK
 (can_use_doloop_p,
  "Return true if it is possible to use low-overhead loops (@code{doloop_end}\n\
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ivopts-3.c b/gcc/testsuite/gcc.dg/tree-ssa/ivopts-3.c
index 214e6a7..ce4b1d0 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/ivopts-3.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ivopts-3.c
@@ -10,4 +10,6 @@ int main (void)
     f2 ();
 }
 
-/* { dg-final { scan-tree-dump-times "!= 0" 5 "ivopts" } }  */
+/* { dg-final { scan-tree-dump-times "!= 0" 5 "ivopts" { target { ! powerpc*-*-* } } } } */
+/* More debug information emitted for doloop on powerpc.  */
+/* { dg-final { scan-tree-dump-times "!= 0" 6 "ivopts" { target { powerpc*-*-* } } } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ivopts-lt.c b/gcc/testsuite/gcc.dg/tree-ssa/ivopts-lt.c
index 7d5859b..71d7f67 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/ivopts-lt.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ivopts-lt.c
@@ -17,6 +17,7 @@ f1 (char *p, uintptr_t i, uintptr_t n)
   while (i < n);
 }
 
-/* { dg-final { scan-tree-dump-times "PHI" 1 "ivopts" } } */
-/* { dg-final { scan-tree-dump-times "PHI <p_" 1 "ivopts"} } */
-/* { dg-final { scan-tree-dump-times "p_\[0-9\]* <" 1 "ivopts" } } */
+/* { dg-final { scan-tree-dump-times "PHI" 1 "ivopts" { target { ! powerpc*-*-* } } } } */
+/* { dg-final { scan-tree-dump-times "PHI" 2 "ivopts" { target { powerpc*-*-* } } } } */
+/* { dg-final { scan-tree-dump-times "PHI <p_" 1 "ivopts" { target { ! powerpc*-*-* } } } } */
+/* { dg-final { scan-tree-dump-times "p_\[0-9\]* <" 1 "ivopts" { target { ! powerpc*-*-* } } } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr32044.c b/gcc/testsuite/gcc.dg/tree-ssa/pr32044.c
index 8a8977a..06c27b0 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/pr32044.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr32044.c
@@ -1,6 +1,10 @@
 /* { dg-do compile } */
 /* { dg-options "-O2 -fdump-tree-optimized" } */
 
+/* For powerpc, disable doloop IV cand generation in IVOPTs to avoid unexpected
+   division operation for its base setup.  */
+/* { dg-additional-options "-fno-branch-count-reg" { target { powerpc*-*-* } } } */
+
 int foo (int n)
 {
   while (n >= 45)
diff --git a/gcc/tree-ssa-loop-ivopts.c b/gcc/tree-ssa-loop-ivopts.c
index 530ea4a..edbdf43 100644
--- a/gcc/tree-ssa-loop-ivopts.c
+++ b/gcc/tree-ssa-loop-ivopts.c
@@ -64,7 +64,30 @@ along with GCC; see the file COPYING3.  If not see
    All of this is done loop by loop.  Doing it globally is theoretically
    possible, it might give a better performance and it might enable us
    to decide costs more precisely, but getting all the interactions right
-   would be complicated.  */
+   would be complicated.
+
+   For the targets supporting low-overhead loops, IVOPTs has to take care of
+   the loops which will probably be transformed in RTL doloop optimization,
+   to try to make selected IV candidate set optimal.  The process of doloop
+   support includes:
+
+   1) Analyze the current loop will be transformed to doloop or not, find and
+      mark its compare type IV use as doloop use (iv_group field doloop_p), and
+      set flag doloop_use_p of ivopts_data to notify subsequent processings on
+      doloop.  See analyze_and_mark_doloop_use and its callees for the details.
+      The target hook predict_doloop_p can be used for target specific checks.
+
+   2) Add one doloop dedicated IV cand {(may_be_zero ? 1 : (niter + 1)), +, -1},
+      set flag doloop_p of iv_cand, step cost is set as zero and no extra cost
+      like biv.  For cost determination between doloop IV cand and IV use, the
+      target hooks doloop_cost_for_generic and doloop_cost_for_address are
+      provided to add on extra costs for generic type and address type IV use.
+      Zero cost is assigned to the pair between doloop IV cand and doloop IV
+      use, and bound zero is set for IV elimination.
+
+   3) With the cost setting in step 2), the current cost model based IV
+      selection algorithm will process as usual, pick up doloop dedicated IV if
+      profitable.  */
 
 #include "config.h"
 #include "system.h"
@@ -275,6 +298,9 @@ comp_cost::operator+= (comp_cost cost)
 comp_cost
 comp_cost::operator+= (HOST_WIDE_INT c)
 {
+  if (c >= INFTY)
+    this->cost = INFTY;
+
   if (infinite_cost_p ())
     return *this;
 
@@ -399,6 +425,8 @@ struct iv_group
   struct cost_pair *cost_map;
   /* The selected candidate for the group.  */
   struct iv_cand *selected;
+  /* To indicate this is a doloop use group.  */
+  bool doloop_p;
   /* Uses in the group.  */
   vec<struct iv_use *> vuses;
 };
@@ -439,6 +467,7 @@ struct iv_cand
 			   be hoisted out of loop.  */
   struct iv *orig_iv;	/* The original iv if this cand is added from biv with
 			   smaller type.  */
+  bool doloop_p;	/* Whether this is a doloop candidate.  */
 };
 
 /* Hashtable entry for common candidate derived from iv uses.  */
@@ -612,6 +641,9 @@ struct ivopts_data
 
   /* Whether the loop body can only be exited via single exit.  */
   bool loop_single_exit_p;
+
+  /* Whether the loop has doloop comparison use.  */
+  bool doloop_use_p;
 };
 
 /* An assignment of iv candidates to uses.  */
@@ -1528,6 +1560,7 @@ record_group (struct ivopts_data *data, enum use_type type)
   group->type = type;
   group->related_cands = BITMAP_ALLOC (NULL);
   group->vuses.create (1);
+  group->doloop_p = false;
 
   data->vgroups.safe_push (group);
   return group;
@@ -3017,10 +3050,10 @@ get_loop_invariant_expr (struct ivopts_data *data, tree inv_expr)
    replacement of the final value of the iv by a direct computation.  */
 
 static struct iv_cand *
-add_candidate_1 (struct ivopts_data *data,
-		 tree base, tree step, bool important, enum iv_position pos,
-		 struct iv_use *use, gimple *incremented_at,
-		 struct iv *orig_iv = NULL)
+add_candidate_1 (struct ivopts_data *data, tree base, tree step, bool important,
+		 enum iv_position pos, struct iv_use *use,
+		 gimple *incremented_at, struct iv *orig_iv = NULL,
+		 bool doloop = false)
 {
   unsigned i;
   struct iv_cand *cand = NULL;
@@ -3079,11 +3112,15 @@ add_candidate_1 (struct ivopts_data *data,
       cand->pos = pos;
       if (pos != IP_ORIGINAL)
 	{
-	  cand->var_before = create_tmp_var_raw (TREE_TYPE (base), "ivtmp");
+	  if (doloop)
+	    cand->var_before = create_tmp_var_raw (TREE_TYPE (base), "doloop");
+	  else
+	    cand->var_before = create_tmp_var_raw (TREE_TYPE (base), "ivtmp");
 	  cand->var_after = cand->var_before;
 	}
       cand->important = important;
       cand->incremented_at = incremented_at;
+      cand->doloop_p = doloop;
       data->vcands.safe_push (cand);
 
       if (!poly_int_tree_p (step))
@@ -3116,6 +3153,7 @@ add_candidate_1 (struct ivopts_data *data,
     }
 
   cand->important |= important;
+  cand->doloop_p |= doloop;
 
   /* Relate candidate to the group for which it is added.  */
   if (use)
@@ -3209,14 +3247,16 @@ add_autoinc_candidates (struct ivopts_data *data, tree base, tree step,
    the end of loop.  */
 
 static void
-add_candidate (struct ivopts_data *data,
-	       tree base, tree step, bool important, struct iv_use *use,
-	       struct iv *orig_iv = NULL)
+add_candidate (struct ivopts_data *data, tree base, tree step, bool important,
+	       struct iv_use *use, struct iv *orig_iv = NULL,
+	       bool doloop = false)
 {
   if (ip_normal_pos (data->current_loop))
-    add_candidate_1 (data, base, step, important,
-		     IP_NORMAL, use, NULL, orig_iv);
-  if (ip_end_pos (data->current_loop)
+    add_candidate_1 (data, base, step, important, IP_NORMAL, use, NULL, orig_iv,
+		     doloop);
+  /* Exclude doloop candidate here since it requires decrement then comparison
+     and jump, the IP_END position doesn't match.  */
+  if (!doloop && ip_end_pos (data->current_loop)
       && allow_ip_end_pos_p (data->current_loop))
     add_candidate_1 (data, base, step, important, IP_END, use, NULL, orig_iv);
 }
@@ -3724,7 +3764,7 @@ prepare_decl_rtl (tree *expr_p, int *ws, void *data)
    Some RTL specific checks seems unable to be checked in gimple, if any new
    checks or easy checks _are_ missing here, please add them.  */
 
-static bool ATTRIBUTE_UNUSED
+static bool
 generic_predict_doloop_p (struct ivopts_data *data)
 {
   struct loop *loop = data->current_loop;
@@ -4177,6 +4217,36 @@ force_expr_to_var_cost (tree expr, bool speed)
       STRIP_NOPS (op0);
       op1 = NULL_TREE;
       break;
+    /* See add_iv_candidate_for_doloop, for doloop may_be_zero case, we
+       introduce COND_EXPR for IV base, need to support better cost estimation
+       for this COND_EXPR and tcc_comparison.  */
+    case COND_EXPR:
+      op0 = TREE_OPERAND (expr, 1);
+      STRIP_NOPS (op0);
+      op1 = TREE_OPERAND (expr, 2);
+      STRIP_NOPS (op1);
+      break;
+    case LT_EXPR:
+    case LE_EXPR:
+    case GT_EXPR:
+    case GE_EXPR:
+    case EQ_EXPR:
+    case NE_EXPR:
+    case UNORDERED_EXPR:
+    case ORDERED_EXPR:
+    case UNLT_EXPR:
+    case UNLE_EXPR:
+    case UNGT_EXPR:
+    case UNGE_EXPR:
+    case UNEQ_EXPR:
+    case LTGT_EXPR:
+    case MAX_EXPR:
+    case MIN_EXPR:
+      op0 = TREE_OPERAND (expr, 0);
+      STRIP_NOPS (op0);
+      op1 = TREE_OPERAND (expr, 1);
+      STRIP_NOPS (op1);
+      break;
 
     default:
       /* Just an arbitrary value, FIXME.  */
@@ -4258,6 +4328,35 @@ force_expr_to_var_cost (tree expr, bool speed)
     case RSHIFT_EXPR:
       cost = comp_cost (add_cost (speed, mode), 0);
       break;
+    case COND_EXPR:
+      op0 = TREE_OPERAND (expr, 0);
+      STRIP_NOPS (op0);
+      if (op0 == NULL_TREE || TREE_CODE (op0) == SSA_NAME
+	  || CONSTANT_CLASS_P (op0))
+	cost = no_cost;
+      else
+	cost = force_expr_to_var_cost (op0, speed);
+      break;
+    case LT_EXPR:
+    case LE_EXPR:
+    case GT_EXPR:
+    case GE_EXPR:
+    case EQ_EXPR:
+    case NE_EXPR:
+    case UNORDERED_EXPR:
+    case ORDERED_EXPR:
+    case UNLT_EXPR:
+    case UNLE_EXPR:
+    case UNGT_EXPR:
+    case UNGE_EXPR:
+    case UNEQ_EXPR:
+    case LTGT_EXPR:
+    case MAX_EXPR:
+    case MIN_EXPR:
+      /* Simply use add cost for now, FIXME if there is some more accurate cost
+	 evaluation way.  */
+      cost = comp_cost (add_cost (speed, mode), 0);
+      break;
 
     default:
       gcc_unreachable ();
@@ -4634,7 +4733,10 @@ get_computation_cost (struct ivopts_data *data, struct iv_use *use,
     {
       cost = get_address_cost (data, use, cand, &aff_inv, &aff_var, ratio,
 			       inv_vars, inv_expr, can_autoinc, speed);
-      return get_scaled_computation_cost_at (data, at, cost);
+      cost = get_scaled_computation_cost_at (data, at, cost);
+      /* For doloop IV cand, add on the extra cost.  */
+      cost += cand->doloop_p ? targetm.doloop_cost_for_address : 0;
+      return cost;
     }
 
   bool simple_inv = (aff_combination_const_p (&aff_inv)
@@ -4684,7 +4786,13 @@ get_computation_cost (struct ivopts_data *data, struct iv_use *use,
   if (comp_inv && !integer_zerop (comp_inv))
     cost += add_cost (speed, TYPE_MODE (utype));
 
-  return get_scaled_computation_cost_at (data, at, cost);
+  cost = get_scaled_computation_cost_at (data, at, cost);
+
+  /* For doloop IV cand, add on the extra cost.  */
+  if (cand->doloop_p && use->type == USE_NONLINEAR_EXPR)
+    cost += targetm.doloop_cost_for_generic;
+
+  return cost;
 }
 
 /* Determines cost of computing the use in GROUP with CAND in a generic
@@ -5142,6 +5250,15 @@ may_eliminate_iv (struct ivopts_data *data,
 	}
     }
 
+  /* For doloop IV cand, the bound would be zero.  It's safe whether
+     may_be_zero set or not.  */
+  if (cand->doloop_p)
+    {
+      *bound = build_int_cst (TREE_TYPE (cand->iv->base), 0);
+      *comp = iv_elimination_compare (data, use);
+      return true;
+    }
+
   cand_value_at (loop, cand, use->stmt, desc->niter, &bnd);
 
   *bound = fold_convert (TREE_TYPE (cand->iv->base),
@@ -5264,6 +5381,9 @@ determine_group_iv_cost_cond (struct ivopts_data *data,
       inv_vars = inv_vars_elim;
       inv_vars_elim = NULL;
       inv_expr = inv_expr_elim;
+      /* For doloop candidate/use pair, adjust to zero cost.  */
+      if (group->doloop_p && cand->doloop_p && elim_cost.cost > no_cost.cost)
+	cost = no_cost;
     }
   else
     {
@@ -5390,6 +5510,42 @@ relate_compare_use_with_all_cands (struct ivopts_data *data)
     }
 }
 
+/* Add one doloop dedicated IV candidate:
+     - Base is (may_be_zero ? 1 : (niter + 1)).
+     - Step is -1.  */
+
+static void
+add_iv_candidate_for_doloop (struct ivopts_data *data)
+{
+  tree_niter_desc *niter_desc = niter_for_single_dom_exit (data);
+  gcc_assert (niter_desc && niter_desc->assumptions);
+
+  tree niter = niter_desc->niter;
+  tree ntype = TREE_TYPE (niter);
+  gcc_assert (TREE_CODE (ntype) == INTEGER_TYPE);
+
+  tree may_be_zero = niter_desc->may_be_zero;
+  if (may_be_zero && integer_zerop (may_be_zero))
+    may_be_zero = NULL_TREE;
+  if (may_be_zero)
+    {
+      if (COMPARISON_CLASS_P (may_be_zero))
+	{
+	  niter = fold_build3 (COND_EXPR, ntype, may_be_zero,
+			       build_int_cst (ntype, 0),
+			       rewrite_to_non_trapping_overflow (niter));
+	}
+      /* Don't try to obtain the iteration count expression when may_be_zero is
+	 integer_nonzerop (actually iteration count is one) or else.  */
+      else
+	return;
+    }
+
+  tree base = fold_build2 (PLUS_EXPR, ntype, unshare_expr (niter),
+			   build_int_cst (ntype, 1));
+  add_candidate (data, base, build_int_cst (ntype, -1), true, NULL, NULL, true);
+}
+
 /* Finds the candidates for the induction variables.  */
 
 static void
@@ -5398,6 +5554,10 @@ find_iv_candidates (struct ivopts_data *data)
   /* Add commonly used ivs.  */
   add_standard_iv_candidates (data);
 
+  /* Add doloop dedicated ivs.  */
+  if (data->doloop_use_p)
+    add_iv_candidate_for_doloop (data);
+
   /* Add old induction variables.  */
   add_iv_candidate_for_bivs (data);
 
@@ -5578,16 +5738,21 @@ determine_iv_cost (struct ivopts_data *data, struct iv_cand *cand)
      or a const set.  */
   if (cost_base.cost == 0)
     cost_base.cost = COSTS_N_INSNS (1);
-  cost_step = add_cost (data->speed, TYPE_MODE (TREE_TYPE (base)));
-
+  /* Doloop decrement should be considered as zero cost.  */
+  if (cand->doloop_p)
+    cost_step = 0;
+  else
+    cost_step = add_cost (data->speed, TYPE_MODE (TREE_TYPE (base)));
   cost = cost_step + adjust_setup_cost (data, cost_base.cost);
 
   /* Prefer the original ivs unless we may gain something by replacing it.
      The reason is to make debugging simpler; so this is not relevant for
      artificial ivs created by other optimization passes.  */
-  if (cand->pos != IP_ORIGINAL
-      || !SSA_NAME_VAR (cand->var_before)
-      || DECL_ARTIFICIAL (SSA_NAME_VAR (cand->var_before)))
+  if ((cand->pos != IP_ORIGINAL
+       || !SSA_NAME_VAR (cand->var_before)
+       || DECL_ARTIFICIAL (SSA_NAME_VAR (cand->var_before)))
+      /* Prefer doloop as well.  */
+      && !cand->doloop_p)
     cost++;
 
   /* Prefer not to insert statements into latch unless there are some
@@ -5832,7 +5997,8 @@ iv_ca_set_no_cp (struct ivopts_data *data, struct iv_ca *ivs,
   if (ivs->n_cand_uses[cid] == 0)
     {
       bitmap_clear_bit (ivs->cands, cid);
-      ivs->n_cands--;
+      if (!cp->cand->doloop_p || !targetm.have_count_reg_decr_p)
+	ivs->n_cands--;
       ivs->cand_cost -= cp->cand->cost;
       iv_ca_set_remove_invs (ivs, cp->cand->inv_vars, ivs->n_inv_var_uses);
       iv_ca_set_remove_invs (ivs, cp->cand->inv_exprs, ivs->n_inv_expr_uses);
@@ -5889,7 +6055,8 @@ iv_ca_set_cp (struct ivopts_data *data, struct iv_ca *ivs,
       if (ivs->n_cand_uses[cid] == 1)
 	{
 	  bitmap_set_bit (ivs->cands, cid);
-	  ivs->n_cands++;
+	  if (!cp->cand->doloop_p || !targetm.have_count_reg_decr_p)
+	    ivs->n_cands++;
 	  ivs->cand_cost += cp->cand->cost;
 	  iv_ca_set_add_invs (ivs, cp->cand->inv_vars, ivs->n_inv_var_uses);
 	  iv_ca_set_add_invs (ivs, cp->cand->inv_exprs, ivs->n_inv_expr_uses);
@@ -6134,6 +6301,8 @@ iv_ca_dump (struct ivopts_data *data, FILE *file, struct iv_ca *ivs)
 
   fprintf (file, "  cost: %" PRId64 " (complexity %d)\n", cost.cost,
 	   cost.complexity);
+  fprintf (file, "  reg_cost: %d\n",
+	   ivopts_estimate_reg_pressure (data, ivs->n_invs, ivs->n_cands));
   fprintf (file, "  cand_cost: %" PRId64 "\n  cand_group_cost: "
 	   "%" PRId64 " (complexity %d)\n", ivs->cand_cost,
 	   ivs->cand_use_cost.cost, ivs->cand_use_cost.complexity);
@@ -7568,6 +7737,77 @@ determine_scaling_factor (struct ivopts_data *data, basic_block *body)
     }
 }
 
+/* Find doloop comparison use and set its doloop_p on if found.  */
+
+static bool
+find_doloop_use (struct ivopts_data *data)
+{
+  struct loop *loop = data->current_loop;
+
+  for (unsigned i = 0; i < data->vgroups.length (); i++)
+    {
+      struct iv_group *group = data->vgroups[i];
+      if (group->type == USE_COMPARE)
+	{
+	  gcc_assert (group->vuses.length () == 1);
+	  struct iv_use *use = group->vuses[0];
+	  gimple *stmt = use->stmt;
+	  if (gimple_code (stmt) == GIMPLE_COND)
+	    {
+	      basic_block bb = gimple_bb (stmt);
+	      edge true_edge, false_edge;
+	      extract_true_false_edges_from_block (bb, &true_edge, &false_edge);
+	      /* This comparison is used for loop latch.  Require latch is empty
+		 for now.  */
+	      if ((loop->latch == true_edge->dest
+		   || loop->latch == false_edge->dest)
+		  && empty_block_p (loop->latch))
+		{
+		  group->doloop_p = true;
+		  if (dump_file && (dump_flags & TDF_DETAILS))
+		    {
+		      fprintf (dump_file, "Doloop cmp iv use: ");
+		      print_gimple_stmt (dump_file, stmt, TDF_DETAILS);
+		    }
+		  return true;
+		}
+	    }
+	}
+    }
+
+  return false;
+}
+
+/* For the targets which support doloop, to predict whether later RTL doloop
+   transformation will perform on this loop, further detect the doloop use and
+   mark the flag doloop_use_p if predicted.  */
+
+void
+analyze_and_mark_doloop_use (struct ivopts_data *data)
+{
+  data->doloop_use_p = false;
+
+  if (!flag_branch_on_count_reg)
+    return;
+
+  if (!generic_predict_doloop_p (data))
+    return;
+
+  if (find_doloop_use (data))
+    {
+      data->doloop_use_p = true;
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	{
+	  struct loop *loop = data->current_loop;
+	  fprintf (dump_file,
+		   "Predict loop %d can perform"
+		   " doloop optimization later.\n",
+		   loop->num);
+	  flow_loop_dump (loop, dump_file, NULL, 1);
+	}
+    }
+}
+
 /* Optimizes the LOOP.  Returns true if anything changed.  */
 
 static bool
@@ -7622,6 +7862,9 @@ tree_ssa_iv_optimize_loop (struct ivopts_data *data, struct loop *loop,
   /* Determine cost scaling factor for basic blocks in loop.  */
   determine_scaling_factor (data, body);
 
+  /* Analyze doloop possibility and mark the doloop use if predicted.  */
+  analyze_and_mark_doloop_use (data);
+
   /* Finds candidates for the induction variables (item 2).  */
   find_iv_candidates (data);
 

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v6 3/3] PR80791 Consider doloop cmp use in ivopts
  2019-09-11  6:18                           ` Kewen.Lin
@ 2019-09-12  8:14                             ` Richard Biener
  2019-09-14  9:35                               ` Kewen.Lin
  0 siblings, 1 reply; 43+ messages in thread
From: Richard Biener @ 2019-09-12  8:14 UTC (permalink / raw)
  To: Kewen.Lin; +Cc: gcc-patches List, Segher Boessenkool, Bin.Cheng, Bill Schmidt

[-- Attachment #1: Type: text/plain, Size: 3292 bytes --]

On Wed, 11 Sep 2019, Kewen.Lin wrote:

> Hi,
> 
> Sorry for the late update.  I've updated the words of target hooks part.
> 
> Could someone help to review it?  Thanks in advance!
> 
> By the way, as previous emails in this thread, Bin has approved the IVOPTs
> part, while Segher has approved the rs6000 part.

The target hooks part is OK.  I guess we'll have to extend it eventually
in case other targets want to make use of it.

Thanks,
Richard.

> 
> Thanks,
> Kewen
> 
> -----
> 
> gcc/ChangeLog
> 
> 2019-09-11  Kewen Lin  <linkw@gcc.gnu.org>
> 
> 	PR middle-end/80791
> 	* config/rs6000/rs6000.c (TARGET_HAVE_COUNT_REG_DECR_P): New macro.
> 	(TARGET_DOLOOP_COST_FOR_GENERIC): Likewise.
> 	(TARGET_DOLOOP_COST_FOR_ADDRESS): Likewise.
> 	* target.def (have_count_reg_decr_p): New hook.
> 	(doloop_cost_for_generic): Likewise.
> 	(doloop_cost_for_address): Likewise.
> 	* doc/tm.texi.in (TARGET_HAVE_COUNT_REG_DECR_P): Likewise.
> 	(TARGET_DOLOOP_COST_FOR_GENERIC): Likewise.
> 	(TARGET_DOLOOP_COST_FOR_ADDRESS): Likewise.
> 	* doc/tm.texi: Regenerate.
> 	* tree-ssa-loop-ivopts.c (comp_cost::operator+=): Consider infinite cost
> 	addend.
> 	(record_group): Init doloop_p.
> 	(add_candidate_1): Add optional argument doloop, change the handlings
> 	accordingly.
> 	(add_candidate): Likewise.
> 	(generic_predict_doloop_p): Update attribute.
> 	(force_expr_to_var_cost): Add costing for expressions COND_EXPR/LT_EXPR/
> 	LE_EXPR/GT_EXPR/GE_EXPR/EQ_EXPR/NE_EXPR/UNORDERED_EXPR/ORDERED_EXPR/
> 	UNLT_EXPR/UNLE_EXPR/UNGT_EXPR/UNGE_EXPR/UNEQ_EXPR/LTGT_EXPR/MAX_EXPR/
> 	MIN_EXPR.
> 	(get_computation_cost): Update for doloop IV cand extra cost.	
> 	(determine_group_iv_cost_cond): Update for doloop IV cand.
> 	(determine_iv_cost): Likewise.
> 	(ivopts_estimate_reg_pressure): Likewise.
> 	(may_eliminate_iv): Update handlings for doloop IV cand.
> 	(add_iv_candidate_for_doloop): New function.
> 	(find_iv_candidates): Call function add_iv_candidate_for_doloop.
> 	(iv_ca_set_no_cp): Update for doloop IV cand.
> 	(iv_ca_set_cp): Likewise.
> 	(iv_ca_dump): Dump register cost.
> 	(find_doloop_use): New function.
> 	(analyze_and_mark_doloop_use): Likewise.
> 	(tree_ssa_iv_optimize_loop): Call function analyze_and_mark_doloop_use.
> 
> gcc/testsuite/ChangeLog
> 
> 2019-09-11  Kewen Lin  <linkw@gcc.gnu.org>
> 
> 	PR middle-end/80791
> 	* gcc.dg/tree-ssa/ivopts-3.c: Adjust for doloop change.
> 	* gcc.dg/tree-ssa/ivopts-lt.c: Likewise.
> 	* gcc.dg/tree-ssa/pr32044.c: Likewise.
> 
> 
> on 2019/8/23 下午6:18, Segher Boessenkool wrote:
> > Hi!
> > 
> > On Fri, Aug 23, 2019 at 05:43:32PM +0800, Bin.Cheng wrote:
> >> On Fri, Aug 23, 2019 at 4:27 PM Kewen.Lin <linkw@linux.ibm.com> wrote:
> >> Not sure if non-ivopts parts are already approved?  If so, the patch
> >> is okay with above issues addressed.
> > 
> > The rs6000 part is fine.  The target.def entries need some spell check
> > and copy-editing, but are obvious and trivial otherwise, and/or you can
> > approve it as ivopts maintainer.
> > 
> >> Thanks very much for your time!
> > 
> > And thank you as well Bin :-)
> > 
> > 
> > Segher
> > 
> 

-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409 Nuernberg,
Germany; GF: Felix Imendörffer; HRB 247165 (AG München)

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v6 3/3] PR80791 Consider doloop cmp use in ivopts
  2019-09-12  8:14                             ` Richard Biener
@ 2019-09-14  9:35                               ` Kewen.Lin
  0 siblings, 0 replies; 43+ messages in thread
From: Kewen.Lin @ 2019-09-14  9:35 UTC (permalink / raw)
  To: Richard Biener
  Cc: gcc-patches List, Segher Boessenkool, Bin.Cheng, Bill Schmidt

on 2019/9/12 下午4:14, Richard Biener wrote:
> On Wed, 11 Sep 2019, Kewen.Lin wrote:
> 
>> Hi,
>>
>> Sorry for the late update.  I've updated the words of target hooks part.
>>
>> Could someone help to review it?  Thanks in advance!
>>
>> By the way, as previous emails in this thread, Bin has approved the IVOPTs
>> part, while Segher has approved the rs6000 part.
> 
> The target hooks part is OK.  I guess we'll have to extend it eventually
> in case other targets want to make use of it.
> 

Thanks Richard!  Committed by r275713.

Yes, it's enough when doloop IV costs zero or infinite for generic/address use,
but if one target wants some other values, we may have to take it as one common
cost shared for all generic/address uses.  It's like IV candidate cost but not
the same since it's only needed when doloop IV is used for generic/address uses,
I guess it requires some changes in candidate set cost calculation.  I chose to
keep it simple at the first place, but radar on for any other target adoptions.


Thanks,
Kewen

> Thanks,
> Richard.
> 
>>
>> Thanks,
>> Kewen
>>
>> -----
>>
>> gcc/ChangeLog
>>
>> 2019-09-11  Kewen Lin  <linkw@gcc.gnu.org>
>>
>> 	PR middle-end/80791
>> 	* config/rs6000/rs6000.c (TARGET_HAVE_COUNT_REG_DECR_P): New macro.
>> 	(TARGET_DOLOOP_COST_FOR_GENERIC): Likewise.
>> 	(TARGET_DOLOOP_COST_FOR_ADDRESS): Likewise.
>> 	* target.def (have_count_reg_decr_p): New hook.
>> 	(doloop_cost_for_generic): Likewise.
>> 	(doloop_cost_for_address): Likewise.
>> 	* doc/tm.texi.in (TARGET_HAVE_COUNT_REG_DECR_P): Likewise.
>> 	(TARGET_DOLOOP_COST_FOR_GENERIC): Likewise.
>> 	(TARGET_DOLOOP_COST_FOR_ADDRESS): Likewise.
>> 	* doc/tm.texi: Regenerate.
>> 	* tree-ssa-loop-ivopts.c (comp_cost::operator+=): Consider infinite cost
>> 	addend.
>> 	(record_group): Init doloop_p.
>> 	(add_candidate_1): Add optional argument doloop, change the handlings
>> 	accordingly.
>> 	(add_candidate): Likewise.
>> 	(generic_predict_doloop_p): Update attribute.
>> 	(force_expr_to_var_cost): Add costing for expressions COND_EXPR/LT_EXPR/
>> 	LE_EXPR/GT_EXPR/GE_EXPR/EQ_EXPR/NE_EXPR/UNORDERED_EXPR/ORDERED_EXPR/
>> 	UNLT_EXPR/UNLE_EXPR/UNGT_EXPR/UNGE_EXPR/UNEQ_EXPR/LTGT_EXPR/MAX_EXPR/
>> 	MIN_EXPR.
>> 	(get_computation_cost): Update for doloop IV cand extra cost.	
>> 	(determine_group_iv_cost_cond): Update for doloop IV cand.
>> 	(determine_iv_cost): Likewise.
>> 	(ivopts_estimate_reg_pressure): Likewise.
>> 	(may_eliminate_iv): Update handlings for doloop IV cand.
>> 	(add_iv_candidate_for_doloop): New function.
>> 	(find_iv_candidates): Call function add_iv_candidate_for_doloop.
>> 	(iv_ca_set_no_cp): Update for doloop IV cand.
>> 	(iv_ca_set_cp): Likewise.
>> 	(iv_ca_dump): Dump register cost.
>> 	(find_doloop_use): New function.
>> 	(analyze_and_mark_doloop_use): Likewise.
>> 	(tree_ssa_iv_optimize_loop): Call function analyze_and_mark_doloop_use.
>>
>> gcc/testsuite/ChangeLog
>>
>> 2019-09-11  Kewen Lin  <linkw@gcc.gnu.org>
>>
>> 	PR middle-end/80791
>> 	* gcc.dg/tree-ssa/ivopts-3.c: Adjust for doloop change.
>> 	* gcc.dg/tree-ssa/ivopts-lt.c: Likewise.
>> 	* gcc.dg/tree-ssa/pr32044.c: Likewise.
>>
>>
>> on 2019/8/23 下午6:18, Segher Boessenkool wrote:
>>> Hi!
>>>
>>> On Fri, Aug 23, 2019 at 05:43:32PM +0800, Bin.Cheng wrote:
>>>> On Fri, Aug 23, 2019 at 4:27 PM Kewen.Lin <linkw@linux.ibm.com> wrote:
>>>> Not sure if non-ivopts parts are already approved?  If so, the patch
>>>> is okay with above issues addressed.
>>>
>>> The rs6000 part is fine.  The target.def entries need some spell check
>>> and copy-editing, but are obvious and trivial otherwise, and/or you can
>>> approve it as ivopts maintainer.
>>>
>>>> Thanks very much for your time!
>>>
>>> And thank you as well Bin :-)
>>>
>>>
>>> Segher
>>>
>>
> 

^ permalink raw reply	[flat|nested] 43+ messages in thread

end of thread, other threads:[~2019-09-14  9:35 UTC | newest]

Thread overview: 43+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-05-14  3:10 [PATCH v2 3/3] Consider doloop cmp use in ivopts linkw
2019-05-14  7:26 ` Richard Biener
2019-05-15  5:03   ` Kewen.Lin
2019-05-15  8:47     ` Richard Biener
2019-05-15 16:17       ` Segher Boessenkool
2019-05-16  7:25         ` Richard Biener
2019-05-16 17:35           ` Segher Boessenkool
2019-05-16  3:53       ` Kewen.Lin
2019-05-16 18:41       ` Jeff Law
2019-05-16 21:42         ` Segher Boessenkool
2019-06-19 11:47 ` [PATCH v3 3/3] PR80791 " Kewen.Lin
2019-06-20  9:09   ` Segher Boessenkool
2019-06-20 12:08     ` Kewen.Lin
2019-06-20 12:17       ` Kewen.Lin
2019-07-10  2:31         ` [PING^1][PATCH v4 " Kewen.Lin
2019-07-12 12:40           ` Richard Biener
2019-07-12 14:10             ` Segher Boessenkool
2019-07-15  6:40             ` Kewen.Lin
2019-07-15  6:50             ` Bin.Cheng
2019-07-21  9:06   ` [PATCH v3 " Bin.Cheng
2019-07-22  5:42     ` Kewen.Lin
2019-07-22  6:53       ` Segher Boessenkool
2019-07-22  7:18         ` Kewen.Lin
2019-07-22  8:02         ` Richard Biener
2019-07-22 21:47           ` Segher Boessenkool
2019-07-23  6:14             ` Kewen.Lin
2019-07-23  7:38             ` Richard Biener
2019-07-23  6:09           ` Kewen.Lin
2019-07-23  8:05             ` Richard Biener
2019-07-23  6:28       ` [PATCH v5 " Kewen.Lin
2019-08-14  7:48         ` [PATCH v6 " Kewen.Lin
2019-08-21 13:42           ` Bin.Cheng
2019-08-22  7:09             ` Kewen.Lin
2019-08-22  8:07               ` Bin.Cheng
2019-08-22  9:16                 ` Kewen.Lin
2019-08-23  5:31                   ` Bin.Cheng
2019-08-23  9:57                     ` Kewen.Lin
2019-08-23 10:43                       ` Bin.Cheng
2019-08-23 11:02                         ` Segher Boessenkool
2019-09-11  6:18                           ` Kewen.Lin
2019-09-12  8:14                             ` Richard Biener
2019-09-14  9:35                               ` Kewen.Lin
2019-08-24 22:43                         ` Kewen.Lin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).