public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH] Support multi-versioning on self-recursive function (ipa/92133)
@ 2019-10-17  8:35 Feng Xue OS
  2019-10-18  2:12 ` luoxhu
  2019-10-24  6:17 ` luoxhu
  0 siblings, 2 replies; 19+ messages in thread
From: Feng Xue OS @ 2019-10-17  8:35 UTC (permalink / raw)
  To: gcc-patches, Jan Hubicka, Martin Jambor

IPA does not allow constant propagation on parameter that is used to control
function recursion. 

recur_fn (i)
{
  if ( !terminate_recursion (i))
    {
      ...  
      recur_fn (i + 1);
      ...
    }
  ...
}

This patch is composed to enable multi-versioning for self-recursive function,
and versioning copies is limited by a specified option.

Feng
---
diff --git a/gcc/ipa-cp.c b/gcc/ipa-cp.c
index 045072e02ec..6255a766e4d 100644
--- a/gcc/ipa-cp.c
+++ b/gcc/ipa-cp.c
@@ -229,7 +229,9 @@ public:
   inline bool set_contains_variable ();
   bool add_value (valtype newval, cgraph_edge *cs,
 		  ipcp_value<valtype> *src_val = NULL,
-		  int src_idx = 0, HOST_WIDE_INT offset = -1);
+		  int src_idx = 0, HOST_WIDE_INT offset = -1,
+		  ipcp_value<valtype> **val_pos_p = NULL,
+		  bool unlimited = false);
   void print (FILE * f, bool dump_sources, bool dump_benefits);
 };
 
@@ -1579,22 +1581,37 @@ allocate_and_init_ipcp_value (ipa_polymorphic_call_context source)
 /* Try to add NEWVAL to LAT, potentially creating a new ipcp_value for it.  CS,
    SRC_VAL SRC_INDEX and OFFSET are meant for add_source and have the same
    meaning.  OFFSET -1 means the source is scalar and not a part of an
-   aggregate.  */
+   aggregate.  If non-NULL, VAL_POS_P specifies position in value list,
+   after which newly created ipcp_value will be inserted, and it is also
+   used to record address of the added ipcp_value before function returns.
+   UNLIMITED means whether value count should not exceed the limit given
+   by PARAM_IPA_CP_VALUE_LIST_SIZE.  */
 
 template <typename valtype>
 bool
 ipcp_lattice<valtype>::add_value (valtype newval, cgraph_edge *cs,
 				  ipcp_value<valtype> *src_val,
-				  int src_idx, HOST_WIDE_INT offset)
+				  int src_idx, HOST_WIDE_INT offset,
+				  ipcp_value<valtype> **val_pos_p,
+				  bool unlimited)
 {
   ipcp_value<valtype> *val;
 
+  if (val_pos_p)
+    {
+      for (val = values; val && val != *val_pos_p; val = val->next);
+      gcc_checking_assert (val);
+    }
+
   if (bottom)
     return false;
 
   for (val = values; val; val = val->next)
     if (values_equal_for_ipcp_p (val->value, newval))
       {
+	if (val_pos_p)
+	  *val_pos_p = val;
+
 	if (ipa_edge_within_scc (cs))
 	  {
 	    ipcp_value_source<valtype> *s;
@@ -1609,7 +1626,8 @@ ipcp_lattice<valtype>::add_value (valtype newval, cgraph_edge *cs,
 	return false;
       }
 
-  if (values_count == PARAM_VALUE (PARAM_IPA_CP_VALUE_LIST_SIZE))
+  if (!unlimited
+      && values_count == PARAM_VALUE (PARAM_IPA_CP_VALUE_LIST_SIZE))
     {
       /* We can only free sources, not the values themselves, because sources
 	 of other values in this SCC might point to them.   */
@@ -1623,6 +1641,9 @@ ipcp_lattice<valtype>::add_value (valtype newval, cgraph_edge *cs,
 	    }
 	}
 
+      if (val_pos_p)
+	*val_pos_p = NULL;
+
       values = NULL;
       return set_to_bottom ();
     }
@@ -1630,8 +1651,54 @@ ipcp_lattice<valtype>::add_value (valtype newval, cgraph_edge *cs,
   values_count++;
   val = allocate_and_init_ipcp_value (newval);
   val->add_source (cs, src_val, src_idx, offset);
-  val->next = values;
-  values = val;
+  if (val_pos_p)
+    {
+      val->next = (*val_pos_p)->next;
+      (*val_pos_p)->next = val;
+      *val_pos_p = val;
+    }
+  else
+    {
+      val->next = values;
+      values = val;
+    }
+
+  return true;
+}
+
+/* Return true, if a ipcp_value VAL is orginated from parameter value of
+   self-feeding recursive function by applying non-passthrough arithmetic
+   transformation.  */
+
+static bool
+self_recursively_generated_p (ipcp_value<tree> *val)
+{
+  class ipa_node_params *info = NULL;
+
+  for (ipcp_value_source<tree> *src = val->sources; src; src = src->next)
+    {
+      cgraph_edge *cs = src->cs;
+
+      if (!src->val || cs->caller != cs->callee->function_symbol ()
+	  || src->val == val)
+	return false;
+
+      if (!info)
+	info = IPA_NODE_REF (cs->caller);
+
+      class ipcp_param_lattices *plats = ipa_get_parm_lattices (info,
+								src->index);
+      ipcp_lattice<tree> *src_lat = src->offset == -1 ? &plats->itself
+						      : plats->aggs;
+      ipcp_value<tree> *src_val;
+
+      for (src_val = src_lat->values; src_val && src_val != val;
+	   src_val = src_val->next);
+
+      if (!src_val)
+	return false;
+    }
+
   return true;
 }
 
@@ -1649,20 +1716,72 @@ propagate_vals_across_pass_through (cgraph_edge *cs, ipa_jump_func *jfunc,
   ipcp_value<tree> *src_val;
   bool ret = false;
 
-  /* Do not create new values when propagating within an SCC because if there
-     are arithmetic functions with circular dependencies, there is infinite
-     number of them and we would just make lattices bottom.  If this condition
-     is ever relaxed we have to detect self-feeding recursive calls in
-     cgraph_edge_brings_value_p in a smarter way.  */
+  /* Due to circular dependencies, propagating within an SCC through arithmetic
+     transformation would create infinite number of values.  But for
+     self-feeding recursive function, we could allow propagation in a limited
+     count, and this can enable a simple kind of recursive function versioning.
+     For other scenario, we would just make lattices bottom.  */
   if ((ipa_get_jf_pass_through_operation (jfunc) != NOP_EXPR)
       && ipa_edge_within_scc (cs))
-    ret = dest_lat->set_contains_variable ();
+    {
+      if (src_lat != dest_lat)
+	return dest_lat->set_contains_variable ();
+
+      auto_vec<ipcp_value<tree> *, 8> val_seeds;
+
+      for (src_val = src_lat->values; src_val; src_val = src_val->next)
+	{
+	  /* Now we do not use self-recursively generated value as propagation
+	     source, this is absolutely conservative, but could avoid explosion
+	     of lattice's value space, especially when one recursive function
+	     calls another recursive.  */
+	  if (self_recursively_generated_p (src_val))
+	    {
+	      /* If the lattice has already been propagated for the call site,
+		 no need to do that again.  */
+	      for (ipcp_value_source<tree> *s = src_val->sources; s;
+		   s = s->next)
+		if (s->cs == cs)
+		  return dest_lat->set_contains_variable ();
+	    }
+	  else
+	    val_seeds.safe_push (src_val);
+	}
+
+      int clone_copies = PARAM_VALUE (PARAM_IPA_CP_MAX_RECURSION_COPIES);
+      int i;
+
+      /* Recursively generate lattice values with a limited count.  */
+      FOR_EACH_VEC_ELT (val_seeds, i, src_val)
+	{
+	  for (int j = 1; j < clone_copies; j++)
+	    {
+	      tree cstval = ipa_get_jf_pass_through_result (jfunc,
+	      						    src_val->value,
+							    parm_type);
+	      if (!cstval)
+		break;
+
+	      /* Try to place the new lattice value after its source, which
+		 can decrease iterations of propagate stage.  */
+	      ret |= dest_lat->add_value (cstval, cs, src_val, src_idx,
+					  -1, &src_val, true);
+	      gcc_checking_assert (src_val);
+	    }
+	}
+      ret |= dest_lat->set_contains_variable ();
+    }
   else
     for (src_val = src_lat->values; src_val; src_val = src_val->next)
       {
+	  /* Now we do not use self-recursively generated value as propagation
+	     source, otherwise it is easy to make value space of normal lattice
+	     overflow.  */
+	if (self_recursively_generated_p (src_val))
+	  continue;
+
 	tree cstval = ipa_get_jf_pass_through_result (jfunc, src_val->value,
 						      parm_type);
-
 	if (cstval)
 	  ret |= dest_lat->add_value (cstval, cs, src_val, src_idx);
 	else
@@ -3635,6 +3754,9 @@ get_info_about_necessary_edges (ipcp_value<valtype> *val, cgraph_node *dest,
 	      hot |= cs->maybe_hot_p ();
 	      if (cs->caller != dest)
 		non_self_recursive = true;
+	      else if (src->val)
+		gcc_assert (values_equal_for_ipcp_p (src->val->value,
+						     val->value));
 	    }
 	  cs = get_next_cgraph_edge_clone (cs);
 	}
diff --git a/gcc/params.def b/gcc/params.def
index 322c37f8b96..3e242e09f01 100644
--- a/gcc/params.def
+++ b/gcc/params.def
@@ -1135,6 +1135,11 @@ DEFPARAM (PARAM_IPA_CP_RECURSION_PENALTY,
 	  "are evaluated for cloning.",
 	  40, 0, 100)
 
+DEFPARAM (PARAM_IPA_CP_MAX_RECURSION_COPIES,
+	  "ipa-cp-max-recursion-copies",
+	  "Maximum function versioning copies for a self-recursive function.",
+	  8, 0, 0)
+
 DEFPARAM (PARAM_IPA_CP_SINGLE_CALL_PENALTY,
 	  "ipa-cp-single-call-penalty",
 	  "Percentage penalty functions containing a single call to another "
diff --git a/gcc/testsuite/gcc.dg/ipa/ipa-clone-2.c b/gcc/testsuite/gcc.dg/ipa/ipa-clone-2.c
new file mode 100644
index 00000000000..7c1c01047c6
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/ipa/ipa-clone-2.c
@@ -0,0 +1,47 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fdump-ipa-cp-details -fno-early-inlining" } */
+
+int fn();
+
+int data[100];
+
+int recur_fn (int i)
+{
+  int j;
+   
+  if (i == 6)
+    {
+      fn();
+      fn();
+      fn();
+      fn();
+      fn();
+      fn();
+      fn();
+      fn();
+      fn();
+      fn();
+      fn();
+      fn();
+      return 10;
+    }
+
+  data[i] = i; 
+
+  for (j = 0; j < 100; j++)
+    recur_fn (i + 1);
+
+  return i; 
+}
+
+int main ()
+{
+  int i;
+
+  for (i = 0; i < 100; i++)
+    recur_fn (1) + recur_fn (-5);
+
+  return 1;
+}
+
+/* { dg-final { scan-ipa-dump-times "Creating a specialized node of recur_fn/\[0-9\]*\\." 12 "cp" } } */
-- 
2.17.1

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] Support multi-versioning on self-recursive function (ipa/92133)
  2019-10-17  8:35 [PATCH] Support multi-versioning on self-recursive function (ipa/92133) Feng Xue OS
@ 2019-10-18  2:12 ` luoxhu
  2019-10-18  5:15   ` Feng Xue OS
  2019-10-24  6:17 ` luoxhu
  1 sibling, 1 reply; 19+ messages in thread
From: luoxhu @ 2019-10-18  2:12 UTC (permalink / raw)
  To: Feng Xue OS, gcc-patches, Jan Hubicka, Martin Jambor

Hi Feng,

On 2019/10/17 16:23, Feng Xue OS wrote:
> IPA does not allow constant propagation on parameter that is used to control
> function recursion.
> 
> recur_fn (i)
> {
>    if ( !terminate_recursion (i))
>      {
>        ...
>        recur_fn (i + 1);
>        ...
>      }
>    ...
> }
> 
> This patch is composed to enable multi-versioning for self-recursive function,
> and versioning copies is limited by a specified option.

I noticed similar issue when analyzing the SPEC, self-recursive function is
not versioned and posted my observations in https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92074.

Generally, this could be implemented well by your patch, while I am
wondering whether it is OK to convert the recursive function to
non-recursive function in a independent pass after ipa-cp and ipa-sra instead
of reuse the ipa-cp framework?
The reason is sometimes the argument is passed-by-reference, and
ipa-sra runs after ipa-cp, so this kind of optimization may not be done in
WPA.  What's your idea about this, please?   Thanks. 


Thanks
Xiong Hu

> 
> Feng
> ---
> diff --git a/gcc/ipa-cp.c b/gcc/ipa-cp.c
> index 045072e02ec..6255a766e4d 100644
> --- a/gcc/ipa-cp.c
> +++ b/gcc/ipa-cp.c
> @@ -229,7 +229,9 @@ public:
>     inline bool set_contains_variable ();
>     bool add_value (valtype newval, cgraph_edge *cs,
>   		  ipcp_value<valtype> *src_val = NULL,
> -		  int src_idx = 0, HOST_WIDE_INT offset = -1);
> +		  int src_idx = 0, HOST_WIDE_INT offset = -1,
> +		  ipcp_value<valtype> **val_pos_p = NULL,
> +		  bool unlimited = false);
>     void print (FILE * f, bool dump_sources, bool dump_benefits);
>   };
>   
> @@ -1579,22 +1581,37 @@ allocate_and_init_ipcp_value (ipa_polymorphic_call_context source)
>   /* Try to add NEWVAL to LAT, potentially creating a new ipcp_value for it.  CS,
>      SRC_VAL SRC_INDEX and OFFSET are meant for add_source and have the same
>      meaning.  OFFSET -1 means the source is scalar and not a part of an
> -   aggregate.  */
> +   aggregate.  If non-NULL, VAL_POS_P specifies position in value list,
> +   after which newly created ipcp_value will be inserted, and it is also
> +   used to record address of the added ipcp_value before function returns.
> +   UNLIMITED means whether value count should not exceed the limit given
> +   by PARAM_IPA_CP_VALUE_LIST_SIZE.  */
>   
>   template <typename valtype>
>   bool
>   ipcp_lattice<valtype>::add_value (valtype newval, cgraph_edge *cs,
>   				  ipcp_value<valtype> *src_val,
> -				  int src_idx, HOST_WIDE_INT offset)
> +				  int src_idx, HOST_WIDE_INT offset,
> +				  ipcp_value<valtype> **val_pos_p,
> +				  bool unlimited)
>   {
>     ipcp_value<valtype> *val;
>   
> +  if (val_pos_p)
> +    {
> +      for (val = values; val && val != *val_pos_p; val = val->next);
> +      gcc_checking_assert (val);
> +    }
> +
>     if (bottom)
>       return false;
>   
>     for (val = values; val; val = val->next)
>       if (values_equal_for_ipcp_p (val->value, newval))
>         {
> +	if (val_pos_p)
> +	  *val_pos_p = val;
> +
>   	if (ipa_edge_within_scc (cs))
>   	  {
>   	    ipcp_value_source<valtype> *s;
> @@ -1609,7 +1626,8 @@ ipcp_lattice<valtype>::add_value (valtype newval, cgraph_edge *cs,
>   	return false;
>         }
>   
> -  if (values_count == PARAM_VALUE (PARAM_IPA_CP_VALUE_LIST_SIZE))
> +  if (!unlimited
> +      && values_count == PARAM_VALUE (PARAM_IPA_CP_VALUE_LIST_SIZE))
>       {
>         /* We can only free sources, not the values themselves, because sources
>   	 of other values in this SCC might point to them.   */
> @@ -1623,6 +1641,9 @@ ipcp_lattice<valtype>::add_value (valtype newval, cgraph_edge *cs,
>   	    }
>   	}
>   
> +      if (val_pos_p)
> +	*val_pos_p = NULL;
> +
>         values = NULL;
>         return set_to_bottom ();
>       }
> @@ -1630,8 +1651,54 @@ ipcp_lattice<valtype>::add_value (valtype newval, cgraph_edge *cs,
>     values_count++;
>     val = allocate_and_init_ipcp_value (newval);
>     val->add_source (cs, src_val, src_idx, offset);
> -  val->next = values;
> -  values = val;
> +  if (val_pos_p)
> +    {
> +      val->next = (*val_pos_p)->next;
> +      (*val_pos_p)->next = val;
> +      *val_pos_p = val;
> +    }
> +  else
> +    {
> +      val->next = values;
> +      values = val;
> +    }
> +
> +  return true;
> +}
> +
> +/* Return true, if a ipcp_value VAL is orginated from parameter value of
> +   self-feeding recursive function by applying non-passthrough arithmetic
> +   transformation.  */
> +
> +static bool
> +self_recursively_generated_p (ipcp_value<tree> *val)
> +{
> +  class ipa_node_params *info = NULL;
> +
> +  for (ipcp_value_source<tree> *src = val->sources; src; src = src->next)
> +    {
> +      cgraph_edge *cs = src->cs;
> +
> +      if (!src->val || cs->caller != cs->callee->function_symbol ()
> +	  || src->val == val)
> +	return false;
> +
> +      if (!info)
> +	info = IPA_NODE_REF (cs->caller);
> +
> +      class ipcp_param_lattices *plats = ipa_get_parm_lattices (info,
> +								src->index);
> +      ipcp_lattice<tree> *src_lat = src->offset == -1 ? &plats->itself
> +						      : plats->aggs;
> +      ipcp_value<tree> *src_val;
> +
> +      for (src_val = src_lat->values; src_val && src_val != val;
> +	   src_val = src_val->next);
> +
> +      if (!src_val)
> +	return false;
> +    }
> +
>     return true;
>   }
>   
> @@ -1649,20 +1716,72 @@ propagate_vals_across_pass_through (cgraph_edge *cs, ipa_jump_func *jfunc,
>     ipcp_value<tree> *src_val;
>     bool ret = false;
>   
> -  /* Do not create new values when propagating within an SCC because if there
> -     are arithmetic functions with circular dependencies, there is infinite
> -     number of them and we would just make lattices bottom.  If this condition
> -     is ever relaxed we have to detect self-feeding recursive calls in
> -     cgraph_edge_brings_value_p in a smarter way.  */
> +  /* Due to circular dependencies, propagating within an SCC through arithmetic
> +     transformation would create infinite number of values.  But for
> +     self-feeding recursive function, we could allow propagation in a limited
> +     count, and this can enable a simple kind of recursive function versioning.
> +     For other scenario, we would just make lattices bottom.  */
>     if ((ipa_get_jf_pass_through_operation (jfunc) != NOP_EXPR)
>         && ipa_edge_within_scc (cs))
> -    ret = dest_lat->set_contains_variable ();
> +    {
> +      if (src_lat != dest_lat)
> +	return dest_lat->set_contains_variable ();
> +
> +      auto_vec<ipcp_value<tree> *, 8> val_seeds;
> +
> +      for (src_val = src_lat->values; src_val; src_val = src_val->next)
> +	{
> +	  /* Now we do not use self-recursively generated value as propagation
> +	     source, this is absolutely conservative, but could avoid explosion
> +	     of lattice's value space, especially when one recursive function
> +	     calls another recursive.  */
> +	  if (self_recursively_generated_p (src_val))
> +	    {
> +	      /* If the lattice has already been propagated for the call site,
> +		 no need to do that again.  */
> +	      for (ipcp_value_source<tree> *s = src_val->sources; s;
> +		   s = s->next)
> +		if (s->cs == cs)
> +		  return dest_lat->set_contains_variable ();
> +	    }
> +	  else
> +	    val_seeds.safe_push (src_val);
> +	}
> +
> +      int clone_copies = PARAM_VALUE (PARAM_IPA_CP_MAX_RECURSION_COPIES);
> +      int i;
> +
> +      /* Recursively generate lattice values with a limited count.  */
> +      FOR_EACH_VEC_ELT (val_seeds, i, src_val)
> +	{
> +	  for (int j = 1; j < clone_copies; j++)
> +	    {
> +	      tree cstval = ipa_get_jf_pass_through_result (jfunc,
> +	      						    src_val->value,
> +							    parm_type);
> +	      if (!cstval)
> +		break;
> +
> +	      /* Try to place the new lattice value after its source, which
> +		 can decrease iterations of propagate stage.  */
> +	      ret |= dest_lat->add_value (cstval, cs, src_val, src_idx,
> +					  -1, &src_val, true);
> +	      gcc_checking_assert (src_val);
> +	    }
> +	}
> +      ret |= dest_lat->set_contains_variable ();
> +    }
>     else
>       for (src_val = src_lat->values; src_val; src_val = src_val->next)
>         {
> +	  /* Now we do not use self-recursively generated value as propagation
> +	     source, otherwise it is easy to make value space of normal lattice
> +	     overflow.  */
> +	if (self_recursively_generated_p (src_val))
> +	  continue;
> +
>   	tree cstval = ipa_get_jf_pass_through_result (jfunc, src_val->value,
>   						      parm_type);
> -
>   	if (cstval)
>   	  ret |= dest_lat->add_value (cstval, cs, src_val, src_idx);
>   	else
> @@ -3635,6 +3754,9 @@ get_info_about_necessary_edges (ipcp_value<valtype> *val, cgraph_node *dest,
>   	      hot |= cs->maybe_hot_p ();
>   	      if (cs->caller != dest)
>   		non_self_recursive = true;
> +	      else if (src->val)
> +		gcc_assert (values_equal_for_ipcp_p (src->val->value,
> +						     val->value));
>   	    }
>   	  cs = get_next_cgraph_edge_clone (cs);
>   	}
> diff --git a/gcc/params.def b/gcc/params.def
> index 322c37f8b96..3e242e09f01 100644
> --- a/gcc/params.def
> +++ b/gcc/params.def
> @@ -1135,6 +1135,11 @@ DEFPARAM (PARAM_IPA_CP_RECURSION_PENALTY,
>   	  "are evaluated for cloning.",
>   	  40, 0, 100)
>   
> +DEFPARAM (PARAM_IPA_CP_MAX_RECURSION_COPIES,
> +	  "ipa-cp-max-recursion-copies",
> +	  "Maximum function versioning copies for a self-recursive function.",
> +	  8, 0, 0)
> +
>   DEFPARAM (PARAM_IPA_CP_SINGLE_CALL_PENALTY,
>   	  "ipa-cp-single-call-penalty",
>   	  "Percentage penalty functions containing a single call to another "
> diff --git a/gcc/testsuite/gcc.dg/ipa/ipa-clone-2.c b/gcc/testsuite/gcc.dg/ipa/ipa-clone-2.c
> new file mode 100644
> index 00000000000..7c1c01047c6
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/ipa/ipa-clone-2.c
> @@ -0,0 +1,47 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -fdump-ipa-cp-details -fno-early-inlining" } */
> +
> +int fn();
> +
> +int data[100];
> +
> +int recur_fn (int i)
> +{
> +  int j;
> +
> +  if (i == 6)
> +    {
> +      fn();
> +      fn();
> +      fn();
> +      fn();
> +      fn();
> +      fn();
> +      fn();
> +      fn();
> +      fn();
> +      fn();
> +      fn();
> +      fn();
> +      return 10;
> +    }
> +
> +  data[i] = i;
> +
> +  for (j = 0; j < 100; j++)
> +    recur_fn (i + 1);
> +
> +  return i;
> +}
> +
> +int main ()
> +{
> +  int i;
> +
> +  for (i = 0; i < 100; i++)
> +    recur_fn (1) + recur_fn (-5);
> +
> +  return 1;
> +}
> +
> +/* { dg-final { scan-ipa-dump-times "Creating a specialized node of recur_fn/\[0-9\]*\\." 12 "cp" } } */
> 

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] Support multi-versioning on self-recursive function (ipa/92133)
  2019-10-18  2:12 ` luoxhu
@ 2019-10-18  5:15   ` Feng Xue OS
  0 siblings, 0 replies; 19+ messages in thread
From: Feng Xue OS @ 2019-10-18  5:15 UTC (permalink / raw)
  To: luoxhu, gcc-patches, Jan Hubicka, Martin Jambor

> I noticed similar issue when analyzing the SPEC, self-recursive function is
> not versioned and posted my observations in https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92074.

> Generally, this could be implemented well by your patch, while I am
> wondering whether it is OK to convert the recursive function to
> non-recursive function in a independent pass after ipa-cp and ipa-sra instead
> of reuse the ipa-cp framework?
> The reason is sometimes the argument is passed-by-reference, and
> ipa-sra runs after ipa-cp, so this kind of optimization may not be done in
> WPA.  What's your idea about this, please?   Thanks.

Function versioning is done in ipa-cp, there is nothing special for recursive function.
So adding a dedicated pass for recursive seems to be redundant.

We might not need to resort to ipa-sra to resolve concern you mentioned. Original
ipa-cp already supports a simple kind of propagation on by-ref argument, who must
be defined by constant. And for an extended form as:  *arg = *param OP constant,  I've
created a tracker PR91682,  also composed a patch:
https://gcc.gnu.org/ml/gcc-patches/2019-09/msg01189.html.

Feng

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] Support multi-versioning on self-recursive function (ipa/92133)
  2019-10-17  8:35 [PATCH] Support multi-versioning on self-recursive function (ipa/92133) Feng Xue OS
  2019-10-18  2:12 ` luoxhu
@ 2019-10-24  6:17 ` luoxhu
  2019-10-24  6:57   ` Feng Xue OS
  1 sibling, 1 reply; 19+ messages in thread
From: luoxhu @ 2019-10-24  6:17 UTC (permalink / raw)
  To: Feng Xue OS, gcc-patches, Jan Hubicka, Martin Jambor

Hi,


On 2019/10/17 16:23, Feng Xue OS wrote:
> IPA does not allow constant propagation on parameter that is used to control
> function recursion.
> 
> recur_fn (i)
> {
>    if ( !terminate_recursion (i))
>      {
>        ...
>        recur_fn (i + 1);
>        ...
>      }
>    ...
> }
> 
> This patch is composed to enable multi-versioning for self-recursive function,
> and versioning copies is limited by a specified option.
> 
> Feng
> ---
> diff --git a/gcc/ipa-cp.c b/gcc/ipa-cp.c
> index 045072e02ec..6255a766e4d 100644
> --- a/gcc/ipa-cp.c
> +++ b/gcc/ipa-cp.c
> @@ -229,7 +229,9 @@ public:
>     inline bool set_contains_variable ();
>     bool add_value (valtype newval, cgraph_edge *cs,
>   		  ipcp_value<valtype> *src_val = NULL,
> -		  int src_idx = 0, HOST_WIDE_INT offset = -1);
> +		  int src_idx = 0, HOST_WIDE_INT offset = -1,
> +		  ipcp_value<valtype> **val_pos_p = NULL,
> +		  bool unlimited = false);
>     void print (FILE * f, bool dump_sources, bool dump_benefits);
>   };
>   
> @@ -1579,22 +1581,37 @@ allocate_and_init_ipcp_value (ipa_polymorphic_call_context source)
>   /* Try to add NEWVAL to LAT, potentially creating a new ipcp_value for it.  CS,
>      SRC_VAL SRC_INDEX and OFFSET are meant for add_source and have the same
>      meaning.  OFFSET -1 means the source is scalar and not a part of an
> -   aggregate.  */
> +   aggregate.  If non-NULL, VAL_POS_P specifies position in value list,
> +   after which newly created ipcp_value will be inserted, and it is also
> +   used to record address of the added ipcp_value before function returns.
> +   UNLIMITED means whether value count should not exceed the limit given
> +   by PARAM_IPA_CP_VALUE_LIST_SIZE.  */
>   
>   template <typename valtype>
>   bool
>   ipcp_lattice<valtype>::add_value (valtype newval, cgraph_edge *cs,
>   				  ipcp_value<valtype> *src_val,
> -				  int src_idx, HOST_WIDE_INT offset)
> +				  int src_idx, HOST_WIDE_INT offset,
> +				  ipcp_value<valtype> **val_pos_p,
> +				  bool unlimited)
>   {
>     ipcp_value<valtype> *val;
>   
> +  if (val_pos_p)
> +    {
> +      for (val = values; val && val != *val_pos_p; val = val->next);
> +      gcc_checking_assert (val);
> +    }
> +
>     if (bottom)
>       return false;
>   
>     for (val = values; val; val = val->next)
>       if (values_equal_for_ipcp_p (val->value, newval))
>         {
> +	if (val_pos_p)
> +	  *val_pos_p = val;
> +
>   	if (ipa_edge_within_scc (cs))
>   	  {
>   	    ipcp_value_source<valtype> *s;
> @@ -1609,7 +1626,8 @@ ipcp_lattice<valtype>::add_value (valtype newval, cgraph_edge *cs,
>   	return false;
>         }
>   
> -  if (values_count == PARAM_VALUE (PARAM_IPA_CP_VALUE_LIST_SIZE))
> +  if (!unlimited
> +      && values_count == PARAM_VALUE (PARAM_IPA_CP_VALUE_LIST_SIZE))
>       {
>         /* We can only free sources, not the values themselves, because sources
>   	 of other values in this SCC might point to them.   */
> @@ -1623,6 +1641,9 @@ ipcp_lattice<valtype>::add_value (valtype newval, cgraph_edge *cs,
>   	    }
>   	}
>   
> +      if (val_pos_p)
> +	*val_pos_p = NULL;
> +
>         values = NULL;
>         return set_to_bottom ();
>       }
> @@ -1630,8 +1651,54 @@ ipcp_lattice<valtype>::add_value (valtype newval, cgraph_edge *cs,
>     values_count++;
>     val = allocate_and_init_ipcp_value (newval);
>     val->add_source (cs, src_val, src_idx, offset);
> -  val->next = values;
> -  values = val;
> +  if (val_pos_p)
> +    {
> +      val->next = (*val_pos_p)->next;
> +      (*val_pos_p)->next = val;
> +      *val_pos_p = val;
> +    }
> +  else
> +    {
> +      val->next = values;
> +      values = val;
> +    }
> +
> +  return true;
> +}
> +
> +/* Return true, if a ipcp_value VAL is orginated from parameter value of
> +   self-feeding recursive function by applying non-passthrough arithmetic
> +   transformation.  */
> +
> +static bool
> +self_recursively_generated_p (ipcp_value<tree> *val)
> +{
> +  class ipa_node_params *info = NULL;
> +
> +  for (ipcp_value_source<tree> *src = val->sources; src; src = src->next)
> +    {
> +      cgraph_edge *cs = src->cs;
> +
> +      if (!src->val || cs->caller != cs->callee->function_symbol ()
> +	  || src->val == val)
> +	return false;
> +
> +      if (!info)
> +	info = IPA_NODE_REF (cs->caller);
> +
> +      class ipcp_param_lattices *plats = ipa_get_parm_lattices (info,
> +								src->index);
> +      ipcp_lattice<tree> *src_lat = src->offset == -1 ? &plats->itself
> +						      : plats->aggs;

Thanks for the patch.
This function doesn't handle the by-ref case after rebasing this patch to your
previous ipa-cp by-ref of arith patch as below (Also some conflicts need fix):

foo (int * p) { ...  return foo(*(&p) + 1); }

It will cause value explosion.

Secondly, the self_recursive doesn't check or break if the value is above than
the recursive boundary, which seems would generate redundant nodes?


IMHO, It may be helpful if adding more comments before the return and dump log
for dump-ipa-cp-all-details when adding new values and new sources like below
for debugging and others can understand the code easier? :)

newval, src_val, src_val->sources,  *src_val->sources,  caller,  callee
1, (ipcp_value<tree_node*> *) 0x12bc1828, (ipcp_value_source<tree_node*> *) 0x12bae800,  {offset = -1, cs = 0x3fffb5602768, val = 0x0, next = 0x0, index = 0}, "main", recur_fn"
2, (ipcp_value<tree_node*> *) 0x12bc1880, (ipcp_value_source<tree_node*> *) 0x12bae830,  {offset = -1, cs = 0x3fffb5602630, val = 0x12bc1828, next = 0x0, index = 0},"recur_fn","recur_fn"
...
1, (ipcp_value<tree_node*> *) 0x12bc1828, (ipcp_value_source<tree_node*> *) 0x12baea70, {offset = -1, cs = 0x3fffb5602630, val = 0x12bc1c48, next = 0x12bae800, index = 0}  "recur_fn","recur_fn"
2, (ipcp_value<tree_node*> *) 0x12bc1880, (ipcp_value_source<tree_node*> *) 0x12bae830, {offset = -1, cs = 0x3fffb5602630, val = 0x12bc1828, next = 0x0, index = 0}  "recur_fn","recur_fn"


Xiong Hu
Thanks

> +      ipcp_value<tree> *src_val;
> +
> +      for (src_val = src_lat->values; src_val && src_val != val;
> +	   src_val = src_val->next);
> +
> +      if (!src_val)
> +	return false;
> +    }
> +
>     return true;
>   }
>   
> @@ -1649,20 +1716,72 @@ propagate_vals_across_pass_through (cgraph_edge *cs, ipa_jump_func *jfunc,
>     ipcp_value<tree> *src_val;
>     bool ret = false;
>   
> -  /* Do not create new values when propagating within an SCC because if there
> -     are arithmetic functions with circular dependencies, there is infinite
> -     number of them and we would just make lattices bottom.  If this condition
> -     is ever relaxed we have to detect self-feeding recursive calls in
> -     cgraph_edge_brings_value_p in a smarter way.  */
> +  /* Due to circular dependencies, propagating within an SCC through arithmetic
> +     transformation would create infinite number of values.  But for
> +     self-feeding recursive function, we could allow propagation in a limited
> +     count, and this can enable a simple kind of recursive function versioning.
> +     For other scenario, we would just make lattices bottom.  */
>     if ((ipa_get_jf_pass_through_operation (jfunc) != NOP_EXPR)
>         && ipa_edge_within_scc (cs))
> -    ret = dest_lat->set_contains_variable ();
> +    {
> +      if (src_lat != dest_lat)
> +	return dest_lat->set_contains_variable ();
> +
> +      auto_vec<ipcp_value<tree> *, 8> val_seeds;
> +
> +      for (src_val = src_lat->values; src_val; src_val = src_val->next)
> +	{
> +	  /* Now we do not use self-recursively generated value as propagation
> +	     source, this is absolutely conservative, but could avoid explosion
> +	     of lattice's value space, especially when one recursive function
> +	     calls another recursive.  */
> +	  if (self_recursively_generated_p (src_val))
> +	    {
> +	      /* If the lattice has already been propagated for the call site,
> +		 no need to do that again.  */
> +	      for (ipcp_value_source<tree> *s = src_val->sources; s;
> +		   s = s->next)
> +		if (s->cs == cs)
> +		  return dest_lat->set_contains_variable ();
> +	    }
> +	  else
> +	    val_seeds.safe_push (src_val);
> +	}
> +
> +      int clone_copies = PARAM_VALUE (PARAM_IPA_CP_MAX_RECURSION_COPIES);
> +      int i;
> +
> +      /* Recursively generate lattice values with a limited count.  */
> +      FOR_EACH_VEC_ELT (val_seeds, i, src_val)
> +	{
> +	  for (int j = 1; j < clone_copies; j++)
> +	    {
> +	      tree cstval = ipa_get_jf_pass_through_result (jfunc,
> +	      						    src_val->value,
> +							    parm_type);
> +	      if (!cstval)
> +		break;
> +
> +	      /* Try to place the new lattice value after its source, which
> +		 can decrease iterations of propagate stage.  */
> +	      ret |= dest_lat->add_value (cstval, cs, src_val, src_idx,
> +					  -1, &src_val, true);
> +	      gcc_checking_assert (src_val);
> +	    }
> +	}
> +      ret |= dest_lat->set_contains_variable ();
> +    }
>     else
>       for (src_val = src_lat->values; src_val; src_val = src_val->next)
>         {
> +	  /* Now we do not use self-recursively generated value as propagation
> +	     source, otherwise it is easy to make value space of normal lattice
> +	     overflow.  */
> +	if (self_recursively_generated_p (src_val))
> +	  continue;
> +
>   	tree cstval = ipa_get_jf_pass_through_result (jfunc, src_val->value,
>   						      parm_type);
> -
>   	if (cstval)
>   	  ret |= dest_lat->add_value (cstval, cs, src_val, src_idx);
>   	else
> @@ -3635,6 +3754,9 @@ get_info_about_necessary_edges (ipcp_value<valtype> *val, cgraph_node *dest,
>   	      hot |= cs->maybe_hot_p ();
>   	      if (cs->caller != dest)
>   		non_self_recursive = true;
> +	      else if (src->val)
> +		gcc_assert (values_equal_for_ipcp_p (src->val->value,
> +						     val->value));
>   	    }
>   	  cs = get_next_cgraph_edge_clone (cs);
>   	}
> diff --git a/gcc/params.def b/gcc/params.def
> index 322c37f8b96..3e242e09f01 100644
> --- a/gcc/params.def
> +++ b/gcc/params.def
> @@ -1135,6 +1135,11 @@ DEFPARAM (PARAM_IPA_CP_RECURSION_PENALTY,
>   	  "are evaluated for cloning.",
>   	  40, 0, 100)
>   
> +DEFPARAM (PARAM_IPA_CP_MAX_RECURSION_COPIES,
> +	  "ipa-cp-max-recursion-copies",
> +	  "Maximum function versioning copies for a self-recursive function.",
> +	  8, 0, 0)
> +
>   DEFPARAM (PARAM_IPA_CP_SINGLE_CALL_PENALTY,
>   	  "ipa-cp-single-call-penalty",
>   	  "Percentage penalty functions containing a single call to another "
> diff --git a/gcc/testsuite/gcc.dg/ipa/ipa-clone-2.c b/gcc/testsuite/gcc.dg/ipa/ipa-clone-2.c
> new file mode 100644
> index 00000000000..7c1c01047c6
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/ipa/ipa-clone-2.c
> @@ -0,0 +1,47 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -fdump-ipa-cp-details -fno-early-inlining" } */
> +
> +int fn();
> +
> +int data[100];
> +
> +int recur_fn (int i)
> +{
> +  int j;
> +
> +  if (i == 6)
> +    {
> +      fn();
> +      fn();
> +      fn();
> +      fn();
> +      fn();
> +      fn();
> +      fn();
> +      fn();
> +      fn();
> +      fn();
> +      fn();
> +      fn();
> +      return 10;
> +    }
> +
> +  data[i] = i;
> +
> +  for (j = 0; j < 100; j++)
> +    recur_fn (i + 1);
> +
> +  return i;
> +}
> +
> +int main ()
> +{
> +  int i;
> +
> +  for (i = 0; i < 100; i++)
> +    recur_fn (1) + recur_fn (-5);
> +
> +  return 1;
> +}
> +
> +/* { dg-final { scan-ipa-dump-times "Creating a specialized node of recur_fn/\[0-9\]*\\." 12 "cp" } } */
> 

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] Support multi-versioning on self-recursive function (ipa/92133)
  2019-10-24  6:17 ` luoxhu
@ 2019-10-24  6:57   ` Feng Xue OS
  2019-11-14 13:29     ` Jan Hubicka
  0 siblings, 1 reply; 19+ messages in thread
From: Feng Xue OS @ 2019-10-24  6:57 UTC (permalink / raw)
  To: luoxhu, gcc-patches, Jan Hubicka, Martin Jambor

[-- Attachment #1: Type: text/plain, Size: 13394 bytes --]

Hi,

  Actually, this patch is not final one. Since the previous cp-by-ref patch is still on the way to the trunk,
I tailored it to only handle by-value recursion, so that it is decoupled with the previous patch, and can
be reviewed independently. I attached the final patch, you can have a try.

 CP propagation stage might generate values outside of recursion ranges, but cloning stage will not
make a duplicate for invalid recursion value, with which we do not need extra code for recursion range
analysis.

Feng

________________________________________
From: luoxhu <luoxhu@linux.ibm.com>
Sent: Thursday, October 24, 2019 1:44 PM
To: Feng Xue OS; gcc-patches@gcc.gnu.org; Jan Hubicka; Martin Jambor
Subject: Re: [PATCH] Support multi-versioning on self-recursive function (ipa/92133)

Hi,


On 2019/10/17 16:23, Feng Xue OS wrote:
> IPA does not allow constant propagation on parameter that is used to control
> function recursion.
>
> recur_fn (i)
> {
>    if ( !terminate_recursion (i))
>      {
>        ...
>        recur_fn (i + 1);
>        ...
>      }
>    ...
> }
>
> This patch is composed to enable multi-versioning for self-recursive function,
> and versioning copies is limited by a specified option.
>
> Feng
> ---
> diff --git a/gcc/ipa-cp.c b/gcc/ipa-cp.c
> index 045072e02ec..6255a766e4d 100644
> --- a/gcc/ipa-cp.c
> +++ b/gcc/ipa-cp.c
> @@ -229,7 +229,9 @@ public:
>     inline bool set_contains_variable ();
>     bool add_value (valtype newval, cgraph_edge *cs,
>                 ipcp_value<valtype> *src_val = NULL,
> -               int src_idx = 0, HOST_WIDE_INT offset = -1);
> +               int src_idx = 0, HOST_WIDE_INT offset = -1,
> +               ipcp_value<valtype> **val_pos_p = NULL,
> +               bool unlimited = false);
>     void print (FILE * f, bool dump_sources, bool dump_benefits);
>   };
>
> @@ -1579,22 +1581,37 @@ allocate_and_init_ipcp_value (ipa_polymorphic_call_context source)
>   /* Try to add NEWVAL to LAT, potentially creating a new ipcp_value for it.  CS,
>      SRC_VAL SRC_INDEX and OFFSET are meant for add_source and have the same
>      meaning.  OFFSET -1 means the source is scalar and not a part of an
> -   aggregate.  */
> +   aggregate.  If non-NULL, VAL_POS_P specifies position in value list,
> +   after which newly created ipcp_value will be inserted, and it is also
> +   used to record address of the added ipcp_value before function returns.
> +   UNLIMITED means whether value count should not exceed the limit given
> +   by PARAM_IPA_CP_VALUE_LIST_SIZE.  */
>
>   template <typename valtype>
>   bool
>   ipcp_lattice<valtype>::add_value (valtype newval, cgraph_edge *cs,
>                                 ipcp_value<valtype> *src_val,
> -                               int src_idx, HOST_WIDE_INT offset)
> +                               int src_idx, HOST_WIDE_INT offset,
> +                               ipcp_value<valtype> **val_pos_p,
> +                               bool unlimited)
>   {
>     ipcp_value<valtype> *val;
>
> +  if (val_pos_p)
> +    {
> +      for (val = values; val && val != *val_pos_p; val = val->next);
> +      gcc_checking_assert (val);
> +    }
> +
>     if (bottom)
>       return false;
>
>     for (val = values; val; val = val->next)
>       if (values_equal_for_ipcp_p (val->value, newval))
>         {
> +     if (val_pos_p)
> +       *val_pos_p = val;
> +
>       if (ipa_edge_within_scc (cs))
>         {
>           ipcp_value_source<valtype> *s;
> @@ -1609,7 +1626,8 @@ ipcp_lattice<valtype>::add_value (valtype newval, cgraph_edge *cs,
>       return false;
>         }
>
> -  if (values_count == PARAM_VALUE (PARAM_IPA_CP_VALUE_LIST_SIZE))
> +  if (!unlimited
> +      && values_count == PARAM_VALUE (PARAM_IPA_CP_VALUE_LIST_SIZE))
>       {
>         /* We can only free sources, not the values themselves, because sources
>        of other values in this SCC might point to them.   */
> @@ -1623,6 +1641,9 @@ ipcp_lattice<valtype>::add_value (valtype newval, cgraph_edge *cs,
>           }
>       }
>
> +      if (val_pos_p)
> +     *val_pos_p = NULL;
> +
>         values = NULL;
>         return set_to_bottom ();
>       }
> @@ -1630,8 +1651,54 @@ ipcp_lattice<valtype>::add_value (valtype newval, cgraph_edge *cs,
>     values_count++;
>     val = allocate_and_init_ipcp_value (newval);
>     val->add_source (cs, src_val, src_idx, offset);
> -  val->next = values;
> -  values = val;
> +  if (val_pos_p)
> +    {
> +      val->next = (*val_pos_p)->next;
> +      (*val_pos_p)->next = val;
> +      *val_pos_p = val;
> +    }
> +  else
> +    {
> +      val->next = values;
> +      values = val;
> +    }
> +
> +  return true;
> +}
> +
> +/* Return true, if a ipcp_value VAL is orginated from parameter value of
> +   self-feeding recursive function by applying non-passthrough arithmetic
> +   transformation.  */
> +
> +static bool
> +self_recursively_generated_p (ipcp_value<tree> *val)
> +{
> +  class ipa_node_params *info = NULL;
> +
> +  for (ipcp_value_source<tree> *src = val->sources; src; src = src->next)
> +    {
> +      cgraph_edge *cs = src->cs;
> +
> +      if (!src->val || cs->caller != cs->callee->function_symbol ()
> +       || src->val == val)
> +     return false;
> +
> +      if (!info)
> +     info = IPA_NODE_REF (cs->caller);
> +
> +      class ipcp_param_lattices *plats = ipa_get_parm_lattices (info,
> +                                                             src->index);
> +      ipcp_lattice<tree> *src_lat = src->offset == -1 ? &plats->itself
> +                                                   : plats->aggs;

Thanks for the patch.
This function doesn't handle the by-ref case after rebasing this patch to your
previous ipa-cp by-ref of arith patch as below (Also some conflicts need fix):

foo (int * p) { ...  return foo(*(&p) + 1); }

It will cause value explosion.

Secondly, the self_recursive doesn't check or break if the value is above than
the recursive boundary, which seems would generate redundant nodes?


IMHO, It may be helpful if adding more comments before the return and dump log
for dump-ipa-cp-all-details when adding new values and new sources like below
for debugging and others can understand the code easier? :)

newval, src_val, src_val->sources,  *src_val->sources,  caller,  callee
1, (ipcp_value<tree_node*> *) 0x12bc1828, (ipcp_value_source<tree_node*> *) 0x12bae800,  {offset = -1, cs = 0x3fffb5602768, val = 0x0, next = 0x0, index = 0}, "main", recur_fn"
2, (ipcp_value<tree_node*> *) 0x12bc1880, (ipcp_value_source<tree_node*> *) 0x12bae830,  {offset = -1, cs = 0x3fffb5602630, val = 0x12bc1828, next = 0x0, index = 0},"recur_fn","recur_fn"
...
1, (ipcp_value<tree_node*> *) 0x12bc1828, (ipcp_value_source<tree_node*> *) 0x12baea70, {offset = -1, cs = 0x3fffb5602630, val = 0x12bc1c48, next = 0x12bae800, index = 0}  "recur_fn","recur_fn"
2, (ipcp_value<tree_node*> *) 0x12bc1880, (ipcp_value_source<tree_node*> *) 0x12bae830, {offset = -1, cs = 0x3fffb5602630, val = 0x12bc1828, next = 0x0, index = 0}  "recur_fn","recur_fn"


Xiong Hu
Thanks

> +      ipcp_value<tree> *src_val;
> +
> +      for (src_val = src_lat->values; src_val && src_val != val;
> +        src_val = src_val->next);
> +
> +      if (!src_val)
> +     return false;
> +    }
> +
>     return true;
>   }
>
> @@ -1649,20 +1716,72 @@ propagate_vals_across_pass_through (cgraph_edge *cs, ipa_jump_func *jfunc,
>     ipcp_value<tree> *src_val;
>     bool ret = false;
>
> -  /* Do not create new values when propagating within an SCC because if there
> -     are arithmetic functions with circular dependencies, there is infinite
> -     number of them and we would just make lattices bottom.  If this condition
> -     is ever relaxed we have to detect self-feeding recursive calls in
> -     cgraph_edge_brings_value_p in a smarter way.  */
> +  /* Due to circular dependencies, propagating within an SCC through arithmetic
> +     transformation would create infinite number of values.  But for
> +     self-feeding recursive function, we could allow propagation in a limited
> +     count, and this can enable a simple kind of recursive function versioning.
> +     For other scenario, we would just make lattices bottom.  */
>     if ((ipa_get_jf_pass_through_operation (jfunc) != NOP_EXPR)
>         && ipa_edge_within_scc (cs))
> -    ret = dest_lat->set_contains_variable ();
> +    {
> +      if (src_lat != dest_lat)
> +     return dest_lat->set_contains_variable ();
> +
> +      auto_vec<ipcp_value<tree> *, 8> val_seeds;
> +
> +      for (src_val = src_lat->values; src_val; src_val = src_val->next)
> +     {
> +       /* Now we do not use self-recursively generated value as propagation
> +          source, this is absolutely conservative, but could avoid explosion
> +          of lattice's value space, especially when one recursive function
> +          calls another recursive.  */
> +       if (self_recursively_generated_p (src_val))
> +         {
> +           /* If the lattice has already been propagated for the call site,
> +              no need to do that again.  */
> +           for (ipcp_value_source<tree> *s = src_val->sources; s;
> +                s = s->next)
> +             if (s->cs == cs)
> +               return dest_lat->set_contains_variable ();
> +         }
> +       else
> +         val_seeds.safe_push (src_val);
> +     }
> +
> +      int clone_copies = PARAM_VALUE (PARAM_IPA_CP_MAX_RECURSION_COPIES);
> +      int i;
> +
> +      /* Recursively generate lattice values with a limited count.  */
> +      FOR_EACH_VEC_ELT (val_seeds, i, src_val)
> +     {
> +       for (int j = 1; j < clone_copies; j++)
> +         {
> +           tree cstval = ipa_get_jf_pass_through_result (jfunc,
> +                                                         src_val->value,
> +                                                         parm_type);
> +           if (!cstval)
> +             break;
> +
> +           /* Try to place the new lattice value after its source, which
> +              can decrease iterations of propagate stage.  */
> +           ret |= dest_lat->add_value (cstval, cs, src_val, src_idx,
> +                                       -1, &src_val, true);
> +           gcc_checking_assert (src_val);
> +         }
> +     }
> +      ret |= dest_lat->set_contains_variable ();
> +    }
>     else
>       for (src_val = src_lat->values; src_val; src_val = src_val->next)
>         {
> +       /* Now we do not use self-recursively generated value as propagation
> +          source, otherwise it is easy to make value space of normal lattice
> +          overflow.  */
> +     if (self_recursively_generated_p (src_val))
> +       continue;
> +
>       tree cstval = ipa_get_jf_pass_through_result (jfunc, src_val->value,
>                                                     parm_type);
> -
>       if (cstval)
>         ret |= dest_lat->add_value (cstval, cs, src_val, src_idx);
>       else
> @@ -3635,6 +3754,9 @@ get_info_about_necessary_edges (ipcp_value<valtype> *val, cgraph_node *dest,
>             hot |= cs->maybe_hot_p ();
>             if (cs->caller != dest)
>               non_self_recursive = true;
> +           else if (src->val)
> +             gcc_assert (values_equal_for_ipcp_p (src->val->value,
> +                                                  val->value));
>           }
>         cs = get_next_cgraph_edge_clone (cs);
>       }
> diff --git a/gcc/params.def b/gcc/params.def
> index 322c37f8b96..3e242e09f01 100644
> --- a/gcc/params.def
> +++ b/gcc/params.def
> @@ -1135,6 +1135,11 @@ DEFPARAM (PARAM_IPA_CP_RECURSION_PENALTY,
>         "are evaluated for cloning.",
>         40, 0, 100)
>
> +DEFPARAM (PARAM_IPA_CP_MAX_RECURSION_COPIES,
> +       "ipa-cp-max-recursion-copies",
> +       "Maximum function versioning copies for a self-recursive function.",
> +       8, 0, 0)
> +
>   DEFPARAM (PARAM_IPA_CP_SINGLE_CALL_PENALTY,
>         "ipa-cp-single-call-penalty",
>         "Percentage penalty functions containing a single call to another "
> diff --git a/gcc/testsuite/gcc.dg/ipa/ipa-clone-2.c b/gcc/testsuite/gcc.dg/ipa/ipa-clone-2.c
> new file mode 100644
> index 00000000000..7c1c01047c6
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/ipa/ipa-clone-2.c
> @@ -0,0 +1,47 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -fdump-ipa-cp-details -fno-early-inlining" } */
> +
> +int fn();
> +
> +int data[100];
> +
> +int recur_fn (int i)
> +{
> +  int j;
> +
> +  if (i == 6)
> +    {
> +      fn();
> +      fn();
> +      fn();
> +      fn();
> +      fn();
> +      fn();
> +      fn();
> +      fn();
> +      fn();
> +      fn();
> +      fn();
> +      fn();
> +      return 10;
> +    }
> +
> +  data[i] = i;
> +
> +  for (j = 0; j < 100; j++)
> +    recur_fn (i + 1);
> +
> +  return i;
> +}
> +
> +int main ()
> +{
> +  int i;
> +
> +  for (i = 0; i < 100; i++)
> +    recur_fn (1) + recur_fn (-5);
> +
> +  return 1;
> +}
> +
> +/* { dg-final { scan-ipa-dump-times "Creating a specialized node of recur_fn/\[0-9\]*\\." 12 "cp" } } */
>


[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: clone.patch --]
[-- Type: text/x-patch; name="clone.patch", Size: 10367 bytes --]

From 51e79933f5264ec35276c3058ed2b5aeb2f28128 Mon Sep 17 00:00:00 2001
From: Feng Xue <fxue@os.amperecomputing.com>
Date: Tue, 24 Sep 2019 11:48:26 +0800
Subject: [PATCH] cost

---
 gcc/ipa-cp.c                           | 171 ++++++++++++++++++++++---
 gcc/params.def                         |   5 +
 gcc/testsuite/gcc.dg/ipa/ipa-clone-2.c |  47 +++++++
 3 files changed, 204 insertions(+), 19 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/ipa/ipa-clone-2.c

diff --git a/gcc/ipa-cp.c b/gcc/ipa-cp.c
index b3e928052dc..dff9eadb64b 100644
--- a/gcc/ipa-cp.c
+++ b/gcc/ipa-cp.c
@@ -229,7 +229,9 @@ public:
   inline bool set_contains_variable ();
   bool add_value (valtype newval, cgraph_edge *cs,
 		  ipcp_value<valtype> *src_val = NULL,
-		  int src_idx = 0, HOST_WIDE_INT offset = -1);
+		  int src_idx = 0, HOST_WIDE_INT offset = -1,
+		  ipcp_value<valtype> **val_pos_p = NULL,
+		  bool unlimited = false);
   void print (FILE * f, bool dump_sources, bool dump_benefits);
 };
 
@@ -1733,22 +1735,37 @@ allocate_and_init_ipcp_value (ipa_polymorphic_call_context source)
 /* Try to add NEWVAL to LAT, potentially creating a new ipcp_value for it.  CS,
    SRC_VAL SRC_INDEX and OFFSET are meant for add_source and have the same
    meaning.  OFFSET -1 means the source is scalar and not a part of an
-   aggregate.  */
+   aggregate.  If non-NULL, VAL_POS_P specifies position in value list,
+   after which newly created ipcp_value will be inserted, and it is also
+   used to record address of the added ipcp_value before function returns.
+   UNLIMITED means whether value count should not exceed the limit given
+   by PARAM_IPA_CP_VALUE_LIST_SIZE.  */
 
 template <typename valtype>
 bool
 ipcp_lattice<valtype>::add_value (valtype newval, cgraph_edge *cs,
 				  ipcp_value<valtype> *src_val,
-				  int src_idx, HOST_WIDE_INT offset)
+				  int src_idx, HOST_WIDE_INT offset,
+				  ipcp_value<valtype> **val_pos_p,
+				  bool unlimited)
 {
   ipcp_value<valtype> *val;
 
+  if (val_pos_p)
+    {
+      for (val = values; val && val != *val_pos_p; val = val->next);
+      gcc_checking_assert (val);
+    }
+
   if (bottom)
     return false;
 
   for (val = values; val; val = val->next)
     if (values_equal_for_ipcp_p (val->value, newval))
       {
+	if (val_pos_p)
+	  *val_pos_p = val;
+
 	if (ipa_edge_within_scc (cs))
 	  {
 	    ipcp_value_source<valtype> *s;
@@ -1763,7 +1780,8 @@ ipcp_lattice<valtype>::add_value (valtype newval, cgraph_edge *cs,
 	return false;
       }
 
-  if (values_count == PARAM_VALUE (PARAM_IPA_CP_VALUE_LIST_SIZE))
+  if (!unlimited
+      && values_count == PARAM_VALUE (PARAM_IPA_CP_VALUE_LIST_SIZE))
     {
       /* We can only free sources, not the values themselves, because sources
 	 of other values in this SCC might point to them.   */
@@ -1777,6 +1795,9 @@ ipcp_lattice<valtype>::add_value (valtype newval, cgraph_edge *cs,
 	    }
 	}
 
+      if (val_pos_p)
+	*val_pos_p = NULL;
+
       values = NULL;
       return set_to_bottom ();
     }
@@ -1784,11 +1805,74 @@ ipcp_lattice<valtype>::add_value (valtype newval, cgraph_edge *cs,
   values_count++;
   val = allocate_and_init_ipcp_value (newval);
   val->add_source (cs, src_val, src_idx, offset);
-  val->next = values;
-  values = val;
+  if (val_pos_p)
+    {
+      val->next = (*val_pos_p)->next;
+      (*val_pos_p)->next = val;
+      *val_pos_p = val;
+    }
+  else
+    {
+      val->next = values;
+      values = val;
+    }
+
+  return true;
+}
+
+/* Return true, if a ipcp_value VAL is orginated from parameter value of
+   self-feeding recursive function by applying non-passthrough arithmetic
+   transformation.  */
+
+static bool
+self_recursively_generated_p (ipcp_value<tree> *val)
+{
+  class ipa_node_params *info = NULL;
+
+  for (ipcp_value_source<tree> *src = val->sources; src; src = src->next)
+    {
+      cgraph_edge *cs = src->cs;
+
+      if (!src->val || cs->caller != cs->callee->function_symbol ()
+	  || src->val == val)
+	return false;
+
+      if (!info)
+	info = IPA_NODE_REF (cs->caller);
+
+      class ipcp_param_lattices *plats = ipa_get_parm_lattices (info,
+								src->index);
+      ipcp_lattice<tree> *src_lat = src->offset == -1 ? &plats->itself
+						      : plats->aggs;
+      ipcp_value<tree> *src_val;
+
+      for (src_val = src_lat->values; src_val && src_val != val;
+	   src_val = src_val->next);
+
+      if (!src_val)
+	return false;
+    }
+
   return true;
 }
 
+static tree
+get_val_across_arith_op (enum tree_code opcode,
+			 tree opnd1_type,
+			 tree opnd2,
+			 ipcp_value<tree> *src_val,
+			 tree res_type)
+{
+  tree opnd1 = src_val->value;
+
+  /* Skip source values that is incompatible with specified type.  */
+  if (opnd1_type
+      && !useless_type_conversion_p (opnd1_type, TREE_TYPE (opnd1)))
+    return NULL_TREE;
+
+  return ipa_get_jf_arith_result (opcode, opnd1, opnd2, res_type);
+}
+
 /* Propagate values through an arithmetic transformation described by a jump
    function associated with edge CS, taking values from SRC_LAT and putting
    them into DEST_LAT.  OPND1_TYPE is expected type for the values in SRC_LAT.
@@ -1812,24 +1896,70 @@ propagate_vals_across_arith_jfunc (cgraph_edge *cs,
   ipcp_value<tree> *src_val;
   bool ret = false;
 
-  /* Do not create new values when propagating within an SCC because if there
-     are arithmetic functions with circular dependencies, there is infinite
-     number of them and we would just make lattices bottom.  If this condition
-     is ever relaxed we have to detect self-feeding recursive calls in
-     cgraph_edge_brings_value_p in a smarter way.  */
+  /* Due to circular dependencies, propagating within an SCC through arithmetic
+     transformation would create infinite number of values.  But for
+     self-feeding recursive function, we could allow propagation in a limited
+     count, and this can enable a simple kind of recursive function versioning.
+     For other scenario, we would just make lattices bottom.  */
   if (opcode != NOP_EXPR && ipa_edge_within_scc (cs))
-    ret = dest_lat->set_contains_variable ();
+    {
+      if (src_lat != dest_lat)
+	return dest_lat->set_contains_variable ();
+
+      auto_vec<ipcp_value<tree> *, 8> val_seeds;
+
+      for (src_val = src_lat->values; src_val; src_val = src_val->next)
+	{
+	  /* Now we do not use self-recursively generated value as propagation
+	     source, this is absolutely conservative, but could avoid explosion
+	     of lattice's value space, especially when one recursive function
+	     calls another recursive.  */
+	  if (self_recursively_generated_p (src_val))
+	    {
+	      /* If the lattice has already been propagated for the call site,
+		 no need to do that again.  */
+	      for (ipcp_value_source<tree> *s = src_val->sources; s;
+		   s = s->next)
+		if (s->cs == cs)
+		  return dest_lat->set_contains_variable ();
+	    }
+	  else
+	    val_seeds.safe_push (src_val);
+	}
+
+      int clone_copies = PARAM_VALUE (PARAM_IPA_CP_MAX_RECURSION_COPIES);
+      int i;
+
+      /* Recursively generate lattice values with a limited count.  */
+      FOR_EACH_VEC_ELT (val_seeds, i, src_val)
+	{
+	  for (int j = 1; j < clone_copies; j++)
+	    {
+	      tree cstval = get_val_across_arith_op (opcode, opnd1_type, opnd2,
+						     src_val, res_type);
+	      if (!cstval)
+		break;
+
+	      /* Try to place the new lattice value after its source, which
+		 can decrease iterations of propagate stage.  */
+	      ret |= dest_lat->add_value (cstval, cs, src_val, src_idx,
+					  src_offset, &src_val, true);
+	      gcc_checking_assert (src_val);
+	    }
+	}
+      ret |= dest_lat->set_contains_variable ();
+    }
   else
     for (src_val = src_lat->values; src_val; src_val = src_val->next)
       {
-	tree opnd1 = src_val->value;
-	tree cstval = NULL_TREE;
-
-	/* Skip source values that is incompatible with specified type.  */
-	if (!opnd1_type
-	    || useless_type_conversion_p (opnd1_type, TREE_TYPE (opnd1)))
-	  cstval = ipa_get_jf_arith_result (opcode, opnd1, opnd2, res_type);
+	  /* Now we do not use self-recursively generated value as propagation
+	     source, otherwise it is easy to make value space of normal lattice
+	     overflow.  */
+	if (self_recursively_generated_p (src_val))
+	  continue;
 
+	tree cstval = get_val_across_arith_op (opcode, opnd1_type, opnd2,
+					       src_val, res_type);
 	if (cstval)
 	  ret |= dest_lat->add_value (cstval, cs, src_val, src_idx,
 				      src_offset);
@@ -3851,6 +3981,9 @@ get_info_about_necessary_edges (ipcp_value<valtype> *val, cgraph_node *dest,
 	      hot |= cs->maybe_hot_p ();
 	      if (cs->caller != dest)
 		non_self_recursive = true;
+	      else if (src->val)
+		gcc_assert (values_equal_for_ipcp_p (src->val->value,
+						     val->value));
 	    }
 	  cs = get_next_cgraph_edge_clone (cs);
 	}
diff --git a/gcc/params.def b/gcc/params.def
index 322c37f8b96..3e242e09f01 100644
--- a/gcc/params.def
+++ b/gcc/params.def
@@ -1135,6 +1135,11 @@ DEFPARAM (PARAM_IPA_CP_RECURSION_PENALTY,
 	  "are evaluated for cloning.",
 	  40, 0, 100)
 
+DEFPARAM (PARAM_IPA_CP_MAX_RECURSION_COPIES,
+	  "ipa-cp-max-recursion-copies",
+	  "Maximum function versioning copies for a self-recursive function.",
+	  8, 0, 0)
+
 DEFPARAM (PARAM_IPA_CP_SINGLE_CALL_PENALTY,
 	  "ipa-cp-single-call-penalty",
 	  "Percentage penalty functions containing a single call to another "
diff --git a/gcc/testsuite/gcc.dg/ipa/ipa-clone-2.c b/gcc/testsuite/gcc.dg/ipa/ipa-clone-2.c
new file mode 100644
index 00000000000..f70fb110c67
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/ipa/ipa-clone-2.c
@@ -0,0 +1,47 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fdump-ipa-cp-details -fno-early-inlining --param ipa-cp-max-recursion-copies=8" } */
+
+int fn();
+
+int data[100];
+
+int recur_fn (int i)
+{
+  int j;
+   
+  if (i == 6)
+    {
+      fn();
+      fn();
+      fn();
+      fn();
+      fn();
+      fn();
+      fn();
+      fn();
+      fn();
+      fn();
+      fn();
+      fn();
+      return 10;
+    }
+
+  data[i] = i; 
+
+  for (j = 0; j < 100; j++)
+    recur_fn (i + 1);
+
+  return i; 
+}
+
+int main ()
+{
+  int i;
+
+  for (i = 0; i < 100; i++)
+    recur_fn (1) + recur_fn (-5);
+
+  return 1;
+}
+
+/* { dg-final { scan-ipa-dump-times "Creating a specialized node of recur_fn/\[0-9\]*\\." 12 "cp" } } */
-- 
2.17.1


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] Support multi-versioning on self-recursive function (ipa/92133)
  2019-10-24  6:57   ` Feng Xue OS
@ 2019-11-14 13:29     ` Jan Hubicka
  2019-11-14 15:16       ` Feng Xue OS
  0 siblings, 1 reply; 19+ messages in thread
From: Jan Hubicka @ 2019-11-14 13:29 UTC (permalink / raw)
  To: Feng Xue OS; +Cc: luoxhu, gcc-patches, Martin Jambor

Hi,
I think the patch generally looks reasonable

+2019-11-13  Feng Xue <fxue@os.amperecomputing.com>
+
+	PR ipa/92133
+	* doc/invoke.texi (ipa-cp-max-recursion-depth): Document new option.
+	* params.opt (ipa-cp-max-recursion-depth): New.
+	* ipa-cp.c (ipcp_lattice<valtype>::add_value): Add two new parameters
+	val_pos_p and unlimited.
+	(self_recursively_generated_p): New function.
+	(get_val_across_arith_op): Likewise.
+	(propagate_vals_across_arith_jfunc): Add constant propagation for
+	self-recursive function.
+	(incorporate_penalties): Do not penalize pure self-recursive function.
+	(good_cloning_opportunity_p): Dump node_is_self_scc flag.
+	(propagate_constants_topo): Set node_is_self_scc flag for cgraph node.
+	(get_info_about_necessary_edges): Relax hotness check for edge to
+	self-recursive function.
+	* ipa-prop.h (ipa_node_params): Add new field node_is_self_scc.
+

In general the patch looks good to me, but I would like Martin Jambor to
comment on the ipa-prop/cp interfaces. However...
 
+@item ipa-cp-max-recursion-depth
+Maximum depth of recursive cloning for self-recursive function.
+

... I believe we will need more careful cost model for this.  I think
we want to limit the overall growth for all the clones and also probably
enable this only when ipa-predicates things the individual clones will
actualy be faster by some non-trivial percentage. For recursive inliner
we have:

--param max-inline-recursive-depth which has similar meaning to your parameter
  (so perhaps similar name would be good)
--param min-inline-recursive-probability
  which requires the inlining to happen only across edges which are
  known to be taken with reasonable chance
--param max-inline-insns-recursive
  which specifies overall size after all the recursive inlining

Those parameters are not parituclarly well tought out or tested, but
they may be good start.

Do you have some data on code size/performance effects of this change?

Honza

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] Support multi-versioning on self-recursive function (ipa/92133)
  2019-11-14 13:29     ` Jan Hubicka
@ 2019-11-14 15:16       ` Feng Xue OS
  2019-11-14 15:28         ` Jan Hubicka
  0 siblings, 1 reply; 19+ messages in thread
From: Feng Xue OS @ 2019-11-14 15:16 UTC (permalink / raw)
  To: Jan Hubicka; +Cc: luoxhu, gcc-patches, Martin Jambor

Thanks for your review.

> In general the patch looks good to me, but I would like Martin Jambor to
> comment on the ipa-prop/cp interfaces. However...

> +@item ipa-cp-max-recursion-depth
> +Maximum depth of recursive cloning for self-recursive function.
> +

> ... I believe we will need more careful cost model for this.  I think
> we want to limit the overall growth for all the clones and also probably
> enable this only when ipa-predicates things the individual clones will
> actualy be faster by some non-trivial percentage. For recursive inliner
> we have:

Cost model used by self-recursive cloning is mainly based on existing stuffs
in ipa-cp cloning, size growth and time benefit are considered. But since
recursive cloning is a more aggressive cloning, we will actually have another
problem, which is opposite to your concern.  By default, current parameter
set used to control ipa-cp and recursive-inliner gives priority to code size,
not completely for performance. This makes ipa-cp behave somewhat
conservatively, and as a result, it can not trigger expected recursive cloning
for the case in SPEC2017.exchange2 with default setting, blocked by both
ipa-cp-eval-threshold and ipcp-unit-growth. The former is due to improper
static profile estimation, and the latter is conflicted to aggressiveness of
recursive cloning. Thus, we have to explicitly lower the threshold and raise
percentage of unit-growth.

We might not reach the destination in one leap. This patch is just first step
to enable recursive function versioning. And next, we still need further
elaborate tuning on this.

> --param max-inline-recursive-depth which has similar meaning to your parameter
>  (so perhaps similar name would be good)
> --param min-inline-recursive-probability
>  which requires the inlining to happen only across edges which are
>  known to be taken with reasonable chance
> --param max-inline-insns-recursive
>  which specifies overall size after all the recursive inlining

> Those parameters are not parituclarly well tought out or tested, but
> they may be good start.

> Do you have some data on code size/performance effects of this change?
For spec2017, no obvious code size and performance change with default setting.
Specifically, for exchange2, with ipa-cp-eval-threshold=1 and ipcp-unit-growth=80,
performance +31%, size +7%, on aarch64.

Feng

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] Support multi-versioning on self-recursive function (ipa/92133)
  2019-11-14 15:16       ` Feng Xue OS
@ 2019-11-14 15:28         ` Jan Hubicka
  2019-11-14 16:02           ` Feng Xue OS
  0 siblings, 1 reply; 19+ messages in thread
From: Jan Hubicka @ 2019-11-14 15:28 UTC (permalink / raw)
  To: Feng Xue OS; +Cc: luoxhu, gcc-patches, Martin Jambor

> Thanks for your review.
> 
> > In general the patch looks good to me, but I would like Martin Jambor to
> > comment on the ipa-prop/cp interfaces. However...
> 
> > +@item ipa-cp-max-recursion-depth
> > +Maximum depth of recursive cloning for self-recursive function.
> > +
> 
> > ... I believe we will need more careful cost model for this.  I think
> > we want to limit the overall growth for all the clones and also probably
> > enable this only when ipa-predicates things the individual clones will
> > actualy be faster by some non-trivial percentage. For recursive inliner
> > we have:
> 
> Cost model used by self-recursive cloning is mainly based on existing stuffs
> in ipa-cp cloning, size growth and time benefit are considered. But since
> recursive cloning is a more aggressive cloning, we will actually have another
> problem, which is opposite to your concern.  By default, current parameter
> set used to control ipa-cp and recursive-inliner gives priority to code size,
> not completely for performance. This makes ipa-cp behave somewhat

Yes, for a while the cost model is quite off.  On Firefox it does just
few clonings where code size increases so it desprately needs retuning.

But since rescursive cloning is quite a different case from normal one,
perhaps having independent set of limits would help in particular ...
> conservatively, and as a result, it can not trigger expected recursive cloning
> for the case in SPEC2017.exchange2 with default setting, blocked by both
> ipa-cp-eval-threshold and ipcp-unit-growth. The former is due to improper
> static profile estimation, and the latter is conflicted to aggressiveness of
> recursive cloning. Thus, we have to explicitly lower the threshold and raise
> percentage of unit-growth.
> 
> We might not reach the destination in one leap. This patch is just first step
> to enable recursive function versioning. And next, we still need further
> elaborate tuning on this.
> 
> > --param max-inline-recursive-depth which has similar meaning to your parameter
> >  (so perhaps similar name would be good)
> > --param min-inline-recursive-probability
> >  which requires the inlining to happen only across edges which are
> >  known to be taken with reasonable chance
> > --param max-inline-insns-recursive
> >  which specifies overall size after all the recursive inlining
> 
> > Those parameters are not parituclarly well tought out or tested, but
> > they may be good start.
> 
> > Do you have some data on code size/performance effects of this change?
> For spec2017, no obvious code size and performance change with default setting.
> Specifically, for exchange2, with ipa-cp-eval-threshold=1 and ipcp-unit-growth=80,
> performance +31%, size +7%, on aarch64.

... it will help here since ipa-cp-eval-threshold value needed are quite off of what we need to do.

I wonder about the 80% of unit growth which is also more than we can
enable by default.  How it comes the overal size change is only 7%?

Honza
> 
> Feng

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] Support multi-versioning on self-recursive function (ipa/92133)
  2019-11-14 15:28         ` Jan Hubicka
@ 2019-11-14 16:02           ` Feng Xue OS
  2019-11-14 20:50             ` Jan Hubicka
  0 siblings, 1 reply; 19+ messages in thread
From: Feng Xue OS @ 2019-11-14 16:02 UTC (permalink / raw)
  To: Jan Hubicka; +Cc: luoxhu, gcc-patches, Martin Jambor

>> Cost model used by self-recursive cloning is mainly based on existing stuffs
>> in ipa-cp cloning, size growth and time benefit are considered. But since
>> recursive cloning is a more aggressive cloning, we will actually have another
>> problem, which is opposite to your concern.  By default, current parameter
>> set used to control ipa-cp and recursive-inliner gives priority to code size,
>> not completely for performance. This makes ipa-cp behave somewhat

> Yes, for a while the cost model is quite off.  On Firefox it does just
> few clonings where code size increases so it desprately needs retuning.

> But since rescursive cloning is quite a different case from normal one,
> perhaps having independent set of limits would help in particular ...
I did consider this way, but this seems to be contradictory for normal
and recursive cloning.

> > Do you have some data on code size/performance effects of this change?
> For spec2017, no obvious code size and performance change with default setting.
> Specifically, for exchange2, with ipa-cp-eval-threshold=1 and ipcp-unit-growth=80,
> performance +31%, size +7%, on aarch64.

> ... it will help here since ipa-cp-eval-threshold value needed are quite off of what we need to do.

> I wonder about the 80% of unit growth which is also more than we can
> enable by default.  How it comes the overal size change is only 7%?
343624 -> 365632 (this contains debug info, -g)    recursion-depth=8
273488 -> 273760 (no debug info)   recursion-depth=8

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] Support multi-versioning on self-recursive function (ipa/92133)
  2019-11-14 16:02           ` Feng Xue OS
@ 2019-11-14 20:50             ` Jan Hubicka
  2019-11-15 15:37               ` Feng Xue OS
  0 siblings, 1 reply; 19+ messages in thread
From: Jan Hubicka @ 2019-11-14 20:50 UTC (permalink / raw)
  To: Feng Xue OS; +Cc: luoxhu, gcc-patches, Martin Jambor

> >> Cost model used by self-recursive cloning is mainly based on existing stuffs
> >> in ipa-cp cloning, size growth and time benefit are considered. But since
> >> recursive cloning is a more aggressive cloning, we will actually have another
> >> problem, which is opposite to your concern.  By default, current parameter
> >> set used to control ipa-cp and recursive-inliner gives priority to code size,
> >> not completely for performance. This makes ipa-cp behave somewhat
> 
> > Yes, for a while the cost model is quite off.  On Firefox it does just
> > few clonings where code size increases so it desprately needs retuning.
> 
> > But since rescursive cloning is quite a different case from normal one,
> > perhaps having independent set of limits would help in particular ...
> I did consider this way, but this seems to be contradictory for normal
> and recursive cloning.

We could definitly discuss cost model incrementally. It is safe to do
what you do currently (rely on the existing ipa-cp's overconservative
heuristics).  

> 
> > > Do you have some data on code size/performance effects of this change?
> > For spec2017, no obvious code size and performance change with default setting.
> > Specifically, for exchange2, with ipa-cp-eval-threshold=1 and ipcp-unit-growth=80,
> > performance +31%, size +7%, on aarch64.
> 
> > ... it will help here since ipa-cp-eval-threshold value needed are quite off of what we need to do.
> 
> > I wonder about the 80% of unit growth which is also more than we can
> > enable by default.  How it comes the overal size change is only 7%?
> 343624 -> 365632 (this contains debug info, -g)    recursion-depth=8
> 273488 -> 273760 (no debug info)   recursion-depth=8

What seems bit odd is that ipcp's metrics ends up with 80% code growth.
I will try to look into it and see if I can think better what to do
about the costs.

Honza

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] Support multi-versioning on self-recursive function (ipa/92133)
  2019-11-14 20:50             ` Jan Hubicka
@ 2019-11-15 15:37               ` Feng Xue OS
  2019-11-22  5:26                 ` Ping: " Feng Xue OS
  2019-11-22 11:34                 ` Martin Jambor
  0 siblings, 2 replies; 19+ messages in thread
From: Feng Xue OS @ 2019-11-15 15:37 UTC (permalink / raw)
  To: Jan Hubicka; +Cc: luoxhu, gcc-patches, Martin Jambor

[-- Attachment #1: Type: text/plain, Size: 2389 bytes --]

Honza,

   I made some changes: do not penalize self-recursive function, and add --param ipa-cp-min-recursive-probability, similar to recursive inline. Please review this new one.

Thanks,
Feng

________________________________________
From: Jan Hubicka <hubicka@ucw.cz>
Sent: Friday, November 15, 2019 4:33 AM
To: Feng Xue OS
Cc: luoxhu; gcc-patches@gcc.gnu.org; Martin Jambor
Subject: Re: [PATCH] Support multi-versioning on self-recursive function (ipa/92133)

> >> Cost model used by self-recursive cloning is mainly based on existing stuffs
> >> in ipa-cp cloning, size growth and time benefit are considered. But since
> >> recursive cloning is a more aggressive cloning, we will actually have another
> >> problem, which is opposite to your concern.  By default, current parameter
> >> set used to control ipa-cp and recursive-inliner gives priority to code size,
> >> not completely for performance. This makes ipa-cp behave somewhat
>
> > Yes, for a while the cost model is quite off.  On Firefox it does just
> > few clonings where code size increases so it desprately needs retuning.
>
> > But since rescursive cloning is quite a different case from normal one,
> > perhaps having independent set of limits would help in particular ...
> I did consider this way, but this seems to be contradictory for normal
> and recursive cloning.

We could definitly discuss cost model incrementally. It is safe to do
what you do currently (rely on the existing ipa-cp's overconservative
heuristics).

>
> > > Do you have some data on code size/performance effects of this change?
> > For spec2017, no obvious code size and performance change with default setting.
> > Specifically, for exchange2, with ipa-cp-eval-threshold=1 and ipcp-unit-growth=80,
> > performance +31%, size +7%, on aarch64.
>
> > ... it will help here since ipa-cp-eval-threshold value needed are quite off of what we need to do.
>
> > I wonder about the 80% of unit growth which is also more than we can
> > enable by default.  How it comes the overal size change is only 7%?
> 343624 -> 365632 (this contains debug info, -g)    recursion-depth=8
> 273488 -> 273760 (no debug info)   recursion-depth=8

What seems bit odd is that ipcp's metrics ends up with 80% code growth.
I will try to look into it and see if I can think better what to do
about the costs.

Honza

[-- Attachment #2: 0001-recursive-clone.patch --]
[-- Type: application/octet-stream, Size: 16658 bytes --]

From 3f90176d1d15822897cfc744c8f88a1686f75edc Mon Sep 17 00:00:00 2001
From: Feng Xue <fxue@os.amperecomputing.com>
Date: Tue, 24 Sep 2019 11:48:26 +0800
Subject: [PATCH] cost

---
 gcc/ChangeLog                          |  20 +++
 gcc/doc/invoke.texi                    |   7 +
 gcc/ipa-cp.c                           | 215 ++++++++++++++++++++++---
 gcc/ipa-prop.h                         |   2 +
 gcc/params.opt                         |   8 +
 gcc/testsuite/ChangeLog                |   5 +
 gcc/testsuite/gcc.dg/ipa/ipa-clone-2.c |  47 ++++++
 7 files changed, 280 insertions(+), 24 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/ipa/ipa-clone-2.c

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index fe8daf40888..c84f4d73ed6 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,23 @@
+2019-11-15  Feng Xue <fxue@os.amperecomputing.com>
+
+	PR ipa/92133
+	* doc/invoke.texi (ipa-cp-max-recursive-depth): Document new option.
+	(ipa-cp-min-recursive-probability): Likewise.
+	* params.opt (ipa-cp-max-recursive-depth): New.
+	(ipa-cp-min-recursive-probability): Likewise.
+	* ipa-cp.c (ipcp_lattice<valtype>::add_value): Add two new parameters
+	val_pos_p and unlimited.
+	(self_recursively_generated_p): New function.
+	(get_val_across_arith_op): Likewise.
+	(propagate_vals_across_arith_jfunc): Add constant propagation for
+	self-recursive function.
+	(incorporate_penalties): Do not penalize pure self-recursive function.
+	(good_cloning_opportunity_p): Dump node_is_self_scc flag.
+	(propagate_constants_topo): Set node_is_self_scc flag for cgraph node.
+	(get_info_about_necessary_edges): Relax hotness check for edge to
+	self-recursive function.
+	* ipa-prop.h (ipa_node_params): Add new field node_is_self_scc.
+
 2019-11-15  Feng Xue  <fxue@os.amperecomputing.com>
 
 	PR ipa/92528
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index fe79ca2247a..c30adfd31c2 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -12045,6 +12045,13 @@ IPA-CP calculates its own score of cloning profitability heuristics
 and performs those cloning opportunities with scores that exceed
 @option{ipa-cp-eval-threshold}.
 
+@item ipa-cp-max-recursive-depth
+Maximum depth of recursive cloning for self-recursive function.
+
+@item ipa-cp-min-recursive-probability
+Recursive cloning only when the probability of call being executed exceeds
+the parameter.
+
 @item ipa-cp-recursion-penalty
 Percentage penalty the recursive functions will receive when they
 are evaluated for cloning.
diff --git a/gcc/ipa-cp.c b/gcc/ipa-cp.c
index 8372dfaa771..bbf508bca6c 100644
--- a/gcc/ipa-cp.c
+++ b/gcc/ipa-cp.c
@@ -228,7 +228,9 @@ public:
   inline bool set_contains_variable ();
   bool add_value (valtype newval, cgraph_edge *cs,
 		  ipcp_value<valtype> *src_val = NULL,
-		  int src_idx = 0, HOST_WIDE_INT offset = -1);
+		  int src_idx = 0, HOST_WIDE_INT offset = -1,
+		  ipcp_value<valtype> **val_pos_p = NULL,
+		  bool unlimited = false);
   void print (FILE * f, bool dump_sources, bool dump_benefits);
 };
 
@@ -1817,22 +1819,37 @@ allocate_and_init_ipcp_value (ipa_polymorphic_call_context source)
 /* Try to add NEWVAL to LAT, potentially creating a new ipcp_value for it.  CS,
    SRC_VAL SRC_INDEX and OFFSET are meant for add_source and have the same
    meaning.  OFFSET -1 means the source is scalar and not a part of an
-   aggregate.  */
+   aggregate.  If non-NULL, VAL_POS_P specifies position in value list,
+   after which newly created ipcp_value will be inserted, and it is also
+   used to record address of the added ipcp_value before function returns.
+   UNLIMITED means whether value count should not exceed the limit given
+   by PARAM_IPA_CP_VALUE_LIST_SIZE.  */
 
 template <typename valtype>
 bool
 ipcp_lattice<valtype>::add_value (valtype newval, cgraph_edge *cs,
 				  ipcp_value<valtype> *src_val,
-				  int src_idx, HOST_WIDE_INT offset)
+				  int src_idx, HOST_WIDE_INT offset,
+				  ipcp_value<valtype> **val_pos_p,
+				  bool unlimited)
 {
   ipcp_value<valtype> *val;
 
+  if (val_pos_p)
+    {
+      for (val = values; val && val != *val_pos_p; val = val->next);
+      gcc_checking_assert (val);
+    }
+
   if (bottom)
     return false;
 
   for (val = values; val; val = val->next)
     if (values_equal_for_ipcp_p (val->value, newval))
       {
+	if (val_pos_p)
+	  *val_pos_p = val;
+
 	if (ipa_edge_within_scc (cs))
 	  {
 	    ipcp_value_source<valtype> *s;
@@ -1847,7 +1864,7 @@ ipcp_lattice<valtype>::add_value (valtype newval, cgraph_edge *cs,
 	return false;
       }
 
-  if (values_count == param_ipa_cp_value_list_size)
+  if (!unlimited && values_count == param_ipa_cp_value_list_size)
     {
       /* We can only free sources, not the values themselves, because sources
 	 of other values in this SCC might point to them.   */
@@ -1861,6 +1878,9 @@ ipcp_lattice<valtype>::add_value (valtype newval, cgraph_edge *cs,
 	    }
 	}
 
+      if (val_pos_p)
+	*val_pos_p = NULL;
+
       values = NULL;
       return set_to_bottom ();
     }
@@ -1868,11 +1888,74 @@ ipcp_lattice<valtype>::add_value (valtype newval, cgraph_edge *cs,
   values_count++;
   val = allocate_and_init_ipcp_value (newval);
   val->add_source (cs, src_val, src_idx, offset);
-  val->next = values;
-  values = val;
+  if (val_pos_p)
+    {
+      val->next = (*val_pos_p)->next;
+      (*val_pos_p)->next = val;
+      *val_pos_p = val;
+    }
+  else
+    {
+      val->next = values;
+      values = val;
+    }
+
+  return true;
+}
+
+/* Return true, if a ipcp_value VAL is orginated from parameter value of
+   self-feeding recursive function by applying non-passthrough arithmetic
+   transformation.  */
+
+static bool
+self_recursively_generated_p (ipcp_value<tree> *val)
+{
+  class ipa_node_params *info = NULL;
+
+  for (ipcp_value_source<tree> *src = val->sources; src; src = src->next)
+    {
+      cgraph_edge *cs = src->cs;
+
+      if (!src->val || cs->caller != cs->callee->function_symbol ()
+	  || src->val == val)
+	return false;
+
+      if (!info)
+	info = IPA_NODE_REF (cs->caller);
+
+      class ipcp_param_lattices *plats = ipa_get_parm_lattices (info,
+								src->index);
+      ipcp_lattice<tree> *src_lat = src->offset == -1 ? &plats->itself
+						      : plats->aggs;
+      ipcp_value<tree> *src_val;
+
+      for (src_val = src_lat->values; src_val && src_val != val;
+	   src_val = src_val->next);
+
+      if (!src_val)
+	return false;
+    }
+
   return true;
 }
 
+static tree
+get_val_across_arith_op (enum tree_code opcode,
+			 tree opnd1_type,
+			 tree opnd2,
+			 ipcp_value<tree> *src_val,
+			 tree res_type)
+{
+  tree opnd1 = src_val->value;
+
+  /* Skip source values that is incompatible with specified type.  */
+  if (opnd1_type
+      && !useless_type_conversion_p (opnd1_type, TREE_TYPE (opnd1)))
+    return NULL_TREE;
+
+  return ipa_get_jf_arith_result (opcode, opnd1, opnd2, res_type);
+}
+
 /* Propagate values through an arithmetic transformation described by a jump
    function associated with edge CS, taking values from SRC_LAT and putting
    them into DEST_LAT.  OPND1_TYPE is expected type for the values in SRC_LAT.
@@ -1896,24 +1979,74 @@ propagate_vals_across_arith_jfunc (cgraph_edge *cs,
   ipcp_value<tree> *src_val;
   bool ret = false;
 
-  /* Do not create new values when propagating within an SCC because if there
-     are arithmetic functions with circular dependencies, there is infinite
-     number of them and we would just make lattices bottom.  If this condition
-     is ever relaxed we have to detect self-feeding recursive calls in
-     cgraph_edge_brings_value_p in a smarter way.  */
+  /* Due to circular dependencies, propagating within an SCC through arithmetic
+     transformation would create infinite number of values.  But for
+     self-feeding recursive function, we could allow propagation in a limited
+     count, and this can enable a simple kind of recursive function versioning.
+     For other scenario, we would just make lattices bottom.  */
   if (opcode != NOP_EXPR && ipa_edge_within_scc (cs))
-    ret = dest_lat->set_contains_variable ();
+    {
+      int i;
+
+      if (src_lat != dest_lat || param_ipa_cp_max_recursive_depth < 1)
+	return dest_lat->set_contains_variable ();
+
+      /* No benefit if recursive execution is in low probability.  */
+      if (cs->sreal_frequency () * 100
+	  <= ((sreal) 1) * param_ipa_cp_min_recursive_probability)
+	return dest_lat->set_contains_variable ();
+
+      auto_vec<ipcp_value<tree> *, 8> val_seeds;
+
+      for (src_val = src_lat->values; src_val; src_val = src_val->next)
+	{
+	  /* Now we do not use self-recursively generated value as propagation
+	     source, this is absolutely conservative, but could avoid explosion
+	     of lattice's value space, especially when one recursive function
+	     calls another recursive.  */
+	  if (self_recursively_generated_p (src_val))
+	    {
+	      /* If the lattice has already been propagated for the call site,
+		 no need to do that again.  */
+	      for (ipcp_value_source<tree> *s = src_val->sources; s;
+		   s = s->next)
+		if (s->cs == cs)
+		  return dest_lat->set_contains_variable ();
+	    }
+	  else
+	    val_seeds.safe_push (src_val);
+	}
+
+      /* Recursively generate lattice values with a limited count.  */
+      FOR_EACH_VEC_ELT (val_seeds, i, src_val)
+	{
+	  for (int j = 1; j < param_ipa_cp_max_recursive_depth; j++)
+	    {
+	      tree cstval = get_val_across_arith_op (opcode, opnd1_type, opnd2,
+						     src_val, res_type);
+	      if (!cstval)
+		break;
+
+	      /* Try to place the new lattice value after its source, which
+		 can decrease iterations of propagate stage.  */
+	      ret |= dest_lat->add_value (cstval, cs, src_val, src_idx,
+					  src_offset, &src_val, true);
+	      gcc_checking_assert (src_val);
+	    }
+	}
+      ret |= dest_lat->set_contains_variable ();
+    }
   else
     for (src_val = src_lat->values; src_val; src_val = src_val->next)
       {
-	tree opnd1 = src_val->value;
-	tree cstval = NULL_TREE;
-
-	/* Skip source values that is incompatible with specified type.  */
-	if (!opnd1_type
-	    || useless_type_conversion_p (opnd1_type, TREE_TYPE (opnd1)))
-	  cstval = ipa_get_jf_arith_result (opcode, opnd1, opnd2, res_type);
+	/* Now we do not use self-recursively generated value as propagation
+	   source, otherwise it is easy to make value space of normal lattice
+	   overflow.  */
+	if (self_recursively_generated_p (src_val))
+	  continue;
 
+	tree cstval = get_val_across_arith_op (opcode, opnd1_type, opnd2,
+					       src_val, res_type);
 	if (cstval)
 	  ret |= dest_lat->add_value (cstval, cs, src_val, src_idx,
 				      src_offset);
@@ -3032,7 +3165,7 @@ hint_time_bonus (ipa_hints hints)
 static inline int64_t
 incorporate_penalties (ipa_node_params *info, int64_t evaluation)
 {
-  if (info->node_within_scc)
+  if (info->node_within_scc && !info->node_is_self_scc)
     evaluation = (evaluation
 		  * (100 - param_ipa_cp_recursion_penalty)) / 100;
 
@@ -3076,7 +3209,8 @@ good_cloning_opportunity_p (struct cgraph_node *node, int time_benefit,
 	  count_sum.dump (dump_file);
 	  fprintf (dump_file, "%s%s) -> evaluation: " "%" PRId64
 		 ", threshold: %i\n",
-		 info->node_within_scc ? ", scc" : "",
+		 info->node_within_scc
+		   ? (info->node_is_self_scc ? ", self_scc" : ", scc") : "",
 		 info->node_calling_single_call ? ", single_call" : "",
 		 evaluation, param_ipa_cp_eval_threshold);
 	}
@@ -3094,7 +3228,8 @@ good_cloning_opportunity_p (struct cgraph_node *node, int time_benefit,
 		 "size: %i, freq_sum: %i%s%s) -> evaluation: "
 		 "%" PRId64 ", threshold: %i\n",
 		 time_benefit, size_cost, freq_sum,
-		 info->node_within_scc ? ", scc" : "",
+		 info->node_within_scc
+		   ? (info->node_is_self_scc ? ", self_scc" : ", scc") : "",
 		 info->node_calling_single_call ? ", single_call" : "",
 		 evaluation, param_ipa_cp_eval_threshold);
 
@@ -3588,14 +3723,30 @@ propagate_constants_topo (class ipa_topo_info *topo)
       while (v)
 	{
 	  struct cgraph_edge *cs;
+	  class ipa_node_params *info = NULL;
+	  bool self_scc = true;
 
 	  for (cs = v->callees; cs; cs = cs->next_callee)
 	    if (ipa_edge_within_scc (cs))
 	      {
-		IPA_NODE_REF (v)->node_within_scc = true;
+		cgraph_node *callee = cs->callee->function_symbol ();
+
+		if (v != callee)
+		  self_scc = false;
+
+		if (!info)
+		  {
+		    info = IPA_NODE_REF (v);
+		    info->node_within_scc = true;
+		  }
+
 		if (propagate_constants_across_call (cs))
-		  push_node_to_stack (topo, cs->callee->function_symbol ());
+		  push_node_to_stack (topo, callee);
 	      }
+
+	  if (info)
+	    info->node_is_self_scc = self_scc;
+
 	  v = pop_node_from_stack (topo);
 	}
 
@@ -3977,6 +4128,9 @@ get_info_about_necessary_edges (ipcp_value<valtype> *val, cgraph_node *dest,
 	      hot |= cs->maybe_hot_p ();
 	      if (cs->caller != dest)
 		non_self_recursive = true;
+	      else if (src->val)
+		gcc_assert (values_equal_for_ipcp_p (src->val->value,
+						     val->value));
 	    }
 	  cs = get_next_cgraph_edge_clone (cs);
 	}
@@ -3990,6 +4144,19 @@ get_info_about_necessary_edges (ipcp_value<valtype> *val, cgraph_node *dest,
   *freq_sum = freq;
   *count_sum = cnt;
   *caller_count = count;
+
+  if (!hot && IPA_NODE_REF (dest)->node_within_scc)
+    {
+      struct cgraph_edge *cs;
+
+      /* Cold non-SCC source edge could trigger hot recursive execution of
+	 function. Consider the case as hot and rely on following cost model
+	 computation to further select right one.  */
+      for (cs = dest->callers; cs; cs = cs->next_caller)
+	if (cs->caller == dest && cs->maybe_hot_p ())
+	  return true;
+    }
+
   return hot;
 }
 
diff --git a/gcc/ipa-prop.h b/gcc/ipa-prop.h
index 9e85ef8aace..685f621a4ba 100644
--- a/gcc/ipa-prop.h
+++ b/gcc/ipa-prop.h
@@ -495,6 +495,8 @@ public:
   unsigned node_dead : 1;
   /* Node is involved in a recursion, potentionally indirect.  */
   unsigned node_within_scc : 1;
+  /* Node contains only direct recursion.  */
+  unsigned node_is_self_scc : 1;
   /* Node is calling a private function called only once.  */
   unsigned node_calling_single_call : 1;
   /* False when there is something makes versioning impossible.  */
diff --git a/gcc/params.opt b/gcc/params.opt
index d60db5825cc..0a833a8942c 100644
--- a/gcc/params.opt
+++ b/gcc/params.opt
@@ -210,6 +210,14 @@ Threshold ipa-cp opportunity evaluation that is still considered beneficial to c
 Common Joined UInteger Var(param_ipa_cp_loop_hint_bonus) Init(64) Param
 Compile-time bonus IPA-CP assigns to candidates which make loop bounds or strides known.
 
+-param=ipa-cp-max-recursive-depth=
+Common Joined UInteger Var(param_ipa_cp_max_recursive_depth) Init(7) Param
+Maximum depth of recursive cloning for self-recursive function.
+
+-param=ipa-cp-min-recursive-probability=
+Common Joined UInteger Var(param_ipa_cp_min_recursive_probability) Init(5) Param
+Recursive cloning only when the probability of call being executed exceeds the parameter.
+
 -param=ipa-cp-recursion-penalty=
 Common Joined UInteger Var(param_ipa_cp_recursion_penalty) Init(40) IntegerRange(0, 100) Param
 Percentage penalty the recursive functions will receive when they are evaluated for cloning.
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index b835d671eba..153f8fd29e8 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -166,6 +166,11 @@
 	gcc.dg/gnu2x-attr-syntax-1.c, gcc.dg/gnu2x-attr-syntax-2.c,
 	gcc.dg/gnu2x-attrs-1.c: New tests.
 
+2019-11-14  Feng Xue  <fxue@os.amperecomputing.com>
+
+	PR ipa/92133
+	* gcc.dg/ipa/ipa-clone-2.c: New test.
+
 2019-11-14  Feng Xue  <fxue@os.amperecomputing.com>
 
 	PR ipa/91682
diff --git a/gcc/testsuite/gcc.dg/ipa/ipa-clone-2.c b/gcc/testsuite/gcc.dg/ipa/ipa-clone-2.c
new file mode 100644
index 00000000000..c6c26f73ee5
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/ipa/ipa-clone-2.c
@@ -0,0 +1,47 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fdump-ipa-cp-details -fno-early-inlining --param ipa-cp-max-recursion-depth=8" } */
+
+int fn();
+
+int data[100];
+
+int recur_fn (int i)
+{
+  int j;
+   
+  if (i == 6)
+    {
+      fn();
+      fn();
+      fn();
+      fn();
+      fn();
+      fn();
+      fn();
+      fn();
+      fn();
+      fn();
+      fn();
+      fn();
+      return 10;
+    }
+
+  data[i] = i; 
+
+  for (j = 0; j < 100; j++)
+    recur_fn (i + 1);
+
+  return i; 
+}
+
+int main ()
+{
+  int i;
+
+  for (i = 0; i < 100; i++)
+    recur_fn (1) + recur_fn (-5);
+
+  return 1;
+}
+
+/* { dg-final { scan-ipa-dump-times "Creating a specialized node of recur_fn/\[0-9\]*\\." 12 "cp" } } */
-- 
2.17.1


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Ping: [PATCH] Support multi-versioning on self-recursive function (ipa/92133)
  2019-11-15 15:37               ` Feng Xue OS
@ 2019-11-22  5:26                 ` Feng Xue OS
  2019-11-22 11:34                 ` Martin Jambor
  1 sibling, 0 replies; 19+ messages in thread
From: Feng Xue OS @ 2019-11-22  5:26 UTC (permalink / raw)
  To: Jan Hubicka, mjambor; +Cc: luoxhu, gcc-patches

Honza, Martin,

  Hope your more comments on this patch. Not sure you base option on it. I think this can be a start point for
recursive versioning. And later we definitely need further cost-model tuning not only on this, but also whole
 ipa-cp to enable more aggressive cloning.

Thanks,
Feng

________________________________________
From: Feng Xue OS <fxue@os.amperecomputing.com>
Sent: Friday, November 15, 2019 11:32 PM
To: Jan Hubicka
Cc: luoxhu; gcc-patches@gcc.gnu.org; Martin Jambor
Subject: Re: [PATCH] Support multi-versioning on self-recursive function (ipa/92133)

Honza,

   I made some changes: do not penalize self-recursive function, and add --param ipa-cp-min-recursive-probability, similar to recursive inline. Please review this new one.

Thanks,
Feng

________________________________________
From: Jan Hubicka <hubicka@ucw.cz>
Sent: Friday, November 15, 2019 4:33 AM
To: Feng Xue OS
Cc: luoxhu; gcc-patches@gcc.gnu.org; Martin Jambor
Subject: Re: [PATCH] Support multi-versioning on self-recursive function (ipa/92133)

> >> Cost model used by self-recursive cloning is mainly based on existing stuffs
> >> in ipa-cp cloning, size growth and time benefit are considered. But since
> >> recursive cloning is a more aggressive cloning, we will actually have another
> >> problem, which is opposite to your concern.  By default, current parameter
> >> set used to control ipa-cp and recursive-inliner gives priority to code size,
> >> not completely for performance. This makes ipa-cp behave somewhat
>
> > Yes, for a while the cost model is quite off.  On Firefox it does just
> > few clonings where code size increases so it desprately needs retuning.
>
> > But since rescursive cloning is quite a different case from normal one,
> > perhaps having independent set of limits would help in particular ...
> I did consider this way, but this seems to be contradictory for normal
> and recursive cloning.

We could definitly discuss cost model incrementally. It is safe to do
what you do currently (rely on the existing ipa-cp's overconservative
heuristics).

>
> > > Do you have some data on code size/performance effects of this change?
> > For spec2017, no obvious code size and performance change with default setting.
> > Specifically, for exchange2, with ipa-cp-eval-threshold=1 and ipcp-unit-growth=80,
> > performance +31%, size +7%, on aarch64.
>
> > ... it will help here since ipa-cp-eval-threshold value needed are quite off of what we need to do.
>
> > I wonder about the 80% of unit growth which is also more than we can
> > enable by default.  How it comes the overal size change is only 7%?
> 343624 -> 365632 (this contains debug info, -g)    recursion-depth=8
> 273488 -> 273760 (no debug info)   recursion-depth=8

What seems bit odd is that ipcp's metrics ends up with 80% code growth.
I will try to look into it and see if I can think better what to do
about the costs.

Honza

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] Support multi-versioning on self-recursive function (ipa/92133)
  2019-11-15 15:37               ` Feng Xue OS
  2019-11-22  5:26                 ` Ping: " Feng Xue OS
@ 2019-11-22 11:34                 ` Martin Jambor
  2019-11-25 14:17                   ` Feng Xue OS
  1 sibling, 1 reply; 19+ messages in thread
From: Martin Jambor @ 2019-11-22 11:34 UTC (permalink / raw)
  To: Feng Xue OS, Jan Hubicka; +Cc: luoxhu, gcc-patches

Hi,

On Fri, Nov 15 2019, Feng Xue OS wrote:
> Honza,
>
> I made some changes: do not penalize self-recursive function, and
> add --param ipa-cp-min-recursive-probability, similar to recursive
> inline. Please review this new one.

The patch and its effect on exchange is intriguing, I only have a few
comments, see below, otherwise I am happy about it.

> diff --git a/gcc/ChangeLog b/gcc/ChangeLog
> index fe8daf40888..c84f4d73ed6 100644
> --- a/gcc/ChangeLog
> +++ b/gcc/ChangeLog
> @@ -1,3 +1,23 @@
> +2019-11-15  Feng Xue <fxue@os.amperecomputing.com>
> +
> +	PR ipa/92133
> +	* doc/invoke.texi (ipa-cp-max-recursive-depth): Document new option.
> +	(ipa-cp-min-recursive-probability): Likewise.
> +	* params.opt (ipa-cp-max-recursive-depth): New.
> +	(ipa-cp-min-recursive-probability): Likewise.
> +	* ipa-cp.c (ipcp_lattice<valtype>::add_value): Add two new parameters
> +	val_pos_p and unlimited.
> +	(self_recursively_generated_p): New function.
> +	(get_val_across_arith_op): Likewise.
> +	(propagate_vals_across_arith_jfunc): Add constant propagation for
> +	self-recursive function.
> +	(incorporate_penalties): Do not penalize pure self-recursive function.
> +	(good_cloning_opportunity_p): Dump node_is_self_scc flag.
> +	(propagate_constants_topo): Set node_is_self_scc flag for cgraph node.
> +	(get_info_about_necessary_edges): Relax hotness check for edge to
> +	self-recursive function.
> +	* ipa-prop.h (ipa_node_params): Add new field node_is_self_scc.
> +

The least important comment: Thanks for providing the ChangeLog but
sending ChangeLog in the patch itself makes it difficult for people to
apply the patch because the ChangeLog hunks will never apply cleanly.
That's why people send them in the email body when they email
gcc-patches.  Hopefully we'll be putting ChangeLogs only into the commit
message soon and all of this will be a non-issue.

>  2019-11-15  Feng Xue  <fxue@os.amperecomputing.com>
>  
>  	PR ipa/92528
> diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
> index fe79ca2247a..c30adfd31c2 100644
> --- a/gcc/doc/invoke.texi
> +++ b/gcc/doc/invoke.texi
> @@ -12045,6 +12045,13 @@ IPA-CP calculates its own score of cloning profitability heuristics
>  and performs those cloning opportunities with scores that exceed
>  @option{ipa-cp-eval-threshold}.
>  
> +@item ipa-cp-max-recursive-depth
> +Maximum depth of recursive cloning for self-recursive function.
> +
> +@item ipa-cp-min-recursive-probability
> +Recursive cloning only when the probability of call being executed exceeds
> +the parameter.
> +
>  @item ipa-cp-recursion-penalty
>  Percentage penalty the recursive functions will receive when they
>  are evaluated for cloning.
> diff --git a/gcc/ipa-cp.c b/gcc/ipa-cp.c
> index 8372dfaa771..bbf508bca6c 100644
> --- a/gcc/ipa-cp.c
> +++ b/gcc/ipa-cp.c
> @@ -228,7 +228,9 @@ public:
>    inline bool set_contains_variable ();
>    bool add_value (valtype newval, cgraph_edge *cs,
>  		  ipcp_value<valtype> *src_val = NULL,
> -		  int src_idx = 0, HOST_WIDE_INT offset = -1);
> +		  int src_idx = 0, HOST_WIDE_INT offset = -1,
> +		  ipcp_value<valtype> **val_pos_p = NULL,
> +		  bool unlimited = false);
>    void print (FILE * f, bool dump_sources, bool dump_benefits);
>  };
>  
> @@ -1817,22 +1819,37 @@ allocate_and_init_ipcp_value (ipa_polymorphic_call_context source)
>  /* Try to add NEWVAL to LAT, potentially creating a new ipcp_value for it.  CS,
>     SRC_VAL SRC_INDEX and OFFSET are meant for add_source and have the same
>     meaning.  OFFSET -1 means the source is scalar and not a part of an
> -   aggregate.  */
> +   aggregate.  If non-NULL, VAL_POS_P specifies position in value list,
> +   after which newly created ipcp_value will be inserted, and it is also
> +   used to record address of the added ipcp_value before function returns.
> +   UNLIMITED means whether value count should not exceed the limit given
> +   by PARAM_IPA_CP_VALUE_LIST_SIZE.  */
>  
>  template <typename valtype>
>  bool
>  ipcp_lattice<valtype>::add_value (valtype newval, cgraph_edge *cs,
>  				  ipcp_value<valtype> *src_val,
> -				  int src_idx, HOST_WIDE_INT offset)
> +				  int src_idx, HOST_WIDE_INT offset,
> +				  ipcp_value<valtype> **val_pos_p,
> +				  bool unlimited)
>  {
>    ipcp_value<valtype> *val;
>  
> +  if (val_pos_p)
> +    {
> +      for (val = values; val && val != *val_pos_p; val = val->next);

Please put empty statements on a separate line (I think there is one
more in the patch IIRC).

> +      gcc_checking_assert (val);
> +    }
> +
>    if (bottom)
>      return false;
>  
>    for (val = values; val; val = val->next)
>      if (values_equal_for_ipcp_p (val->value, newval))
>        {
> +	if (val_pos_p)
> +	  *val_pos_p = val;
> +
>  	if (ipa_edge_within_scc (cs))
>  	  {
>  	    ipcp_value_source<valtype> *s;
> @@ -1847,7 +1864,7 @@ ipcp_lattice<valtype>::add_value (valtype newval, cgraph_edge *cs,
>  	return false;
>        }
>  
> -  if (values_count == param_ipa_cp_value_list_size)
> +  if (!unlimited && values_count == param_ipa_cp_value_list_size)
>      {
>        /* We can only free sources, not the values themselves, because sources
>  	 of other values in this SCC might point to them.   */
> @@ -1861,6 +1878,9 @@ ipcp_lattice<valtype>::add_value (valtype newval, cgraph_edge *cs,
>  	    }
>  	}
>  
> +      if (val_pos_p)
> +	*val_pos_p = NULL;
> +
>        values = NULL;
>        return set_to_bottom ();
>      }
> @@ -1868,11 +1888,74 @@ ipcp_lattice<valtype>::add_value (valtype newval, cgraph_edge *cs,
>    values_count++;
>    val = allocate_and_init_ipcp_value (newval);
>    val->add_source (cs, src_val, src_idx, offset);
> -  val->next = values;
> -  values = val;
> +  if (val_pos_p)
> +    {
> +      val->next = (*val_pos_p)->next;
> +      (*val_pos_p)->next = val;
> +      *val_pos_p = val;
> +    }

I must say I am not a big fan of the val_pos_p parameter.  Would simply
putting new values always at the end of the list work to reduce the
iterations?  At this point the function has traversed all the values
already when searching if it is present, so it can remember the last
one and just add stuff after it.

But if I'm missing something clever I'm not going to oppose it.


> +  else
> +    {
> +      val->next = values;
> +      values = val;
> +    }
> +
> +  return true;
> +}
> +
> +/* Return true, if a ipcp_value VAL is orginated from parameter value of
> +   self-feeding recursive function by applying non-passthrough arithmetic
> +   transformation.  */
> +
> +static bool
> +self_recursively_generated_p (ipcp_value<tree> *val)
> +{
> +  class ipa_node_params *info = NULL;
> +
> +  for (ipcp_value_source<tree> *src = val->sources; src; src = src->next)
> +    {
> +      cgraph_edge *cs = src->cs;
> +
> +      if (!src->val || cs->caller != cs->callee->function_symbol ()
> +	  || src->val == val)
> +	return false;

I'd suggest putting the condition calling function_symbol last.

> +
> +      if (!info)
> +	info = IPA_NODE_REF (cs->caller);
> +
> +      class ipcp_param_lattices *plats = ipa_get_parm_lattices (info,
> +								src->index);
> +      ipcp_lattice<tree> *src_lat = src->offset == -1 ? &plats->itself
> +						      : plats->aggs;
> +      ipcp_value<tree> *src_val;
> +
> +      for (src_val = src_lat->values; src_val && src_val != val;
> +	   src_val = src_val->next);

I think that GNU code style dictates that when a for does not fit on a
single line, the three bits have to be on a separate line each.  Plus
please put the empty statement on its own line too.

> +
> +      if (!src_val)
> +	return false;
> +    }
> +
>    return true;
>  }
>  
> +static tree
> +get_val_across_arith_op (enum tree_code opcode,
> +			 tree opnd1_type,
> +			 tree opnd2,
> +			 ipcp_value<tree> *src_val,
> +			 tree res_type)

The function should get at least a brief comment.

> +{
> +  tree opnd1 = src_val->value;
> +
> +  /* Skip source values that is incompatible with specified type.  */
> +  if (opnd1_type
> +      && !useless_type_conversion_p (opnd1_type, TREE_TYPE (opnd1)))
> +    return NULL_TREE;
> +
> +  return ipa_get_jf_arith_result (opcode, opnd1, opnd2, res_type);
> +}
> +
>  /* Propagate values through an arithmetic transformation described by a jump
>     function associated with edge CS, taking values from SRC_LAT and putting
>     them into DEST_LAT.  OPND1_TYPE is expected type for the values in SRC_LAT.
> @@ -1896,24 +1979,74 @@ propagate_vals_across_arith_jfunc (cgraph_edge *cs,
>    ipcp_value<tree> *src_val;
>    bool ret = false;
>  
> -  /* Do not create new values when propagating within an SCC because if there
> -     are arithmetic functions with circular dependencies, there is infinite
> -     number of them and we would just make lattices bottom.  If this condition
> -     is ever relaxed we have to detect self-feeding recursive calls in
> -     cgraph_edge_brings_value_p in a smarter way.  */
> +  /* Due to circular dependencies, propagating within an SCC through arithmetic
> +     transformation would create infinite number of values.  But for
> +     self-feeding recursive function, we could allow propagation in a limited
> +     count, and this can enable a simple kind of recursive function versioning.
> +     For other scenario, we would just make lattices bottom.  */
>    if (opcode != NOP_EXPR && ipa_edge_within_scc (cs))
> -    ret = dest_lat->set_contains_variable ();
> +    {
> +      int i;
> +
> +      if (src_lat != dest_lat || param_ipa_cp_max_recursive_depth < 1)
> +	return dest_lat->set_contains_variable ();
> +
> +      /* No benefit if recursive execution is in low probability.  */
> +      if (cs->sreal_frequency () * 100
> +	  <= ((sreal) 1) * param_ipa_cp_min_recursive_probability)
> +	return dest_lat->set_contains_variable ();
> +
> +      auto_vec<ipcp_value<tree> *, 8> val_seeds;
> +
> +      for (src_val = src_lat->values; src_val; src_val = src_val->next)
> +	{
> +	  /* Now we do not use self-recursively generated value as propagation
> +	     source, this is absolutely conservative, but could avoid explosion
> +	     of lattice's value space, especially when one recursive function
> +	     calls another recursive.  */
> +	  if (self_recursively_generated_p (src_val))
> +	    {
> +	      /* If the lattice has already been propagated for the call site,
> +		 no need to do that again.  */
> +	      for (ipcp_value_source<tree> *s = src_val->sources; s;
> +		   s = s->next)

I'm afraid you'll have to reformat this for loop too.

> +		if (s->cs == cs)
> +		  return dest_lat->set_contains_variable ();
> +	    }
> +	  else
> +	    val_seeds.safe_push (src_val);
> +	}
> +
> +      /* Recursively generate lattice values with a limited count.  */
> +      FOR_EACH_VEC_ELT (val_seeds, i, src_val)
> +	{
> +	  for (int j = 1; j < param_ipa_cp_max_recursive_depth; j++)
> +	    {
> +	      tree cstval = get_val_across_arith_op (opcode, opnd1_type, opnd2,
> +						     src_val, res_type);
> +	      if (!cstval)
> +		break;
> +
> +	      /* Try to place the new lattice value after its source, which
> +		 can decrease iterations of propagate stage.  */
> +	      ret |= dest_lat->add_value (cstval, cs, src_val, src_idx,
> +					  src_offset, &src_val, true);
> +	      gcc_checking_assert (src_val);
> +	    }
> +	}
> +      ret |= dest_lat->set_contains_variable ();
> +    }
>    else
>      for (src_val = src_lat->values; src_val; src_val = src_val->next)
>        {
> -	tree opnd1 = src_val->value;
> -	tree cstval = NULL_TREE;
> -
> -	/* Skip source values that is incompatible with specified type.  */
> -	if (!opnd1_type
> -	    || useless_type_conversion_p (opnd1_type, TREE_TYPE (opnd1)))
> -	  cstval = ipa_get_jf_arith_result (opcode, opnd1, opnd2, res_type);
> +	/* Now we do not use self-recursively generated value as propagation
> +	   source, otherwise it is easy to make value space of normal lattice
> +	   overflow.  */
> +	if (self_recursively_generated_p (src_val))
> +	  continue;

I think you are missing a necessary call to
dest_lat->set_contains_variable() here.  Either call it or initialize
cstval to zero and call get_val_across_arith_op only when
self_recursively_generated_p returns false;

>  
> +	tree cstval = get_val_across_arith_op (opcode, opnd1_type, opnd2,
> +					       src_val, res_type);
>  	if (cstval)
>  	  ret |= dest_lat->add_value (cstval, cs, src_val, src_idx,
>  				      src_offset);
> @@ -3032,7 +3165,7 @@ hint_time_bonus (ipa_hints hints)
>  static inline int64_t
>  incorporate_penalties (ipa_node_params *info, int64_t evaluation)
>  {
> -  if (info->node_within_scc)
> +  if (info->node_within_scc && !info->node_is_self_scc)
>      evaluation = (evaluation
>  		  * (100 - param_ipa_cp_recursion_penalty)) / 100;
>  

The rest looks good to me, thanks for the patch, I hope to see it in the
trunk soon.

I'll be looking into IPA-CP tuning shortly too.  If you already have any
ideas, I'd be interested in hearing them.

Thanks,

Martin

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] Support multi-versioning on self-recursive function (ipa/92133)
  2019-11-22 11:34                 ` Martin Jambor
@ 2019-11-25 14:17                   ` Feng Xue OS
  2019-11-27  2:07                     ` Feng Xue OS
  0 siblings, 1 reply; 19+ messages in thread
From: Feng Xue OS @ 2019-11-25 14:17 UTC (permalink / raw)
  To: Martin Jambor, Jan Hubicka; +Cc: luoxhu, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 3902 bytes --]

Martin,

    Thanks for your review. I updated the patch with your comments.

Feng
---
2019-11-15  Feng Xue <fxue@os.amperecomputing.com>

	PR ipa/92133
	* doc/invoke.texi (ipa-cp-max-recursive-depth): Document new option.
	(ipa-cp-min-recursive-probability): Likewise.
	* params.opt (ipa-cp-max-recursive-depth): New.
	(ipa-cp-min-recursive-probability): Likewise.
	* ipa-cp.c (ipcp_lattice<valtype>::add_value): Add two new parameters
	val_p and unlimited.
	(self_recursively_generated_p): New function.
	(get_val_across_arith_op): Likewise.
	(propagate_vals_across_arith_jfunc): Add constant propagation for
	self-recursive function.
	(incorporate_penalties): Do not penalize pure self-recursive function.
	(good_cloning_opportunity_p): Dump node_is_self_scc flag.
	(propagate_constants_topo): Set node_is_self_scc flag for cgraph node.
	(get_info_about_necessary_edges): Relax hotness check for edge to
	self-recursive function.
	* ipa-prop.h (ipa_node_params): Add new field node_is_self_scc.
---

> The least important comment: Thanks for providing the ChangeLog but
> sending ChangeLog in the patch itself makes it difficult for people to
> apply the patch because the ChangeLog hunks will never apply cleanly.
> That's why people send them in the email body when they email
> gcc-patches.  Hopefully we'll be putting ChangeLogs only into the commit
> message soon and all of this will be a non-issue.
Ok.

>> +  if (val_pos_p)
>> +    {
>> +      for (val = values; val && val != *val_pos_p; val = val->next);

> Please put empty statements on a separate line (I think there is one
> more in the patch IIRC).
Done.

>> +  if (val_pos_p)
>> +    {
>> +      val->next = (*val_pos_p)->next;
>> +      (*val_pos_p)->next = val;
>> +      *val_pos_p = val;
>> +    }

> I must say I am not a big fan of the val_pos_p parameter.  Would simply
> putting new values always at the end of the list work to reduce the
> iterations?  At this point the function has traversed all the values
> already when searching if it is present, so it can remember the last
> one and just add stuff after it.
We need a parameter to record address of the added value, which will be used
in constructing next one. To place new value at the end of list is a good idea,
thus meaning of val_pos_p becomes simpler, it is only an out parameter now.

>> +      if (!src->val || cs->caller != cs->callee->function_symbol ()
>> +       || src->val == val)
>> +     return false;

> I'd suggest putting the condition calling function_symbol last.
Original condition order can reduce unnecessary evaluations on src->val==val,
which is false in most situation, while cs->caller != cs->callee->function_symbol ()
is true. 

>> +      for (src_val = src_lat->values; src_val && src_val != val;
>> +        src_val = src_val->next);

> I think that GNU code style dictates that when a for does not fit on a
> single line, the three bits have to be on a separate line each.  Plus
> please put the empty statement on its own line too.
Done.

>> +static tree
>> +get_val_across_arith_op (enum tree_code opcode,
>> +                      tree opnd1_type,
>> +                      tree opnd2,
>> +                      ipcp_value<tree> *src_val,
>> +                      tree res_type)

> The function should get at least a brief comment.
Done.

>> +           for (ipcp_value_source<tree> *s = src_val->sources; s;
>> +                s = s->next)

> I'm afraid you'll have to reformat this for loop too.
Done.
>> +     if (self_recursively_generated_p (src_val))
>> +       continue;

> I think you are missing a necessary call to
> dest_lat->set_contains_variable() here.  Either call it or initialize
> cstval to zero and call get_val_across_arith_op only when
> self_recursively_generated_p returns false;
Yes, this is a bug. Fixed.

[-- Attachment #2: 0001-Enable-recursive-function-versioning.patch --]
[-- Type: application/octet-stream, Size: 15207 bytes --]

From 8e2fa8f97a832c41a32ddc531d7d5d426ed1ed14 Mon Sep 17 00:00:00 2001
From: Feng Xue <fxue@os.amperecomputing.com>
Date: Tue, 24 Sep 2019 11:48:26 +0800
Subject: [PATCH] Enable recursive function versioning

---
 gcc/doc/invoke.texi                    |   7 +
 gcc/ipa-cp.c                           | 221 ++++++++++++++++++++++---
 gcc/ipa-prop.h                         |   2 +
 gcc/params.opt                         |   8 +
 gcc/testsuite/gcc.dg/ipa/ipa-clone-2.c |  47 ++++++
 5 files changed, 258 insertions(+), 27 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/ipa/ipa-clone-2.c

diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 403be47d893..d165f31a865 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -12035,6 +12035,13 @@ IPA-CP calculates its own score of cloning profitability heuristics
 and performs those cloning opportunities with scores that exceed
 @option{ipa-cp-eval-threshold}.
 
+@item ipa-cp-max-recursive-depth
+Maximum depth of recursive cloning for self-recursive function.
+
+@item ipa-cp-min-recursive-probability
+Recursive cloning only when the probability of call being executed exceeds
+the parameter.
+
 @item ipa-cp-recursion-penalty
 Percentage penalty the recursive functions will receive when they
 are evaluated for cloning.
diff --git a/gcc/ipa-cp.c b/gcc/ipa-cp.c
index 31a98a3d98a..f58c2b4fe93 100644
--- a/gcc/ipa-cp.c
+++ b/gcc/ipa-cp.c
@@ -228,7 +228,9 @@ public:
   inline bool set_contains_variable ();
   bool add_value (valtype newval, cgraph_edge *cs,
 		  ipcp_value<valtype> *src_val = NULL,
-		  int src_idx = 0, HOST_WIDE_INT offset = -1);
+		  int src_idx = 0, HOST_WIDE_INT offset = -1,
+		  ipcp_value<valtype> **val_p = NULL,
+		  bool unlimited = false);
   void print (FILE * f, bool dump_sources, bool dump_benefits);
 };
 
@@ -1817,22 +1819,32 @@ allocate_and_init_ipcp_value (ipa_polymorphic_call_context source)
 /* Try to add NEWVAL to LAT, potentially creating a new ipcp_value for it.  CS,
    SRC_VAL SRC_INDEX and OFFSET are meant for add_source and have the same
    meaning.  OFFSET -1 means the source is scalar and not a part of an
-   aggregate.  */
+   aggregate.  If non-NULL, VAL_P records address of existing or newly added
+   ipcp_value.  UNLIMITED means whether value count should not exceed the limit
+   given by PARAM_IPA_CP_VALUE_LIST_SIZE.  */
 
 template <typename valtype>
 bool
 ipcp_lattice<valtype>::add_value (valtype newval, cgraph_edge *cs,
 				  ipcp_value<valtype> *src_val,
-				  int src_idx, HOST_WIDE_INT offset)
+				  int src_idx, HOST_WIDE_INT offset,
+				  ipcp_value<valtype> **val_p,
+				  bool unlimited)
 {
-  ipcp_value<valtype> *val;
+  ipcp_value<valtype> *val, *last_val = NULL;
+
+  if (val_p)
+    *val_p = NULL;
 
   if (bottom)
     return false;
 
-  for (val = values; val; val = val->next)
+  for (val = values; val; last_val = val, val = val->next)
     if (values_equal_for_ipcp_p (val->value, newval))
       {
+	if (val_p)
+	  *val_p = val;
+
 	if (ipa_edge_within_scc (cs))
 	  {
 	    ipcp_value_source<valtype> *s;
@@ -1847,7 +1859,7 @@ ipcp_lattice<valtype>::add_value (valtype newval, cgraph_edge *cs,
 	return false;
       }
 
-  if (values_count == param_ipa_cp_value_list_size)
+  if (!unlimited && values_count == param_ipa_cp_value_list_size)
     {
       /* We can only free sources, not the values themselves, because sources
 	 of other values in this SCC might point to them.   */
@@ -1860,7 +1872,6 @@ ipcp_lattice<valtype>::add_value (valtype newval, cgraph_edge *cs,
 	      ipcp_sources_pool.remove ((ipcp_value_source<tree>*)src);
 	    }
 	}
-
       values = NULL;
       return set_to_bottom ();
     }
@@ -1868,11 +1879,81 @@ ipcp_lattice<valtype>::add_value (valtype newval, cgraph_edge *cs,
   values_count++;
   val = allocate_and_init_ipcp_value (newval);
   val->add_source (cs, src_val, src_idx, offset);
-  val->next = values;
-  values = val;
+  val->next = NULL;
+
+  /* Add the new value to end of value list, which can reduce iterations
+     of propagation stage for recursive function.  */
+  if (last_val)
+    last_val->next = val;
+  else
+    values = val;
+
+  if (val_p)
+    *val_p = val;
+
+  return true;
+}
+
+/* Return true, if a ipcp_value VAL is orginated from parameter value of
+   self-feeding recursive function by applying non-passthrough arithmetic
+   transformation.  */
+
+static bool
+self_recursively_generated_p (ipcp_value<tree> *val)
+{
+  class ipa_node_params *info = NULL;
+
+  for (ipcp_value_source<tree> *src = val->sources; src; src = src->next)
+    {
+      cgraph_edge *cs = src->cs;
+
+      if (!src->val || cs->caller != cs->callee->function_symbol ()
+	  || src->val == val)
+	return false;
+
+      if (!info)
+	info = IPA_NODE_REF (cs->caller);
+
+      class ipcp_param_lattices *plats = ipa_get_parm_lattices (info,
+								src->index);
+      ipcp_lattice<tree> *src_lat = src->offset == -1 ? &plats->itself
+						      : plats->aggs;
+      ipcp_value<tree> *src_val;
+
+      for (src_val = src_lat->values; src_val; src_val = src_val->next)
+	if (src_val == val)
+	  break;
+
+      if (!src_val)
+	return false;
+    }
+
   return true;
 }
 
+/* A helper function that returns result of operation specified by OPCODE on
+   the value of SRC_VAL.  If non-NULL, OPND1_TYPE is expected type for the
+   value of SRC_VAL.  If the operation is binary, OPND2 is a constant value
+   acting as its second operand.  If non-NULL, RES_TYPE is expected type of
+   the result.  */
+
+static tree
+get_val_across_arith_op (enum tree_code opcode,
+			 tree opnd1_type,
+			 tree opnd2,
+			 ipcp_value<tree> *src_val,
+			 tree res_type)
+{
+  tree opnd1 = src_val->value;
+
+  /* Skip source values that is incompatible with specified type.  */
+  if (opnd1_type
+      && !useless_type_conversion_p (opnd1_type, TREE_TYPE (opnd1)))
+    return NULL_TREE;
+
+  return ipa_get_jf_arith_result (opcode, opnd1, opnd2, res_type);
+}
+
 /* Propagate values through an arithmetic transformation described by a jump
    function associated with edge CS, taking values from SRC_LAT and putting
    them into DEST_LAT.  OPND1_TYPE is expected type for the values in SRC_LAT.
@@ -1896,24 +1977,76 @@ propagate_vals_across_arith_jfunc (cgraph_edge *cs,
   ipcp_value<tree> *src_val;
   bool ret = false;
 
-  /* Do not create new values when propagating within an SCC because if there
-     are arithmetic functions with circular dependencies, there is infinite
-     number of them and we would just make lattices bottom.  If this condition
-     is ever relaxed we have to detect self-feeding recursive calls in
-     cgraph_edge_brings_value_p in a smarter way.  */
+  /* Due to circular dependencies, propagating within an SCC through arithmetic
+     transformation would create infinite number of values.  But for
+     self-feeding recursive function, we could allow propagation in a limited
+     count, and this can enable a simple kind of recursive function versioning.
+     For other scenario, we would just make lattices bottom.  */
   if (opcode != NOP_EXPR && ipa_edge_within_scc (cs))
-    ret = dest_lat->set_contains_variable ();
+    {
+      int i;
+
+      if (src_lat != dest_lat || param_ipa_cp_max_recursive_depth < 1)
+	return dest_lat->set_contains_variable ();
+
+      /* No benefit if recursive execution is in low probability.  */
+      if (cs->sreal_frequency () * 100
+	  <= ((sreal) 1) * param_ipa_cp_min_recursive_probability)
+	return dest_lat->set_contains_variable ();
+
+      auto_vec<ipcp_value<tree> *, 8> val_seeds;
+
+      for (src_val = src_lat->values; src_val; src_val = src_val->next)
+	{
+	  /* Now we do not use self-recursively generated value as propagation
+	     source, this is absolutely conservative, but could avoid explosion
+	     of lattice's value space, especially when one recursive function
+	     calls another recursive.  */
+	  if (self_recursively_generated_p (src_val))
+	    {
+	      ipcp_value_source<tree> *s;
+
+	      /* If the lattice has already been propagated for the call site,
+		 no need to do that again.  */
+	      for (s = src_val->sources; s; s = s->next)
+		if (s->cs == cs)
+		  return dest_lat->set_contains_variable ();
+	    }
+	  else
+	    val_seeds.safe_push (src_val);
+	}
+
+      /* Recursively generate lattice values with a limited count.  */
+      FOR_EACH_VEC_ELT (val_seeds, i, src_val)
+	{
+	  for (int j = 1; j < param_ipa_cp_max_recursive_depth; j++)
+	    {
+	      tree cstval = get_val_across_arith_op (opcode, opnd1_type, opnd2,
+						     src_val, res_type);
+	      if (!cstval)
+		break;
+
+	      ret |= dest_lat->add_value (cstval, cs, src_val, src_idx,
+					  src_offset, &src_val, true);
+	      gcc_checking_assert (src_val);
+	    }
+	}
+      ret |= dest_lat->set_contains_variable ();
+    }
   else
     for (src_val = src_lat->values; src_val; src_val = src_val->next)
       {
-	tree opnd1 = src_val->value;
-	tree cstval = NULL_TREE;
-
-	/* Skip source values that is incompatible with specified type.  */
-	if (!opnd1_type
-	    || useless_type_conversion_p (opnd1_type, TREE_TYPE (opnd1)))
-	  cstval = ipa_get_jf_arith_result (opcode, opnd1, opnd2, res_type);
+	/* Now we do not use self-recursively generated value as propagation
+	   source, otherwise it is easy to make value space of normal lattice
+	   overflow.  */
+	if (self_recursively_generated_p (src_val))
+	  {
+	    ret |= dest_lat->set_contains_variable ();
+	    continue;
+	  }
 
+	tree cstval = get_val_across_arith_op (opcode, opnd1_type, opnd2,
+					       src_val, res_type);
 	if (cstval)
 	  ret |= dest_lat->add_value (cstval, cs, src_val, src_idx,
 				      src_offset);
@@ -3038,7 +3171,7 @@ hint_time_bonus (ipa_hints hints)
 static inline int64_t
 incorporate_penalties (ipa_node_params *info, int64_t evaluation)
 {
-  if (info->node_within_scc)
+  if (info->node_within_scc && !info->node_is_self_scc)
     evaluation = (evaluation
 		  * (100 - param_ipa_cp_recursion_penalty)) / 100;
 
@@ -3082,7 +3215,8 @@ good_cloning_opportunity_p (struct cgraph_node *node, int time_benefit,
 	  count_sum.dump (dump_file);
 	  fprintf (dump_file, "%s%s) -> evaluation: " "%" PRId64
 		 ", threshold: %i\n",
-		 info->node_within_scc ? ", scc" : "",
+		 info->node_within_scc
+		   ? (info->node_is_self_scc ? ", self_scc" : ", scc") : "",
 		 info->node_calling_single_call ? ", single_call" : "",
 		 evaluation, param_ipa_cp_eval_threshold);
 	}
@@ -3100,7 +3234,8 @@ good_cloning_opportunity_p (struct cgraph_node *node, int time_benefit,
 		 "size: %i, freq_sum: %i%s%s) -> evaluation: "
 		 "%" PRId64 ", threshold: %i\n",
 		 time_benefit, size_cost, freq_sum,
-		 info->node_within_scc ? ", scc" : "",
+		 info->node_within_scc
+		   ? (info->node_is_self_scc ? ", self_scc" : ", scc") : "",
 		 info->node_calling_single_call ? ", single_call" : "",
 		 evaluation, param_ipa_cp_eval_threshold);
 
@@ -3594,14 +3729,30 @@ propagate_constants_topo (class ipa_topo_info *topo)
       while (v)
 	{
 	  struct cgraph_edge *cs;
+	  class ipa_node_params *info = NULL;
+	  bool self_scc = true;
 
 	  for (cs = v->callees; cs; cs = cs->next_callee)
 	    if (ipa_edge_within_scc (cs))
 	      {
-		IPA_NODE_REF (v)->node_within_scc = true;
+		cgraph_node *callee = cs->callee->function_symbol ();
+
+		if (v != callee)
+		  self_scc = false;
+
+		if (!info)
+		  {
+		    info = IPA_NODE_REF (v);
+		    info->node_within_scc = true;
+		  }
+
 		if (propagate_constants_across_call (cs))
-		  push_node_to_stack (topo, cs->callee->function_symbol ());
+		  push_node_to_stack (topo, callee);
 	      }
+
+	  if (info)
+	    info->node_is_self_scc = self_scc;
+
 	  v = pop_node_from_stack (topo);
 	}
 
@@ -3983,6 +4134,9 @@ get_info_about_necessary_edges (ipcp_value<valtype> *val, cgraph_node *dest,
 	      hot |= cs->maybe_hot_p ();
 	      if (cs->caller != dest)
 		non_self_recursive = true;
+	      else if (src->val)
+		gcc_assert (values_equal_for_ipcp_p (src->val->value,
+						     val->value));
 	    }
 	  cs = get_next_cgraph_edge_clone (cs);
 	}
@@ -3996,6 +4150,19 @@ get_info_about_necessary_edges (ipcp_value<valtype> *val, cgraph_node *dest,
   *freq_sum = freq;
   *count_sum = cnt;
   *caller_count = count;
+
+  if (!hot && IPA_NODE_REF (dest)->node_within_scc)
+    {
+      struct cgraph_edge *cs;
+
+      /* Cold non-SCC source edge could trigger hot recursive execution of
+	 function. Consider the case as hot and rely on following cost model
+	 computation to further select right one.  */
+      for (cs = dest->callers; cs; cs = cs->next_caller)
+	if (cs->caller == dest && cs->maybe_hot_p ())
+	  return true;
+    }
+
   return hot;
 }
 
diff --git a/gcc/ipa-prop.h b/gcc/ipa-prop.h
index e9d6a5e8305..1958e1ee9cc 100644
--- a/gcc/ipa-prop.h
+++ b/gcc/ipa-prop.h
@@ -497,6 +497,8 @@ public:
   unsigned node_dead : 1;
   /* Node is involved in a recursion, potentionally indirect.  */
   unsigned node_within_scc : 1;
+  /* Node contains only direct recursion.  */
+  unsigned node_is_self_scc : 1;
   /* Node is calling a private function called only once.  */
   unsigned node_calling_single_call : 1;
   /* False when there is something makes versioning impossible.  */
diff --git a/gcc/params.opt b/gcc/params.opt
index 586b539ec5f..f4b810ded44 100644
--- a/gcc/params.opt
+++ b/gcc/params.opt
@@ -198,6 +198,14 @@ Threshold ipa-cp opportunity evaluation that is still considered beneficial to c
 Common Joined UInteger Var(param_ipa_cp_loop_hint_bonus) Init(64) Param
 Compile-time bonus IPA-CP assigns to candidates which make loop bounds or strides known.
 
+-param=ipa-cp-max-recursive-depth=
+Common Joined UInteger Var(param_ipa_cp_max_recursive_depth) Init(7) Param
+Maximum depth of recursive cloning for self-recursive function.
+
+-param=ipa-cp-min-recursive-probability=
+Common Joined UInteger Var(param_ipa_cp_min_recursive_probability) Init(2) Param
+Recursive cloning only when the probability of call being executed exceeds the parameter.
+
 -param=ipa-cp-recursion-penalty=
 Common Joined UInteger Var(param_ipa_cp_recursion_penalty) Init(40) IntegerRange(0, 100) Param
 Percentage penalty the recursive functions will receive when they are evaluated for cloning.
diff --git a/gcc/testsuite/gcc.dg/ipa/ipa-clone-2.c b/gcc/testsuite/gcc.dg/ipa/ipa-clone-2.c
new file mode 100644
index 00000000000..d513020ee8b
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/ipa/ipa-clone-2.c
@@ -0,0 +1,47 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fdump-ipa-cp-details -fno-early-inlining --param ipa-cp-max-recursive-depth=8" } */
+
+int fn();
+
+int data[100];
+
+int recur_fn (int i)
+{
+  int j;
+   
+  if (i == 6)
+    {
+      fn();
+      fn();
+      fn();
+      fn();
+      fn();
+      fn();
+      fn();
+      fn();
+      fn();
+      fn();
+      fn();
+      fn();
+      return 10;
+    }
+
+  data[i] = i; 
+
+  for (j = 0; j < 100; j++)
+    recur_fn (i + 1);
+
+  return i; 
+}
+
+int main ()
+{
+  int i;
+
+  for (i = 0; i < 100; i++)
+    recur_fn (1) + recur_fn (-5);
+
+  return 1;
+}
+
+/* { dg-final { scan-ipa-dump-times "Creating a specialized node of recur_fn/\[0-9\]*\\." 12 "cp" } } */
-- 
2.17.1


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] Support multi-versioning on self-recursive function (ipa/92133)
  2019-11-25 14:17                   ` Feng Xue OS
@ 2019-11-27  2:07                     ` Feng Xue OS
  2019-11-27 15:12                       ` Jan Hubicka
  2019-12-01 23:20                       ` Jeff Law
  0 siblings, 2 replies; 19+ messages in thread
From: Feng Xue OS @ 2019-11-27  2:07 UTC (permalink / raw)
  To: Martin Jambor, Jan Hubicka, Richard Biener; +Cc: luoxhu, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 4478 bytes --]

Hi, Richard,

  This patch is a not bugfix, while it is small. Martin and Honza are fine with it.
But now we are in stage 3, is it ok to commit?

Thanks,
Feng

________________________________________
From: Feng Xue OS <fxue@os.amperecomputing.com>
Sent: Monday, November 25, 2019 10:17 PM
To: Martin Jambor; Jan Hubicka
Cc: luoxhu; gcc-patches@gcc.gnu.org
Subject: Re: [PATCH] Support multi-versioning on self-recursive function (ipa/92133)

Martin,

    Thanks for your review. I updated the patch with your comments.

Feng
---
2019-11-15  Feng Xue <fxue@os.amperecomputing.com>

        PR ipa/92133
        * doc/invoke.texi (ipa-cp-max-recursive-depth): Document new option.
        (ipa-cp-min-recursive-probability): Likewise.
        * params.opt (ipa-cp-max-recursive-depth): New.
        (ipa-cp-min-recursive-probability): Likewise.
        * ipa-cp.c (ipcp_lattice<valtype>::add_value): Add two new parameters
        val_p and unlimited.
        (self_recursively_generated_p): New function.
        (get_val_across_arith_op): Likewise.
        (propagate_vals_across_arith_jfunc): Add constant propagation for
        self-recursive function.
        (incorporate_penalties): Do not penalize pure self-recursive function.
        (good_cloning_opportunity_p): Dump node_is_self_scc flag.
        (propagate_constants_topo): Set node_is_self_scc flag for cgraph node.
        (get_info_about_necessary_edges): Relax hotness check for edge to
        self-recursive function.
        * ipa-prop.h (ipa_node_params): Add new field node_is_self_scc.
---

> The least important comment: Thanks for providing the ChangeLog but
> sending ChangeLog in the patch itself makes it difficult for people to
> apply the patch because the ChangeLog hunks will never apply cleanly.
> That's why people send them in the email body when they email
> gcc-patches.  Hopefully we'll be putting ChangeLogs only into the commit
> message soon and all of this will be a non-issue.
Ok.

>> +  if (val_pos_p)
>> +    {
>> +      for (val = values; val && val != *val_pos_p; val = val->next);

> Please put empty statements on a separate line (I think there is one
> more in the patch IIRC).
Done.

>> +  if (val_pos_p)
>> +    {
>> +      val->next = (*val_pos_p)->next;
>> +      (*val_pos_p)->next = val;
>> +      *val_pos_p = val;
>> +    }

> I must say I am not a big fan of the val_pos_p parameter.  Would simply
> putting new values always at the end of the list work to reduce the
> iterations?  At this point the function has traversed all the values
> already when searching if it is present, so it can remember the last
> one and just add stuff after it.
We need a parameter to record address of the added value, which will be used
in constructing next one. To place new value at the end of list is a good idea,
thus meaning of val_pos_p becomes simpler, it is only an out parameter now.

>> +      if (!src->val || cs->caller != cs->callee->function_symbol ()
>> +       || src->val == val)
>> +     return false;

> I'd suggest putting the condition calling function_symbol last.
Original condition order can reduce unnecessary evaluations on src->val==val,
which is false in most situation, while cs->caller != cs->callee->function_symbol ()
is true.

>> +      for (src_val = src_lat->values; src_val && src_val != val;
>> +        src_val = src_val->next);

> I think that GNU code style dictates that when a for does not fit on a
> single line, the three bits have to be on a separate line each.  Plus
> please put the empty statement on its own line too.
Done.

>> +static tree
>> +get_val_across_arith_op (enum tree_code opcode,
>> +                      tree opnd1_type,
>> +                      tree opnd2,
>> +                      ipcp_value<tree> *src_val,
>> +                      tree res_type)

> The function should get at least a brief comment.
Done.

>> +           for (ipcp_value_source<tree> *s = src_val->sources; s;
>> +                s = s->next)

> I'm afraid you'll have to reformat this for loop too.
Done.
>> +     if (self_recursively_generated_p (src_val))
>> +       continue;

> I think you are missing a necessary call to
> dest_lat->set_contains_variable() here.  Either call it or initialize
> cstval to zero and call get_val_across_arith_op only when
> self_recursively_generated_p returns false;
Yes, this is a bug. Fixed.

[-- Attachment #2: 0001-Enable-recursive-function-versioning.patch --]
[-- Type: application/octet-stream, Size: 15207 bytes --]

From 8e2fa8f97a832c41a32ddc531d7d5d426ed1ed14 Mon Sep 17 00:00:00 2001
From: Feng Xue <fxue@os.amperecomputing.com>
Date: Tue, 24 Sep 2019 11:48:26 +0800
Subject: [PATCH] Enable recursive function versioning

---
 gcc/doc/invoke.texi                    |   7 +
 gcc/ipa-cp.c                           | 221 ++++++++++++++++++++++---
 gcc/ipa-prop.h                         |   2 +
 gcc/params.opt                         |   8 +
 gcc/testsuite/gcc.dg/ipa/ipa-clone-2.c |  47 ++++++
 5 files changed, 258 insertions(+), 27 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/ipa/ipa-clone-2.c

diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 403be47d893..d165f31a865 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -12035,6 +12035,13 @@ IPA-CP calculates its own score of cloning profitability heuristics
 and performs those cloning opportunities with scores that exceed
 @option{ipa-cp-eval-threshold}.
 
+@item ipa-cp-max-recursive-depth
+Maximum depth of recursive cloning for self-recursive function.
+
+@item ipa-cp-min-recursive-probability
+Recursive cloning only when the probability of call being executed exceeds
+the parameter.
+
 @item ipa-cp-recursion-penalty
 Percentage penalty the recursive functions will receive when they
 are evaluated for cloning.
diff --git a/gcc/ipa-cp.c b/gcc/ipa-cp.c
index 31a98a3d98a..f58c2b4fe93 100644
--- a/gcc/ipa-cp.c
+++ b/gcc/ipa-cp.c
@@ -228,7 +228,9 @@ public:
   inline bool set_contains_variable ();
   bool add_value (valtype newval, cgraph_edge *cs,
 		  ipcp_value<valtype> *src_val = NULL,
-		  int src_idx = 0, HOST_WIDE_INT offset = -1);
+		  int src_idx = 0, HOST_WIDE_INT offset = -1,
+		  ipcp_value<valtype> **val_p = NULL,
+		  bool unlimited = false);
   void print (FILE * f, bool dump_sources, bool dump_benefits);
 };
 
@@ -1817,22 +1819,32 @@ allocate_and_init_ipcp_value (ipa_polymorphic_call_context source)
 /* Try to add NEWVAL to LAT, potentially creating a new ipcp_value for it.  CS,
    SRC_VAL SRC_INDEX and OFFSET are meant for add_source and have the same
    meaning.  OFFSET -1 means the source is scalar and not a part of an
-   aggregate.  */
+   aggregate.  If non-NULL, VAL_P records address of existing or newly added
+   ipcp_value.  UNLIMITED means whether value count should not exceed the limit
+   given by PARAM_IPA_CP_VALUE_LIST_SIZE.  */
 
 template <typename valtype>
 bool
 ipcp_lattice<valtype>::add_value (valtype newval, cgraph_edge *cs,
 				  ipcp_value<valtype> *src_val,
-				  int src_idx, HOST_WIDE_INT offset)
+				  int src_idx, HOST_WIDE_INT offset,
+				  ipcp_value<valtype> **val_p,
+				  bool unlimited)
 {
-  ipcp_value<valtype> *val;
+  ipcp_value<valtype> *val, *last_val = NULL;
+
+  if (val_p)
+    *val_p = NULL;
 
   if (bottom)
     return false;
 
-  for (val = values; val; val = val->next)
+  for (val = values; val; last_val = val, val = val->next)
     if (values_equal_for_ipcp_p (val->value, newval))
       {
+	if (val_p)
+	  *val_p = val;
+
 	if (ipa_edge_within_scc (cs))
 	  {
 	    ipcp_value_source<valtype> *s;
@@ -1847,7 +1859,7 @@ ipcp_lattice<valtype>::add_value (valtype newval, cgraph_edge *cs,
 	return false;
       }
 
-  if (values_count == param_ipa_cp_value_list_size)
+  if (!unlimited && values_count == param_ipa_cp_value_list_size)
     {
       /* We can only free sources, not the values themselves, because sources
 	 of other values in this SCC might point to them.   */
@@ -1860,7 +1872,6 @@ ipcp_lattice<valtype>::add_value (valtype newval, cgraph_edge *cs,
 	      ipcp_sources_pool.remove ((ipcp_value_source<tree>*)src);
 	    }
 	}
-
       values = NULL;
       return set_to_bottom ();
     }
@@ -1868,11 +1879,81 @@ ipcp_lattice<valtype>::add_value (valtype newval, cgraph_edge *cs,
   values_count++;
   val = allocate_and_init_ipcp_value (newval);
   val->add_source (cs, src_val, src_idx, offset);
-  val->next = values;
-  values = val;
+  val->next = NULL;
+
+  /* Add the new value to end of value list, which can reduce iterations
+     of propagation stage for recursive function.  */
+  if (last_val)
+    last_val->next = val;
+  else
+    values = val;
+
+  if (val_p)
+    *val_p = val;
+
+  return true;
+}
+
+/* Return true, if a ipcp_value VAL is orginated from parameter value of
+   self-feeding recursive function by applying non-passthrough arithmetic
+   transformation.  */
+
+static bool
+self_recursively_generated_p (ipcp_value<tree> *val)
+{
+  class ipa_node_params *info = NULL;
+
+  for (ipcp_value_source<tree> *src = val->sources; src; src = src->next)
+    {
+      cgraph_edge *cs = src->cs;
+
+      if (!src->val || cs->caller != cs->callee->function_symbol ()
+	  || src->val == val)
+	return false;
+
+      if (!info)
+	info = IPA_NODE_REF (cs->caller);
+
+      class ipcp_param_lattices *plats = ipa_get_parm_lattices (info,
+								src->index);
+      ipcp_lattice<tree> *src_lat = src->offset == -1 ? &plats->itself
+						      : plats->aggs;
+      ipcp_value<tree> *src_val;
+
+      for (src_val = src_lat->values; src_val; src_val = src_val->next)
+	if (src_val == val)
+	  break;
+
+      if (!src_val)
+	return false;
+    }
+
   return true;
 }
 
+/* A helper function that returns result of operation specified by OPCODE on
+   the value of SRC_VAL.  If non-NULL, OPND1_TYPE is expected type for the
+   value of SRC_VAL.  If the operation is binary, OPND2 is a constant value
+   acting as its second operand.  If non-NULL, RES_TYPE is expected type of
+   the result.  */
+
+static tree
+get_val_across_arith_op (enum tree_code opcode,
+			 tree opnd1_type,
+			 tree opnd2,
+			 ipcp_value<tree> *src_val,
+			 tree res_type)
+{
+  tree opnd1 = src_val->value;
+
+  /* Skip source values that is incompatible with specified type.  */
+  if (opnd1_type
+      && !useless_type_conversion_p (opnd1_type, TREE_TYPE (opnd1)))
+    return NULL_TREE;
+
+  return ipa_get_jf_arith_result (opcode, opnd1, opnd2, res_type);
+}
+
 /* Propagate values through an arithmetic transformation described by a jump
    function associated with edge CS, taking values from SRC_LAT and putting
    them into DEST_LAT.  OPND1_TYPE is expected type for the values in SRC_LAT.
@@ -1896,24 +1977,76 @@ propagate_vals_across_arith_jfunc (cgraph_edge *cs,
   ipcp_value<tree> *src_val;
   bool ret = false;
 
-  /* Do not create new values when propagating within an SCC because if there
-     are arithmetic functions with circular dependencies, there is infinite
-     number of them and we would just make lattices bottom.  If this condition
-     is ever relaxed we have to detect self-feeding recursive calls in
-     cgraph_edge_brings_value_p in a smarter way.  */
+  /* Due to circular dependencies, propagating within an SCC through arithmetic
+     transformation would create infinite number of values.  But for
+     self-feeding recursive function, we could allow propagation in a limited
+     count, and this can enable a simple kind of recursive function versioning.
+     For other scenario, we would just make lattices bottom.  */
   if (opcode != NOP_EXPR && ipa_edge_within_scc (cs))
-    ret = dest_lat->set_contains_variable ();
+    {
+      int i;
+
+      if (src_lat != dest_lat || param_ipa_cp_max_recursive_depth < 1)
+	return dest_lat->set_contains_variable ();
+
+      /* No benefit if recursive execution is in low probability.  */
+      if (cs->sreal_frequency () * 100
+	  <= ((sreal) 1) * param_ipa_cp_min_recursive_probability)
+	return dest_lat->set_contains_variable ();
+
+      auto_vec<ipcp_value<tree> *, 8> val_seeds;
+
+      for (src_val = src_lat->values; src_val; src_val = src_val->next)
+	{
+	  /* Now we do not use self-recursively generated value as propagation
+	     source, this is absolutely conservative, but could avoid explosion
+	     of lattice's value space, especially when one recursive function
+	     calls another recursive.  */
+	  if (self_recursively_generated_p (src_val))
+	    {
+	      ipcp_value_source<tree> *s;
+
+	      /* If the lattice has already been propagated for the call site,
+		 no need to do that again.  */
+	      for (s = src_val->sources; s; s = s->next)
+		if (s->cs == cs)
+		  return dest_lat->set_contains_variable ();
+	    }
+	  else
+	    val_seeds.safe_push (src_val);
+	}
+
+      /* Recursively generate lattice values with a limited count.  */
+      FOR_EACH_VEC_ELT (val_seeds, i, src_val)
+	{
+	  for (int j = 1; j < param_ipa_cp_max_recursive_depth; j++)
+	    {
+	      tree cstval = get_val_across_arith_op (opcode, opnd1_type, opnd2,
+						     src_val, res_type);
+	      if (!cstval)
+		break;
+
+	      ret |= dest_lat->add_value (cstval, cs, src_val, src_idx,
+					  src_offset, &src_val, true);
+	      gcc_checking_assert (src_val);
+	    }
+	}
+      ret |= dest_lat->set_contains_variable ();
+    }
   else
     for (src_val = src_lat->values; src_val; src_val = src_val->next)
       {
-	tree opnd1 = src_val->value;
-	tree cstval = NULL_TREE;
-
-	/* Skip source values that is incompatible with specified type.  */
-	if (!opnd1_type
-	    || useless_type_conversion_p (opnd1_type, TREE_TYPE (opnd1)))
-	  cstval = ipa_get_jf_arith_result (opcode, opnd1, opnd2, res_type);
+	/* Now we do not use self-recursively generated value as propagation
+	   source, otherwise it is easy to make value space of normal lattice
+	   overflow.  */
+	if (self_recursively_generated_p (src_val))
+	  {
+	    ret |= dest_lat->set_contains_variable ();
+	    continue;
+	  }
 
+	tree cstval = get_val_across_arith_op (opcode, opnd1_type, opnd2,
+					       src_val, res_type);
 	if (cstval)
 	  ret |= dest_lat->add_value (cstval, cs, src_val, src_idx,
 				      src_offset);
@@ -3038,7 +3171,7 @@ hint_time_bonus (ipa_hints hints)
 static inline int64_t
 incorporate_penalties (ipa_node_params *info, int64_t evaluation)
 {
-  if (info->node_within_scc)
+  if (info->node_within_scc && !info->node_is_self_scc)
     evaluation = (evaluation
 		  * (100 - param_ipa_cp_recursion_penalty)) / 100;
 
@@ -3082,7 +3215,8 @@ good_cloning_opportunity_p (struct cgraph_node *node, int time_benefit,
 	  count_sum.dump (dump_file);
 	  fprintf (dump_file, "%s%s) -> evaluation: " "%" PRId64
 		 ", threshold: %i\n",
-		 info->node_within_scc ? ", scc" : "",
+		 info->node_within_scc
+		   ? (info->node_is_self_scc ? ", self_scc" : ", scc") : "",
 		 info->node_calling_single_call ? ", single_call" : "",
 		 evaluation, param_ipa_cp_eval_threshold);
 	}
@@ -3100,7 +3234,8 @@ good_cloning_opportunity_p (struct cgraph_node *node, int time_benefit,
 		 "size: %i, freq_sum: %i%s%s) -> evaluation: "
 		 "%" PRId64 ", threshold: %i\n",
 		 time_benefit, size_cost, freq_sum,
-		 info->node_within_scc ? ", scc" : "",
+		 info->node_within_scc
+		   ? (info->node_is_self_scc ? ", self_scc" : ", scc") : "",
 		 info->node_calling_single_call ? ", single_call" : "",
 		 evaluation, param_ipa_cp_eval_threshold);
 
@@ -3594,14 +3729,30 @@ propagate_constants_topo (class ipa_topo_info *topo)
       while (v)
 	{
 	  struct cgraph_edge *cs;
+	  class ipa_node_params *info = NULL;
+	  bool self_scc = true;
 
 	  for (cs = v->callees; cs; cs = cs->next_callee)
 	    if (ipa_edge_within_scc (cs))
 	      {
-		IPA_NODE_REF (v)->node_within_scc = true;
+		cgraph_node *callee = cs->callee->function_symbol ();
+
+		if (v != callee)
+		  self_scc = false;
+
+		if (!info)
+		  {
+		    info = IPA_NODE_REF (v);
+		    info->node_within_scc = true;
+		  }
+
 		if (propagate_constants_across_call (cs))
-		  push_node_to_stack (topo, cs->callee->function_symbol ());
+		  push_node_to_stack (topo, callee);
 	      }
+
+	  if (info)
+	    info->node_is_self_scc = self_scc;
+
 	  v = pop_node_from_stack (topo);
 	}
 
@@ -3983,6 +4134,9 @@ get_info_about_necessary_edges (ipcp_value<valtype> *val, cgraph_node *dest,
 	      hot |= cs->maybe_hot_p ();
 	      if (cs->caller != dest)
 		non_self_recursive = true;
+	      else if (src->val)
+		gcc_assert (values_equal_for_ipcp_p (src->val->value,
+						     val->value));
 	    }
 	  cs = get_next_cgraph_edge_clone (cs);
 	}
@@ -3996,6 +4150,19 @@ get_info_about_necessary_edges (ipcp_value<valtype> *val, cgraph_node *dest,
   *freq_sum = freq;
   *count_sum = cnt;
   *caller_count = count;
+
+  if (!hot && IPA_NODE_REF (dest)->node_within_scc)
+    {
+      struct cgraph_edge *cs;
+
+      /* Cold non-SCC source edge could trigger hot recursive execution of
+	 function. Consider the case as hot and rely on following cost model
+	 computation to further select right one.  */
+      for (cs = dest->callers; cs; cs = cs->next_caller)
+	if (cs->caller == dest && cs->maybe_hot_p ())
+	  return true;
+    }
+
   return hot;
 }
 
diff --git a/gcc/ipa-prop.h b/gcc/ipa-prop.h
index e9d6a5e8305..1958e1ee9cc 100644
--- a/gcc/ipa-prop.h
+++ b/gcc/ipa-prop.h
@@ -497,6 +497,8 @@ public:
   unsigned node_dead : 1;
   /* Node is involved in a recursion, potentionally indirect.  */
   unsigned node_within_scc : 1;
+  /* Node contains only direct recursion.  */
+  unsigned node_is_self_scc : 1;
   /* Node is calling a private function called only once.  */
   unsigned node_calling_single_call : 1;
   /* False when there is something makes versioning impossible.  */
diff --git a/gcc/params.opt b/gcc/params.opt
index 586b539ec5f..f4b810ded44 100644
--- a/gcc/params.opt
+++ b/gcc/params.opt
@@ -198,6 +198,14 @@ Threshold ipa-cp opportunity evaluation that is still considered beneficial to c
 Common Joined UInteger Var(param_ipa_cp_loop_hint_bonus) Init(64) Param
 Compile-time bonus IPA-CP assigns to candidates which make loop bounds or strides known.
 
+-param=ipa-cp-max-recursive-depth=
+Common Joined UInteger Var(param_ipa_cp_max_recursive_depth) Init(7) Param
+Maximum depth of recursive cloning for self-recursive function.
+
+-param=ipa-cp-min-recursive-probability=
+Common Joined UInteger Var(param_ipa_cp_min_recursive_probability) Init(2) Param
+Recursive cloning only when the probability of call being executed exceeds the parameter.
+
 -param=ipa-cp-recursion-penalty=
 Common Joined UInteger Var(param_ipa_cp_recursion_penalty) Init(40) IntegerRange(0, 100) Param
 Percentage penalty the recursive functions will receive when they are evaluated for cloning.
diff --git a/gcc/testsuite/gcc.dg/ipa/ipa-clone-2.c b/gcc/testsuite/gcc.dg/ipa/ipa-clone-2.c
new file mode 100644
index 00000000000..d513020ee8b
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/ipa/ipa-clone-2.c
@@ -0,0 +1,47 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fdump-ipa-cp-details -fno-early-inlining --param ipa-cp-max-recursive-depth=8" } */
+
+int fn();
+
+int data[100];
+
+int recur_fn (int i)
+{
+  int j;
+   
+  if (i == 6)
+    {
+      fn();
+      fn();
+      fn();
+      fn();
+      fn();
+      fn();
+      fn();
+      fn();
+      fn();
+      fn();
+      fn();
+      fn();
+      return 10;
+    }
+
+  data[i] = i; 
+
+  for (j = 0; j < 100; j++)
+    recur_fn (i + 1);
+
+  return i; 
+}
+
+int main ()
+{
+  int i;
+
+  for (i = 0; i < 100; i++)
+    recur_fn (1) + recur_fn (-5);
+
+  return 1;
+}
+
+/* { dg-final { scan-ipa-dump-times "Creating a specialized node of recur_fn/\[0-9\]*\\." 12 "cp" } } */
-- 
2.17.1


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] Support multi-versioning on self-recursive function (ipa/92133)
  2019-11-27  2:07                     ` Feng Xue OS
@ 2019-11-27 15:12                       ` Jan Hubicka
  2019-11-28  3:48                         ` Feng Xue OS
  2019-12-01 23:20                       ` Jeff Law
  1 sibling, 1 reply; 19+ messages in thread
From: Jan Hubicka @ 2019-11-27 15:12 UTC (permalink / raw)
  To: Feng Xue OS; +Cc: Martin Jambor, Richard Biener, luoxhu, gcc-patches

> 2019-11-15  Feng Xue <fxue@os.amperecomputing.com>
> 
>         PR ipa/92133
>         * doc/invoke.texi (ipa-cp-max-recursive-depth): Document new option.
>         (ipa-cp-min-recursive-probability): Likewise.
>         * params.opt (ipa-cp-max-recursive-depth): New.
>         (ipa-cp-min-recursive-probability): Likewise.
>         * ipa-cp.c (ipcp_lattice<valtype>::add_value): Add two new parameters
>         val_p and unlimited.
>         (self_recursively_generated_p): New function.
>         (get_val_across_arith_op): Likewise.
>         (propagate_vals_across_arith_jfunc): Add constant propagation for
>         self-recursive function.
>         (incorporate_penalties): Do not penalize pure self-recursive function.
>         (good_cloning_opportunity_p): Dump node_is_self_scc flag.
>         (propagate_constants_topo): Set node_is_self_scc flag for cgraph node.
>         (get_info_about_necessary_edges): Relax hotness check for edge to
>         self-recursive function.
>         * ipa-prop.h (ipa_node_params): Add new field node_is_self_scc.

OK, thanks!
do you have some plans on the better cost model for the recursive
cloning? Also it would be nice to have this info available in recursive
inliner and give it a higher priority when inlining is going to turn
previously recrusive call into non-recursive.

Honza

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] Support multi-versioning on self-recursive function (ipa/92133)
  2019-11-27 15:12                       ` Jan Hubicka
@ 2019-11-28  3:48                         ` Feng Xue OS
  0 siblings, 0 replies; 19+ messages in thread
From: Feng Xue OS @ 2019-11-28  3:48 UTC (permalink / raw)
  To: Jan Hubicka; +Cc: Martin Jambor, Richard Biener, luoxhu, gcc-patches

Have no a complete idea for new cost model, but there are something we can try:
o.  adjust estimated profile for recursive function so that we can get a higher threshold.
o.  introduce a strength level for ipa-cp-clone, by default, it is same as current configuration,
and with larger number, ipa-cp works more aggressively.
o. integrate frequency information to computation of prop_time_benefit.

Feng

________________________________________
From: Jan Hubicka <hubicka@ucw.cz>
Sent: Wednesday, November 27, 2019 10:27 PM
To: Feng Xue OS
Cc: Martin Jambor; Richard Biener; luoxhu; gcc-patches@gcc.gnu.org
Subject: Re: [PATCH] Support multi-versioning on self-recursive function (ipa/92133)

> 2019-11-15  Feng Xue <fxue@os.amperecomputing.com>
>
>         PR ipa/92133
>         * doc/invoke.texi (ipa-cp-max-recursive-depth): Document new option.
>         (ipa-cp-min-recursive-probability): Likewise.
>         * params.opt (ipa-cp-max-recursive-depth): New.
>         (ipa-cp-min-recursive-probability): Likewise.
>         * ipa-cp.c (ipcp_lattice<valtype>::add_value): Add two new parameters
>         val_p and unlimited.
>         (self_recursively_generated_p): New function.
>         (get_val_across_arith_op): Likewise.
>         (propagate_vals_across_arith_jfunc): Add constant propagation for
>         self-recursive function.
>         (incorporate_penalties): Do not penalize pure self-recursive function.
>         (good_cloning_opportunity_p): Dump node_is_self_scc flag.
>         (propagate_constants_topo): Set node_is_self_scc flag for cgraph node.
>         (get_info_about_necessary_edges): Relax hotness check for edge to
>         self-recursive function.
>         * ipa-prop.h (ipa_node_params): Add new field node_is_self_scc.

OK, thanks!
do you have some plans on the better cost model for the recursive
cloning? Also it would be nice to have this info available in recursive
inliner and give it a higher priority when inlining is going to turn
previously recrusive call into non-recursive.

Honza

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] Support multi-versioning on self-recursive function (ipa/92133)
  2019-11-27  2:07                     ` Feng Xue OS
  2019-11-27 15:12                       ` Jan Hubicka
@ 2019-12-01 23:20                       ` Jeff Law
  2019-12-02  7:07                         ` Feng Xue OS
  1 sibling, 1 reply; 19+ messages in thread
From: Jeff Law @ 2019-12-01 23:20 UTC (permalink / raw)
  To: Feng Xue OS, Martin Jambor, Jan Hubicka, Richard Biener
  Cc: luoxhu, gcc-patches

On 11/26/19 6:44 PM, Feng Xue OS wrote:
> Hi, Richard,
> 
>   This patch is a not bugfix, while it is small. Martin and Honza are fine with it.
> But now we are in stage 3, is it ok to commit?
Yes.  It was posted well in advance of the stage1->stage3 change.
Please commit it ASAP so the testers can pick it up.

Thanks,
jeff

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] Support multi-versioning on self-recursive function (ipa/92133)
  2019-12-01 23:20                       ` Jeff Law
@ 2019-12-02  7:07                         ` Feng Xue OS
  0 siblings, 0 replies; 19+ messages in thread
From: Feng Xue OS @ 2019-12-02  7:07 UTC (permalink / raw)
  To: Jeff Law, Martin Jambor, Jan Hubicka, Richard Biener; +Cc: luoxhu, gcc-patches

Done.   -Thanks

Feng


________________________________________
From: Jeff Law <law@redhat.com>
Sent: Monday, December 2, 2019 4:33 AM
To: Feng Xue OS; Martin Jambor; Jan Hubicka; Richard Biener
Cc: luoxhu; gcc-patches@gcc.gnu.org
Subject: Re: [PATCH] Support multi-versioning on self-recursive function (ipa/92133)

On 11/26/19 6:44 PM, Feng Xue OS wrote:
> Hi, Richard,
>
>   This patch is a not bugfix, while it is small. Martin and Honza are fine with it.
> But now we are in stage 3, is it ok to commit?
Yes.  It was posted well in advance of the stage1->stage3 change.
Please commit it ASAP so the testers can pick it up.

Thanks,
jeff

^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2019-12-02  7:07 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-10-17  8:35 [PATCH] Support multi-versioning on self-recursive function (ipa/92133) Feng Xue OS
2019-10-18  2:12 ` luoxhu
2019-10-18  5:15   ` Feng Xue OS
2019-10-24  6:17 ` luoxhu
2019-10-24  6:57   ` Feng Xue OS
2019-11-14 13:29     ` Jan Hubicka
2019-11-14 15:16       ` Feng Xue OS
2019-11-14 15:28         ` Jan Hubicka
2019-11-14 16:02           ` Feng Xue OS
2019-11-14 20:50             ` Jan Hubicka
2019-11-15 15:37               ` Feng Xue OS
2019-11-22  5:26                 ` Ping: " Feng Xue OS
2019-11-22 11:34                 ` Martin Jambor
2019-11-25 14:17                   ` Feng Xue OS
2019-11-27  2:07                     ` Feng Xue OS
2019-11-27 15:12                       ` Jan Hubicka
2019-11-28  3:48                         ` Feng Xue OS
2019-12-01 23:20                       ` Jeff Law
2019-12-02  7:07                         ` Feng Xue OS

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).