public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH] Time profiler - phase 2
@ 2013-11-14  1:12 Martin Liška
  2013-11-16 12:59 ` Jan Hubicka
  0 siblings, 1 reply; 18+ messages in thread
From: Martin Liška @ 2013-11-14  1:12 UTC (permalink / raw)
  To: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 529 bytes --]

Hello,
   this patch introduces new function reordering feature that is based
on the patch I submitted few days ago.

The idea of a new flag (-fprofile-reorder-functions) is to reorder
functions according to the first execution in instrumented run. Of
course, the option operates only in LTO mode, where the compiler can
see all call graph nodes.

I've tested the patch on programs like Gimp, Inkscape, Firefox. There
are still some inaccuracies caused by weak symbols in C++ and has been
under investigation.

Thank you,
Martin

[-- Attachment #2: time-profiler-2.0.patch --]
[-- Type: text/x-patch, Size: 12052 bytes --]

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index c566a85..1562098 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,15 @@
+2013-11-13	Martin Liska	<marxin.liska@gmail.com>
+						Jan Hubicka  <jh@suse.cz>
+
+	* cgraphunit.c (node_cmp): New function.
+	(expand_all_functions): Function ordering added.
+	* common.opt: New profile based function reordering flag introduced.
+	* coverage.c (get_coverage_counts): Wrong profile handled.
+	* ipa.c (cgraph_externally_visible_p): New late flag introduced.
+	* lto-partition.c: Support for time profile added.
+	* lto.c: Likewise.
+	* value-prof.c: Histogram instrumentation switch added.
+
 2013-11-13  Vladimir Makarov  <vmakarov@redhat.com>
 
 	PR rtl-optimization/59036
diff --git a/gcc/cgraphunit.c b/gcc/cgraphunit.c
index 4765e6a..7cdd9a4 100644
--- a/gcc/cgraphunit.c
+++ b/gcc/cgraphunit.c
@@ -1821,6 +1821,17 @@ expand_function (struct cgraph_node *node)
   ipa_remove_all_references (&node->ref_list);
 }
 
+/* Node comparer that is responsible for the order that corresponds
+   to time when a function was launched for the first time.  */
+
+static int
+node_cmp (const void *pa, const void *pb)
+{
+  const struct cgraph_node *a = *(const struct cgraph_node * const *) pa;
+  const struct cgraph_node *b = *(const struct cgraph_node * const *) pb;
+
+  return b->tp_first_run - a->tp_first_run;
+}
 
 /* Expand all functions that must be output.
 
@@ -1832,11 +1843,14 @@ expand_function (struct cgraph_node *node)
    to use subsections to make the output functions appear in top-down
    order).  */
 
+
 static void
 expand_all_functions (void)
 {
   struct cgraph_node *node;
   struct cgraph_node **order = XCNEWVEC (struct cgraph_node *, cgraph_n_nodes);
+
+  unsigned int expanded_func_count = 0, profiled_func_count = 0;
   int order_pos, new_order_pos = 0;
   int i;
 
@@ -1849,19 +1863,35 @@ expand_all_functions (void)
     if (order[i]->process)
       order[new_order_pos++] = order[i];
 
+  if (flag_profile_reorder_functions)
+    qsort (order, new_order_pos, sizeof (struct cgraph_node *), node_cmp);
+
   for (i = new_order_pos - 1; i >= 0; i--)
     {
       node = order[i];
+
       if (node->process)
 	{
+     expanded_func_count++;
+     if(node->tp_first_run)
+       profiled_func_count++;
+
 	  node->process = 0;
 	  expand_function (node);
 	}
     }
+
+    if (in_lto_p && dump_file)
+      fprintf (dump_file, "Expanded functions with time profile (%s):%u/%u\n",
+               main_input_filename, profiled_func_count, expanded_func_count);
+
+  if (cgraph_dump_file && flag_profile_reorder_functions && in_lto_p)
+    fprintf (cgraph_dump_file, "Expanded functions with time profile:%u/%u\n",
+             profiled_func_count, expanded_func_count);
+
   cgraph_process_new_functions ();
 
   free (order);
-
 }
 
 /* This is used to sort the node types by the cgraph order number.  */
@@ -2186,6 +2216,7 @@ compile (void)
 #endif
 
   cgraph_state = CGRAPH_STATE_EXPANSION;
+
   if (!flag_toplevel_reorder)
     output_in_order ();
   else
diff --git a/gcc/common.opt b/gcc/common.opt
index d5971df..85d5c74 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -1708,6 +1708,10 @@ fprofile-report
 Common Report Var(profile_report)
 Report on consistency of profile
 
+fprofile-reorder-functions
+Common Report Var(flag_profile_reorder_functions)
+Enable function reordering that improves code placement
+
 frandom-seed
 Common Var(common_deferred_options) Defer
 
diff --git a/gcc/ipa-profile.c b/gcc/ipa-profile.c
index 1260069..eff4b51 100644
--- a/gcc/ipa-profile.c
+++ b/gcc/ipa-profile.c
@@ -465,6 +465,7 @@ ipa_propagate_frequency (struct cgraph_node *node)
   if (d.maybe_unlikely_executed)
     {
       node->frequency = NODE_FREQUENCY_UNLIKELY_EXECUTED;
+      node->tp_first_run = 0;
       if (dump_file)
 	fprintf (dump_file, "Node %s promoted to unlikely executed.\n",
 		 cgraph_node_name (node));
diff --git a/gcc/ipa.c b/gcc/ipa.c
index a11b1c7..d92a332 100644
--- a/gcc/ipa.c
+++ b/gcc/ipa.c
@@ -761,10 +761,14 @@ cgraph_externally_visible_p (struct cgraph_node *node,
      This improves code quality and we know we will duplicate them at most twice
      (in the case that we are not using plugin and link with object file
       implementing same COMDAT)  */
-  if ((in_lto_p || whole_program)
-      && DECL_COMDAT (node->decl)
-      && comdat_can_be_unshared_p (node))
-    return false;
+  if ((in_lto_p || whole_program || profile_arc_flag)
+     && DECL_COMDAT (node->decl)
+     && comdat_can_be_unshared_p (node))
+    {
+      gcc_checking_assert (cgraph_function_body_availability (node)
+			   > AVAIL_OVERWRITABLE);
+      return false;
+    }
 
   /* When doing link time optimizations, hidden symbols become local.  */
   if (in_lto_p
@@ -932,7 +936,7 @@ function_and_variable_visibility (bool whole_program)
 	}
       gcc_assert ((!DECL_WEAK (node->decl)
 		  && !DECL_COMDAT (node->decl))
-      	          || TREE_PUBLIC (node->decl)
+		  || TREE_PUBLIC (node->decl)
 		  || node->weakref
 		  || DECL_EXTERNAL (node->decl));
       if (cgraph_externally_visible_p (node, whole_program))
@@ -949,7 +953,7 @@ function_and_variable_visibility (bool whole_program)
 	  && node->definition && !node->weakref
 	  && !DECL_EXTERNAL (node->decl))
 	{
-	  gcc_assert (whole_program || in_lto_p
+	  gcc_assert (whole_program || in_lto_p || profile_arc_flag
 		      || !TREE_PUBLIC (node->decl));
 	  node->unique_name = ((node->resolution == LDPR_PREVAILING_DEF_IRONLY
 				      || node->resolution == LDPR_PREVAILING_DEF_IRONLY_EXP)
diff --git a/gcc/lto/lto-partition.c b/gcc/lto/lto-partition.c
index 6a3d881..e4ca6df 100644
--- a/gcc/lto/lto-partition.c
+++ b/gcc/lto/lto-partition.c
@@ -280,9 +280,11 @@ add_symbol_to_partition (ltrans_partition part, symtab_node *node)
 
      Be lax about comdats; they may or may not be duplicated and we may
      end up in need to duplicate keyed comdat because it has unkeyed alias.  */
+
   gcc_assert (get_symbol_class (node) == SYMBOL_DUPLICATE
 	      || DECL_COMDAT (node->decl)
 	      || !symbol_partitioned_p (node));
+
   add_symbol_to_partition_1 (part, node);
 }
 
@@ -395,6 +397,20 @@ node_cmp (const void *pa, const void *pb)
 {
   const struct cgraph_node *a = *(const struct cgraph_node * const *) pa;
   const struct cgraph_node *b = *(const struct cgraph_node * const *) pb;
+
+  /* Profile reorder flag enables function reordering based on first execution
+     of a function. All functions with profile are placed in ascending
+     order at the beginning.  */
+
+  if (flag_profile_reorder_functions)
+  {
+    if (a->tp_first_run && b->tp_first_run)
+      return a->tp_first_run - b->tp_first_run;
+
+    if (a->tp_first_run || b->tp_first_run)
+      return b->tp_first_run - a->tp_first_run;
+  }
+
   return b->order - a->order;
 }
 
@@ -449,7 +465,7 @@ void
 lto_balanced_map (void)
 {
   int n_nodes = 0;
-  int n_varpool_nodes = 0, varpool_pos = 0, best_varpool_pos = 0;
+  int n_varpool_nodes = 0, varpool_pos = 0;
   struct cgraph_node **order = XNEWVEC (struct cgraph_node *, cgraph_max_uid);
   struct varpool_node **varpool_order = NULL;
   int i;
@@ -481,10 +497,13 @@ lto_balanced_map (void)
      get better about minimizing the function bounday, but until that
      things works smoother if we order in source order.  */
   qsort (order, n_nodes, sizeof (struct cgraph_node *), node_cmp);
+
+  if (cgraph_dump_file)
+    for(i = 0; i < n_nodes; i++)
+      fprintf (cgraph_dump_file, "Balanced map symbol order:%s:%u\n", cgraph_node_asm_name (order[i]), order[i]->tp_first_run);
+
   if (!flag_toplevel_reorder)
     {
-      qsort (order, n_nodes, sizeof (struct cgraph_node *), node_cmp);
-
       FOR_EACH_VARIABLE (vnode)
 	if (get_symbol_class (vnode) == SYMBOL_PARTITION)
 	  n_varpool_nodes++;
@@ -683,7 +702,6 @@ lto_balanced_map (void)
 	  best_i = i;
 	  best_n_nodes = lto_symtab_encoder_size (partition->encoder);
 	  best_total_size = total_size;
-	  best_varpool_pos = varpool_pos;
 	}
       if (cgraph_dump_file)
 	fprintf (cgraph_dump_file, "Step %i: added %s/%i, size %i, cost %i/%i "
@@ -701,7 +719,6 @@ lto_balanced_map (void)
 		fprintf (cgraph_dump_file, "Unwinding %i insertions to step %i\n",
 			 i - best_i, best_i);
 	      undo_partition (partition, best_n_nodes);
-	      varpool_pos = best_varpool_pos;
 	    }
 	  i = best_i;
  	  /* When we are finished, avoid creating empty partition.  */
@@ -849,7 +866,7 @@ may_need_named_section_p (lto_symtab_encoder_t encoder, symtab_node *node)
    of the same name in partition ENCODER (or in whole compilation unit if
    ENCODER is NULL) and if so, mangle the statics.  Always mangle all
    conflicting statics, so we reduce changes of silently miscompiling
-   asm statemnets referring to them by symbol name.  */
+   asm statements referring to them by symbol name.  */
 
 static void
 rename_statics (lto_symtab_encoder_t encoder, symtab_node *node)
diff --git a/gcc/lto/lto.c b/gcc/lto/lto.c
index 62856d0..2401c76 100644
--- a/gcc/lto/lto.c
+++ b/gcc/lto/lto.c
@@ -2453,9 +2453,12 @@ lto_wpa_write_files (void)
   /* Sort partitions by size so small ones are compiled last.
      FIXME: Even when not reordering we may want to output one list for parallel make
      and other for final link command.  */
-  ltrans_partitions.qsort (flag_toplevel_reorder
+
+  if (!flag_profile_reorder_functions || !flag_profile_use)
+    ltrans_partitions.qsort (flag_toplevel_reorder
 			   ? cmp_partitions_size
 			   : cmp_partitions_order);
+
   for (i = 0; i < n_sets; i++)
     {
       size_t len;
diff --git a/gcc/opts.c b/gcc/opts.c
index 3a939ac..d3a7735 100644
--- a/gcc/opts.c
+++ b/gcc/opts.c
@@ -1690,6 +1690,8 @@ common_handle_option (struct gcc_options *opts,
 	opts->x_flag_vect_cost_model = VECT_COST_MODEL_DYNAMIC;
       if (!opts_set->x_flag_tree_loop_distribute_patterns)
 	opts->x_flag_tree_loop_distribute_patterns = value;
+      if (!opts_set->x_flag_profile_reorder_functions)
+	opts->x_flag_profile_reorder_functions = value;
       /* Indirect call profiling should do all useful transformations
  	 speculative devirutalization does.  */
       if (!opts_set->x_flag_devirtualize_speculatively
@@ -1708,6 +1710,8 @@ common_handle_option (struct gcc_options *opts,
 	opts->x_flag_profile_values = value;
       if (!opts_set->x_flag_inline_functions)
 	opts->x_flag_inline_functions = value;
+      if (!opts_set->x_flag_profile_reorder_functions)
+	opts->x_flag_profile_reorder_functions = value;
       /* FIXME: Instrumentation we insert makes ipa-reference bitmaps
 	 quadratic.  Disable the pass until better memory representation
 	 is done.  */
diff --git a/gcc/varasm.c b/gcc/varasm.c
index 2226912..758367f 100644
--- a/gcc/varasm.c
+++ b/gcc/varasm.c
@@ -548,7 +548,14 @@ default_function_section (tree decl, enum node_frequency freq,
      unlikely executed (this happens especially with function splitting
      where we can split away unnecessary parts of static constructors.  */
   if (startup && freq != NODE_FREQUENCY_UNLIKELY_EXECUTED)
-    return get_named_text_section (decl, ".text.startup", NULL);
+    {
+      /* If we do have a profile or(and) LTO phase is executed, we do not need
+         these ELF section.  */
+      if (!in_lto_p || !flag_profile_values)
+        return get_named_text_section (decl, ".text.startup", NULL);
+      else
+        return NULL;
+    }
 
   /* Similarly for exit.  */
   if (exit && freq != NODE_FREQUENCY_UNLIKELY_EXECUTED)
@@ -560,7 +567,10 @@ default_function_section (tree decl, enum node_frequency freq,
       case NODE_FREQUENCY_UNLIKELY_EXECUTED:
 	return get_named_text_section (decl, ".text.unlikely", NULL);
       case NODE_FREQUENCY_HOT:
-	return get_named_text_section (decl, ".text.hot", NULL);
+        /* If we do have a profile or(and) LTO phase is executed, we do not need
+           these ELF section.  */
+        if (!in_lto_p || !flag_profile_values)
+          return get_named_text_section (decl, ".text.hot", NULL);
       default:
 	return NULL;
     }

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] Time profiler - phase 2
  2013-11-14  1:12 [PATCH] Time profiler - phase 2 Martin Liška
@ 2013-11-16 12:59 ` Jan Hubicka
  2013-11-17 23:38   ` Martin Liška
  0 siblings, 1 reply; 18+ messages in thread
From: Jan Hubicka @ 2013-11-16 12:59 UTC (permalink / raw)
  To: Martin Liška; +Cc: gcc-patches

> diff --git a/gcc/ChangeLog b/gcc/ChangeLog
> index c566a85..1562098 100644
> --- a/gcc/ChangeLog
> +++ b/gcc/ChangeLog
> @@ -1,3 +1,15 @@
> +2013-11-13	Martin Liska	<marxin.liska@gmail.com>
> +						Jan Hubicka  <jh@suse.cz>
> +
> +	* cgraphunit.c (node_cmp): New function.
> +	(expand_all_functions): Function ordering added.
> +	* common.opt: New profile based function reordering flag introduced.
> +	* coverage.c (get_coverage_counts): Wrong profile handled.
> +	* ipa.c (cgraph_externally_visible_p): New late flag introduced.
> +	* lto-partition.c: Support for time profile added.
> +	* lto.c: Likewise.
> +	* value-prof.c: Histogram instrumentation switch added.
> +
>  2013-11-13  Vladimir Makarov  <vmakarov@redhat.com>
>  
>  	PR rtl-optimization/59036
> diff --git a/gcc/cgraphunit.c b/gcc/cgraphunit.c
> index 4765e6a..7cdd9a4 100644
> --- a/gcc/cgraphunit.c
> +++ b/gcc/cgraphunit.c
> @@ -1821,6 +1821,17 @@ expand_function (struct cgraph_node *node)
>    ipa_remove_all_references (&node->ref_list);
>  }
>  
> +/* Node comparer that is responsible for the order that corresponds
> +   to time when a function was launched for the first time.  */
> +
> +static int
> +node_cmp (const void *pa, const void *pb)
> +{
> +  const struct cgraph_node *a = *(const struct cgraph_node * const *) pa;
> +  const struct cgraph_node *b = *(const struct cgraph_node * const *) pb;
> +
> +  return b->tp_first_run - a->tp_first_run;

Please stabilize this by using node->order when tp_first_run is equivalent.
Later we ought to use better heuristic here, but order may be good enough to
start with.
> diff --git a/gcc/ipa.c b/gcc/ipa.c
> index a11b1c7..d92a332 100644
> --- a/gcc/ipa.c
> +++ b/gcc/ipa.c
> @@ -761,10 +761,14 @@ cgraph_externally_visible_p (struct cgraph_node *node,
>       This improves code quality and we know we will duplicate them at most twice
>       (in the case that we are not using plugin and link with object file
>        implementing same COMDAT)  */
> -  if ((in_lto_p || whole_program)
> -      && DECL_COMDAT (node->decl)
> -      && comdat_can_be_unshared_p (node))
> -    return false;
> +  if ((in_lto_p || whole_program || profile_arc_flag)
> +     && DECL_COMDAT (node->decl)
> +     && comdat_can_be_unshared_p (node))
> +    {
> +      gcc_checking_assert (cgraph_function_body_availability (node)
> +			   > AVAIL_OVERWRITABLE);
> +      return false;
> +    }
>  
>    /* When doing link time optimizations, hidden symbols become local.  */
>    if (in_lto_p
> @@ -932,7 +936,7 @@ function_and_variable_visibility (bool whole_program)
>  	}
>        gcc_assert ((!DECL_WEAK (node->decl)
>  		  && !DECL_COMDAT (node->decl))
> -      	          || TREE_PUBLIC (node->decl)
> +		  || TREE_PUBLIC (node->decl)
>  		  || node->weakref
>  		  || DECL_EXTERNAL (node->decl));
>        if (cgraph_externally_visible_p (node, whole_program))
> @@ -949,7 +953,7 @@ function_and_variable_visibility (bool whole_program)
>  	  && node->definition && !node->weakref
>  	  && !DECL_EXTERNAL (node->decl))
>  	{
> -	  gcc_assert (whole_program || in_lto_p
> +	  gcc_assert (whole_program || in_lto_p || profile_arc_flag
>  		      || !TREE_PUBLIC (node->decl));
>  	  node->unique_name = ((node->resolution == LDPR_PREVAILING_DEF_IRONLY
>  				      || node->resolution == LDPR_PREVAILING_DEF_IRONLY_EXP)

These changes are unrelated, please remove them.
> @@ -395,6 +397,20 @@ node_cmp (const void *pa, const void *pb)
>  {
>    const struct cgraph_node *a = *(const struct cgraph_node * const *) pa;
>    const struct cgraph_node *b = *(const struct cgraph_node * const *) pb;
> +
> +  /* Profile reorder flag enables function reordering based on first execution
> +     of a function. All functions with profile are placed in ascending
> +     order at the beginning.  */
> +
> +  if (flag_profile_reorder_functions)
				       && a->tp_first_run != b->tp_first_run
> +  {
> +    if (a->tp_first_run && b->tp_first_run)
> +      return a->tp_first_run - b->tp_first_run;
> +
> +    if (a->tp_first_run || b->tp_first_run)
> +      return b->tp_first_run - a->tp_first_run;

Drop a comment explaining the logic here ;)
> @@ -449,7 +465,7 @@ void
>  lto_balanced_map (void)
>  {
>    int n_nodes = 0;
> -  int n_varpool_nodes = 0, varpool_pos = 0, best_varpool_pos = 0;
> +  int n_varpool_nodes = 0, varpool_pos = 0;
>    struct cgraph_node **order = XNEWVEC (struct cgraph_node *, cgraph_max_uid);
>    struct varpool_node **varpool_order = NULL;
>    int i;
> @@ -481,10 +497,13 @@ lto_balanced_map (void)
>       get better about minimizing the function bounday, but until that
>       things works smoother if we order in source order.  */
>    qsort (order, n_nodes, sizeof (struct cgraph_node *), node_cmp);
> +
> +  if (cgraph_dump_file)
> +    for(i = 0; i < n_nodes; i++)
> +      fprintf (cgraph_dump_file, "Balanced map symbol order:%s:%u\n", cgraph_node_asm_name (order[i]), order[i]->tp_first_run);
> +
>    if (!flag_toplevel_reorder)
>      {
> -      qsort (order, n_nodes, sizeof (struct cgraph_node *), node_cmp);
> -
>        FOR_EACH_VARIABLE (vnode)
>  	if (get_symbol_class (vnode) == SYMBOL_PARTITION)
>  	  n_varpool_nodes++;

Good catch (the redundant qsort)

> @@ -683,7 +702,6 @@ lto_balanced_map (void)
>  	  best_i = i;
>  	  best_n_nodes = lto_symtab_encoder_size (partition->encoder);
>  	  best_total_size = total_size;
> -	  best_varpool_pos = varpool_pos;
>  	}
>        if (cgraph_dump_file)
>  	fprintf (cgraph_dump_file, "Step %i: added %s/%i, size %i, cost %i/%i "
> @@ -701,7 +719,6 @@ lto_balanced_map (void)
>  		fprintf (cgraph_dump_file, "Unwinding %i insertions to step %i\n",
>  			 i - best_i, best_i);
>  	      undo_partition (partition, best_n_nodes);
> -	      varpool_pos = best_varpool_pos;
>  	    }
>  	  i = best_i;
>   	  /* When we are finished, avoid creating empty partition.  */

This actually reverts a bugfix, so please remove the two changes above.
> diff --git a/gcc/lto/lto.c b/gcc/lto/lto.c
> index 62856d0..2401c76 100644
> --- a/gcc/lto/lto.c
> +++ b/gcc/lto/lto.c
> @@ -2453,9 +2453,12 @@ lto_wpa_write_files (void)
>    /* Sort partitions by size so small ones are compiled last.
>       FIXME: Even when not reordering we may want to output one list for parallel make
>       and other for final link command.  */
> -  ltrans_partitions.qsort (flag_toplevel_reorder
> +
> +  if (!flag_profile_reorder_functions || !flag_profile_use)
> +    ltrans_partitions.qsort (flag_toplevel_reorder
>  			   ? cmp_partitions_size
>  			   : cmp_partitions_order);
> +

Add a FIXME explaining that we ought to produce two list - one for Make parallelizm and one for final link time.
lets work on this incrementally.
> diff --git a/gcc/varasm.c b/gcc/varasm.c
> index 2226912..758367f 100644
> --- a/gcc/varasm.c
> +++ b/gcc/varasm.c
> @@ -548,7 +548,14 @@ default_function_section (tree decl, enum node_frequency freq,
>       unlikely executed (this happens especially with function splitting
>       where we can split away unnecessary parts of static constructors.  */
>    if (startup && freq != NODE_FREQUENCY_UNLIKELY_EXECUTED)
> -    return get_named_text_section (decl, ".text.startup", NULL);
> +    {
> +      /* If we do have a profile or(and) LTO phase is executed, we do not need
> +         these ELF section.  */
> +      if (!in_lto_p || !flag_profile_values)
> +        return get_named_text_section (decl, ".text.startup", NULL);
> +      else
> +        return NULL;
> +    }
>  
>    /* Similarly for exit.  */
>    if (exit && freq != NODE_FREQUENCY_UNLIKELY_EXECUTED)
> @@ -560,7 +567,10 @@ default_function_section (tree decl, enum node_frequency freq,
>        case NODE_FREQUENCY_UNLIKELY_EXECUTED:
>  	return get_named_text_section (decl, ".text.unlikely", NULL);
>        case NODE_FREQUENCY_HOT:
> -	return get_named_text_section (decl, ".text.hot", NULL);
> +        /* If we do have a profile or(and) LTO phase is executed, we do not need
> +           these ELF section.  */
> +        if (!in_lto_p || !flag_profile_values)
> +          return get_named_text_section (decl, ".text.hot", NULL);
>        default:
>  	return NULL;
>      }
Lets handle this in separate patch.

The patch is missing documentation in doc/invoke.texi.  You need to document
the new command line option and update documentation of -fprofile-use.  Please
add it and send an update patch.

Also Teresa recently sent patch for resetting profiles of functions where we clearly missed the profile.
Please add there a code filling in tp_first_run from largest first_run of the callers with non-0 count.

Thanks,
Honza

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] Time profiler - phase 2
  2013-11-16 12:59 ` Jan Hubicka
@ 2013-11-17 23:38   ` Martin Liška
  2013-11-18  4:37     ` Jan Hubicka
  0 siblings, 1 reply; 18+ messages in thread
From: Martin Liška @ 2013-11-17 23:38 UTC (permalink / raw)
  To: Jan Hubicka; +Cc: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 9452 bytes --]

Dear Jan,

On 16 November 2013 12:24, Jan Hubicka <hubicka@ucw.cz> wrote:
>> diff --git a/gcc/ChangeLog b/gcc/ChangeLog
>> index c566a85..1562098 100644
>> --- a/gcc/ChangeLog
>> +++ b/gcc/ChangeLog
>> @@ -1,3 +1,15 @@
>> +2013-11-13   Martin Liska    <marxin.liska@gmail.com>
>> +                                             Jan Hubicka  <jh@suse.cz>
>> +
>> +     * cgraphunit.c (node_cmp): New function.
>> +     (expand_all_functions): Function ordering added.
>> +     * common.opt: New profile based function reordering flag introduced.
>> +     * coverage.c (get_coverage_counts): Wrong profile handled.
>> +     * ipa.c (cgraph_externally_visible_p): New late flag introduced.
>> +     * lto-partition.c: Support for time profile added.
>> +     * lto.c: Likewise.
>> +     * value-prof.c: Histogram instrumentation switch added.
>> +
>>  2013-11-13  Vladimir Makarov  <vmakarov@redhat.com>
>>
>>       PR rtl-optimization/59036
>> diff --git a/gcc/cgraphunit.c b/gcc/cgraphunit.c
>> index 4765e6a..7cdd9a4 100644
>> --- a/gcc/cgraphunit.c
>> +++ b/gcc/cgraphunit.c
>> @@ -1821,6 +1821,17 @@ expand_function (struct cgraph_node *node)
>>    ipa_remove_all_references (&node->ref_list);
>>  }
>>
>> +/* Node comparer that is responsible for the order that corresponds
>> +   to time when a function was launched for the first time.  */
>> +
>> +static int
>> +node_cmp (const void *pa, const void *pb)
>> +{
>> +  const struct cgraph_node *a = *(const struct cgraph_node * const *) pa;
>> +  const struct cgraph_node *b = *(const struct cgraph_node * const *) pb;
>> +
>> +  return b->tp_first_run - a->tp_first_run;
>
> Please stabilize this by using node->order when tp_first_run is equivalent.
> Later we ought to use better heuristic here, but order may be good enough to

Done.

> start with.
>> diff --git a/gcc/ipa.c b/gcc/ipa.c
>> index a11b1c7..d92a332 100644
>> --- a/gcc/ipa.c
>> +++ b/gcc/ipa.c
>> @@ -761,10 +761,14 @@ cgraph_externally_visible_p (struct cgraph_node *node,
>>       This improves code quality and we know we will duplicate them at most twice
>>       (in the case that we are not using plugin and link with object file
>>        implementing same COMDAT)  */
>> -  if ((in_lto_p || whole_program)
>> -      && DECL_COMDAT (node->decl)
>> -      && comdat_can_be_unshared_p (node))
>> -    return false;
>> +  if ((in_lto_p || whole_program || profile_arc_flag)
>> +     && DECL_COMDAT (node->decl)
>> +     && comdat_can_be_unshared_p (node))
>> +    {
>> +      gcc_checking_assert (cgraph_function_body_availability (node)
>> +                        > AVAIL_OVERWRITABLE);
>> +      return false;
>> +    }
>>
>>    /* When doing link time optimizations, hidden symbols become local.  */
>>    if (in_lto_p
>> @@ -932,7 +936,7 @@ function_and_variable_visibility (bool whole_program)
>>       }
>>        gcc_assert ((!DECL_WEAK (node->decl)
>>                 && !DECL_COMDAT (node->decl))
>> -                       || TREE_PUBLIC (node->decl)
>> +               || TREE_PUBLIC (node->decl)
>>                 || node->weakref
>>                 || DECL_EXTERNAL (node->decl));
>>        if (cgraph_externally_visible_p (node, whole_program))
>> @@ -949,7 +953,7 @@ function_and_variable_visibility (bool whole_program)
>>         && node->definition && !node->weakref
>>         && !DECL_EXTERNAL (node->decl))
>>       {
>> -       gcc_assert (whole_program || in_lto_p
>> +       gcc_assert (whole_program || in_lto_p || profile_arc_flag
>>                     || !TREE_PUBLIC (node->decl));
>>         node->unique_name = ((node->resolution == LDPR_PREVAILING_DEF_IRONLY
>>                                     || node->resolution == LDPR_PREVAILING_DEF_IRONLY_EXP)
>
> These changes are unrelated, please remove them.
>> @@ -395,6 +397,20 @@ node_cmp (const void *pa, const void *pb)
>>  {
>>    const struct cgraph_node *a = *(const struct cgraph_node * const *) pa;
>>    const struct cgraph_node *b = *(const struct cgraph_node * const *) pb;
>> +
>> +  /* Profile reorder flag enables function reordering based on first execution
>> +     of a function. All functions with profile are placed in ascending
>> +     order at the beginning.  */
>> +
>> +  if (flag_profile_reorder_functions)
>                                        && a->tp_first_run != b->tp_first_run
>> +  {
>> +    if (a->tp_first_run && b->tp_first_run)
>> +      return a->tp_first_run - b->tp_first_run;
>> +
>> +    if (a->tp_first_run || b->tp_first_run)
>> +      return b->tp_first_run - a->tp_first_run;
>
> Drop a comment explaining the logic here ;)
>> @@ -449,7 +465,7 @@ void
>>  lto_balanced_map (void)
>>  {
>>    int n_nodes = 0;
>> -  int n_varpool_nodes = 0, varpool_pos = 0, best_varpool_pos = 0;
>> +  int n_varpool_nodes = 0, varpool_pos = 0;
>>    struct cgraph_node **order = XNEWVEC (struct cgraph_node *, cgraph_max_uid);
>>    struct varpool_node **varpool_order = NULL;
>>    int i;
>> @@ -481,10 +497,13 @@ lto_balanced_map (void)
>>       get better about minimizing the function bounday, but until that
>>       things works smoother if we order in source order.  */
>>    qsort (order, n_nodes, sizeof (struct cgraph_node *), node_cmp);
>> +
>> +  if (cgraph_dump_file)
>> +    for(i = 0; i < n_nodes; i++)
>> +      fprintf (cgraph_dump_file, "Balanced map symbol order:%s:%u\n", cgraph_node_asm_name (order[i]), order[i]->tp_first_run);
>> +
>>    if (!flag_toplevel_reorder)
>>      {
>> -      qsort (order, n_nodes, sizeof (struct cgraph_node *), node_cmp);
>> -
>>        FOR_EACH_VARIABLE (vnode)
>>       if (get_symbol_class (vnode) == SYMBOL_PARTITION)
>>         n_varpool_nodes++;
>
> Good catch (the redundant qsort)
>
>> @@ -683,7 +702,6 @@ lto_balanced_map (void)
>>         best_i = i;
>>         best_n_nodes = lto_symtab_encoder_size (partition->encoder);
>>         best_total_size = total_size;
>> -       best_varpool_pos = varpool_pos;
>>       }
>>        if (cgraph_dump_file)
>>       fprintf (cgraph_dump_file, "Step %i: added %s/%i, size %i, cost %i/%i "
>> @@ -701,7 +719,6 @@ lto_balanced_map (void)
>>               fprintf (cgraph_dump_file, "Unwinding %i insertions to step %i\n",
>>                        i - best_i, best_i);
>>             undo_partition (partition, best_n_nodes);
>> -           varpool_pos = best_varpool_pos;
>>           }
>>         i = best_i;
>>         /* When we are finished, avoid creating empty partition.  */
>
> This actually reverts a bugfix, so please remove the two changes above.
>> diff --git a/gcc/lto/lto.c b/gcc/lto/lto.c
>> index 62856d0..2401c76 100644
>> --- a/gcc/lto/lto.c
>> +++ b/gcc/lto/lto.c
>> @@ -2453,9 +2453,12 @@ lto_wpa_write_files (void)
>>    /* Sort partitions by size so small ones are compiled last.
>>       FIXME: Even when not reordering we may want to output one list for parallel make
>>       and other for final link command.  */
>> -  ltrans_partitions.qsort (flag_toplevel_reorder
>> +
>> +  if (!flag_profile_reorder_functions || !flag_profile_use)
>> +    ltrans_partitions.qsort (flag_toplevel_reorder
>>                          ? cmp_partitions_size
>>                          : cmp_partitions_order);
>> +
>
> Add a FIXME explaining that we ought to produce two list - one for Make parallelizm and one for final link time.
> lets work on this incrementally.
>> diff --git a/gcc/varasm.c b/gcc/varasm.c
>> index 2226912..758367f 100644
>> --- a/gcc/varasm.c
>> +++ b/gcc/varasm.c
>> @@ -548,7 +548,14 @@ default_function_section (tree decl, enum node_frequency freq,
>>       unlikely executed (this happens especially with function splitting
>>       where we can split away unnecessary parts of static constructors.  */
>>    if (startup && freq != NODE_FREQUENCY_UNLIKELY_EXECUTED)
>> -    return get_named_text_section (decl, ".text.startup", NULL);
>> +    {
>> +      /* If we do have a profile or(and) LTO phase is executed, we do not need
>> +         these ELF section.  */
>> +      if (!in_lto_p || !flag_profile_values)
>> +        return get_named_text_section (decl, ".text.startup", NULL);
>> +      else
>> +        return NULL;
>> +    }
>>
>>    /* Similarly for exit.  */
>>    if (exit && freq != NODE_FREQUENCY_UNLIKELY_EXECUTED)
>> @@ -560,7 +567,10 @@ default_function_section (tree decl, enum node_frequency freq,
>>        case NODE_FREQUENCY_UNLIKELY_EXECUTED:
>>       return get_named_text_section (decl, ".text.unlikely", NULL);
>>        case NODE_FREQUENCY_HOT:
>> -     return get_named_text_section (decl, ".text.hot", NULL);
>> +        /* If we do have a profile or(and) LTO phase is executed, we do not need
>> +           these ELF section.  */
>> +        if (!in_lto_p || !flag_profile_values)
>> +          return get_named_text_section (decl, ".text.hot", NULL);
>>        default:
>>       return NULL;
>>      }
> Lets handle this in separate patch.
>
> The patch is missing documentation in doc/invoke.texi.  You need to document
> the new command line option and update documentation of -fprofile-use.  Please
> add it and send an update patch.

I added documentation to invoke.texi.

> Also Teresa recently sent patch for resetting profiles of functions where we clearly missed the profile.
> Please add there a code filling in tp_first_run from largest first_run of the callers with non-0 count.

Good idea, I implemented filling of time-profile according to callers.

New version of patch is submitted as attachment.

Martin

>
> Thanks,
> Honza

[-- Attachment #2: time-profiler-2.1.patch --]
[-- Type: text/x-patch, Size: 11529 bytes --]

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 5cb07b7..754f882 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,13 @@
+2013-11-17  Martin Liska  <marxin.liska@gmail.com>
+	    Jan Hubicka  <jh@suse.cz>
+
+	* cgraphunit.c (node_cmp): New function.
+	(expand_all_functions): Function ordering added.
+	* common.opt: New profile based function reordering flag introduced.
+	* lto-partition.c: Support for time profile added.
+	* lto.c: Likewise.
+	* predict.c (handle_missing_profiles): Time profile handled in
+	  missing profiles.
 2013-11-16  Joern Rennecke  <joern.rennecke@embecosm.com>
 
 	* config/arc/arc.c (arc_predicate_delay_insns): New function.
diff --git a/gcc/cgraphunit.c b/gcc/cgraphunit.c
index 8ab274b..ea722b8 100644
--- a/gcc/cgraphunit.c
+++ b/gcc/cgraphunit.c
@@ -1824,6 +1824,19 @@ expand_function (struct cgraph_node *node)
   ipa_remove_all_references (&node->ref_list);
 }
 
+/* Node comparer that is responsible for the order that corresponds
+   to time when a function was launched for the first time.  */
+
+static int
+node_cmp (const void *pa, const void *pb)
+{
+  const struct cgraph_node *a = *(const struct cgraph_node * const *) pa;
+  const struct cgraph_node *b = *(const struct cgraph_node * const *) pb;
+
+  return a->tp_first_run != b->tp_first_run
+	 ? b->tp_first_run - a->tp_first_run
+	 : b->order - a->order;
+}
 
 /* Expand all functions that must be output.
 
@@ -1835,11 +1848,14 @@ expand_function (struct cgraph_node *node)
    to use subsections to make the output functions appear in top-down
    order).  */
 
+
 static void
 expand_all_functions (void)
 {
   struct cgraph_node *node;
   struct cgraph_node **order = XCNEWVEC (struct cgraph_node *, cgraph_n_nodes);
+
+  unsigned int expanded_func_count = 0, profiled_func_count = 0;
   int order_pos, new_order_pos = 0;
   int i;
 
@@ -1852,19 +1868,35 @@ expand_all_functions (void)
     if (order[i]->process)
       order[new_order_pos++] = order[i];
 
+  if (flag_profile_reorder_functions)
+    qsort (order, new_order_pos, sizeof (struct cgraph_node *), node_cmp);
+
   for (i = new_order_pos - 1; i >= 0; i--)
     {
       node = order[i];
+
       if (node->process)
 	{
+     expanded_func_count++;
+     if(node->tp_first_run)
+       profiled_func_count++;
+
 	  node->process = 0;
 	  expand_function (node);
 	}
     }
+
+    if (in_lto_p && dump_file)
+      fprintf (dump_file, "Expanded functions with time profile (%s):%u/%u\n",
+               main_input_filename, profiled_func_count, expanded_func_count);
+
+  if (cgraph_dump_file && flag_profile_reorder_functions && in_lto_p)
+    fprintf (cgraph_dump_file, "Expanded functions with time profile:%u/%u\n",
+             profiled_func_count, expanded_func_count);
+
   cgraph_process_new_functions ();
 
   free (order);
-
 }
 
 /* This is used to sort the node types by the cgraph order number.  */
@@ -2189,6 +2221,7 @@ compile (void)
 #endif
 
   cgraph_state = CGRAPH_STATE_EXPANSION;
+
   if (!flag_toplevel_reorder)
     output_in_order ();
   else
diff --git a/gcc/common.opt b/gcc/common.opt
index d5971df..85d5c74 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -1708,6 +1708,10 @@ fprofile-report
 Common Report Var(profile_report)
 Report on consistency of profile
 
+fprofile-reorder-functions
+Common Report Var(flag_profile_reorder_functions)
+Enable function reordering that improves code placement
+
 frandom-seed
 Common Var(common_deferred_options) Defer
 
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index ff4c2ee..eb101ed 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -393,7 +393,7 @@ Objective-C and Objective-C++ Dialects}.
 -fprefetch-loop-arrays -fprofile-report @gol
 -fprofile-correction -fprofile-dir=@var{path} -fprofile-generate @gol
 -fprofile-generate=@var{path} @gol
--fprofile-use -fprofile-use=@var{path} -fprofile-values @gol
+-fprofile-use -fprofile-use=@var{path} -fprofile-values -fprofile-reorder-functions @gol
 -freciprocal-math -free -frename-registers -freorder-blocks @gol
 -freorder-blocks-and-partition -freorder-functions @gol
 -frerun-cse-after-loop -freschedule-modulo-scheduled-loops @gol
@@ -8645,7 +8645,7 @@ profile useful for later recompilation with profile feedback based
 optimization.  You must use @option{-fprofile-generate} both when
 compiling and when linking your program.
 
-The following options are enabled: @code{-fprofile-arcs}, @code{-fprofile-values}, @code{-fvpt}.
+The following options are enabled: @code{-fprofile-arcs}, @code{-fprofile-values}, @code{-fprofile-reorder-functions}, @code{-fvpt}.
 
 If @var{path} is specified, GCC looks at the @var{path} to find
 the profile feedback data files. See @option{-fprofile-dir}.
@@ -8933,6 +8933,14 @@ from profiling values of expressions for usage in optimizations.
 
 Enabled with @option{-fprofile-generate} and @option{-fprofile-use}.
 
+@item -fprofile-reoder-functions
+@opindex fprofile-reorder-functions
+Function reordering based on profile instrumentation collects
+first time of execution of a function and orders these functions
+in ascending order.
+
+Enabled with @option{-fprofile-generate} and @option{-fprofile-use}.
+
 @item -fvpt
 @opindex fvpt
 If combined with @option{-fprofile-arcs}, this option instructs the compiler
diff --git a/gcc/lto/lto-partition.c b/gcc/lto/lto-partition.c
index 6a3d881..42e4aef 100644
--- a/gcc/lto/lto-partition.c
+++ b/gcc/lto/lto-partition.c
@@ -280,9 +280,11 @@ add_symbol_to_partition (ltrans_partition part, symtab_node *node)
 
      Be lax about comdats; they may or may not be duplicated and we may
      end up in need to duplicate keyed comdat because it has unkeyed alias.  */
+
   gcc_assert (get_symbol_class (node) == SYMBOL_DUPLICATE
 	      || DECL_COMDAT (node->decl)
 	      || !symbol_partitioned_p (node));
+
   add_symbol_to_partition_1 (part, node);
 }
 
@@ -395,6 +397,25 @@ node_cmp (const void *pa, const void *pb)
 {
   const struct cgraph_node *a = *(const struct cgraph_node * const *) pa;
   const struct cgraph_node *b = *(const struct cgraph_node * const *) pb;
+
+  /* Profile reorder flag enables function reordering based on first execution
+     of a function. All functions with profile are placed in ascending
+     order at the beginning.  */
+
+  if (flag_profile_reorder_functions)
+  {
+    /* Functions with time profile are sorted in ascending order.  */
+    if (a->tp_first_run && b->tp_first_run)
+      return a->tp_first_run != b->tp_first_run
+	? a->tp_first_run - b->tp_first_run
+        : a->order - b->order;
+
+    /* Functions with time profile are sorted before the functions
+       that do not have the profile.  */
+    if (a->tp_first_run || b->tp_first_run)
+      return b->tp_first_run - a->tp_first_run;
+  }
+
   return b->order - a->order;
 }
 
@@ -449,7 +470,7 @@ void
 lto_balanced_map (void)
 {
   int n_nodes = 0;
-  int n_varpool_nodes = 0, varpool_pos = 0, best_varpool_pos = 0;
+  int n_varpool_nodes = 0, varpool_pos = 0;
   struct cgraph_node **order = XNEWVEC (struct cgraph_node *, cgraph_max_uid);
   struct varpool_node **varpool_order = NULL;
   int i;
@@ -481,10 +502,13 @@ lto_balanced_map (void)
      get better about minimizing the function bounday, but until that
      things works smoother if we order in source order.  */
   qsort (order, n_nodes, sizeof (struct cgraph_node *), node_cmp);
+
+  if (cgraph_dump_file)
+    for(i = 0; i < n_nodes; i++)
+      fprintf (cgraph_dump_file, "Balanced map symbol order:%s:%u\n", cgraph_node_asm_name (order[i]), order[i]->tp_first_run);
+
   if (!flag_toplevel_reorder)
     {
-      qsort (order, n_nodes, sizeof (struct cgraph_node *), node_cmp);
-
       FOR_EACH_VARIABLE (vnode)
 	if (get_symbol_class (vnode) == SYMBOL_PARTITION)
 	  n_varpool_nodes++;
@@ -683,7 +707,6 @@ lto_balanced_map (void)
 	  best_i = i;
 	  best_n_nodes = lto_symtab_encoder_size (partition->encoder);
 	  best_total_size = total_size;
-	  best_varpool_pos = varpool_pos;
 	}
       if (cgraph_dump_file)
 	fprintf (cgraph_dump_file, "Step %i: added %s/%i, size %i, cost %i/%i "
@@ -701,7 +724,6 @@ lto_balanced_map (void)
 		fprintf (cgraph_dump_file, "Unwinding %i insertions to step %i\n",
 			 i - best_i, best_i);
 	      undo_partition (partition, best_n_nodes);
-	      varpool_pos = best_varpool_pos;
 	    }
 	  i = best_i;
  	  /* When we are finished, avoid creating empty partition.  */
@@ -849,7 +871,7 @@ may_need_named_section_p (lto_symtab_encoder_t encoder, symtab_node *node)
    of the same name in partition ENCODER (or in whole compilation unit if
    ENCODER is NULL) and if so, mangle the statics.  Always mangle all
    conflicting statics, so we reduce changes of silently miscompiling
-   asm statemnets referring to them by symbol name.  */
+   asm statements referring to them by symbol name.  */
 
 static void
 rename_statics (lto_symtab_encoder_t encoder, symtab_node *node)
diff --git a/gcc/lto/lto.c b/gcc/lto/lto.c
index 62856d0..2401c76 100644
--- a/gcc/lto/lto.c
+++ b/gcc/lto/lto.c
@@ -2453,9 +2453,12 @@ lto_wpa_write_files (void)
   /* Sort partitions by size so small ones are compiled last.
      FIXME: Even when not reordering we may want to output one list for parallel make
      and other for final link command.  */
-  ltrans_partitions.qsort (flag_toplevel_reorder
+
+  if (!flag_profile_reorder_functions || !flag_profile_use)
+    ltrans_partitions.qsort (flag_toplevel_reorder
 			   ? cmp_partitions_size
 			   : cmp_partitions_order);
+
   for (i = 0; i < n_sets; i++)
     {
       size_t len;
diff --git a/gcc/opts.c b/gcc/opts.c
index 3a939ac..d3a7735 100644
--- a/gcc/opts.c
+++ b/gcc/opts.c
@@ -1690,6 +1690,8 @@ common_handle_option (struct gcc_options *opts,
 	opts->x_flag_vect_cost_model = VECT_COST_MODEL_DYNAMIC;
       if (!opts_set->x_flag_tree_loop_distribute_patterns)
 	opts->x_flag_tree_loop_distribute_patterns = value;
+      if (!opts_set->x_flag_profile_reorder_functions)
+	opts->x_flag_profile_reorder_functions = value;
       /* Indirect call profiling should do all useful transformations
  	 speculative devirutalization does.  */
       if (!opts_set->x_flag_devirtualize_speculatively
@@ -1708,6 +1710,8 @@ common_handle_option (struct gcc_options *opts,
 	opts->x_flag_profile_values = value;
       if (!opts_set->x_flag_inline_functions)
 	opts->x_flag_inline_functions = value;
+      if (!opts_set->x_flag_profile_reorder_functions)
+	opts->x_flag_profile_reorder_functions = value;
       /* FIXME: Instrumentation we insert makes ipa-reference bitmaps
 	 quadratic.  Disable the pass until better memory representation
 	 is done.  */
diff --git a/gcc/predict.c b/gcc/predict.c
index 61cac52..dbfcfdd 100644
--- a/gcc/predict.c
+++ b/gcc/predict.c
@@ -2840,12 +2840,24 @@ handle_missing_profiles (void)
     {
       struct cgraph_edge *e;
       gcov_type call_count = 0;
+      gcov_type max_tp_first_run = 0;
       struct function *fn = DECL_STRUCT_FUNCTION (node->decl);
 
       if (node->count)
         continue;
       for (e = node->callers; e; e = e->next_caller)
+      {
         call_count += e->count;
+
+	if (e->caller->tp_first_run > max_tp_first_run)
+	  max_tp_first_run = e->caller->tp_first_run;
+      }
+
+      /* If time profile is missing, let assign the maximum that comes from
+	 caller functions.  */
+      if (!node->tp_first_run)
+	node->tp_first_run = max_tp_first_run;
+
       if (call_count
           && fn && fn->cfg
           && (call_count * unlikely_count_fraction >= profile_info->runs))

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] Time profiler - phase 2
  2013-11-17 23:38   ` Martin Liška
@ 2013-11-18  4:37     ` Jan Hubicka
       [not found]       ` <CAObPJ3MqUhdybQ+Ua08ab680aoqXSAoq+tAfXi_xxkyYXzM2tg@mail.gmail.com>
  0 siblings, 1 reply; 18+ messages in thread
From: Jan Hubicka @ 2013-11-18  4:37 UTC (permalink / raw)
  To: Martin Liška; +Cc: Jan Hubicka, gcc-patches

> diff --git a/gcc/ChangeLog b/gcc/ChangeLog
> index 5cb07b7..754f882 100644
> --- a/gcc/ChangeLog
> +++ b/gcc/ChangeLog
> @@ -1,3 +1,13 @@
> +2013-11-17  Martin Liska  <marxin.liska@gmail.com>
> +	    Jan Hubicka  <jh@suse.cz>
> +
> +	* cgraphunit.c (node_cmp): New function.
> +	(expand_all_functions): Function ordering added.
> +	* common.opt: New profile based function reordering flag introduced.
> +	* lto-partition.c: Support for time profile added.
> +	* lto.c: Likewise.
> +	* predict.c (handle_missing_profiles): Time profile handled in
> +	  missing profiles.

OK.
> @@ -8933,6 +8933,14 @@ from profiling values of expressions for usage in optimizations.
>  
>  Enabled with @option{-fprofile-generate} and @option{-fprofile-use}.
>  
> +@item -fprofile-reoder-functions
> +@opindex fprofile-reorder-functions
> +Function reordering based on profile instrumentation collects
> +first time of execution of a function and orders these functions
> +in ascending order.
> +
> +Enabled with @option{-fprofile-generate} and @option{-fprofile-use}.

I wonder if we don't want to enable it only for -fprofile-use -flto.
You do not need to enable it -fprofile-generate.
> --- a/gcc/opts.c
> +++ b/gcc/opts.c
> @@ -1690,6 +1690,8 @@ common_handle_option (struct gcc_options *opts,
>  	opts->x_flag_vect_cost_model = VECT_COST_MODEL_DYNAMIC;
>        if (!opts_set->x_flag_tree_loop_distribute_patterns)
>  	opts->x_flag_tree_loop_distribute_patterns = value;
> +      if (!opts_set->x_flag_profile_reorder_functions)
> +	opts->x_flag_profile_reorder_functions = value;
>        /* Indirect call profiling should do all useful transformations
>   	 speculative devirutalization does.  */
>        if (!opts_set->x_flag_devirtualize_speculatively
> @@ -1708,6 +1710,8 @@ common_handle_option (struct gcc_options *opts,
>  	opts->x_flag_profile_values = value;
>        if (!opts_set->x_flag_inline_functions)
>  	opts->x_flag_inline_functions = value;
> +      if (!opts_set->x_flag_profile_reorder_functions)
> +	opts->x_flag_profile_reorder_functions = value;
>        /* FIXME: Instrumentation we insert makes ipa-reference bitmaps
>  	 quadratic.  Disable the pass until better memory representation
>  	 is done.  */

Rmove the -fprofile-generate path here.
> +
> +      /* If time profile is missing, let assign the maximum that comes from
> +	 caller functions.  */
> +      if (!node->tp_first_run)
> +	node->tp_first_run = max_tp_first_run;

			Probably +1 here, you want the function to appar afterwards.

Honza
> +
>        if (call_count
>            && fn && fn->cfg
>            && (call_count * unlikely_count_fraction >= profile_info->runs))

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] Time profiler - phase 2
       [not found]       ` <CAObPJ3MqUhdybQ+Ua08ab680aoqXSAoq+tAfXi_xxkyYXzM2tg@mail.gmail.com>
@ 2013-11-18  7:43         ` Jan Hubicka
  2013-11-18  9:21           ` Martin Liška
  0 siblings, 1 reply; 18+ messages in thread
From: Jan Hubicka @ 2013-11-18  7:43 UTC (permalink / raw)
  To: Martin Liška; +Cc: Jan Hubicka, gcc-patches

> Hello,
>    there is a new version of the patch, I disabled the branch with
> profile-generate. Could you please advise me how should I force to use
> profile-reorder-functions just with enable LTO optimization?
> 
> I also attach reordering results:
> o gimp-reoder-latest.html (latest patch)
> o gimp-reoder-without-fix.html (without the code in gcc/predict.c)
> 
> Thank you,
> Martin
> diff --git a/gcc/ChangeLog b/gcc/ChangeLog
> index 5cb07b7..754f882 100644
> --- a/gcc/ChangeLog
> +++ b/gcc/ChangeLog
> @@ -1,3 +1,13 @@
> +2013-11-17  Martin Liska  <marxin.liska@gmail.com>
> +	    Jan Hubicka  <jh@suse.cz>
> +
> +	* cgraphunit.c (node_cmp): New function.
> +	(expand_all_functions): Function ordering added.
> +	* common.opt: New profile based function reordering flag introduced.
> +	* lto-partition.c: Support for time profile added.
> +	* lto.c: Likewise.
> +	* predict.c (handle_missing_profiles): Time profile handled in
> +	  missing profiles.

OK,
thanks!
> @@ -8645,7 +8645,7 @@ profile useful for later recompilation with profile feedback based
>  optimization.  You must use @option{-fprofile-generate} both when
>  compiling and when linking your program.
>  
> -The following options are enabled: @code{-fprofile-arcs}, @code{-fprofile-values}, @code{-fvpt}.
> +The following options are enabled: @code{-fprofile-arcs}, @code{-fprofile-values}, @code{-fprofile-reorder-functions}, @code{-fvpt}.
>  
>  If @var{path} is specified, GCC looks at the @var{path} to find
>  the profile feedback data files. See @option{-fprofile-dir}.

Skip this change, it is only about the profiling options used.  I think it is enough to mention it later.
> @@ -8933,6 +8933,14 @@ from profiling values of expressions for usage in optimizations.
>  
>  Enabled with @option{-fprofile-generate} and @option{-fprofile-use}.
>  
> +@item -fprofile-reoder-functions
> +@opindex fprofile-reorder-functions
> +Function reordering based on profile instrumentation collects
> +first time of execution of a function and orders these functions
> +in ascending order.
> +
> +Enabled with @option{-fprofile-generate} and @option{-fprofile-use}.

Only with -fprofile-use.

What happened with the plans for linker support? Perhaps we can implement the numbered sections by Carry's proposal and hope that binutils will catch up in next release?

Honza

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] Time profiler - phase 2
  2013-11-18  7:43         ` Jan Hubicka
@ 2013-11-18  9:21           ` Martin Liška
  2013-11-18 10:56             ` Jan Hubicka
  0 siblings, 1 reply; 18+ messages in thread
From: Martin Liška @ 2013-11-18  9:21 UTC (permalink / raw)
  To: Jan Hubicka; +Cc: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 2626 bytes --]

Hello,
   there's new version of the patch. I wrote email to Cary to
negotiate how will implement gold's linker patch.

Thanks,
Martin

On 18 November 2013 00:37, Jan Hubicka <hubicka@ucw.cz> wrote:
>> Hello,
>>    there is a new version of the patch, I disabled the branch with
>> profile-generate. Could you please advise me how should I force to use
>> profile-reorder-functions just with enable LTO optimization?
>>
>> I also attach reordering results:
>> o gimp-reoder-latest.html (latest patch)
>> o gimp-reoder-without-fix.html (without the code in gcc/predict.c)
>>
>> Thank you,
>> Martin
>> diff --git a/gcc/ChangeLog b/gcc/ChangeLog
>> index 5cb07b7..754f882 100644
>> --- a/gcc/ChangeLog
>> +++ b/gcc/ChangeLog
>> @@ -1,3 +1,13 @@
>> +2013-11-17  Martin Liska  <marxin.liska@gmail.com>
>> +         Jan Hubicka  <jh@suse.cz>
>> +
>> +     * cgraphunit.c (node_cmp): New function.
>> +     (expand_all_functions): Function ordering added.
>> +     * common.opt: New profile based function reordering flag introduced.
>> +     * lto-partition.c: Support for time profile added.
>> +     * lto.c: Likewise.
>> +     * predict.c (handle_missing_profiles): Time profile handled in
>> +       missing profiles.
>
> OK,
> thanks!
>> @@ -8645,7 +8645,7 @@ profile useful for later recompilation with profile feedback based
>>  optimization.  You must use @option{-fprofile-generate} both when
>>  compiling and when linking your program.
>>
>> -The following options are enabled: @code{-fprofile-arcs}, @code{-fprofile-values}, @code{-fvpt}.
>> +The following options are enabled: @code{-fprofile-arcs}, @code{-fprofile-values}, @code{-fprofile-reorder-functions}, @code{-fvpt}.
>>
>>  If @var{path} is specified, GCC looks at the @var{path} to find
>>  the profile feedback data files. See @option{-fprofile-dir}.
>
> Skip this change, it is only about the profiling options used.  I think it is enough to mention it later.
>> @@ -8933,6 +8933,14 @@ from profiling values of expressions for usage in optimizations.
>>
>>  Enabled with @option{-fprofile-generate} and @option{-fprofile-use}.
>>
>> +@item -fprofile-reoder-functions
>> +@opindex fprofile-reorder-functions
>> +Function reordering based on profile instrumentation collects
>> +first time of execution of a function and orders these functions
>> +in ascending order.
>> +
>> +Enabled with @option{-fprofile-generate} and @option{-fprofile-use}.
>
> Only with -fprofile-use.
>
> What happened with the plans for linker support? Perhaps we can implement the numbered sections by Carry's proposal and hope that binutils will catch up in next release?
>
> Honza

[-- Attachment #2: time-profiler-2.3.patch --]
[-- Type: text/x-patch, Size: 10503 bytes --]

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 5cb07b7..754f882 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,13 @@
+2013-11-17  Martin Liska  <marxin.liska@gmail.com>
+	    Jan Hubicka  <jh@suse.cz>
+
+	* cgraphunit.c (node_cmp): New function.
+	(expand_all_functions): Function ordering added.
+	* common.opt: New profile based function reordering flag introduced.
+	* lto-partition.c: Support for time profile added.
+	* lto.c: Likewise.
+	* predict.c (handle_missing_profiles): Time profile handled in
+	  missing profiles.
 2013-11-16  Joern Rennecke  <joern.rennecke@embecosm.com>
 
 	* config/arc/arc.c (arc_predicate_delay_insns): New function.
diff --git a/gcc/cgraphunit.c b/gcc/cgraphunit.c
index 8ab274b..ea722b8 100644
--- a/gcc/cgraphunit.c
+++ b/gcc/cgraphunit.c
@@ -1824,6 +1824,19 @@ expand_function (struct cgraph_node *node)
   ipa_remove_all_references (&node->ref_list);
 }
 
+/* Node comparer that is responsible for the order that corresponds
+   to time when a function was launched for the first time.  */
+
+static int
+node_cmp (const void *pa, const void *pb)
+{
+  const struct cgraph_node *a = *(const struct cgraph_node * const *) pa;
+  const struct cgraph_node *b = *(const struct cgraph_node * const *) pb;
+
+  return a->tp_first_run != b->tp_first_run
+	 ? b->tp_first_run - a->tp_first_run
+	 : b->order - a->order;
+}
 
 /* Expand all functions that must be output.
 
@@ -1835,11 +1848,14 @@ expand_function (struct cgraph_node *node)
    to use subsections to make the output functions appear in top-down
    order).  */
 
+
 static void
 expand_all_functions (void)
 {
   struct cgraph_node *node;
   struct cgraph_node **order = XCNEWVEC (struct cgraph_node *, cgraph_n_nodes);
+
+  unsigned int expanded_func_count = 0, profiled_func_count = 0;
   int order_pos, new_order_pos = 0;
   int i;
 
@@ -1852,19 +1868,35 @@ expand_all_functions (void)
     if (order[i]->process)
       order[new_order_pos++] = order[i];
 
+  if (flag_profile_reorder_functions)
+    qsort (order, new_order_pos, sizeof (struct cgraph_node *), node_cmp);
+
   for (i = new_order_pos - 1; i >= 0; i--)
     {
       node = order[i];
+
       if (node->process)
 	{
+     expanded_func_count++;
+     if(node->tp_first_run)
+       profiled_func_count++;
+
 	  node->process = 0;
 	  expand_function (node);
 	}
     }
+
+    if (in_lto_p && dump_file)
+      fprintf (dump_file, "Expanded functions with time profile (%s):%u/%u\n",
+               main_input_filename, profiled_func_count, expanded_func_count);
+
+  if (cgraph_dump_file && flag_profile_reorder_functions && in_lto_p)
+    fprintf (cgraph_dump_file, "Expanded functions with time profile:%u/%u\n",
+             profiled_func_count, expanded_func_count);
+
   cgraph_process_new_functions ();
 
   free (order);
-
 }
 
 /* This is used to sort the node types by the cgraph order number.  */
@@ -2189,6 +2221,7 @@ compile (void)
 #endif
 
   cgraph_state = CGRAPH_STATE_EXPANSION;
+
   if (!flag_toplevel_reorder)
     output_in_order ();
   else
diff --git a/gcc/common.opt b/gcc/common.opt
index d5971df..85d5c74 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -1708,6 +1708,10 @@ fprofile-report
 Common Report Var(profile_report)
 Report on consistency of profile
 
+fprofile-reorder-functions
+Common Report Var(flag_profile_reorder_functions)
+Enable function reordering that improves code placement
+
 frandom-seed
 Common Var(common_deferred_options) Defer
 
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index ff4c2ee..e4972ab 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -393,7 +393,7 @@ Objective-C and Objective-C++ Dialects}.
 -fprefetch-loop-arrays -fprofile-report @gol
 -fprofile-correction -fprofile-dir=@var{path} -fprofile-generate @gol
 -fprofile-generate=@var{path} @gol
--fprofile-use -fprofile-use=@var{path} -fprofile-values @gol
+-fprofile-use -fprofile-use=@var{path} -fprofile-values -fprofile-reorder-functions @gol
 -freciprocal-math -free -frename-registers -freorder-blocks @gol
 -freorder-blocks-and-partition -freorder-functions @gol
 -frerun-cse-after-loop -freschedule-modulo-scheduled-loops @gol
@@ -8933,6 +8933,14 @@ from profiling values of expressions for usage in optimizations.
 
 Enabled with @option{-fprofile-generate} and @option{-fprofile-use}.
 
+@item -fprofile-reoder-functions
+@opindex fprofile-reorder-functions
+Function reordering based on profile instrumentation collects
+first time of execution of a function and orders these functions
+in ascending order.
+
+Enabled with @option{-fprofile-use}.
+
 @item -fvpt
 @opindex fvpt
 If combined with @option{-fprofile-arcs}, this option instructs the compiler
diff --git a/gcc/lto/lto-partition.c b/gcc/lto/lto-partition.c
index 6a3d881..42e4aef 100644
--- a/gcc/lto/lto-partition.c
+++ b/gcc/lto/lto-partition.c
@@ -280,9 +280,11 @@ add_symbol_to_partition (ltrans_partition part, symtab_node *node)
 
      Be lax about comdats; they may or may not be duplicated and we may
      end up in need to duplicate keyed comdat because it has unkeyed alias.  */
+
   gcc_assert (get_symbol_class (node) == SYMBOL_DUPLICATE
 	      || DECL_COMDAT (node->decl)
 	      || !symbol_partitioned_p (node));
+
   add_symbol_to_partition_1 (part, node);
 }
 
@@ -395,6 +397,25 @@ node_cmp (const void *pa, const void *pb)
 {
   const struct cgraph_node *a = *(const struct cgraph_node * const *) pa;
   const struct cgraph_node *b = *(const struct cgraph_node * const *) pb;
+
+  /* Profile reorder flag enables function reordering based on first execution
+     of a function. All functions with profile are placed in ascending
+     order at the beginning.  */
+
+  if (flag_profile_reorder_functions)
+  {
+    /* Functions with time profile are sorted in ascending order.  */
+    if (a->tp_first_run && b->tp_first_run)
+      return a->tp_first_run != b->tp_first_run
+	? a->tp_first_run - b->tp_first_run
+        : a->order - b->order;
+
+    /* Functions with time profile are sorted before the functions
+       that do not have the profile.  */
+    if (a->tp_first_run || b->tp_first_run)
+      return b->tp_first_run - a->tp_first_run;
+  }
+
   return b->order - a->order;
 }
 
@@ -449,7 +470,7 @@ void
 lto_balanced_map (void)
 {
   int n_nodes = 0;
-  int n_varpool_nodes = 0, varpool_pos = 0, best_varpool_pos = 0;
+  int n_varpool_nodes = 0, varpool_pos = 0;
   struct cgraph_node **order = XNEWVEC (struct cgraph_node *, cgraph_max_uid);
   struct varpool_node **varpool_order = NULL;
   int i;
@@ -481,10 +502,13 @@ lto_balanced_map (void)
      get better about minimizing the function bounday, but until that
      things works smoother if we order in source order.  */
   qsort (order, n_nodes, sizeof (struct cgraph_node *), node_cmp);
+
+  if (cgraph_dump_file)
+    for(i = 0; i < n_nodes; i++)
+      fprintf (cgraph_dump_file, "Balanced map symbol order:%s:%u\n", cgraph_node_asm_name (order[i]), order[i]->tp_first_run);
+
   if (!flag_toplevel_reorder)
     {
-      qsort (order, n_nodes, sizeof (struct cgraph_node *), node_cmp);
-
       FOR_EACH_VARIABLE (vnode)
 	if (get_symbol_class (vnode) == SYMBOL_PARTITION)
 	  n_varpool_nodes++;
@@ -683,7 +707,6 @@ lto_balanced_map (void)
 	  best_i = i;
 	  best_n_nodes = lto_symtab_encoder_size (partition->encoder);
 	  best_total_size = total_size;
-	  best_varpool_pos = varpool_pos;
 	}
       if (cgraph_dump_file)
 	fprintf (cgraph_dump_file, "Step %i: added %s/%i, size %i, cost %i/%i "
@@ -701,7 +724,6 @@ lto_balanced_map (void)
 		fprintf (cgraph_dump_file, "Unwinding %i insertions to step %i\n",
 			 i - best_i, best_i);
 	      undo_partition (partition, best_n_nodes);
-	      varpool_pos = best_varpool_pos;
 	    }
 	  i = best_i;
  	  /* When we are finished, avoid creating empty partition.  */
@@ -849,7 +871,7 @@ may_need_named_section_p (lto_symtab_encoder_t encoder, symtab_node *node)
    of the same name in partition ENCODER (or in whole compilation unit if
    ENCODER is NULL) and if so, mangle the statics.  Always mangle all
    conflicting statics, so we reduce changes of silently miscompiling
-   asm statemnets referring to them by symbol name.  */
+   asm statements referring to them by symbol name.  */
 
 static void
 rename_statics (lto_symtab_encoder_t encoder, symtab_node *node)
diff --git a/gcc/lto/lto.c b/gcc/lto/lto.c
index 62856d0..2401c76 100644
--- a/gcc/lto/lto.c
+++ b/gcc/lto/lto.c
@@ -2453,9 +2453,12 @@ lto_wpa_write_files (void)
   /* Sort partitions by size so small ones are compiled last.
      FIXME: Even when not reordering we may want to output one list for parallel make
      and other for final link command.  */
-  ltrans_partitions.qsort (flag_toplevel_reorder
+
+  if (!flag_profile_reorder_functions || !flag_profile_use)
+    ltrans_partitions.qsort (flag_toplevel_reorder
 			   ? cmp_partitions_size
 			   : cmp_partitions_order);
+
   for (i = 0; i < n_sets; i++)
     {
       size_t len;
diff --git a/gcc/opts.c b/gcc/opts.c
index 3a939ac..b842543 100644
--- a/gcc/opts.c
+++ b/gcc/opts.c
@@ -1690,6 +1690,8 @@ common_handle_option (struct gcc_options *opts,
 	opts->x_flag_vect_cost_model = VECT_COST_MODEL_DYNAMIC;
       if (!opts_set->x_flag_tree_loop_distribute_patterns)
 	opts->x_flag_tree_loop_distribute_patterns = value;
+      if (!opts_set->x_flag_profile_reorder_functions)
+	opts->x_flag_profile_reorder_functions = value;
       /* Indirect call profiling should do all useful transformations
  	 speculative devirutalization does.  */
       if (!opts_set->x_flag_devirtualize_speculatively
diff --git a/gcc/predict.c b/gcc/predict.c
index 61cac52..39a0ef7 100644
--- a/gcc/predict.c
+++ b/gcc/predict.c
@@ -2840,12 +2840,24 @@ handle_missing_profiles (void)
     {
       struct cgraph_edge *e;
       gcov_type call_count = 0;
+      gcov_type max_tp_first_run = 0;
       struct function *fn = DECL_STRUCT_FUNCTION (node->decl);
 
       if (node->count)
         continue;
       for (e = node->callers; e; e = e->next_caller)
+      {
         call_count += e->count;
+
+	if (e->caller->tp_first_run > max_tp_first_run)
+	  max_tp_first_run = e->caller->tp_first_run;
+      }
+
+      /* If time profile is missing, let assign the maximum that comes from
+	 caller functions.  */
+      if (!node->tp_first_run && max_tp_first_run)
+	node->tp_first_run = max_tp_first_run + 1;
+
       if (call_count
           && fn && fn->cfg
           && (call_count * unlikely_count_fraction >= profile_info->runs))

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] Time profiler - phase 2
  2013-11-18  9:21           ` Martin Liška
@ 2013-11-18 10:56             ` Jan Hubicka
       [not found]               ` <CAObPJ3M7maO8FmJHMpKFv-sqr=ch=gUWRDKdBzyunmAYDfzpHw@mail.gmail.com>
  0 siblings, 1 reply; 18+ messages in thread
From: Jan Hubicka @ 2013-11-18 10:56 UTC (permalink / raw)
  To: Martin Liška; +Cc: Jan Hubicka, gcc-patches

> diff --git a/gcc/ChangeLog b/gcc/ChangeLog
> index 5cb07b7..754f882 100644
> --- a/gcc/ChangeLog
> +++ b/gcc/ChangeLog
> @@ -1,3 +1,13 @@
> +2013-11-17  Martin Liska  <marxin.liska@gmail.com>
> +	    Jan Hubicka  <jh@suse.cz>
> +
> +	* cgraphunit.c (node_cmp): New function.
> +	(expand_all_functions): Function ordering added.
> +	* common.opt: New profile based function reordering flag introduced.
> +	* lto-partition.c: Support for time profile added.
> +	* lto.c: Likewise.
> +	* predict.c (handle_missing_profiles): Time profile handled in
> +	  missing profiles.

OK,
thanks!  Implementing the function section naming scheme would be easy and it would
enable us to do the reordering even w/o LTO that would be quite cool. Lets hope it gets
resolved soon.

Honza

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] Time profiler - phase 2
       [not found]               ` <CAObPJ3M7maO8FmJHMpKFv-sqr=ch=gUWRDKdBzyunmAYDfzpHw@mail.gmail.com>
@ 2013-12-02 23:37                 ` Martin Liška
       [not found]                 ` <CAObPJ3OKRZhb8UYEtvq3RPt3Xw_QzGEpx_vFjFGibtiLXZWSVQ@mail.gmail.com>
  1 sibling, 0 replies; 18+ messages in thread
From: Martin Liška @ 2013-12-02 23:37 UTC (permalink / raw)
  To: Jan Hubicka; +Cc: gcc-patches

Hello,
   there are dumps for Inkscape, it looks very well.

Link: https://docs.google.com/file/d/0B0pisUJ80pO1Y0t1aEVBRlByR28/edit

There are few of functions that look like this (wpa cgraph):

_ZL13resync_activeP19_EgeSelectOneActionii/2604322 (resync_active)
@0x7f84af42cea0
  Type: function definition analyzed
  Visibility: prevailing_def_ironly
  References:
  Referring:
  Read from file: libinkscape.a
  Availability: local
  First run: 4422
  Function flags: executed 47x local
  Called by: ege_select_one_action_set_active_text/2604300 (0.34 per
call) (can throw external)
_ZL21commit_pending_changeP19_EgeSelectOneAction/2604327 (0.16 per
call) (can throw external)
_ZL34ege_select_one_action_set_propertyP8_GObjectjPK7_GValueP11_GParamSpec/2604316
(47x) (0.16 per call) (can throw external)
  Calls: _ZL13resync_activeP19_EgeSelectOneActionii.part.0/2604456
(10x) (0.21 per call) (can throw external)

_ZL13resync_activeP19_EgeSelectOneActionii.part.0/2604456
(_ZL13resync_activeP19_EgeSelectOneActionii.part.0) @0x7f84af42cd68
  Type: function definition analyzed
  Visibility: artificial
  References: _ZL7signals/2604291 (read)
  Referring:
  Read from file: libinkscape.a
  Availability: local
  First run: 0
  Function flags: executed 10x local
  Called by: _ZL13resync_activeP19_EgeSelectOneActionii/2604322 (10x)
(0.21 per call) (can throw external)

First function has a profile (position is correct according to
valgrind) and second not. Both of them comes from the same object
file. The problem is that the second one is called according to
valgrind. What does .part.X means, is it a part of function that was
separated to a different function? Is there any was these two profiles
could be merged?

Thank you,
Martin

On 27 November 2013 00:24, Martin Liška <marxin.liska@gmail.com> wrote:
> Hello,
>     present results reached for GIMP by the reordering pass. Important
> to notice, that I just used single '.text' section where all symbols
> are placed. As you can see, there just few functions that are not
> catched by the pass (3 of them are LTO clones, that I will find out).
> And 2 functions were not seen during -fprofile-generate run.
>
> In following days, I will prepare same dumps for Inkscape and Firefox.
>
> Martin
>
> On 18 November 2013 11:16, Jan Hubicka <hubicka@ucw.cz> wrote:
>>> diff --git a/gcc/ChangeLog b/gcc/ChangeLog
>>> index 5cb07b7..754f882 100644
>>> --- a/gcc/ChangeLog
>>> +++ b/gcc/ChangeLog
>>> @@ -1,3 +1,13 @@
>>> +2013-11-17  Martin Liska  <marxin.liska@gmail.com>
>>> +         Jan Hubicka  <jh@suse.cz>
>>> +
>>> +     * cgraphunit.c (node_cmp): New function.
>>> +     (expand_all_functions): Function ordering added.
>>> +     * common.opt: New profile based function reordering flag introduced.
>>> +     * lto-partition.c: Support for time profile added.
>>> +     * lto.c: Likewise.
>>> +     * predict.c (handle_missing_profiles): Time profile handled in
>>> +       missing profiles.
>>
>> OK,
>> thanks!  Implementing the function section naming scheme would be easy and it would
>> enable us to do the reordering even w/o LTO that would be quite cool. Lets hope it gets
>> resolved soon.
>>
>> Honza

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] Time profiler - phase 2
       [not found]                 ` <CAObPJ3OKRZhb8UYEtvq3RPt3Xw_QzGEpx_vFjFGibtiLXZWSVQ@mail.gmail.com>
@ 2013-12-05 13:35                   ` Jan Hubicka
  2013-12-05 13:38                     ` Jan Hubicka
  0 siblings, 1 reply; 18+ messages in thread
From: Jan Hubicka @ 2013-12-05 13:35 UTC (permalink / raw)
  To: Martin Liška; +Cc: Jan Hubicka, gcc-patches

> Hello,
>    there are dumps for Inkscape, it looks very well. There are few of
> functions that look like this (wpa cgraph):
> 
> _ZL13resync_activeP19_EgeSelectOneActionii/2604322 (resync_active)
> @0x7f84af42cea0
>   Type: function definition analyzed
>   Visibility: prevailing_def_ironly
>   References:
>   Referring:
>   Read from file: libinkscape.a
>   Availability: local
>   First run: 4422
>   Function flags: executed 47x local
>   Called by: ege_select_one_action_set_active_text/2604300 (0.34 per
> call) (can throw external)
> _ZL21commit_pending_changeP19_EgeSelectOneAction/2604327 (0.16 per
> call) (can throw external)
> _ZL34ege_select_one_action_set_propertyP8_GObjectjPK7_GValueP11_GParamSpec/2604316
> (47x) (0.16 per call) (can throw external)
>   Calls: _ZL13resync_activeP19_EgeSelectOneActionii.part.0/2604456
> (10x) (0.21 per call) (can throw external)
> 
> _ZL13resync_activeP19_EgeSelectOneActionii.part.0/2604456
> (_ZL13resync_activeP19_EgeSelectOneActionii.part.0) @0x7f84af42cd68
>   Type: function definition analyzed
>   Visibility: artificial
>   References: _ZL7signals/2604291 (read)
>   Referring:
>   Read from file: libinkscape.a
>   Availability: local
>   First run: 0
>   Function flags: executed 10x local
>   Called by: _ZL13resync_activeP19_EgeSelectOneActionii/2604322 (10x)
> (0.21 per call) (can throw external)
> 
> First function has a profile (position is correct according to
> valgrind) and second not. Both of them comes from the same object
> file. The problem is that the second one is called according to
> valgrind. What does .part.X means, is it a part of function that was
> separated to a different function? Is there any was these two profiles
> could be merged?

.part.X are functions created by function splitting.  I assume we want to modify
ipa-split.c as follows:
Index: ipa-split.c
===================================================================
--- ipa-split.c (revision 205531)
+++ ipa-split.c (working copy)
@@ -1233,6 +1233,7 @@
                                     !split_part_return_p,
                                     split_point->split_bbs,
                                     split_point->entry_bb, "part");
+  node->tp_first_run = cur_node->tp_first_run + 1;
   /* For usual cloning it is enough to clear builtin only when signature
      changes.  For partial inlining we however can not expect the part
      of builtin implementation to have same semantic as the whole.  */

Can you, please, send me the -flto systemtaps for gimp and/or inkscape so we can decide
on the patch? We should enable it earlier in stage3 rather than later.

Honza

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] Time profiler - phase 2
  2013-12-05 13:35                   ` Jan Hubicka
@ 2013-12-05 13:38                     ` Jan Hubicka
  2013-12-05 19:38                       ` Martin Liška
  0 siblings, 1 reply; 18+ messages in thread
From: Jan Hubicka @ 2013-12-05 13:38 UTC (permalink / raw)
  To: Jan Hubicka; +Cc: Martin Liška, gcc-patches

> Can you, please, send me the -flto systemtaps for gimp and/or inkscape so we can decide
> on the patch? We should enable it earlier in stage3 rather than later.

I see, the PDF was included within the tar file.  Was this with
-freorder-blocks-and-partition?  If so, the patch is OK.
I still think we should put cold parts of hot/normal function into a subsection different
from unlikely section, but lets handle that incrementally.

Thanks,
Honza
> 
> Honza

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] Time profiler - phase 2
  2013-12-05 13:38                     ` Jan Hubicka
@ 2013-12-05 19:38                       ` Martin Liška
  2013-12-05 21:32                         ` Jan Hubicka
  0 siblings, 1 reply; 18+ messages in thread
From: Martin Liška @ 2013-12-05 19:38 UTC (permalink / raw)
  To: Jan Hubicka; +Cc: gcc-patches

Hello,
   thank you for the trick in ipa-split.c. It really helped! I
prepared 2 tests for Inkscape, first was just with my function
reordering pass. And for the second, I enable also
-freorder-blocks-and-partition (note: files ending with _blocks.xxx in
attached tar).

Touched pages:
just reorder functions: 1108
with -freorder-blocks-and-partition: 1120

Please see dumps at:
https://drive.google.com/file/d/0B0pisUJ80pO1R19zSXR6U1Q4NmM/edit?usp=sharing
Note: I put all function to a single section (for easier layout orientation).

If I'm correct, there a small chunk of about 10 functions after the
boundary of untouched functions and a single miss at the end of
binary: __libc_csu_init.
If you look at the graph line, it looks really well.

I will prepare patch for mailing list as a phase 2.

Martin

On 5 December 2013 14:38, Jan Hubicka <hubicka@ucw.cz> wrote:
>> Can you, please, send me the -flto systemtaps for gimp and/or inkscape so we can decide
>> on the patch? We should enable it earlier in stage3 rather than later.
>
> I see, the PDF was included within the tar file.  Was this with
> -freorder-blocks-and-partition?  If so, the patch is OK.
> I still think we should put cold parts of hot/normal function into a subsection different
> from unlikely section, but lets handle that incrementally.
>
> Thanks,
> Honza
>>
>> Honza

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] Time profiler - phase 2
  2013-12-05 19:38                       ` Martin Liška
@ 2013-12-05 21:32                         ` Jan Hubicka
  2013-12-15 23:08                           ` Martin Liška
  0 siblings, 1 reply; 18+ messages in thread
From: Jan Hubicka @ 2013-12-05 21:32 UTC (permalink / raw)
  To: Martin Liška; +Cc: Jan Hubicka, gcc-patches

> Hello,
>    thank you for the trick in ipa-split.c. It really helped! I

Good!, this patch is pre-approved after testing.
> prepared 2 tests for Inkscape, first was just with my function
> reordering pass. And for the second, I enable also
> -freorder-blocks-and-partition (note: files ending with _blocks.xxx in
> attached tar).
> 
> Touched pages:
> just reorder functions: 1108
> with -freorder-blocks-and-partition: 1120
> 
> Please see dumps at:
> https://drive.google.com/file/d/0B0pisUJ80pO1R19zSXR6U1Q4NmM/edit?usp=sharing
> Note: I put all function to a single section (for easier layout orientation).

I think for -freorder-blocks-and-partition you will need to split the sections
again, otherwise we will not see the effect of the pass. Can you, please, make
one extra systemtap with this (it would be good to have both
-fno-reorder-blocks-and-partition and -freorder-blocks-and-partition so we can
compare size of bloth)?
But overall it looks good!

Honza
> 
> If I'm correct, there a small chunk of about 10 functions after the
> boundary of untouched functions and a single miss at the end of
> binary: __libc_csu_init.
> If you look at the graph line, it looks really well.
> 
> I will prepare patch for mailing list as a phase 2.
> 
> Martin
> 
> On 5 December 2013 14:38, Jan Hubicka <hubicka@ucw.cz> wrote:
> >> Can you, please, send me the -flto systemtaps for gimp and/or inkscape so we can decide
> >> on the patch? We should enable it earlier in stage3 rather than later.
> >
> > I see, the PDF was included within the tar file.  Was this with
> > -freorder-blocks-and-partition?  If so, the patch is OK.
> > I still think we should put cold parts of hot/normal function into a subsection different
> > from unlikely section, but lets handle that incrementally.
> >
> > Thanks,
> > Honza
> >>
> >> Honza

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] Time profiler - phase 2
  2013-12-05 21:32                         ` Jan Hubicka
@ 2013-12-15 23:08                           ` Martin Liška
  2013-12-15 23:21                             ` Jan Hubicka
  0 siblings, 1 reply; 18+ messages in thread
From: Martin Liška @ 2013-12-15 23:08 UTC (permalink / raw)
  To: Jan Hubicka; +Cc: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 2022 bytes --]

There's latest version of the patch.
Could you please approve the patch?

Martin

On 5 December 2013 22:32, Jan Hubicka <hubicka@ucw.cz> wrote:
>> Hello,
>>    thank you for the trick in ipa-split.c. It really helped! I
>
> Good!, this patch is pre-approved after testing.
>> prepared 2 tests for Inkscape, first was just with my function
>> reordering pass. And for the second, I enable also
>> -freorder-blocks-and-partition (note: files ending with _blocks.xxx in
>> attached tar).
>>
>> Touched pages:
>> just reorder functions: 1108
>> with -freorder-blocks-and-partition: 1120
>>
>> Please see dumps at:
>> https://drive.google.com/file/d/0B0pisUJ80pO1R19zSXR6U1Q4NmM/edit?usp=sharing
>> Note: I put all function to a single section (for easier layout orientation).
>
> I think for -freorder-blocks-and-partition you will need to split the sections
> again, otherwise we will not see the effect of the pass. Can you, please, make
> one extra systemtap with this (it would be good to have both
> -fno-reorder-blocks-and-partition and -freorder-blocks-and-partition so we can
> compare size of bloth)?
> But overall it looks good!
>
> Honza
>>
>> If I'm correct, there a small chunk of about 10 functions after the
>> boundary of untouched functions and a single miss at the end of
>> binary: __libc_csu_init.
>> If you look at the graph line, it looks really well.
>>
>> I will prepare patch for mailing list as a phase 2.
>>
>> Martin
>>
>> On 5 December 2013 14:38, Jan Hubicka <hubicka@ucw.cz> wrote:
>> >> Can you, please, send me the -flto systemtaps for gimp and/or inkscape so we can decide
>> >> on the patch? We should enable it earlier in stage3 rather than later.
>> >
>> > I see, the PDF was included within the tar file.  Was this with
>> > -freorder-blocks-and-partition?  If so, the patch is OK.
>> > I still think we should put cold parts of hot/normal function into a subsection different
>> > from unlikely section, but lets handle that incrementally.
>> >
>> > Thanks,
>> > Honza
>> >>
>> >> Honza

[-- Attachment #2: time-profiler-2.4.patch --]
[-- Type: text/x-patch, Size: 12783 bytes --]

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 93e857df..d5a0ac8 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,14 @@
+2013-12-15  Martin Liska  <marxin.liska@gmail.com>
+	    Jan Hubicka  <jh@suse.cz>
+
+	* cgraphunit.c (node_cmp): New function.
+	(expand_all_functions): Function ordering added.
+	* common.opt: New profile based function reordering flag introduced.
+	* lto-partition.c: Support for time profile added.
+	* lto.c: Likewise.
+	* predict.c (handle_missing_profiles): Time profile handled in
+	  missing profiles.
+
 2013-12-14   Jan Hubicka  <jh@suse.cz>
 
 	PR middle-end/58477
diff --git a/gcc/cgraphunit.c b/gcc/cgraphunit.c
index 44f3afd..2b66331 100644
--- a/gcc/cgraphunit.c
+++ b/gcc/cgraphunit.c
@@ -1831,6 +1831,23 @@ expand_function (struct cgraph_node *node)
   ipa_remove_all_references (&node->ref_list);
 }
 
+/* Node comparer that is responsible for the order that corresponds
+   to time when a function was launched for the first time.  */
+
+static int
+node_cmp (const void *pa, const void *pb)
+{
+  const struct cgraph_node *a = *(const struct cgraph_node * const *) pa;
+  const struct cgraph_node *b = *(const struct cgraph_node * const *) pb;
+
+  /* Functions with time profile must be before these without profile.  */
+  if (!a->tp_first_run || !b->tp_first_run)
+    return a->tp_first_run - b->tp_first_run;
+
+  return a->tp_first_run != b->tp_first_run
+	 ? b->tp_first_run - a->tp_first_run
+	 : b->order - a->order;
+}
 
 /* Expand all functions that must be output.
 
@@ -1842,11 +1859,14 @@ expand_function (struct cgraph_node *node)
    to use subsections to make the output functions appear in top-down
    order).  */
 
+
 static void
 expand_all_functions (void)
 {
   struct cgraph_node *node;
   struct cgraph_node **order = XCNEWVEC (struct cgraph_node *, cgraph_n_nodes);
+
+  unsigned int expanded_func_count = 0, profiled_func_count = 0;
   int order_pos, new_order_pos = 0;
   int i;
 
@@ -1859,20 +1879,39 @@ expand_all_functions (void)
     if (order[i]->process)
       order[new_order_pos++] = order[i];
 
+  if (flag_profile_reorder_functions)
+    qsort (order, new_order_pos, sizeof (struct cgraph_node *), node_cmp);
+
   for (i = new_order_pos - 1; i >= 0; i--)
     {
       node = order[i];
+
       if (node->process)
 	{
+     expanded_func_count++;
+     if(node->tp_first_run)
+       profiled_func_count++;
+
+    if (cgraph_dump_file)
+      fprintf (cgraph_dump_file, "Time profile order in expand_all_functions:%s:%d\n", node->asm_name (), node->tp_first_run);
+
 	  node->process = 0;
 	  expand_function (node);
 	}
     }
+
+    if (in_lto_p && dump_file)
+      fprintf (dump_file, "Expanded functions with time profile (%s):%u/%u\n",
+               main_input_filename, profiled_func_count, expanded_func_count);
+
+  if (cgraph_dump_file && flag_profile_reorder_functions && in_lto_p)
+    fprintf (cgraph_dump_file, "Expanded functions with time profile:%u/%u\n",
+             profiled_func_count, expanded_func_count);
+
   cgraph_process_new_functions ();
   free_gimplify_stack ();
 
   free (order);
-
 }
 
 /* This is used to sort the node types by the cgraph order number.  */
@@ -2194,6 +2233,7 @@ compile (void)
 #endif
 
   cgraph_state = CGRAPH_STATE_EXPANSION;
+
   if (!flag_toplevel_reorder)
     output_in_order ();
   else
diff --git a/gcc/common.opt b/gcc/common.opt
index 0cd1fdd..ea323fd 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -1712,6 +1712,10 @@ fprofile-report
 Common Report Var(profile_report)
 Report on consistency of profile
 
+fprofile-reorder-functions
+Common Report Var(flag_profile_reorder_functions)
+Enable function reordering that improves code placement
+
 frandom-seed
 Common Var(common_deferred_options) Defer
 
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index b655a64..b30a368 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -394,7 +394,7 @@ Objective-C and Objective-C++ Dialects}.
 -fprefetch-loop-arrays -fprofile-report @gol
 -fprofile-correction -fprofile-dir=@var{path} -fprofile-generate @gol
 -fprofile-generate=@var{path} @gol
--fprofile-use -fprofile-use=@var{path} -fprofile-values @gol
+-fprofile-use -fprofile-use=@var{path} -fprofile-values -fprofile-reorder-functions @gol
 -freciprocal-math -free -frename-registers -freorder-blocks @gol
 -freorder-blocks-and-partition -freorder-functions @gol
 -frerun-cse-after-loop -freschedule-modulo-scheduled-loops @gol
@@ -9071,6 +9071,14 @@ from profiling values of expressions for usage in optimizations.
 
 Enabled with @option{-fprofile-generate} and @option{-fprofile-use}.
 
+@item -fprofile-reoder-functions
+@opindex fprofile-reorder-functions
+Function reordering based on profile instrumentation collects
+first time of execution of a function and orders these functions
+in ascending order.
+
+Enabled with @option{-fprofile-use}.
+
 @item -fvpt
 @opindex fvpt
 If combined with @option{-fprofile-arcs}, this option instructs the compiler
diff --git a/gcc/ipa-split.c b/gcc/ipa-split.c
index 390adf1..db056c8 100644
--- a/gcc/ipa-split.c
+++ b/gcc/ipa-split.c
@@ -1234,6 +1234,10 @@ split_function (struct split_point *split_point)
 				     !split_part_return_p,
 				     split_point->split_bbs,
 				     split_point->entry_bb, "part");
+
+  /* Let's take a time profile for splitted function.  */
+  node->tp_first_run = cur_node->tp_first_run + 1;
+
   /* For usual cloning it is enough to clear builtin only when signature
      changes.  For partial inlining we however can not expect the part
      of builtin implementation to have same semantic as the whole.  */
diff --git a/gcc/lto/lto-partition.c b/gcc/lto/lto-partition.c
index 5b46af9..8119b11 100644
--- a/gcc/lto/lto-partition.c
+++ b/gcc/lto/lto-partition.c
@@ -286,9 +286,11 @@ add_symbol_to_partition (ltrans_partition part, symtab_node *node)
 
      Be lax about comdats; they may or may not be duplicated and we may
      end up in need to duplicate keyed comdat because it has unkeyed alias.  */
+
   gcc_assert (get_symbol_class (node) == SYMBOL_DUPLICATE
 	      || DECL_COMDAT (node->decl)
 	      || !symbol_partitioned_p (node));
+
   add_symbol_to_partition_1 (part, node);
 }
 
@@ -401,6 +403,25 @@ node_cmp (const void *pa, const void *pb)
 {
   const struct cgraph_node *a = *(const struct cgraph_node * const *) pa;
   const struct cgraph_node *b = *(const struct cgraph_node * const *) pb;
+
+  /* Profile reorder flag enables function reordering based on first execution
+     of a function. All functions with profile are placed in ascending
+     order at the beginning.  */
+
+  if (flag_profile_reorder_functions)
+  {
+    /* Functions with time profile are sorted in ascending order.  */
+    if (a->tp_first_run && b->tp_first_run)
+      return a->tp_first_run != b->tp_first_run
+	? a->tp_first_run - b->tp_first_run
+        : a->order - b->order;
+
+    /* Functions with time profile are sorted before the functions
+       that do not have the profile.  */
+    if (a->tp_first_run || b->tp_first_run)
+      return b->tp_first_run - a->tp_first_run;
+  }
+
   return b->order - a->order;
 }
 
@@ -455,7 +476,7 @@ void
 lto_balanced_map (void)
 {
   int n_nodes = 0;
-  int n_varpool_nodes = 0, varpool_pos = 0, best_varpool_pos = 0;
+  int n_varpool_nodes = 0, varpool_pos = 0;
   struct cgraph_node **order = XNEWVEC (struct cgraph_node *, cgraph_max_uid);
   varpool_node **varpool_order = NULL;
   int i;
@@ -487,10 +508,13 @@ lto_balanced_map (void)
      get better about minimizing the function bounday, but until that
      things works smoother if we order in source order.  */
   qsort (order, n_nodes, sizeof (struct cgraph_node *), node_cmp);
+
+  if (cgraph_dump_file)
+    for(i = 0; i < n_nodes; i++)
+      fprintf (cgraph_dump_file, "Balanced map symbol order:%s:%u\n", order[i]->name (), order[i]->tp_first_run);
+
   if (!flag_toplevel_reorder)
     {
-      qsort (order, n_nodes, sizeof (struct cgraph_node *), node_cmp);
-
       FOR_EACH_VARIABLE (vnode)
 	if (get_symbol_class (vnode) == SYMBOL_PARTITION)
 	  n_varpool_nodes++;
@@ -689,7 +713,6 @@ lto_balanced_map (void)
 	  best_i = i;
 	  best_n_nodes = lto_symtab_encoder_size (partition->encoder);
 	  best_total_size = total_size;
-	  best_varpool_pos = varpool_pos;
 	}
       if (cgraph_dump_file)
 	fprintf (cgraph_dump_file, "Step %i: added %s/%i, size %i, cost %i/%i "
@@ -707,7 +730,6 @@ lto_balanced_map (void)
 		fprintf (cgraph_dump_file, "Unwinding %i insertions to step %i\n",
 			 i - best_i, best_i);
 	      undo_partition (partition, best_n_nodes);
-	      varpool_pos = best_varpool_pos;
 	    }
 	  i = best_i;
  	  /* When we are finished, avoid creating empty partition.  */
@@ -855,7 +877,7 @@ may_need_named_section_p (lto_symtab_encoder_t encoder, symtab_node *node)
    of the same name in partition ENCODER (or in whole compilation unit if
    ENCODER is NULL) and if so, mangle the statics.  Always mangle all
    conflicting statics, so we reduce changes of silently miscompiling
-   asm statemnets referring to them by symbol name.  */
+   asm statements referring to them by symbol name.  */
 
 static void
 rename_statics (lto_symtab_encoder_t encoder, symtab_node *node)
diff --git a/gcc/lto/lto.c b/gcc/lto/lto.c
index f322a00..8e5eeb3 100644
--- a/gcc/lto/lto.c
+++ b/gcc/lto/lto.c
@@ -2503,9 +2503,12 @@ lto_wpa_write_files (void)
   /* Sort partitions by size so small ones are compiled last.
      FIXME: Even when not reordering we may want to output one list for parallel make
      and other for final link command.  */
-  ltrans_partitions.qsort (flag_toplevel_reorder
+
+  if (!flag_profile_reorder_functions || !flag_profile_use)
+    ltrans_partitions.qsort (flag_toplevel_reorder
 			   ? cmp_partitions_size
 			   : cmp_partitions_order);
+
   for (i = 0; i < n_sets; i++)
     {
       size_t len;
diff --git a/gcc/opts.c b/gcc/opts.c
index 4cb2cdf..5be03fa 100644
--- a/gcc/opts.c
+++ b/gcc/opts.c
@@ -1710,6 +1710,8 @@ common_handle_option (struct gcc_options *opts,
 	opts->x_flag_vect_cost_model = VECT_COST_MODEL_DYNAMIC;
       if (!opts_set->x_flag_tree_loop_distribute_patterns)
 	opts->x_flag_tree_loop_distribute_patterns = value;
+      if (!opts_set->x_flag_profile_reorder_functions)
+	opts->x_flag_profile_reorder_functions = value;
       /* Indirect call profiling should do all useful transformations
  	 speculative devirtualization does.  */
       if (!opts_set->x_flag_devirtualize_speculatively
diff --git a/gcc/predict.c b/gcc/predict.c
index a5ad34f..1826a06 100644
--- a/gcc/predict.c
+++ b/gcc/predict.c
@@ -2839,12 +2839,24 @@ handle_missing_profiles (void)
     {
       struct cgraph_edge *e;
       gcov_type call_count = 0;
+      gcov_type max_tp_first_run = 0;
       struct function *fn = DECL_STRUCT_FUNCTION (node->decl);
 
       if (node->count)
         continue;
       for (e = node->callers; e; e = e->next_caller)
+      {
         call_count += e->count;
+
+	if (e->caller->tp_first_run > max_tp_first_run)
+	  max_tp_first_run = e->caller->tp_first_run;
+      }
+
+      /* If time profile is missing, let assign the maximum that comes from
+	 caller functions.  */
+      if (!node->tp_first_run && max_tp_first_run)
+	node->tp_first_run = max_tp_first_run + 1;
+
       if (call_count
           && fn && fn->cfg
           && (call_count * unlikely_count_fraction >= profile_info->runs))
diff --git a/gcc/varasm.c b/gcc/varasm.c
index 5c5025a..f34946c 100644
--- a/gcc/varasm.c
+++ b/gcc/varasm.c
@@ -552,7 +552,14 @@ default_function_section (tree decl, enum node_frequency freq,
      unlikely executed (this happens especially with function splitting
      where we can split away unnecessary parts of static constructors.  */
   if (startup && freq != NODE_FREQUENCY_UNLIKELY_EXECUTED)
-    return get_named_text_section (decl, ".text.startup", NULL);
+  {
+    /* If we do have a profile or(and) LTO phase is executed, we do not need
+    these ELF section.  */
+    if (!in_lto_p || !flag_profile_values)
+      return get_named_text_section (decl, ".text.startup", NULL);
+    else
+      return NULL;
+  }
 
   /* Similarly for exit.  */
   if (exit && freq != NODE_FREQUENCY_UNLIKELY_EXECUTED)
@@ -564,7 +571,10 @@ default_function_section (tree decl, enum node_frequency freq,
       case NODE_FREQUENCY_UNLIKELY_EXECUTED:
 	return get_named_text_section (decl, ".text.unlikely", NULL);
       case NODE_FREQUENCY_HOT:
-	return get_named_text_section (decl, ".text.hot", NULL);
+        /* If we do have a profile or(and) LTO phase is executed, we do not need
+           these ELF section.  */
+        if (!in_lto_p || !flag_profile_values)
+          return get_named_text_section (decl, ".text.hot", NULL);
       default:
 	return NULL;
     }

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] Time profiler - phase 2
  2013-12-15 23:08                           ` Martin Liška
@ 2013-12-15 23:21                             ` Jan Hubicka
  2013-12-15 23:52                               ` Martin Liška
  0 siblings, 1 reply; 18+ messages in thread
From: Jan Hubicka @ 2013-12-15 23:21 UTC (permalink / raw)
  To: Martin Liška; +Cc: Jan Hubicka, gcc-patches

> diff --git a/gcc/ChangeLog b/gcc/ChangeLog
> index 93e857df..d5a0ac8 100644
> --- a/gcc/ChangeLog
> +++ b/gcc/ChangeLog
> @@ -1,3 +1,14 @@
> +2013-12-15  Martin Liska  <marxin.liska@gmail.com>
> +	    Jan Hubicka  <jh@suse.cz>
> +
> +	* cgraphunit.c (node_cmp): New function.
> +	(expand_all_functions): Function ordering added.
> +	* common.opt: New profile based function reordering flag introduced.
> +	* lto-partition.c: Support for time profile added.
> +	* lto.c: Likewise.
> +	* predict.c (handle_missing_profiles): Time profile handled in
> +	  missing profiles.
> +

OK, thanks, with the changes bellow!
(I tought this patch was already in! Also please be careful about
applying the changes - it seems that in the previous commit you
M
omitted some)
> @@ -1842,11 +1859,14 @@ expand_function (struct cgraph_node *node)
>     to use subsections to make the output functions appear in top-down
>     order).  */
>  
> +
Bogus whitespace
>  static void
>  expand_all_functions (void)
>  {
>    struct cgraph_node *node;
>    struct cgraph_node **order = XCNEWVEC (struct cgraph_node *, cgraph_n_nodes);
> +
> +  unsigned int expanded_func_count = 0, profiled_func_count = 0;
>    int order_pos, new_order_pos = 0;
>    int i;
>  
> @@ -1859,20 +1879,39 @@ expand_all_functions (void)
>      if (order[i]->process)
>        order[new_order_pos++] = order[i];
>  
> +  if (flag_profile_reorder_functions)
> +    qsort (order, new_order_pos, sizeof (struct cgraph_node *), node_cmp);
> +
>    for (i = new_order_pos - 1; i >= 0; i--)
>      {
>        node = order[i];
> +
>        if (node->process)
>  	{
> +     expanded_func_count++;
> +     if(node->tp_first_run)
> +       profiled_func_count++;
> +
> +    if (cgraph_dump_file)
> +      fprintf (cgraph_dump_file, "Time profile order in expand_all_functions:%s:%d\n", node->asm_name (), node->tp_first_run);
> +
>  	  node->process = 0;
>  	  expand_function (node);
>  	}
>      }
> +
> +    if (in_lto_p && dump_file)
> +      fprintf (dump_file, "Expanded functions with time profile (%s):%u/%u\n",
> +               main_input_filename, profiled_func_count, expanded_func_count);
> +
> +  if (cgraph_dump_file && flag_profile_reorder_functions && in_lto_p)
> +    fprintf (cgraph_dump_file, "Expanded functions with time profile:%u/%u\n",
> +             profiled_func_count, expanded_func_count);

Make the dumps unconditoinal, I do not see why they should be in_lto_p here.
> @@ -689,7 +713,6 @@ lto_balanced_map (void)
>  	  best_i = i;
>  	  best_n_nodes = lto_symtab_encoder_size (partition->encoder);
>  	  best_total_size = total_size;
> -	  best_varpool_pos = varpool_pos;
>  	}
>        if (cgraph_dump_file)
>  	fprintf (cgraph_dump_file, "Step %i: added %s/%i, size %i, cost %i/%i "
> @@ -707,7 +730,6 @@ lto_balanced_map (void)
>  		fprintf (cgraph_dump_file, "Unwinding %i insertions to step %i\n",
>  			 i - best_i, best_i);
>  	      undo_partition (partition, best_n_nodes);
> -	      varpool_pos = best_varpool_pos;
>  	    }
>  	  i = best_i;
>   	  /* When we are finished, avoid creating empty partition.  */

I already asked you to remove these changes - they revert earlier fix.

> diff --git a/gcc/predict.c b/gcc/predict.c
> index a5ad34f..1826a06 100644
> --- a/gcc/predict.c
> +++ b/gcc/predict.c
> @@ -2839,12 +2839,24 @@ handle_missing_profiles (void)
>      {
>        struct cgraph_edge *e;
>        gcov_type call_count = 0;
> +      gcov_type max_tp_first_run = 0;
>        struct function *fn = DECL_STRUCT_FUNCTION (node->decl);
>  
>        if (node->count)
>          continue;
>        for (e = node->callers; e; e = e->next_caller)
> +      {
>          call_count += e->count;
> +
> +	if (e->caller->tp_first_run > max_tp_first_run)
> +	  max_tp_first_run = e->caller->tp_first_run;
> +      }
> +
> +      /* If time profile is missing, let assign the maximum that comes from
> +	 caller functions.  */
> +      if (!node->tp_first_run && max_tp_first_run)
> +	node->tp_first_run = max_tp_first_run + 1;
> +

I believe you also need minizming node->tp_first_run in ipa_merge_profiles.
>        if (call_count
>            && fn && fn->cfg
>            && (call_count * unlikely_count_fraction >= profile_info->runs))
> diff --git a/gcc/varasm.c b/gcc/varasm.c
> index 5c5025a..f34946c 100644
> --- a/gcc/varasm.c
> +++ b/gcc/varasm.c
> @@ -552,7 +552,14 @@ default_function_section (tree decl, enum node_frequency freq,
>       unlikely executed (this happens especially with function splitting
>       where we can split away unnecessary parts of static constructors.  */
>    if (startup && freq != NODE_FREQUENCY_UNLIKELY_EXECUTED)
> -    return get_named_text_section (decl, ".text.startup", NULL);
> +  {
> +    /* If we do have a profile or(and) LTO phase is executed, we do not need
> +    these ELF section.  */
> +    if (!in_lto_p || !flag_profile_values)
> +      return get_named_text_section (decl, ".text.startup", NULL);
> +    else
> +      return NULL;
> +  }
>  
>    /* Similarly for exit.  */
>    if (exit && freq != NODE_FREQUENCY_UNLIKELY_EXECUTED)
> @@ -564,7 +571,10 @@ default_function_section (tree decl, enum node_frequency freq,
>        case NODE_FREQUENCY_UNLIKELY_EXECUTED:
>  	return get_named_text_section (decl, ".text.unlikely", NULL);
>        case NODE_FREQUENCY_HOT:
> -	return get_named_text_section (decl, ".text.hot", NULL);
> +        /* If we do have a profile or(and) LTO phase is executed, we do not need
> +           these ELF section.  */
> +        if (!in_lto_p || !flag_profile_values)
> +          return get_named_text_section (decl, ".text.hot", NULL);
>        default:
>  	return NULL;
Please duplicate these changes into config/darwin.c that has identical code in it.

OK with those changes.

Honza

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] Time profiler - phase 2
  2013-12-15 23:21                             ` Jan Hubicka
@ 2013-12-15 23:52                               ` Martin Liška
  2013-12-16 10:14                                 ` Jan Hubicka
  0 siblings, 1 reply; 18+ messages in thread
From: Martin Liška @ 2013-12-15 23:52 UTC (permalink / raw)
  To: Jan Hubicka; +Cc: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 6233 bytes --]

Hello,
   there's updated version of the patch.

Tested on x86_64 with enable bootstrap.

Martin

On 16 December 2013 00:21, Jan Hubicka <hubicka@ucw.cz> wrote:
>> diff --git a/gcc/ChangeLog b/gcc/ChangeLog
>> index 93e857df..d5a0ac8 100644
>> --- a/gcc/ChangeLog
>> +++ b/gcc/ChangeLog
>> @@ -1,3 +1,14 @@
>> +2013-12-15  Martin Liska  <marxin.liska@gmail.com>
>> +         Jan Hubicka  <jh@suse.cz>
>> +
>> +     * cgraphunit.c (node_cmp): New function.
>> +     (expand_all_functions): Function ordering added.
>> +     * common.opt: New profile based function reordering flag introduced.
>> +     * lto-partition.c: Support for time profile added.
>> +     * lto.c: Likewise.
>> +     * predict.c (handle_missing_profiles): Time profile handled in
>> +       missing profiles.
>> +
>
> OK, thanks, with the changes bellow!
> (I tought this patch was already in! Also please be careful about
> applying the changes - it seems that in the previous commit you
> M
> omitted some)
>> @@ -1842,11 +1859,14 @@ expand_function (struct cgraph_node *node)
>>     to use subsections to make the output functions appear in top-down
>>     order).  */
>>
>> +
> Bogus whitespace
>>  static void
>>  expand_all_functions (void)
>>  {
>>    struct cgraph_node *node;
>>    struct cgraph_node **order = XCNEWVEC (struct cgraph_node *, cgraph_n_nodes);
>> +
>> +  unsigned int expanded_func_count = 0, profiled_func_count = 0;
>>    int order_pos, new_order_pos = 0;
>>    int i;
>>
>> @@ -1859,20 +1879,39 @@ expand_all_functions (void)
>>      if (order[i]->process)
>>        order[new_order_pos++] = order[i];
>>
>> +  if (flag_profile_reorder_functions)
>> +    qsort (order, new_order_pos, sizeof (struct cgraph_node *), node_cmp);
>> +
>>    for (i = new_order_pos - 1; i >= 0; i--)
>>      {
>>        node = order[i];
>> +
>>        if (node->process)
>>       {
>> +     expanded_func_count++;
>> +     if(node->tp_first_run)
>> +       profiled_func_count++;
>> +
>> +    if (cgraph_dump_file)
>> +      fprintf (cgraph_dump_file, "Time profile order in expand_all_functions:%s:%d\n", node->asm_name (), node->tp_first_run);
>> +
>>         node->process = 0;
>>         expand_function (node);
>>       }
>>      }
>> +
>> +    if (in_lto_p && dump_file)
>> +      fprintf (dump_file, "Expanded functions with time profile (%s):%u/%u\n",
>> +               main_input_filename, profiled_func_count, expanded_func_count);
>> +
>> +  if (cgraph_dump_file && flag_profile_reorder_functions && in_lto_p)
>> +    fprintf (cgraph_dump_file, "Expanded functions with time profile:%u/%u\n",
>> +             profiled_func_count, expanded_func_count);
>
> Make the dumps unconditoinal, I do not see why they should be in_lto_p here.
>> @@ -689,7 +713,6 @@ lto_balanced_map (void)
>>         best_i = i;
>>         best_n_nodes = lto_symtab_encoder_size (partition->encoder);
>>         best_total_size = total_size;
>> -       best_varpool_pos = varpool_pos;
>>       }
>>        if (cgraph_dump_file)
>>       fprintf (cgraph_dump_file, "Step %i: added %s/%i, size %i, cost %i/%i "
>> @@ -707,7 +730,6 @@ lto_balanced_map (void)
>>               fprintf (cgraph_dump_file, "Unwinding %i insertions to step %i\n",
>>                        i - best_i, best_i);
>>             undo_partition (partition, best_n_nodes);
>> -           varpool_pos = best_varpool_pos;
>>           }
>>         i = best_i;
>>         /* When we are finished, avoid creating empty partition.  */
>
> I already asked you to remove these changes - they revert earlier fix.
>
>> diff --git a/gcc/predict.c b/gcc/predict.c
>> index a5ad34f..1826a06 100644
>> --- a/gcc/predict.c
>> +++ b/gcc/predict.c
>> @@ -2839,12 +2839,24 @@ handle_missing_profiles (void)
>>      {
>>        struct cgraph_edge *e;
>>        gcov_type call_count = 0;
>> +      gcov_type max_tp_first_run = 0;
>>        struct function *fn = DECL_STRUCT_FUNCTION (node->decl);
>>
>>        if (node->count)
>>          continue;
>>        for (e = node->callers; e; e = e->next_caller)
>> +      {
>>          call_count += e->count;
>> +
>> +     if (e->caller->tp_first_run > max_tp_first_run)
>> +       max_tp_first_run = e->caller->tp_first_run;
>> +      }
>> +
>> +      /* If time profile is missing, let assign the maximum that comes from
>> +      caller functions.  */
>> +      if (!node->tp_first_run && max_tp_first_run)
>> +     node->tp_first_run = max_tp_first_run + 1;
>> +
>
> I believe you also need minizming node->tp_first_run in ipa_merge_profiles.
>>        if (call_count
>>            && fn && fn->cfg
>>            && (call_count * unlikely_count_fraction >= profile_info->runs))
>> diff --git a/gcc/varasm.c b/gcc/varasm.c
>> index 5c5025a..f34946c 100644
>> --- a/gcc/varasm.c
>> +++ b/gcc/varasm.c
>> @@ -552,7 +552,14 @@ default_function_section (tree decl, enum node_frequency freq,
>>       unlikely executed (this happens especially with function splitting
>>       where we can split away unnecessary parts of static constructors.  */
>>    if (startup && freq != NODE_FREQUENCY_UNLIKELY_EXECUTED)
>> -    return get_named_text_section (decl, ".text.startup", NULL);
>> +  {
>> +    /* If we do have a profile or(and) LTO phase is executed, we do not need
>> +    these ELF section.  */
>> +    if (!in_lto_p || !flag_profile_values)
>> +      return get_named_text_section (decl, ".text.startup", NULL);
>> +    else
>> +      return NULL;
>> +  }
>>
>>    /* Similarly for exit.  */
>>    if (exit && freq != NODE_FREQUENCY_UNLIKELY_EXECUTED)
>> @@ -564,7 +571,10 @@ default_function_section (tree decl, enum node_frequency freq,
>>        case NODE_FREQUENCY_UNLIKELY_EXECUTED:
>>       return get_named_text_section (decl, ".text.unlikely", NULL);
>>        case NODE_FREQUENCY_HOT:
>> -     return get_named_text_section (decl, ".text.hot", NULL);
>> +        /* If we do have a profile or(and) LTO phase is executed, we do not need
>> +           these ELF section.  */
>> +        if (!in_lto_p || !flag_profile_values)
>> +          return get_named_text_section (decl, ".text.hot", NULL);
>>        default:
>>       return NULL;
> Please duplicate these changes into config/darwin.c that has identical code in it.
>
> OK with those changes.
>
> Honza

[-- Attachment #2: time-profiler-2.5.patch --]
[-- Type: text/x-patch, Size: 13808 bytes --]

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 93e857df..d5a0ac8 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,14 @@
+2013-12-15  Martin Liska  <marxin.liska@gmail.com>
+	    Jan Hubicka  <jh@suse.cz>
+
+	* cgraphunit.c (node_cmp): New function.
+	(expand_all_functions): Function ordering added.
+	* common.opt: New profile based function reordering flag introduced.
+	* lto-partition.c: Support for time profile added.
+	* lto.c: Likewise.
+	* predict.c (handle_missing_profiles): Time profile handled in
+	  missing profiles.
+
 2013-12-14   Jan Hubicka  <jh@suse.cz>
 
 	PR middle-end/58477
diff --git a/gcc/cgraphunit.c b/gcc/cgraphunit.c
index 44f3afd..28f5116 100644
--- a/gcc/cgraphunit.c
+++ b/gcc/cgraphunit.c
@@ -1831,6 +1831,23 @@ expand_function (struct cgraph_node *node)
   ipa_remove_all_references (&node->ref_list);
 }
 
+/* Node comparer that is responsible for the order that corresponds
+   to time when a function was launched for the first time.  */
+
+static int
+node_cmp (const void *pa, const void *pb)
+{
+  const struct cgraph_node *a = *(const struct cgraph_node * const *) pa;
+  const struct cgraph_node *b = *(const struct cgraph_node * const *) pb;
+
+  /* Functions with time profile must be before these without profile.  */
+  if (!a->tp_first_run || !b->tp_first_run)
+    return a->tp_first_run - b->tp_first_run;
+
+  return a->tp_first_run != b->tp_first_run
+	 ? b->tp_first_run - a->tp_first_run
+	 : b->order - a->order;
+}
 
 /* Expand all functions that must be output.
 
@@ -1847,6 +1864,7 @@ expand_all_functions (void)
 {
   struct cgraph_node *node;
   struct cgraph_node **order = XCNEWVEC (struct cgraph_node *, cgraph_n_nodes);
+  unsigned int expanded_func_count = 0, profiled_func_count = 0;
   int order_pos, new_order_pos = 0;
   int i;
 
@@ -1859,20 +1877,39 @@ expand_all_functions (void)
     if (order[i]->process)
       order[new_order_pos++] = order[i];
 
+  if (flag_profile_reorder_functions)
+    qsort (order, new_order_pos, sizeof (struct cgraph_node *), node_cmp);
+
   for (i = new_order_pos - 1; i >= 0; i--)
     {
       node = order[i];
+
       if (node->process)
 	{
+     expanded_func_count++;
+     if(node->tp_first_run)
+       profiled_func_count++;
+
+    if (cgraph_dump_file)
+      fprintf (cgraph_dump_file, "Time profile order in expand_all_functions:%s:%d\n", node->asm_name (), node->tp_first_run);
+
 	  node->process = 0;
 	  expand_function (node);
 	}
     }
+
+    if (dump_file)
+      fprintf (dump_file, "Expanded functions with time profile (%s):%u/%u\n",
+               main_input_filename, profiled_func_count, expanded_func_count);
+
+  if (cgraph_dump_file && flag_profile_reorder_functions)
+    fprintf (cgraph_dump_file, "Expanded functions with time profile:%u/%u\n",
+             profiled_func_count, expanded_func_count);
+
   cgraph_process_new_functions ();
   free_gimplify_stack ();
 
   free (order);
-
 }
 
 /* This is used to sort the node types by the cgraph order number.  */
@@ -2194,6 +2231,7 @@ compile (void)
 #endif
 
   cgraph_state = CGRAPH_STATE_EXPANSION;
+
   if (!flag_toplevel_reorder)
     output_in_order ();
   else
diff --git a/gcc/common.opt b/gcc/common.opt
index 0cd1fdd..ea323fd 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -1712,6 +1712,10 @@ fprofile-report
 Common Report Var(profile_report)
 Report on consistency of profile
 
+fprofile-reorder-functions
+Common Report Var(flag_profile_reorder_functions)
+Enable function reordering that improves code placement
+
 frandom-seed
 Common Var(common_deferred_options) Defer
 
diff --git a/gcc/config/darwin.c b/gcc/config/darwin.c
index ea056a9..4267c89 100644
--- a/gcc/config/darwin.c
+++ b/gcc/config/darwin.c
@@ -3621,9 +3621,16 @@ darwin_function_section (tree decl, enum node_frequency freq,
      unlikely executed (this happens especially with function splitting
      where we can split away unnecessary parts of static constructors).  */
   if (startup && freq != NODE_FREQUENCY_UNLIKELY_EXECUTED)
-    return (weak)
-	    ? darwin_sections[text_startup_coal_section]
-	    : darwin_sections[text_startup_section];
+  {
+    /* If we do have a profile or(and) LTO phase is executed, we do not need
+       these ELF section.  */
+    if (!in_lto_p || !flag_profile_values)
+      return (weak)
+	      ? darwin_sections[text_startup_coal_section]
+	      : darwin_sections[text_startup_section];
+    else
+      return text_section;
+  }
 
   /* Similarly for exit.  */
   if (exit && freq != NODE_FREQUENCY_UNLIKELY_EXECUTED)
@@ -3640,10 +3647,15 @@ darwin_function_section (tree decl, enum node_frequency freq,
 		: darwin_sections[text_cold_section];
 	break;
       case NODE_FREQUENCY_HOT:
-	return (weak)
-		? darwin_sections[text_hot_coal_section]
-		: darwin_sections[text_hot_section];
-	break;
+      {
+        /* If we do have a profile or(and) LTO phase is executed, we do not need
+           these ELF section.  */
+        if (!in_lto_p || !flag_profile_values)
+          return (weak)
+                  ? darwin_sections[text_hot_coal_section]
+                  : darwin_sections[text_hot_section];
+        break;
+      }
       default:
 	return (weak)
 		? darwin_sections[text_coal_section]
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index b655a64..b30a368 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -394,7 +394,7 @@ Objective-C and Objective-C++ Dialects}.
 -fprefetch-loop-arrays -fprofile-report @gol
 -fprofile-correction -fprofile-dir=@var{path} -fprofile-generate @gol
 -fprofile-generate=@var{path} @gol
--fprofile-use -fprofile-use=@var{path} -fprofile-values @gol
+-fprofile-use -fprofile-use=@var{path} -fprofile-values -fprofile-reorder-functions @gol
 -freciprocal-math -free -frename-registers -freorder-blocks @gol
 -freorder-blocks-and-partition -freorder-functions @gol
 -frerun-cse-after-loop -freschedule-modulo-scheduled-loops @gol
@@ -9071,6 +9071,14 @@ from profiling values of expressions for usage in optimizations.
 
 Enabled with @option{-fprofile-generate} and @option{-fprofile-use}.
 
+@item -fprofile-reoder-functions
+@opindex fprofile-reorder-functions
+Function reordering based on profile instrumentation collects
+first time of execution of a function and orders these functions
+in ascending order.
+
+Enabled with @option{-fprofile-use}.
+
 @item -fvpt
 @opindex fvpt
 If combined with @option{-fprofile-arcs}, this option instructs the compiler
diff --git a/gcc/ipa-split.c b/gcc/ipa-split.c
index 390adf1..db056c8 100644
--- a/gcc/ipa-split.c
+++ b/gcc/ipa-split.c
@@ -1234,6 +1234,10 @@ split_function (struct split_point *split_point)
 				     !split_part_return_p,
 				     split_point->split_bbs,
 				     split_point->entry_bb, "part");
+
+  /* Let's take a time profile for splitted function.  */
+  node->tp_first_run = cur_node->tp_first_run + 1;
+
   /* For usual cloning it is enough to clear builtin only when signature
      changes.  For partial inlining we however can not expect the part
      of builtin implementation to have same semantic as the whole.  */
diff --git a/gcc/ipa-utils.c b/gcc/ipa-utils.c
index 9297280..6641626 100644
--- a/gcc/ipa-utils.c
+++ b/gcc/ipa-utils.c
@@ -655,6 +655,11 @@ ipa_merge_profiles (struct cgraph_node *dst,
     return;
   if (src->frequency < dst->frequency)
     src->frequency = dst->frequency;
+
+  /* Time profiles are merged.  */
+  if (dst->tp_first_run > src->tp_first_run && src->tp_first_run)
+    dst->tp_first_run = src->tp_first_run;
+
   if (!dst->count)
     return;
   if (cgraph_dump_file)
diff --git a/gcc/lto/lto-partition.c b/gcc/lto/lto-partition.c
index 5b46af9..5e0335e 100644
--- a/gcc/lto/lto-partition.c
+++ b/gcc/lto/lto-partition.c
@@ -286,9 +286,11 @@ add_symbol_to_partition (ltrans_partition part, symtab_node *node)
 
      Be lax about comdats; they may or may not be duplicated and we may
      end up in need to duplicate keyed comdat because it has unkeyed alias.  */
+
   gcc_assert (get_symbol_class (node) == SYMBOL_DUPLICATE
 	      || DECL_COMDAT (node->decl)
 	      || !symbol_partitioned_p (node));
+
   add_symbol_to_partition_1 (part, node);
 }
 
@@ -401,6 +403,25 @@ node_cmp (const void *pa, const void *pb)
 {
   const struct cgraph_node *a = *(const struct cgraph_node * const *) pa;
   const struct cgraph_node *b = *(const struct cgraph_node * const *) pb;
+
+  /* Profile reorder flag enables function reordering based on first execution
+     of a function. All functions with profile are placed in ascending
+     order at the beginning.  */
+
+  if (flag_profile_reorder_functions)
+  {
+    /* Functions with time profile are sorted in ascending order.  */
+    if (a->tp_first_run && b->tp_first_run)
+      return a->tp_first_run != b->tp_first_run
+	? a->tp_first_run - b->tp_first_run
+        : a->order - b->order;
+
+    /* Functions with time profile are sorted before the functions
+       that do not have the profile.  */
+    if (a->tp_first_run || b->tp_first_run)
+      return b->tp_first_run - a->tp_first_run;
+  }
+
   return b->order - a->order;
 }
 
@@ -487,10 +508,13 @@ lto_balanced_map (void)
      get better about minimizing the function bounday, but until that
      things works smoother if we order in source order.  */
   qsort (order, n_nodes, sizeof (struct cgraph_node *), node_cmp);
+
+  if (cgraph_dump_file)
+    for(i = 0; i < n_nodes; i++)
+      fprintf (cgraph_dump_file, "Balanced map symbol order:%s:%u\n", order[i]->name (), order[i]->tp_first_run);
+
   if (!flag_toplevel_reorder)
     {
-      qsort (order, n_nodes, sizeof (struct cgraph_node *), node_cmp);
-
       FOR_EACH_VARIABLE (vnode)
 	if (get_symbol_class (vnode) == SYMBOL_PARTITION)
 	  n_varpool_nodes++;
@@ -855,7 +879,7 @@ may_need_named_section_p (lto_symtab_encoder_t encoder, symtab_node *node)
    of the same name in partition ENCODER (or in whole compilation unit if
    ENCODER is NULL) and if so, mangle the statics.  Always mangle all
    conflicting statics, so we reduce changes of silently miscompiling
-   asm statemnets referring to them by symbol name.  */
+   asm statements referring to them by symbol name.  */
 
 static void
 rename_statics (lto_symtab_encoder_t encoder, symtab_node *node)
diff --git a/gcc/lto/lto.c b/gcc/lto/lto.c
index f322a00..8e5eeb3 100644
--- a/gcc/lto/lto.c
+++ b/gcc/lto/lto.c
@@ -2503,9 +2503,12 @@ lto_wpa_write_files (void)
   /* Sort partitions by size so small ones are compiled last.
      FIXME: Even when not reordering we may want to output one list for parallel make
      and other for final link command.  */
-  ltrans_partitions.qsort (flag_toplevel_reorder
+
+  if (!flag_profile_reorder_functions || !flag_profile_use)
+    ltrans_partitions.qsort (flag_toplevel_reorder
 			   ? cmp_partitions_size
 			   : cmp_partitions_order);
+
   for (i = 0; i < n_sets; i++)
     {
       size_t len;
diff --git a/gcc/opts.c b/gcc/opts.c
index 4cb2cdf..5be03fa 100644
--- a/gcc/opts.c
+++ b/gcc/opts.c
@@ -1710,6 +1710,8 @@ common_handle_option (struct gcc_options *opts,
 	opts->x_flag_vect_cost_model = VECT_COST_MODEL_DYNAMIC;
       if (!opts_set->x_flag_tree_loop_distribute_patterns)
 	opts->x_flag_tree_loop_distribute_patterns = value;
+      if (!opts_set->x_flag_profile_reorder_functions)
+	opts->x_flag_profile_reorder_functions = value;
       /* Indirect call profiling should do all useful transformations
  	 speculative devirtualization does.  */
       if (!opts_set->x_flag_devirtualize_speculatively
diff --git a/gcc/predict.c b/gcc/predict.c
index a5ad34f..1826a06 100644
--- a/gcc/predict.c
+++ b/gcc/predict.c
@@ -2839,12 +2839,24 @@ handle_missing_profiles (void)
     {
       struct cgraph_edge *e;
       gcov_type call_count = 0;
+      gcov_type max_tp_first_run = 0;
       struct function *fn = DECL_STRUCT_FUNCTION (node->decl);
 
       if (node->count)
         continue;
       for (e = node->callers; e; e = e->next_caller)
+      {
         call_count += e->count;
+
+	if (e->caller->tp_first_run > max_tp_first_run)
+	  max_tp_first_run = e->caller->tp_first_run;
+      }
+
+      /* If time profile is missing, let assign the maximum that comes from
+	 caller functions.  */
+      if (!node->tp_first_run && max_tp_first_run)
+	node->tp_first_run = max_tp_first_run + 1;
+
       if (call_count
           && fn && fn->cfg
           && (call_count * unlikely_count_fraction >= profile_info->runs))
diff --git a/gcc/varasm.c b/gcc/varasm.c
index 5c5025a..f34946c 100644
--- a/gcc/varasm.c
+++ b/gcc/varasm.c
@@ -552,7 +552,14 @@ default_function_section (tree decl, enum node_frequency freq,
      unlikely executed (this happens especially with function splitting
      where we can split away unnecessary parts of static constructors.  */
   if (startup && freq != NODE_FREQUENCY_UNLIKELY_EXECUTED)
-    return get_named_text_section (decl, ".text.startup", NULL);
+  {
+    /* If we do have a profile or(and) LTO phase is executed, we do not need
+    these ELF section.  */
+    if (!in_lto_p || !flag_profile_values)
+      return get_named_text_section (decl, ".text.startup", NULL);
+    else
+      return NULL;
+  }
 
   /* Similarly for exit.  */
   if (exit && freq != NODE_FREQUENCY_UNLIKELY_EXECUTED)
@@ -564,7 +571,10 @@ default_function_section (tree decl, enum node_frequency freq,
       case NODE_FREQUENCY_UNLIKELY_EXECUTED:
 	return get_named_text_section (decl, ".text.unlikely", NULL);
       case NODE_FREQUENCY_HOT:
-	return get_named_text_section (decl, ".text.hot", NULL);
+        /* If we do have a profile or(and) LTO phase is executed, we do not need
+           these ELF section.  */
+        if (!in_lto_p || !flag_profile_values)
+          return get_named_text_section (decl, ".text.hot", NULL);
       default:
 	return NULL;
     }

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] Time profiler - phase 2
  2013-12-15 23:52                               ` Martin Liška
@ 2013-12-16 10:14                                 ` Jan Hubicka
  2013-12-20 12:17                                   ` Iain Sandoe
  0 siblings, 1 reply; 18+ messages in thread
From: Jan Hubicka @ 2013-12-16 10:14 UTC (permalink / raw)
  To: Martin Liška; +Cc: Jan Hubicka, gcc-patches

> Hello,
>    there's updated version of the patch.
> 
> Tested on x86_64 with enable bootstrap.
> 
> Martin
> 
> On 16 December 2013 00:21, Jan Hubicka <hubicka@ucw.cz> wrote:
> >> diff --git a/gcc/ChangeLog b/gcc/ChangeLog
> >> index 93e857df..d5a0ac8 100644
> >> --- a/gcc/ChangeLog
> >> +++ b/gcc/ChangeLog
> >> @@ -1,3 +1,14 @@
> >> +2013-12-15  Martin Liska  <marxin.liska@gmail.com>
> >> +         Jan Hubicka  <jh@suse.cz>
> >> +
> >> +     * cgraphunit.c (node_cmp): New function.
> >> +     (expand_all_functions): Function ordering added.
> >> +     * common.opt: New profile based function reordering flag introduced.
> >> +     * lto-partition.c: Support for time profile added.
> >> +     * lto.c: Likewise.
> >> +     * predict.c (handle_missing_profiles): Time profile handled in
> >> +       missing profiles.
> >> +

OK,
thanks!
Honza

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] Time profiler - phase 2
  2013-12-16 10:14                                 ` Jan Hubicka
@ 2013-12-20 12:17                                   ` Iain Sandoe
  0 siblings, 0 replies; 18+ messages in thread
From: Iain Sandoe @ 2013-12-20 12:17 UTC (permalink / raw)
  To: Jan Hubicka, Martin Liška
  Cc: gcc-patches, Dominique Dhumieres, Mike Stump

Hi Martin,

Thanks for working on this!
 --- However you have introduced some problems including a bootstrap fail on darwin.

On 16 Dec 2013, at 10:13, Jan Hubicka wrote:

>> Hello,
>>   there's updated version of the patch.
>> 
>> Tested on x86_64 with enable bootstrap.
>> 
>> Martin
>> 
>> On 16 December 2013 00:21, Jan Hubicka <hubicka@ucw.cz> wrote:
>>>> diff --git a/gcc/ChangeLog b/gcc/ChangeLog
>>>> index 93e857df..d5a0ac8 100644
>>>> --- a/gcc/ChangeLog
>>>> +++ b/gcc/ChangeLog
>>>> @@ -1,3 +1,14 @@
>>>> +2013-12-15  Martin Liska  <marxin.liska@gmail.com>
>>>> +         Jan Hubicka  <jh@suse.cz>
>>>> +
>>>> +     * cgraphunit.c (node_cmp): New function.
>>>> +     (expand_all_functions): Function ordering added.
>>>> +     * common.opt: New profile based function reordering flag introduced.
>>>> +     * lto-partition.c: Support for time profile added.
>>>> +     * lto.c: Likewise.
>>>> +     * predict.c (handle_missing_profiles): Time profile handled in
>>>> +       missing profiles.
>>>> +

There is no mention of config/darwin.c here ^

----
diff --git a/gcc/config/darwin.c b/gcc/config/darwin.c
index ea056a9..4267c89 100644
--- a/gcc/config/darwin.c
+++ b/gcc/config/darwin.c
@@ -3621,9 +3621,16 @@ darwin_function_section (tree decl, enum node_frequency freq,
      unlikely executed (this happens especially with function splitting
      where we can split away unnecessary parts of static constructors).  */
   if (startup && freq != NODE_FREQUENCY_UNLIKELY_EXECUTED)
-    return (weak)
-	    ? darwin_sections[text_startup_coal_section]
-	    : darwin_sections[text_startup_section];
+  {
+    /* If we do have a profile or(and) LTO phase is executed, we do not need
+       these ELF section.  */
+    if (!in_lto_p || !flag_profile_values)
+      return (weak)
+	      ? darwin_sections[text_startup_coal_section]
+	      : darwin_sections[text_startup_section];
+    else
+      return text_section;
+  }
 
   /* Similarly for exit.  */
   if (exit && freq != NODE_FREQUENCY_UNLIKELY_EXECUTED)
@@ -3640,10 +3647,15 @@ darwin_function_section (tree decl, enum node_frequency freq,
 		: darwin_sections[text_cold_section];
 	break;
       case NODE_FREQUENCY_HOT:
-	return (weak)
-		? darwin_sections[text_hot_coal_section]
-		: darwin_sections[text_hot_section];
-	break;
+      {
+        /* If we do have a profile or(and) LTO phase is executed, we do not need
+           these ELF section.  */
+        if (!in_lto_p || !flag_profile_values)
+          return (weak)
+                  ? darwin_sections[text_hot_coal_section]
+                  : darwin_sections[text_hot_section];
+        break;
+      }
       default:
 	return (weak)
 		? darwin_sections[text_coal_section]

-=====

This is NOT OK for darwin, it breaks bootstrap with  pr59541.

If one fixes that trivial issue - then there is a lot of test-suite fallout of profiled code.

Please explain what the logic is intended to implement - and ensure that all the code-paths have equivalent treatment - it all looks inconsitent to me at present.

I am sure that one of the darwin folks will be happy to review (or at least test) changes.

cheers
Iain

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] Time profiler - phase 2
@ 2013-12-20 11:36 Dominique Dhumieres
  0 siblings, 0 replies; 18+ messages in thread
From: Dominique Dhumieres @ 2013-12-20 11:36 UTC (permalink / raw)
  To: gcc-patches; +Cc: hubicka, marxin.liska

> Hello,
>    there's updated version of the patch.
> 
> Tested on x86_64 with enable bootstrap.
> 
> Martin

This caused pr59541.

TIA

Dominique

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2013-12-20 12:17 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-11-14  1:12 [PATCH] Time profiler - phase 2 Martin Liška
2013-11-16 12:59 ` Jan Hubicka
2013-11-17 23:38   ` Martin Liška
2013-11-18  4:37     ` Jan Hubicka
     [not found]       ` <CAObPJ3MqUhdybQ+Ua08ab680aoqXSAoq+tAfXi_xxkyYXzM2tg@mail.gmail.com>
2013-11-18  7:43         ` Jan Hubicka
2013-11-18  9:21           ` Martin Liška
2013-11-18 10:56             ` Jan Hubicka
     [not found]               ` <CAObPJ3M7maO8FmJHMpKFv-sqr=ch=gUWRDKdBzyunmAYDfzpHw@mail.gmail.com>
2013-12-02 23:37                 ` Martin Liška
     [not found]                 ` <CAObPJ3OKRZhb8UYEtvq3RPt3Xw_QzGEpx_vFjFGibtiLXZWSVQ@mail.gmail.com>
2013-12-05 13:35                   ` Jan Hubicka
2013-12-05 13:38                     ` Jan Hubicka
2013-12-05 19:38                       ` Martin Liška
2013-12-05 21:32                         ` Jan Hubicka
2013-12-15 23:08                           ` Martin Liška
2013-12-15 23:21                             ` Jan Hubicka
2013-12-15 23:52                               ` Martin Liška
2013-12-16 10:14                                 ` Jan Hubicka
2013-12-20 12:17                                   ` Iain Sandoe
2013-12-20 11:36 Dominique Dhumieres

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).