public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* fix for PR49888 var-tracking compile-time regression
@ 2013-01-16 10:29 Alexandre Oliva
  2013-01-16 10:58 ` Jakub Jelinek
  0 siblings, 1 reply; 7+ messages in thread
From: Alexandre Oliva @ 2013-01-16 10:29 UTC (permalink / raw)
  To: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 702 bytes --]

PR49888 introduced clobber_overlapping_mems to detach VALUEs (and
variables bound to them) from MEMs as the MEMs are modified.  This
turned out to be quite expensive, particularly the computation of
canonical addresses passed to alias dependency.

This patch introduces caching of the canonical addresses to alleviate
this expensive operation.  We cache mappings that apply to the entire
function, from equivalences recorded in the global cselib table, and
mappings that apply only to the local basic block.  This cut down 2% of
a full regstrap cycle, and about 1% of the time it took to build target
libraries for C, C++ and Java.

Regstrapped on x86_64-linux-gnu and i686-linux-gnu.  Ok to install?


[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: vta-get-addr-caching-pr54402-pr54114.patch --]
[-- Type: text/x-diff, Size: 10610 bytes --]

Cache canonical addresses within VTA

From: Alexandre Oliva <aoliva@redhat.com>

for  gcc/ChangeLog

	PR debug/54114
	PR debug/54402
	PR debug/49888
	* var-tracking.c (negative_power_of_two_p): New.
	(global_get_addr_cache, local_get_addr_cache): New.
	(get_addr_from_global_cache, get_addr_from_local_cache): New.
	(vt_canonicalize_addr): Rewrite using the above.
	(local_get_addr_clear_given_value): New.
	(val_reset): Clear local cached entries.
	(compute_bb_dataflow): Create and release the local cache.
	(vt_emit_notes): Likewise.
	(vt_initialize, vt_finalize): Create and release the global
	cache, respectively.
---

 gcc/var-tracking.c |  243 +++++++++++++++++++++++++++++++++++++++++++---------
 1 files changed, 200 insertions(+), 43 deletions(-)


diff --git a/gcc/var-tracking.c b/gcc/var-tracking.c
index 8c76fe0..befde0a 100644
--- a/gcc/var-tracking.c
+++ b/gcc/var-tracking.c
@@ -1948,6 +1948,14 @@ var_regno_delete (dataflow_set *set, int regno)
   *reg = NULL;
 }
 
+/* Return true if I is the negated value of a power of two.  */
+static bool
+negative_power_of_two_p (HOST_WIDE_INT i)
+{
+  unsigned HOST_WIDE_INT x = -(unsigned HOST_WIDE_INT)i;
+  return x == (x & -x);
+}
+
 /* Strip constant offsets and alignments off of LOC.  Return the base
    expression.  */
 
@@ -1958,12 +1966,123 @@ vt_get_canonicalize_base (rtx loc)
 	  || GET_CODE (loc) == AND)
 	 && GET_CODE (XEXP (loc, 1)) == CONST_INT
 	 && (GET_CODE (loc) != AND
-	     || INTVAL (XEXP (loc, 1)) < 0))
+	     || negative_power_of_two_p (INTVAL (XEXP (loc, 1)))))
     loc = XEXP (loc, 0);
 
   return loc;
 }
 
+/* This caches canonicalized addresses for VALUEs, computed using
+   information in the global cselib table.  */
+static struct pointer_map_t *global_get_addr_cache;
+
+/* This caches canonicalized addresses for VALUEs, computed using
+   information from the global cache and information pertaining to a
+   basic block being analyzed.  */
+static struct pointer_map_t *local_get_addr_cache;
+
+static rtx vt_canonicalize_addr (dataflow_set *, rtx);
+
+/* Return the canonical address for LOC, that must be a VALUE, using a
+   cached global equivalence or computing it and storing it in the
+   global cache.  */
+
+static rtx
+get_addr_from_global_cache (rtx const loc)
+{
+  rtx x;
+  void **slot;
+
+  gcc_checking_assert (GET_CODE (loc) == VALUE);
+  
+  slot = pointer_map_insert (global_get_addr_cache, loc);
+  if (*slot)
+    return (rtx)*slot;
+
+  x = canon_rtx (get_addr (loc));
+
+  /* Tentative, avoiding infinite recursion.  */
+  *slot = x;
+
+  if (x != loc)
+    {
+      rtx nx = vt_canonicalize_addr (NULL, x);
+      if (nx != x)
+	{
+	  /* The table may have moved during recursion, recompute
+	     SLOT.  */
+	  slot = pointer_map_contains (global_get_addr_cache, loc);
+	  *slot = x = nx;
+	}
+    }
+
+  return x;
+}
+
+/* Return the canonical address for LOC, that must be a VALUE, using a
+   cached local equivalence or computing it and storing it in the
+   local cache.  */
+
+static rtx
+get_addr_from_local_cache (dataflow_set *set, rtx const loc)
+{
+  rtx x;
+  void **slot;
+  decl_or_value dv;
+  variable var;
+  location_chain l;
+
+  gcc_checking_assert (GET_CODE (loc) == VALUE);
+  
+  slot = pointer_map_insert (local_get_addr_cache, loc);
+  if (*slot)
+    return (rtx)*slot;
+
+  x = get_addr_from_global_cache (loc);
+  
+  /* Tentative, avoiding infinite recursion.  */
+  *slot = x;
+
+  /* Recurse to cache local expansion of X, or if we need to search
+     for a VALUE in the expansion.  */
+  if (x != loc)
+    {
+      rtx nx = vt_canonicalize_addr (set, x);
+      if (nx != x)
+	{
+	  slot = pointer_map_contains (local_get_addr_cache, loc);
+	  *slot = x = nx;
+	}
+      return x;
+    }
+
+  dv = dv_from_rtx (x);
+  var = (variable) htab_find_with_hash (shared_hash_htab (set->vars),
+					dv, dv_htab_hash (dv));
+  if (!var)
+    return x;
+
+  /* Look for an improved equivalent expression.  */
+  for (l = var->var_part[0].loc_chain; l; l = l->next)
+    {
+      rtx base = vt_get_canonicalize_base (l->loc);
+      if (GET_CODE (base) == REG
+	  || (GET_CODE (base) == VALUE
+	      && canon_value_cmp (base, loc)))
+	{
+	  rtx nx = vt_canonicalize_addr (set, l->loc);
+	  if (x != nx)
+	    {
+	      slot = pointer_map_contains (local_get_addr_cache, loc);
+	      *slot = x = nx;
+	    }
+	  break;
+	}
+    }
+
+  return x;
+}
+
 /* Canonicalize LOC using equivalences from SET in addition to those
    in the cselib static table.  */
 
@@ -1972,69 +2091,56 @@ vt_canonicalize_addr (dataflow_set *set, rtx oloc)
 {
   HOST_WIDE_INT ofst = 0;
   enum machine_mode mode = GET_MODE (oloc);
-  rtx loc = canon_rtx (get_addr (oloc));
+  rtx loc = oloc;
+  rtx x;
+  bool retry = true;
 
-  /* Try to substitute a base VALUE for equivalent expressions as much
-     as possible.  The goal here is to expand stack-related addresses
-     to one of the stack base registers, so that we can compare
-     addresses for overlaps.  */
-  while (GET_CODE (vt_get_canonicalize_base (loc)) == VALUE)
+  while (retry)
     {
-      rtx x;
-      decl_or_value dv;
-      variable var;
-      location_chain l;
-
-      while (GET_CODE (loc) == PLUS)
+      while (GET_CODE (loc) == PLUS
+	     && GET_CODE (XEXP (loc, 1)) == CONST_INT)
 	{
 	  ofst += INTVAL (XEXP (loc, 1));
 	  loc = XEXP (loc, 0);
-	  continue;
 	}
 
       /* Alignment operations can't normally be combined, so just
 	 canonicalize the base and we're done.  We'll normally have
 	 only one stack alignment anyway.  */
-      if (GET_CODE (loc) == AND)
+      if (GET_CODE (loc) == AND
+	  && GET_CODE (XEXP (loc, 1)) == CONST_INT
+	  && negative_power_of_two_p (INTVAL (XEXP (loc, 1))))
 	{
 	  x = vt_canonicalize_addr (set, XEXP (loc, 0));
 	  if (x != XEXP (loc, 0))
 	    loc = gen_rtx_AND (mode, x, XEXP (loc, 1));
-	  loc = canon_rtx (get_addr (loc));
-	  break;
+	  retry = false;
 	}
 
-      x = canon_rtx (get_addr (loc));
-
-      /* We've made progress!  Start over.  */
-      if (x != loc || GET_CODE (x) != VALUE)
+      if (GET_CODE (loc) == VALUE)
 	{
-	  loc = x;
-	  continue;
-	}
-
-      dv = dv_from_rtx (x);
-      var = (variable) htab_find_with_hash (shared_hash_htab (set->vars),
-					    dv, dv_htab_hash (dv));
-      if (!var)
-	break;
+	  if (set)
+	    loc = get_addr_from_local_cache (set, loc);
+	  else
+	    loc = get_addr_from_global_cache (loc);
 
-      /* Look for an improved equivalent expression.  */
-      for (l = var->var_part[0].loc_chain; l; l = l->next)
-	{
-	  rtx base = vt_get_canonicalize_base (l->loc);
-	  if (GET_CODE (base) == REG
-	      || (GET_CODE (base) == VALUE
-		  && canon_value_cmp (base, loc)))
+	  /* Consolidate plus_constants.  */
+	  while (ofst && GET_CODE (loc) == PLUS
+		 && GET_CODE (XEXP (loc, 1)) == CONST_INT)
 	    {
-	      loc = l->loc;
-	      break;
+	      ofst += INTVAL (XEXP (loc, 1));
+	      loc = XEXP (loc, 0);
 	    }
-	}
 
-      /* No luck with the dataflow set, so we're done.  */
-      if (!l)
-	break;
+	  retry = false;
+	}
+      else
+	{
+	  x = canon_rtx (loc);
+	  if (retry)
+	    retry = (x != loc);
+	  loc = x;
+	}
     }
 
   /* Add OFST back in.  */
@@ -2358,6 +2464,17 @@ val_store (dataflow_set *set, rtx val, rtx loc, rtx insn, bool modified)
   val_bind (set, val, loc, modified);
 }
 
+/* Clear (canonical address) slots that reference X.  */
+
+static bool
+local_get_addr_clear_given_value (const void *v ATTRIBUTE_UNUSED,
+				  void **slot, void *x)
+{
+  if (vt_get_canonicalize_base ((rtx)*slot) == x)
+    *slot = NULL;
+  return true;
+}
+
 /* Reset this node, detaching all its equivalences.  Return the slot
    in the variable hash table that holds dv, if there is one.  */
 
@@ -2373,6 +2490,28 @@ val_reset (dataflow_set *set, decl_or_value dv)
 
   gcc_assert (var->n_var_parts == 1);
 
+  if (var->onepart == ONEPART_VALUE)
+    {
+      rtx x = dv_as_value (dv);
+      void **slot;
+      
+      /* Relationships in the global cache don't change, so reset the
+	 local cache entry only.  */
+      slot = pointer_map_contains (local_get_addr_cache, x);
+      if (slot)
+	{
+	  /* If the value resolved back to itself, odds are that other
+	     values may have cached it too.  These entries now refer
+	     to the old X, so detach them too.  Entries that used the
+	     old X but resolved to something else remain ok as long as
+	     that something else isn't also reset.  */
+	  if (*slot == x)
+	    pointer_map_traverse (local_get_addr_cache,
+				  local_get_addr_clear_given_value, x);
+	  *slot = NULL;
+	}
+    }
+
   cval = NULL;
   for (node = var->var_part[0].loc_chain; node; node = node->next)
     if (GET_CODE (node->loc) == VALUE
@@ -6420,6 +6559,9 @@ compute_bb_dataflow (basic_block bb)
   dataflow_set_copy (&old_out, out);
   dataflow_set_copy (out, in);
 
+  if (MAY_HAVE_DEBUG_INSNS)
+    local_get_addr_cache = pointer_map_create ();
+
   FOR_EACH_VEC_ELT (VTI (bb)->mos, i, mo)
     {
       rtx insn = mo->insn;
@@ -6696,6 +6838,9 @@ compute_bb_dataflow (basic_block bb)
 
   if (MAY_HAVE_DEBUG_INSNS)
     {
+      pointer_map_destroy (local_get_addr_cache);
+      local_get_addr_cache = NULL;
+
       dataflow_set_equiv_regs (out);
       htab_traverse (shared_hash_htab (out->vars), canonicalize_values_mark,
 		     out);
@@ -9236,9 +9381,16 @@ vt_emit_notes (void)
 	 subsequent basic blocks.  */
       emit_notes_for_differences (BB_HEAD (bb), &cur, &VTI (bb)->in);
 
+      if (MAY_HAVE_DEBUG_INSNS)
+	local_get_addr_cache = pointer_map_create ();
+
       /* Emit the notes for the changes in the basic block itself.  */
       emit_notes_in_bb (bb, &cur);
 
+      if (MAY_HAVE_DEBUG_INSNS)
+	pointer_map_destroy (local_get_addr_cache);
+      local_get_addr_cache = NULL;
+
       /* Free memory occupied by the in hash table, we won't need it
 	 again.  */
       dataflow_set_clear (&VTI (bb)->in);
@@ -9607,11 +9759,13 @@ vt_initialize (void)
       valvar_pool = create_alloc_pool ("small variable_def pool",
 				       sizeof (struct variable_def), 256);
       preserved_values.create (256);
+      global_get_addr_cache = pointer_map_create ();
     }
   else
     {
       scratch_regs = NULL;
       valvar_pool = NULL;
+      global_get_addr_cache = NULL;
     }
 
   if (MAY_HAVE_DEBUG_INSNS)
@@ -9948,6 +10102,9 @@ vt_finalize (void)
 
   if (MAY_HAVE_DEBUG_INSNS)
     {
+      if (global_get_addr_cache)
+	pointer_map_destroy (global_get_addr_cache);
+      global_get_addr_cache = NULL;
       if (loc_exp_dep_pool)
 	free_alloc_pool (loc_exp_dep_pool);
       loc_exp_dep_pool = NULL;

[-- Attachment #3: Type: text/plain, Size: 258 bytes --]



-- 
Alexandre Oliva, freedom fighter    http://FSFLA.org/~lxoliva/
You must be the change you wish to see in the world. -- Gandhi
Be Free! -- http://FSFLA.org/   FSF Latin America board member
Free Software Evangelist      Red Hat Brazil Compiler Engineer

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: fix for PR49888 var-tracking compile-time regression
  2013-01-16 10:29 fix for PR49888 var-tracking compile-time regression Alexandre Oliva
@ 2013-01-16 10:58 ` Jakub Jelinek
  2013-01-16 13:25   ` Alexandre Oliva
  0 siblings, 1 reply; 7+ messages in thread
From: Jakub Jelinek @ 2013-01-16 10:58 UTC (permalink / raw)
  To: Alexandre Oliva; +Cc: gcc-patches

On Wed, Jan 16, 2013 at 08:28:59AM -0200, Alexandre Oliva wrote:
> PR49888 introduced clobber_overlapping_mems to detach VALUEs (and
> variables bound to them) from MEMs as the MEMs are modified.  This
> turned out to be quite expensive, particularly the computation of
> canonical addresses passed to alias dependency.
> 
> This patch introduces caching of the canonical addresses to alleviate
> this expensive operation.  We cache mappings that apply to the entire
> function, from equivalences recorded in the global cselib table, and
> mappings that apply only to the local basic block.  This cut down 2% of
> a full regstrap cycle, and about 1% of the time it took to build target
> libraries for C, C++ and Java.

Can you safely cache the canon addresses already during vt_initialize
(when cselib_* is still processing new insns, cselib VALUEs contain
REGs and MEMs that are flushed at the end of processing the current bb
in vt_initialize)?  In my earlier attempts to cache something in
var-tracking, I've always started caching at the end of vt_initialize,
when the VALUEs should be (with one small exception of
variable_post_merge_new_vals created VALUEs) pretty much stable, everything
should be flushed that needs to be flushed, etc.

Also, what effects (if any) does the patch have on the
.debug_info/.debug_loc size and coverage?

	Jakub

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: fix for PR49888 var-tracking compile-time regression
  2013-01-16 10:58 ` Jakub Jelinek
@ 2013-01-16 13:25   ` Alexandre Oliva
  2013-01-16 14:15     ` Jakub Jelinek
  2013-01-16 16:55     ` Jakub Jelinek
  0 siblings, 2 replies; 7+ messages in thread
From: Alexandre Oliva @ 2013-01-16 13:25 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: gcc-patches

On Jan 16, 2013, Jakub Jelinek <jakub@redhat.com> wrote:

> On Wed, Jan 16, 2013 at 08:28:59AM -0200, Alexandre Oliva wrote:
>> PR49888 introduced clobber_overlapping_mems to detach VALUEs (and
>> variables bound to them) from MEMs as the MEMs are modified.  This
>> turned out to be quite expensive, particularly the computation of
>> canonical addresses passed to alias dependency.
>> 
>> This patch introduces caching of the canonical addresses to alleviate
>> this expensive operation.  We cache mappings that apply to the entire
>> function, from equivalences recorded in the global cselib table, and
>> mappings that apply only to the local basic block.  This cut down 2% of
>> a full regstrap cycle, and about 1% of the time it took to build target
>> libraries for C, C++ and Java.

> Can you safely cache the canon addresses already during vt_initialize
> (when cselib_* is still processing new insns, cselib VALUEs contain
> REGs and MEMs that are flushed at the end of processing the current bb
> in vt_initialize)?

No.  AFAICT we only call the address canonicalization function after
collecting all MOps, when the cselib table is fully constructed and
cleaned up from any local information, retaining only the global
equivalences.

> Also, what effects (if any) does the patch have on the
> .debug_info/.debug_loc size and coverage?

It shouldn't have any, since it's just caching results that would have
been recomputed over and over.  However, there's a possibility of being
“lucky” and recording an equivalence in the cache whose path would later
be removed from a dynamic set (say, if an incoming VALUE is reset and
re-bound within a block; I'm not sure this ever actually happens).  In
this case, these retained equivalences might enable alias analysis to
figure out that two memory refs do not overlap, and so one can be
retained in a dynamic equivalence list when we process a MOp that
modifies the other.  Or something ;-) It shouldn't really make any
difference, just speed things up a bit.  Paraphrasing Knuth, “I proved
it, but I didn't test it” ;-)

-- 
Alexandre Oliva, freedom fighter    http://FSFLA.org/~lxoliva/
You must be the change you wish to see in the world. -- Gandhi
Be Free! -- http://FSFLA.org/   FSF Latin America board member
Free Software Evangelist      Red Hat Brazil Compiler Engineer

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: fix for PR49888 var-tracking compile-time regression
  2013-01-16 13:25   ` Alexandre Oliva
@ 2013-01-16 14:15     ` Jakub Jelinek
  2013-01-16 16:55     ` Jakub Jelinek
  1 sibling, 0 replies; 7+ messages in thread
From: Jakub Jelinek @ 2013-01-16 14:15 UTC (permalink / raw)
  To: Alexandre Oliva; +Cc: gcc-patches

On Wed, Jan 16, 2013 at 11:25:46AM -0200, Alexandre Oliva wrote:
> > Can you safely cache the canon addresses already during vt_initialize
> > (when cselib_* is still processing new insns, cselib VALUEs contain
> > REGs and MEMs that are flushed at the end of processing the current bb
> > in vt_initialize)?
> 
> No.  AFAICT we only call the address canonicalization function after
> collecting all MOps, when the cselib table is fully constructed and
> cleaned up from any local information, retaining only the global
> equivalences.

Weird, I thought I've seen significant time spent in get_addr etc.
already during vt_initialize, e.g. when looking at PR54402, but I might be
wrong.
--- var-tracking.c.xx	2013-01-11 09:03:01.000000000 +0100
+++ var-tracking.c	2013-01-16 15:00:39.012478547 +0100
@@ -2172,11 +2172,14 @@ drop_overlapping_mem_locs (void **slot,
 
 /* Remove from SET all VALUE bindings to MEMs that overlap with LOC.  */
 
+static bool vt_initialize_p;
+
 static void
 clobber_overlapping_mems (dataflow_set *set, rtx loc)
 {
   struct overlapping_mems coms;
 
+gcc_assert (!vt_initialize_p);
   coms.set = set;
   coms.loc = canon_rtx (loc);
   coms.addr = vt_canonicalize_addr (set, XEXP (loc, 0));
@@ -9604,6 +9607,8 @@ vt_initialize (void)
       VTI (bb)->permp = NULL;
     }
 
+vt_initialize_p = true;
+
   if (MAY_HAVE_DEBUG_INSNS)
     {
       cselib_init (CSELIB_RECORD_MEMORY | CSELIB_PRESERVE_CONSTANTS);
@@ -9861,6 +9866,7 @@ vt_initialize (void)
 	}
     }
 
+vt_initialize_p = false;
   hard_frame_pointer_adjustment = -1;
   VTI (ENTRY_BLOCK_PTR)->flooded = true;
   cfa_base_rtx = NULL_RTX;
doesn't ICE on the few testcases I've tried.  If it only runs after
vt_initialize, my complain is wrong of course.

> > Also, what effects (if any) does the patch have on the
> > .debug_info/.debug_loc size and coverage?
> 
> It shouldn't have any, since it's just caching results that would have
> been recomputed over and over.  However, there's a possibility of being
> “lucky” and recording an equivalence in the cache whose path would later
> be removed from a dynamic set (say, if an incoming VALUE is reset and
> re-bound within a block; I'm not sure this ever actually happens).  In
> this case, these retained equivalences might enable alias analysis to
> figure out that two memory refs do not overlap, and so one can be
> retained in a dynamic equivalence list when we process a MOp that
> modifies the other.  Or something ;-) It shouldn't really make any
> difference, just speed things up a bit.  Paraphrasing Knuth, “I proved
> it, but I didn't test it” ;-)

Let me do a bootstrap/regtest pair (first one almost finished) to see.

	Jakub

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: fix for PR49888 var-tracking compile-time regression
  2013-01-16 13:25   ` Alexandre Oliva
  2013-01-16 14:15     ` Jakub Jelinek
@ 2013-01-16 16:55     ` Jakub Jelinek
  2013-01-17 20:57       ` Alexandre Oliva
  1 sibling, 1 reply; 7+ messages in thread
From: Jakub Jelinek @ 2013-01-16 16:55 UTC (permalink / raw)
  To: Alexandre Oliva; +Cc: gcc-patches

On Wed, Jan 16, 2013 at 11:25:46AM -0200, Alexandre Oliva wrote:
> > Also, what effects (if any) does the patch have on the
> > .debug_info/.debug_loc size and coverage?
> 
> It shouldn't have any, since it's just caching results that would have
> been recomputed over and over.  However, there's a possibility of being
> “lucky” and recording an equivalence in the cache whose path would later
> be removed from a dynamic set (say, if an incoming VALUE is reset and
> re-bound within a block; I'm not sure this ever actually happens).  In
> this case, these retained equivalences might enable alias analysis to
> figure out that two memory refs do not overlap, and so one can be
> retained in a dynamic equivalence list when we process a MOp that
> modifies the other.  Or something ;-) It shouldn't really make any
> difference, just speed things up a bit.  Paraphrasing Knuth, “I proved
> it, but I didn't test it” ;-)

Ok, seems it is almost no change, but if I do between
--enable-languages=all,obj-c++,ada,lto,go --enable-checking=yes,rtl
x86_64-linux and i686-linux (the latter without ,ada) trees (once without
your var-tracking.c patch, once with) readelf -WS comparison of all the
gcc/*.o gcc/*/*.o files from stage3, I get differences beyond
var-tracking.o (which is of course expected):

for i686-linux tree-ssa-pre.o is different, and
for x86_64-linux go/export.o is different.
All other objects have the same readelf -WS output (not comparing .debug_*
section data, as that could be slightly different as I haven't done the
build with exactly the same object directory (different dirname, but same
length thereof)).  If I strip those two object files, then they are
identical between the two trees.  There are some differences in .debug_loc
in both cases.

	Jakub

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: fix for PR49888 var-tracking compile-time regression
  2013-01-16 16:55     ` Jakub Jelinek
@ 2013-01-17 20:57       ` Alexandre Oliva
  2013-01-18 10:59         ` Jakub Jelinek
  0 siblings, 1 reply; 7+ messages in thread
From: Alexandre Oliva @ 2013-01-17 20:57 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 2277 bytes --]

On Jan 16, 2013, Jakub Jelinek <jakub@redhat.com> wrote:

> for i686-linux tree-ssa-pre.o is different, and
> for x86_64-linux go/export.o is different.

I looked into the latter first, and that revealed a few major bugs:

1. I'd failed to fully implement what I'd planned.  The canonicalization
function would still take REGs from dataflow sets, instead of only
VALUEs that can be safely cached, and the memory clobbering was still
using an expression that could contain REGs rather than only VALUEs.
This means at times the canonical values could have been wrong, and they
could accidentally fail to match.

2. ENTRY_VALUEs, that could already be used as canonical base addresses
before, have now become far more likely to be chosen as such, but the
alias code compared all ENTRY_VALUEs of the same mode equal, because the
ENTRY_VALUE operand is a 0 rather than an e.

Having fixed these two problems, the debug info for go/export became
identical.  Feeling adventurous, I made all memory overlap tests
performed as part of the memory clobbering use (the nwo cheap)
canonicalized addresses, instead of only canonicalizing the compare-with
addresses that were marked as SP-based.

This gives us additional accuracy in the tests, and this caused some
additional differences in tree-ssa-pre compare on i686: with the earlier
patch, we'd fail to recognize many compares as non-overlapping, and
conservatively drop the corresponding MEMs from the dataflow set.  In
other cases, it was the use of VALUE-based address canonicalization that
enabled the mem conflict test to do a better job.

While investigating this, I found out we'd go TWICE over the entire
dataflow set clobbering each modified MEM, when once would have been
enough, and fixed that too.  This should further reduce the performance
impact caused by the original memory clobbering patch.

Here's the patch I've just regstrapped.  Test results look good, with no
regressions introduced since the earlier version of the patch, which
itself hadn't regressed on top of an earlier tree.  However, there are
regressions comparing the latest results with those based on the earlier
tree, so I'm now running a baseline test just to be sure that the
regressions are not caused by this patch.  Ok to install if they aren't?


[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: vta-get-addr-caching-pr54402-pr54114.patch --]
[-- Type: text/x-diff, Size: 14382 bytes --]

Cache canonical addresses within VTA

From: Alexandre Oliva <aoliva@redhat.com>

for  gcc/ChangeLog

	PR debug/54114
	PR debug/54402
	PR debug/49888
	* var-tracking.c (negative_power_of_two_p): New.
	(global_get_addr_cache, local_get_addr_cache): New.
	(get_addr_from_global_cache, get_addr_from_local_cache): New.
	(vt_canonicalize_addr): Rewrite using the above.  Adjust the
	heading comment.
	(vt_stack_offset_p): Remove.
	(vt_canon_true_dep): Always canonicalize loc's address.
	(clobber_overlapping_mems): Make sure we have a MEM.
	(local_get_addr_clear_given_value): New.
	(val_reset): Clear local cached entries.
	(compute_bb_dataflow): Create and release the local cache.
	Disable duplicate MEMs clobbering.
	(emit_notes_in_bb): Clobber MEMs likewise.
	(vt_emit_notes): Create and release the local cache.
	(vt_initialize, vt_finalize): Create and release the global
	cache, respectively.
	* alias.c (rtx_equal_for_memref_p): Compare operands of
	ENTRY_VALUEs.
---

 gcc/alias.c        |    4 +
 gcc/var-tracking.c |  293 ++++++++++++++++++++++++++++++++++++++++------------
 2 files changed, 231 insertions(+), 66 deletions(-)


diff --git a/gcc/alias.c b/gcc/alias.c
index f3cd014..e18dd34 100644
--- a/gcc/alias.c
+++ b/gcc/alias.c
@@ -1465,6 +1465,10 @@ rtx_equal_for_memref_p (const_rtx x, const_rtx y)
     case SYMBOL_REF:
       return XSTR (x, 0) == XSTR (y, 0);
 
+    case ENTRY_VALUE:
+      /* This is magic, don't go through canonicalization et al.  */
+      return rtx_equal_p (ENTRY_VALUE_EXP (x), ENTRY_VALUE_EXP (y));
+
     case VALUE:
     CASE_CONST_UNIQUE:
       /* There's no need to compare the contents of CONST_DOUBLEs or
diff --git a/gcc/var-tracking.c b/gcc/var-tracking.c
index def624a..714acb69 100644
--- a/gcc/var-tracking.c
+++ b/gcc/var-tracking.c
@@ -1948,6 +1948,14 @@ var_regno_delete (dataflow_set *set, int regno)
   *reg = NULL;
 }
 
+/* Return true if I is the negated value of a power of two.  */
+static bool
+negative_power_of_two_p (HOST_WIDE_INT i)
+{
+  unsigned HOST_WIDE_INT x = -(unsigned HOST_WIDE_INT)i;
+  return x == (x & -x);
+}
+
 /* Strip constant offsets and alignments off of LOC.  Return the base
    expression.  */
 
@@ -1958,83 +1966,191 @@ vt_get_canonicalize_base (rtx loc)
 	  || GET_CODE (loc) == AND)
 	 && GET_CODE (XEXP (loc, 1)) == CONST_INT
 	 && (GET_CODE (loc) != AND
-	     || INTVAL (XEXP (loc, 1)) < 0))
+	     || negative_power_of_two_p (INTVAL (XEXP (loc, 1)))))
     loc = XEXP (loc, 0);
 
   return loc;
 }
 
+/* This caches canonicalized addresses for VALUEs, computed using
+   information in the global cselib table.  */
+static struct pointer_map_t *global_get_addr_cache;
+
+/* This caches canonicalized addresses for VALUEs, computed using
+   information from the global cache and information pertaining to a
+   basic block being analyzed.  */
+static struct pointer_map_t *local_get_addr_cache;
+
+static rtx vt_canonicalize_addr (dataflow_set *, rtx);
+
+/* Return the canonical address for LOC, that must be a VALUE, using a
+   cached global equivalence or computing it and storing it in the
+   global cache.  */
+
+static rtx
+get_addr_from_global_cache (rtx const loc)
+{
+  rtx x;
+  void **slot;
+
+  gcc_checking_assert (GET_CODE (loc) == VALUE);
+  
+  slot = pointer_map_insert (global_get_addr_cache, loc);
+  if (*slot)
+    return (rtx)*slot;
+
+  x = canon_rtx (get_addr (loc));
+
+  /* Tentative, avoiding infinite recursion.  */
+  *slot = x;
+
+  if (x != loc)
+    {
+      rtx nx = vt_canonicalize_addr (NULL, x);
+      if (nx != x)
+	{
+	  /* The table may have moved during recursion, recompute
+	     SLOT.  */
+	  slot = pointer_map_contains (global_get_addr_cache, loc);
+	  *slot = x = nx;
+	}
+    }
+
+  return x;
+}
+
+/* Return the canonical address for LOC, that must be a VALUE, using a
+   cached local equivalence or computing it and storing it in the
+   local cache.  */
+
+static rtx
+get_addr_from_local_cache (dataflow_set *set, rtx const loc)
+{
+  rtx x;
+  void **slot;
+  decl_or_value dv;
+  variable var;
+  location_chain l;
+
+  gcc_checking_assert (GET_CODE (loc) == VALUE);
+  
+  slot = pointer_map_insert (local_get_addr_cache, loc);
+  if (*slot)
+    return (rtx)*slot;
+
+  x = get_addr_from_global_cache (loc);
+  
+  /* Tentative, avoiding infinite recursion.  */
+  *slot = x;
+
+  /* Recurse to cache local expansion of X, or if we need to search
+     for a VALUE in the expansion.  */
+  if (x != loc)
+    {
+      rtx nx = vt_canonicalize_addr (set, x);
+      if (nx != x)
+	{
+	  slot = pointer_map_contains (local_get_addr_cache, loc);
+	  *slot = x = nx;
+	}
+      return x;
+    }
+
+  dv = dv_from_rtx (x);
+  var = (variable) htab_find_with_hash (shared_hash_htab (set->vars),
+					dv, dv_htab_hash (dv));
+  if (!var)
+    return x;
+
+  /* Look for an improved equivalent expression.  */
+  for (l = var->var_part[0].loc_chain; l; l = l->next)
+    {
+      rtx base = vt_get_canonicalize_base (l->loc);
+      if (GET_CODE (base) == VALUE
+	  && canon_value_cmp (base, loc))
+	{
+	  rtx nx = vt_canonicalize_addr (set, l->loc);
+	  if (x != nx)
+	    {
+	      slot = pointer_map_contains (local_get_addr_cache, loc);
+	      *slot = x = nx;
+	    }
+	  break;
+	}
+    }
+
+  return x;
+}
+
 /* Canonicalize LOC using equivalences from SET in addition to those
-   in the cselib static table.  */
+   in the cselib static table.  It expects a VALUE-based expression,
+   and it will only substitute VALUEs with other VALUEs or
+   function-global equivalences, so that, if two addresses have base
+   VALUEs that are locally or globally related in ways that
+   memrefs_conflict_p cares about, they will both canonicalize to
+   expressions that have the same base VALUE.
+
+   The use of VALUEs as canonical base addresses enables the canonical
+   RTXs to remain unchanged globally, if they resolve to a constant,
+   or throughout a basic block otherwise, so that they can be cached
+   and the cache needs not be invalidated when REGs, MEMs or such
+   change.  */
 
 static rtx
 vt_canonicalize_addr (dataflow_set *set, rtx oloc)
 {
   HOST_WIDE_INT ofst = 0;
   enum machine_mode mode = GET_MODE (oloc);
-  rtx loc = canon_rtx (get_addr (oloc));
+  rtx loc = oloc;
+  rtx x;
+  bool retry = true;
 
-  /* Try to substitute a base VALUE for equivalent expressions as much
-     as possible.  The goal here is to expand stack-related addresses
-     to one of the stack base registers, so that we can compare
-     addresses for overlaps.  */
-  while (GET_CODE (vt_get_canonicalize_base (loc)) == VALUE)
+  while (retry)
     {
-      rtx x;
-      decl_or_value dv;
-      variable var;
-      location_chain l;
-
-      while (GET_CODE (loc) == PLUS)
+      while (GET_CODE (loc) == PLUS
+	     && GET_CODE (XEXP (loc, 1)) == CONST_INT)
 	{
 	  ofst += INTVAL (XEXP (loc, 1));
 	  loc = XEXP (loc, 0);
-	  continue;
 	}
 
       /* Alignment operations can't normally be combined, so just
 	 canonicalize the base and we're done.  We'll normally have
 	 only one stack alignment anyway.  */
-      if (GET_CODE (loc) == AND)
+      if (GET_CODE (loc) == AND
+	  && GET_CODE (XEXP (loc, 1)) == CONST_INT
+	  && negative_power_of_two_p (INTVAL (XEXP (loc, 1))))
 	{
 	  x = vt_canonicalize_addr (set, XEXP (loc, 0));
 	  if (x != XEXP (loc, 0))
 	    loc = gen_rtx_AND (mode, x, XEXP (loc, 1));
-	  loc = canon_rtx (get_addr (loc));
-	  break;
+	  retry = false;
 	}
 
-      x = canon_rtx (get_addr (loc));
-
-      /* We've made progress!  Start over.  */
-      if (x != loc || GET_CODE (x) != VALUE)
+      if (GET_CODE (loc) == VALUE)
 	{
-	  loc = x;
-	  continue;
-	}
-
-      dv = dv_from_rtx (x);
-      var = (variable) htab_find_with_hash (shared_hash_htab (set->vars),
-					    dv, dv_htab_hash (dv));
-      if (!var)
-	break;
+	  if (set)
+	    loc = get_addr_from_local_cache (set, loc);
+	  else
+	    loc = get_addr_from_global_cache (loc);
 
-      /* Look for an improved equivalent expression.  */
-      for (l = var->var_part[0].loc_chain; l; l = l->next)
-	{
-	  rtx base = vt_get_canonicalize_base (l->loc);
-	  if (GET_CODE (base) == REG
-	      || (GET_CODE (base) == VALUE
-		  && canon_value_cmp (base, loc)))
+	  /* Consolidate plus_constants.  */
+	  while (ofst && GET_CODE (loc) == PLUS
+		 && GET_CODE (XEXP (loc, 1)) == CONST_INT)
 	    {
-	      loc = l->loc;
-	      break;
+	      ofst += INTVAL (XEXP (loc, 1));
+	      loc = XEXP (loc, 0);
 	    }
-	}
 
-      /* No luck with the dataflow set, so we're done.  */
-      if (!l)
-	break;
+	  retry = false;
+	}
+      else
+	{
+	  x = canon_rtx (loc);
+	  if (retry)
+	    retry = (x != loc);
+	  loc = x;
+	}
     }
 
   /* Add OFST back in.  */
@@ -2052,19 +2168,6 @@ vt_canonicalize_addr (dataflow_set *set, rtx oloc)
   return loc;
 }
 
-/* Return true iff ADDR has a stack register as the base address.  */
-
-static inline bool
-vt_stack_offset_p (rtx addr)
-{
-  rtx base = vt_get_canonicalize_base (addr);
-
-  if (GET_CODE (base) != REG)
-    return false;
-
-  return REGNO_PTR_FRAME_P (REGNO (base));
-}
-
 /* Return true iff there's a true dependence between MLOC and LOC.
    MADDR must be a canonicalized version of MLOC's address.  */
 
@@ -2074,15 +2177,10 @@ vt_canon_true_dep (dataflow_set *set, rtx mloc, rtx maddr, rtx loc)
   if (GET_CODE (loc) != MEM)
     return false;
 
-  if (!canon_true_dependence (mloc, GET_MODE (mloc), maddr, loc, NULL))
+  rtx addr = vt_canonicalize_addr (set, XEXP (loc, 0));
+  if (!canon_true_dependence (mloc, GET_MODE (mloc), maddr, loc, addr))
     return false;
 
-  if (!MEM_EXPR (loc) && vt_stack_offset_p (maddr))
-    {
-      rtx addr = vt_canonicalize_addr (set, XEXP (loc, 0));
-      return canon_true_dependence (mloc, GET_MODE (mloc), maddr, loc, addr);
-    }
-
   return true;
 }
 
@@ -2177,6 +2275,8 @@ clobber_overlapping_mems (dataflow_set *set, rtx loc)
 {
   struct overlapping_mems coms;
 
+  gcc_checking_assert (GET_CODE (loc) == MEM);
+
   coms.set = set;
   coms.loc = canon_rtx (loc);
   coms.addr = vt_canonicalize_addr (set, XEXP (loc, 0));
@@ -2358,6 +2458,17 @@ val_store (dataflow_set *set, rtx val, rtx loc, rtx insn, bool modified)
   val_bind (set, val, loc, modified);
 }
 
+/* Clear (canonical address) slots that reference X.  */
+
+static bool
+local_get_addr_clear_given_value (const void *v ATTRIBUTE_UNUSED,
+				  void **slot, void *x)
+{
+  if (vt_get_canonicalize_base ((rtx)*slot) == x)
+    *slot = NULL;
+  return true;
+}
+
 /* Reset this node, detaching all its equivalences.  Return the slot
    in the variable hash table that holds dv, if there is one.  */
 
@@ -2373,6 +2484,28 @@ val_reset (dataflow_set *set, decl_or_value dv)
 
   gcc_assert (var->n_var_parts == 1);
 
+  if (var->onepart == ONEPART_VALUE)
+    {
+      rtx x = dv_as_value (dv);
+      void **slot;
+      
+      /* Relationships in the global cache don't change, so reset the
+	 local cache entry only.  */
+      slot = pointer_map_contains (local_get_addr_cache, x);
+      if (slot)
+	{
+	  /* If the value resolved back to itself, odds are that other
+	     values may have cached it too.  These entries now refer
+	     to the old X, so detach them too.  Entries that used the
+	     old X but resolved to something else remain ok as long as
+	     that something else isn't also reset.  */
+	  if (*slot == x)
+	    pointer_map_traverse (local_get_addr_cache,
+				  local_get_addr_clear_given_value, x);
+	  *slot = NULL;
+	}
+    }
+
   cval = NULL;
   for (node = var->var_part[0].loc_chain; node; node = node->next)
     if (GET_CODE (node->loc) == VALUE
@@ -6424,6 +6557,9 @@ compute_bb_dataflow (basic_block bb)
   dataflow_set_copy (&old_out, out);
   dataflow_set_copy (out, in);
 
+  if (MAY_HAVE_DEBUG_INSNS)
+    local_get_addr_cache = pointer_map_create ();
+
   FOR_EACH_VEC_ELT (VTI (bb)->mos, i, mo)
     {
       rtx insn = mo->insn;
@@ -6613,7 +6749,12 @@ compute_bb_dataflow (basic_block bb)
 	      else if (REG_P (uloc))
 		var_regno_delete (out, REGNO (uloc));
 	      else if (MEM_P (uloc))
-		clobber_overlapping_mems (out, uloc);
+		{
+		  gcc_checking_assert (GET_CODE (vloc) == MEM);
+		  gcc_checking_assert (dstv == vloc);
+		  if (dstv != vloc)
+		    clobber_overlapping_mems (out, vloc);
+		}
 
 	      val_store (out, val, dstv, insn, true);
 	    }
@@ -6700,6 +6841,9 @@ compute_bb_dataflow (basic_block bb)
 
   if (MAY_HAVE_DEBUG_INSNS)
     {
+      pointer_map_destroy (local_get_addr_cache);
+      local_get_addr_cache = NULL;
+
       dataflow_set_equiv_regs (out);
       htab_traverse (shared_hash_htab (out->vars), canonicalize_values_mark,
 		     out);
@@ -9113,7 +9257,12 @@ emit_notes_in_bb (basic_block bb, dataflow_set *set)
 	      else if (REG_P (uloc))
 		var_regno_delete (set, REGNO (uloc));
 	      else if (MEM_P (uloc))
-		clobber_overlapping_mems (set, uloc);
+		{
+		  gcc_checking_assert (GET_CODE (vloc) == MEM);
+		  gcc_checking_assert (vloc == dstv);
+		  if (vloc != dstv)
+		    clobber_overlapping_mems (set, vloc);
+		}
 
 	      val_store (set, val, dstv, insn, true);
 
@@ -9240,9 +9389,16 @@ vt_emit_notes (void)
 	 subsequent basic blocks.  */
       emit_notes_for_differences (BB_HEAD (bb), &cur, &VTI (bb)->in);
 
+      if (MAY_HAVE_DEBUG_INSNS)
+	local_get_addr_cache = pointer_map_create ();
+
       /* Emit the notes for the changes in the basic block itself.  */
       emit_notes_in_bb (bb, &cur);
 
+      if (MAY_HAVE_DEBUG_INSNS)
+	pointer_map_destroy (local_get_addr_cache);
+      local_get_addr_cache = NULL;
+
       /* Free memory occupied by the in hash table, we won't need it
 	 again.  */
       dataflow_set_clear (&VTI (bb)->in);
@@ -9611,11 +9767,13 @@ vt_initialize (void)
       valvar_pool = create_alloc_pool ("small variable_def pool",
 				       sizeof (struct variable_def), 256);
       preserved_values.create (256);
+      global_get_addr_cache = pointer_map_create ();
     }
   else
     {
       scratch_regs = NULL;
       valvar_pool = NULL;
+      global_get_addr_cache = NULL;
     }
 
   if (MAY_HAVE_DEBUG_INSNS)
@@ -9952,6 +10110,9 @@ vt_finalize (void)
 
   if (MAY_HAVE_DEBUG_INSNS)
     {
+      if (global_get_addr_cache)
+	pointer_map_destroy (global_get_addr_cache);
+      global_get_addr_cache = NULL;
       if (loc_exp_dep_pool)
 	free_alloc_pool (loc_exp_dep_pool);
       loc_exp_dep_pool = NULL;

[-- Attachment #3: Type: text/plain, Size: 258 bytes --]



-- 
Alexandre Oliva, freedom fighter    http://FSFLA.org/~lxoliva/
You must be the change you wish to see in the world. -- Gandhi
Be Free! -- http://FSFLA.org/   FSF Latin America board member
Free Software Evangelist      Red Hat Brazil Compiler Engineer

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: fix for PR49888 var-tracking compile-time regression
  2013-01-17 20:57       ` Alexandre Oliva
@ 2013-01-18 10:59         ` Jakub Jelinek
  0 siblings, 0 replies; 7+ messages in thread
From: Jakub Jelinek @ 2013-01-18 10:59 UTC (permalink / raw)
  To: Alexandre Oliva; +Cc: gcc-patches

On Thu, Jan 17, 2013 at 06:56:40PM -0200, Alexandre Oliva wrote:
> From: Alexandre Oliva <aoliva@redhat.com>
> 
> for  gcc/ChangeLog
> 
> 	PR debug/54114
> 	PR debug/54402
> 	PR debug/49888
> 	* var-tracking.c (negative_power_of_two_p): New.
> 	(global_get_addr_cache, local_get_addr_cache): New.
> 	(get_addr_from_global_cache, get_addr_from_local_cache): New.
> 	(vt_canonicalize_addr): Rewrite using the above.  Adjust the
> 	heading comment.
> 	(vt_stack_offset_p): Remove.
> 	(vt_canon_true_dep): Always canonicalize loc's address.
> 	(clobber_overlapping_mems): Make sure we have a MEM.
> 	(local_get_addr_clear_given_value): New.
> 	(val_reset): Clear local cached entries.
> 	(compute_bb_dataflow): Create and release the local cache.
> 	Disable duplicate MEMs clobbering.
> 	(emit_notes_in_bb): Clobber MEMs likewise.
> 	(vt_emit_notes): Create and release the local cache.
> 	(vt_initialize, vt_finalize): Create and release the global
> 	cache, respectively.
> 	* alias.c (rtx_equal_for_memref_p): Compare operands of
> 	ENTRY_VALUEs.

Ok, thanks.

	Jakub

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2013-01-18 10:59 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-01-16 10:29 fix for PR49888 var-tracking compile-time regression Alexandre Oliva
2013-01-16 10:58 ` Jakub Jelinek
2013-01-16 13:25   ` Alexandre Oliva
2013-01-16 14:15     ` Jakub Jelinek
2013-01-16 16:55     ` Jakub Jelinek
2013-01-17 20:57       ` Alexandre Oliva
2013-01-18 10:59         ` Jakub Jelinek

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).