[gcc r12-2097] tree-optimization/99728 - improve LIM for loops with aggregate copies

public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed

* [gcc r12-2097] tree-optimization/99728 - improve LIM for loops with aggregate copies
@ 2021-07-07 11:49 Richard Biener
  0 siblings, 0 replies; only message in thread
From: Richard Biener @ 2021-07-07 11:49 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:9f34b780b0461ec7b2b2defe96e44ab616ea2aa3

commit r12-2097-g9f34b780b0461ec7b2b2defe96e44ab616ea2aa3
Author: Richard Biener <rguenther@suse.de>
Date:   Wed Jul 7 11:41:03 2021 +0200

    tree-optimization/99728 - improve LIM for loops with aggregate copies
    
    This improves LIM by recording aggregate copies for disambiguation
    purposes instead of as UNANALYZABLE_MEM which will prevent any
    invariant or store motion across it.  This allows four of the six
    references in the loop of the testcase to be promoted.
    
    2021-07-07  Richard Biener  <rguenther@suse.de>
    
            PR tree-optimization/99728
            * tree-ssa-loop-im.c (gather_mem_refs_stmt): Record
            aggregate copies.
            (mem_refs_may_alias_p): Add assert we handled aggregate
            copies elsewhere.
            (sm_seq_valid_bb): Give up when running into aggregate copies.
            (ref_indep_loop_p): Handle aggregate copies as never
            being invariant themselves but allow other refs to be
            disambiguated against them.
            (can_sm_ref_p): Do not try to apply store-motion to aggregate
            copies.
    
            * g++.dg/opt/pr99728.C: New testcase.

Diff:
---
 gcc/testsuite/g++.dg/opt/pr99728.C | 50 ++++++++++++++++++++++++++++++++
 gcc/tree-ssa-loop-im.c             | 59 +++++++++++++++++++++++++++++++++-----
 2 files changed, 102 insertions(+), 7 deletions(-)

diff --git a/gcc/testsuite/g++.dg/opt/pr99728.C b/gcc/testsuite/g++.dg/opt/pr99728.C
new file mode 100644
index 00000000000..d4393231b4c
--- /dev/null
+++ b/gcc/testsuite/g++.dg/opt/pr99728.C
@@ -0,0 +1,50 @@
+// PR/99728
+// { dg-do compile }
+// { dg-options "-O2 -fdump-tree-lim2-details -w -Wno-psabi" }
+
+typedef double __m256d __attribute__((vector_size(sizeof (double) * 4)));
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_set1_pd (double __A)
+{
+  return __extension__ (__m256d){ __A, __A, __A, __A };
+}
+
+// simple OO wrapper around __m256d
+struct Tvsimple
+  {
+  __m256d v;
+  Tvsimple &operator+=(const Tvsimple &other) {v+=other.v; return *this;}
+  Tvsimple operator*(double val) const { Tvsimple res; res.v = v*_mm256_set1_pd(val); return res;}
+  Tvsimple operator*(Tvsimple val) const { Tvsimple res; res.v = v*val.v; return res; }
+  Tvsimple operator+(Tvsimple val) const { Tvsimple res; res.v = v+val.v; return res; }
+  Tvsimple operator+(double val) const { Tvsimple res; res.v = v+_mm256_set1_pd(val); return res;}
+  };
+
+template<typename vtype> struct s0data_s
+  { vtype sth, corfac, scale, lam1, lam2, csq, p1r, p1i, p2r, p2i; };
+
+template<typename vtype> void foo(s0data_s<vtype> & __restrict__ d,
+  const double * __restrict__ coef, const double * __restrict__ alm,
+  unsigned long l, unsigned long il, unsigned long lmax)
+  {
+// critical loop
+  while (l<=lmax)
+    {
+    d.p1r += d.lam2*alm[2*l];
+    d.p1i += d.lam2*alm[2*l+1];
+    d.p2r += d.lam2*alm[2*l+2];
+    d.p2i += d.lam2*alm[2*l+3];
+    Tvsimple tmp = d.lam2*(d.csq*coef[2*il] + coef[2*il+1]) + d.lam1;
+    d.lam1 = d.lam2;
+    d.lam2 = tmp;
+    ++il; l+=2;
+    }
+  }
+
+// this version has dead stores at the end of the loop
+template void foo<>(s0data_s<Tvsimple> & __restrict__ d,
+  const double * __restrict__ coef, const double * __restrict__ alm,
+  unsigned long l, unsigned long il, unsigned long lmax);
+
+// The aggregate copy in the IL should not prevent all store-motion
+// { dg-final { scan-tree-dump-times "Executing store motion" 4 "lim2" } }
diff --git a/gcc/tree-ssa-loop-im.c b/gcc/tree-ssa-loop-im.c
index 9ac390b9a4b..81b4ec21d6e 100644
--- a/gcc/tree-ssa-loop-im.c
+++ b/gcc/tree-ssa-loop-im.c
@@ -122,7 +122,9 @@ public:
   hashval_t hash;		/* Its hash value.  */
 
   /* The memory access itself and associated caching of alias-oracle
-     query meta-data.  */
+     query meta-data.  We are using mem.ref == error_mark_node for the
+     case the reference is represented by its single access stmt
+     in accesses_in_loop[0].  */
   ao_ref mem;
 
   bitmap stored;		/* The set of loops in that this memory location
@@ -130,8 +132,7 @@ public:
   bitmap loaded;		/* The set of loops in that this memory location
 				   is loaded from.  */
   vec<mem_ref_loc>		accesses_in_loop;
-				/* The locations of the accesses.  Vector
-				   indexed by the loop number.  */
+				/* The locations of the accesses.  */
 
   /* The following set is computed on demand.  */
   bitmap_head dep_loop;		/* The set of loops in that the memory
@@ -1465,7 +1466,22 @@ gather_mem_refs_stmt (class loop *loop, gimple *stmt)
     return;
 
   mem = simple_mem_ref_in_stmt (stmt, &is_stored);
-  if (!mem)
+  if (!mem && is_gimple_assign (stmt))
+    {
+      /* For aggregate copies record distinct references but use them
+	 only for disambiguation purposes.  */
+      id = memory_accesses.refs_list.length ();
+      ref = mem_ref_alloc (NULL, 0, id);
+      memory_accesses.refs_list.safe_push (ref);
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	{
+	  fprintf (dump_file, "Unhandled memory reference %u: ", id);
+	  print_gimple_stmt (dump_file, stmt, 0, TDF_SLIM);
+	}
+      record_mem_ref_loc (ref, stmt, mem);
+      is_stored = gimple_vdef (stmt);
+    }
+  else if (!mem)
     {
       /* We use the shared mem_ref for all unanalyzable refs.  */
       id = UNANALYZABLE_MEM_ID;
@@ -1595,7 +1611,8 @@ gather_mem_refs_stmt (class loop *loop, gimple *stmt)
       mark_ref_stored (ref, loop);
     }
   /* A not simple memory op is also a read when it is a write.  */
-  if (!is_stored || id == UNANALYZABLE_MEM_ID)
+  if (!is_stored || id == UNANALYZABLE_MEM_ID
+      || ref->mem.ref == error_mark_node)
     {
       bitmap_set_bit (&memory_accesses.refs_loaded_in_loop[loop->num], ref->id);
       mark_ref_loaded (ref, loop);
@@ -1714,6 +1731,9 @@ mem_refs_may_alias_p (im_mem_ref *mem1, im_mem_ref *mem2,
 		      hash_map<tree, name_expansion *> **ttae_cache,
 		      bool tbaa_p)
 {
+  gcc_checking_assert (mem1->mem.ref != error_mark_node
+		       && mem2->mem.ref != error_mark_node);
+
   /* Perform BASE + OFFSET analysis -- if MEM1 and MEM2 are based on the same
      object and their offset differ in such a way that the locations cannot
      overlap, then they cannot alias.  */
@@ -2490,6 +2510,13 @@ sm_seq_valid_bb (class loop *loop, basic_block bb, tree vdef,
       gcc_assert (data);
       if (data->ref == UNANALYZABLE_MEM_ID)
 	return -1;
+      /* Stop at memory references which we can't move.  */
+      else if (memory_accesses.refs_list[data->ref]->mem.ref == error_mark_node)
+	{
+	  /* Mark refs_not_in_seq as unsupported.  */
+	  bitmap_ior_into (refs_not_supported, refs_not_in_seq);
+	  return 1;
+	}
       /* One of the stores we want to apply SM to and we've not yet seen.  */
       else if (bitmap_clear_bit (refs_not_in_seq, data->ref))
 	{
@@ -2798,7 +2825,8 @@ ref_indep_loop_p (class loop *loop, im_mem_ref *ref, dep_kind kind)
   else
     refs_to_check = &memory_accesses.refs_stored_in_loop[loop->num];
 
-  if (bitmap_bit_p (refs_to_check, UNANALYZABLE_MEM_ID))
+  if (bitmap_bit_p (refs_to_check, UNANALYZABLE_MEM_ID)
+      || ref->mem.ref == error_mark_node)
     indep_p = false;
   else
     {
@@ -2825,7 +2853,20 @@ ref_indep_loop_p (class loop *loop, im_mem_ref *ref, dep_kind kind)
 	  EXECUTE_IF_SET_IN_BITMAP (refs_to_check, 0, i, bi)
 	    {
 	      im_mem_ref *aref = memory_accesses.refs_list[i];
-	      if (!refs_independent_p (ref, aref, kind != sm_waw))
+	      if (aref->mem.ref == error_mark_node)
+		{
+		  gimple *stmt = aref->accesses_in_loop[0].stmt;
+		  if ((kind == sm_war
+		       && ref_maybe_used_by_stmt_p (stmt, &ref->mem,
+						    kind != sm_waw))
+		      || stmt_may_clobber_ref_p_1 (stmt, &ref->mem,
+						   kind != sm_waw))
+		    {
+		      indep_p = false;
+		      break;
+		    }
+		}
+	      else if (!refs_independent_p (ref, aref, kind != sm_waw))
 		{
 		  indep_p = false;
 		  break;
@@ -2858,6 +2899,10 @@ can_sm_ref_p (class loop *loop, im_mem_ref *ref)
   if (!MEM_ANALYZABLE (ref))
     return false;
 
+  /* Can't hoist/sink aggregate copies.  */
+  if (ref->mem.ref == error_mark_node)
+    return false;
+
   /* It should be movable.  */
   if (!is_gimple_reg_type (TREE_TYPE (ref->mem.ref))
       || TREE_THIS_VOLATILE (ref->mem.ref)


^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2021-07-07 11:49 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-07-07 11:49 [gcc r12-2097] tree-optimization/99728 - improve LIM for loops with aggregate copies Richard Biener

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).