public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc r12-2097] tree-optimization/99728 - improve LIM for loops with aggregate copies
@ 2021-07-07 11:49 Richard Biener
0 siblings, 0 replies; only message in thread
From: Richard Biener @ 2021-07-07 11:49 UTC (permalink / raw)
To: gcc-cvs
https://gcc.gnu.org/g:9f34b780b0461ec7b2b2defe96e44ab616ea2aa3
commit r12-2097-g9f34b780b0461ec7b2b2defe96e44ab616ea2aa3
Author: Richard Biener <rguenther@suse.de>
Date: Wed Jul 7 11:41:03 2021 +0200
tree-optimization/99728 - improve LIM for loops with aggregate copies
This improves LIM by recording aggregate copies for disambiguation
purposes instead of as UNANALYZABLE_MEM which will prevent any
invariant or store motion across it. This allows four of the six
references in the loop of the testcase to be promoted.
2021-07-07 Richard Biener <rguenther@suse.de>
PR tree-optimization/99728
* tree-ssa-loop-im.c (gather_mem_refs_stmt): Record
aggregate copies.
(mem_refs_may_alias_p): Add assert we handled aggregate
copies elsewhere.
(sm_seq_valid_bb): Give up when running into aggregate copies.
(ref_indep_loop_p): Handle aggregate copies as never
being invariant themselves but allow other refs to be
disambiguated against them.
(can_sm_ref_p): Do not try to apply store-motion to aggregate
copies.
* g++.dg/opt/pr99728.C: New testcase.
Diff:
---
gcc/testsuite/g++.dg/opt/pr99728.C | 50 ++++++++++++++++++++++++++++++++
gcc/tree-ssa-loop-im.c | 59 +++++++++++++++++++++++++++++++++-----
2 files changed, 102 insertions(+), 7 deletions(-)
diff --git a/gcc/testsuite/g++.dg/opt/pr99728.C b/gcc/testsuite/g++.dg/opt/pr99728.C
new file mode 100644
index 00000000000..d4393231b4c
--- /dev/null
+++ b/gcc/testsuite/g++.dg/opt/pr99728.C
@@ -0,0 +1,50 @@
+// PR/99728
+// { dg-do compile }
+// { dg-options "-O2 -fdump-tree-lim2-details -w -Wno-psabi" }
+
+typedef double __m256d __attribute__((vector_size(sizeof (double) * 4)));
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_set1_pd (double __A)
+{
+ return __extension__ (__m256d){ __A, __A, __A, __A };
+}
+
+// simple OO wrapper around __m256d
+struct Tvsimple
+ {
+ __m256d v;
+ Tvsimple &operator+=(const Tvsimple &other) {v+=other.v; return *this;}
+ Tvsimple operator*(double val) const { Tvsimple res; res.v = v*_mm256_set1_pd(val); return res;}
+ Tvsimple operator*(Tvsimple val) const { Tvsimple res; res.v = v*val.v; return res; }
+ Tvsimple operator+(Tvsimple val) const { Tvsimple res; res.v = v+val.v; return res; }
+ Tvsimple operator+(double val) const { Tvsimple res; res.v = v+_mm256_set1_pd(val); return res;}
+ };
+
+template<typename vtype> struct s0data_s
+ { vtype sth, corfac, scale, lam1, lam2, csq, p1r, p1i, p2r, p2i; };
+
+template<typename vtype> void foo(s0data_s<vtype> & __restrict__ d,
+ const double * __restrict__ coef, const double * __restrict__ alm,
+ unsigned long l, unsigned long il, unsigned long lmax)
+ {
+// critical loop
+ while (l<=lmax)
+ {
+ d.p1r += d.lam2*alm[2*l];
+ d.p1i += d.lam2*alm[2*l+1];
+ d.p2r += d.lam2*alm[2*l+2];
+ d.p2i += d.lam2*alm[2*l+3];
+ Tvsimple tmp = d.lam2*(d.csq*coef[2*il] + coef[2*il+1]) + d.lam1;
+ d.lam1 = d.lam2;
+ d.lam2 = tmp;
+ ++il; l+=2;
+ }
+ }
+
+// this version has dead stores at the end of the loop
+template void foo<>(s0data_s<Tvsimple> & __restrict__ d,
+ const double * __restrict__ coef, const double * __restrict__ alm,
+ unsigned long l, unsigned long il, unsigned long lmax);
+
+// The aggregate copy in the IL should not prevent all store-motion
+// { dg-final { scan-tree-dump-times "Executing store motion" 4 "lim2" } }
diff --git a/gcc/tree-ssa-loop-im.c b/gcc/tree-ssa-loop-im.c
index 9ac390b9a4b..81b4ec21d6e 100644
--- a/gcc/tree-ssa-loop-im.c
+++ b/gcc/tree-ssa-loop-im.c
@@ -122,7 +122,9 @@ public:
hashval_t hash; /* Its hash value. */
/* The memory access itself and associated caching of alias-oracle
- query meta-data. */
+ query meta-data. We are using mem.ref == error_mark_node for the
+ case the reference is represented by its single access stmt
+ in accesses_in_loop[0]. */
ao_ref mem;
bitmap stored; /* The set of loops in that this memory location
@@ -130,8 +132,7 @@ public:
bitmap loaded; /* The set of loops in that this memory location
is loaded from. */
vec<mem_ref_loc> accesses_in_loop;
- /* The locations of the accesses. Vector
- indexed by the loop number. */
+ /* The locations of the accesses. */
/* The following set is computed on demand. */
bitmap_head dep_loop; /* The set of loops in that the memory
@@ -1465,7 +1466,22 @@ gather_mem_refs_stmt (class loop *loop, gimple *stmt)
return;
mem = simple_mem_ref_in_stmt (stmt, &is_stored);
- if (!mem)
+ if (!mem && is_gimple_assign (stmt))
+ {
+ /* For aggregate copies record distinct references but use them
+ only for disambiguation purposes. */
+ id = memory_accesses.refs_list.length ();
+ ref = mem_ref_alloc (NULL, 0, id);
+ memory_accesses.refs_list.safe_push (ref);
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ {
+ fprintf (dump_file, "Unhandled memory reference %u: ", id);
+ print_gimple_stmt (dump_file, stmt, 0, TDF_SLIM);
+ }
+ record_mem_ref_loc (ref, stmt, mem);
+ is_stored = gimple_vdef (stmt);
+ }
+ else if (!mem)
{
/* We use the shared mem_ref for all unanalyzable refs. */
id = UNANALYZABLE_MEM_ID;
@@ -1595,7 +1611,8 @@ gather_mem_refs_stmt (class loop *loop, gimple *stmt)
mark_ref_stored (ref, loop);
}
/* A not simple memory op is also a read when it is a write. */
- if (!is_stored || id == UNANALYZABLE_MEM_ID)
+ if (!is_stored || id == UNANALYZABLE_MEM_ID
+ || ref->mem.ref == error_mark_node)
{
bitmap_set_bit (&memory_accesses.refs_loaded_in_loop[loop->num], ref->id);
mark_ref_loaded (ref, loop);
@@ -1714,6 +1731,9 @@ mem_refs_may_alias_p (im_mem_ref *mem1, im_mem_ref *mem2,
hash_map<tree, name_expansion *> **ttae_cache,
bool tbaa_p)
{
+ gcc_checking_assert (mem1->mem.ref != error_mark_node
+ && mem2->mem.ref != error_mark_node);
+
/* Perform BASE + OFFSET analysis -- if MEM1 and MEM2 are based on the same
object and their offset differ in such a way that the locations cannot
overlap, then they cannot alias. */
@@ -2490,6 +2510,13 @@ sm_seq_valid_bb (class loop *loop, basic_block bb, tree vdef,
gcc_assert (data);
if (data->ref == UNANALYZABLE_MEM_ID)
return -1;
+ /* Stop at memory references which we can't move. */
+ else if (memory_accesses.refs_list[data->ref]->mem.ref == error_mark_node)
+ {
+ /* Mark refs_not_in_seq as unsupported. */
+ bitmap_ior_into (refs_not_supported, refs_not_in_seq);
+ return 1;
+ }
/* One of the stores we want to apply SM to and we've not yet seen. */
else if (bitmap_clear_bit (refs_not_in_seq, data->ref))
{
@@ -2798,7 +2825,8 @@ ref_indep_loop_p (class loop *loop, im_mem_ref *ref, dep_kind kind)
else
refs_to_check = &memory_accesses.refs_stored_in_loop[loop->num];
- if (bitmap_bit_p (refs_to_check, UNANALYZABLE_MEM_ID))
+ if (bitmap_bit_p (refs_to_check, UNANALYZABLE_MEM_ID)
+ || ref->mem.ref == error_mark_node)
indep_p = false;
else
{
@@ -2825,7 +2853,20 @@ ref_indep_loop_p (class loop *loop, im_mem_ref *ref, dep_kind kind)
EXECUTE_IF_SET_IN_BITMAP (refs_to_check, 0, i, bi)
{
im_mem_ref *aref = memory_accesses.refs_list[i];
- if (!refs_independent_p (ref, aref, kind != sm_waw))
+ if (aref->mem.ref == error_mark_node)
+ {
+ gimple *stmt = aref->accesses_in_loop[0].stmt;
+ if ((kind == sm_war
+ && ref_maybe_used_by_stmt_p (stmt, &ref->mem,
+ kind != sm_waw))
+ || stmt_may_clobber_ref_p_1 (stmt, &ref->mem,
+ kind != sm_waw))
+ {
+ indep_p = false;
+ break;
+ }
+ }
+ else if (!refs_independent_p (ref, aref, kind != sm_waw))
{
indep_p = false;
break;
@@ -2858,6 +2899,10 @@ can_sm_ref_p (class loop *loop, im_mem_ref *ref)
if (!MEM_ANALYZABLE (ref))
return false;
+ /* Can't hoist/sink aggregate copies. */
+ if (ref->mem.ref == error_mark_node)
+ return false;
+
/* It should be movable. */
if (!is_gimple_reg_type (TREE_TYPE (ref->mem.ref))
|| TREE_THIS_VOLATILE (ref->mem.ref)
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2021-07-07 11:49 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-07-07 11:49 [gcc r12-2097] tree-optimization/99728 - improve LIM for loops with aggregate copies Richard Biener
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).