public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH] Fix PR84859
@ 2018-03-16 14:09 Richard Biener
  2018-03-19 12:12 ` Jakub Jelinek
  0 siblings, 1 reply; 7+ messages in thread
From: Richard Biener @ 2018-03-16 14:09 UTC (permalink / raw)
  To: gcc-patches; +Cc: Jakub Jelinek


This fixes a bogus warning by performing MIN_EXPR detection on
memory which avoids bogus threading and isolating of a path with
a bogus memcpy.

I've gone back in time and at some point we had no MAX_STORES_TO_SINK
param but handled just the last_and_only_stmt case of sinking
two stores.  The param got added when the code was extended to handle
an arbitrary number of stores and thus did dependence checking which
eventually was thought to be too expensive to do without vectorization
or if-conversion enabled.

So this restores this ability even when MAX_STORES_TO_SINK is zero
and also generalizes it a bit to not only handle last_and_only_stmt
but the case where each BB has a single store without any following
loads.  In that case we can elide the dependence checking and perform
the desired transform (for the testcase one BB has another stmt
so last_and_only_stmt wouldn't have worked).

I've had to disable cselim on three testcases, one being a unit
test for PRE and two being unit tests (?!) for my very special
friend path splitting.

Bootstrapped and tested on x86_64-unknown-linux-gnu, ok?

Thanks,
Richard.

2018-03-16  Richard Biener  <rguenther@suse.de>

	PR tree-optimization/84859
	* tree-ssa-phiopt.c (single_trailing_store_in_bb): New function.
	(cond_if_else_store_replacement): Perform sinking operation on
	single-store BBs regardless of MAX_STORES_TO_SINK setting.
	Generalize what a BB with a single eligible store is.

	* gcc.dg/tree-ssa/pr84859.c: New testcase.
	* gcc.dg/tree-ssa/pr35286.c: Disable cselim.
	* gcc.dg/tree-ssa/split-path-6.c: Likewise.
	* gcc.dg/tree-ssa/split-path-7.c: Likewise.

Index: gcc/tree-ssa-phiopt.c
===================================================================
--- gcc/tree-ssa-phiopt.c	(revision 258584)
+++ gcc/tree-ssa-phiopt.c	(working copy)
@@ -2035,6 +2035,36 @@ cond_if_else_store_replacement_1 (basic_
   return true;
 }
 
+/* Return the single store in BB with VDEF or NULL if there are
+   other stores in the BB or loads following the store.  */
+
+static gimple *
+single_trailing_store_in_bb (basic_block bb, tree vdef)
+{
+  if (SSA_NAME_IS_DEFAULT_DEF (vdef))
+    return NULL;
+  gimple *store = SSA_NAME_DEF_STMT (vdef);
+  if (gimple_bb (store) != bb
+      || gimple_code (store) == GIMPLE_PHI)
+    return NULL;
+
+  /* Verify there is no other store in this BB.  */
+  if (!SSA_NAME_IS_DEFAULT_DEF (gimple_vuse (store))
+      && gimple_bb (SSA_NAME_DEF_STMT (gimple_vuse (store))) == bb
+      && gimple_code (SSA_NAME_DEF_STMT (gimple_vuse (store))) != GIMPLE_PHI)
+    return NULL;
+
+  /* Verify there is no load or store after the store.  */
+  use_operand_p use_p;
+  imm_use_iterator imm_iter;
+  FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_vdef (store))
+    if (USE_STMT (use_p) != store
+	&& gimple_bb (USE_STMT (use_p)) == bb)
+      return NULL;
+
+  return store;
+}
+
 /* Conditional store replacement.  We already know
    that the recognized pattern looks like so:
 
@@ -2061,8 +2091,6 @@ static bool
 cond_if_else_store_replacement (basic_block then_bb, basic_block else_bb,
                                 basic_block join_bb)
 {
-  gimple *then_assign = last_and_only_stmt (then_bb);
-  gimple *else_assign = last_and_only_stmt (else_bb);
   vec<data_reference_p> then_datarefs, else_datarefs;
   vec<ddr_p> then_ddrs, else_ddrs;
   gimple *then_store, *else_store;
@@ -2073,13 +2101,32 @@ cond_if_else_store_replacement (basic_bl
   tree then_lhs, else_lhs;
   basic_block blocks[3];
 
-  if (MAX_STORES_TO_SINK == 0)
+  /* Handle the case with single store in THEN_BB and ELSE_BB.  That is
+     cheap enough to always handle as it allows us to elide dependence
+     checking.  */
+  gphi *vphi = NULL;
+  for (gphi_iterator si = gsi_start_phis (join_bb); !gsi_end_p (si);
+       gsi_next (&si))
+    if (virtual_operand_p (gimple_phi_result (si.phi ())))
+      {
+	vphi = si.phi ();
+	break;
+      }
+  if (!vphi)
     return false;
+  tree then_vdef = PHI_ARG_DEF_FROM_EDGE (vphi, single_succ_edge (then_bb));
+  tree else_vdef = PHI_ARG_DEF_FROM_EDGE (vphi, single_succ_edge (else_bb));
+  gimple *then_assign = single_trailing_store_in_bb (then_bb, then_vdef);
+  if (then_assign)
+    {
+      gimple *else_assign = single_trailing_store_in_bb (else_bb, else_vdef);
+      if (else_assign)
+	return cond_if_else_store_replacement_1 (then_bb, else_bb, join_bb,
+						 then_assign, else_assign);
+    }
 
-  /* Handle the case with single statement in THEN_BB and ELSE_BB.  */
-  if (then_assign && else_assign)
-    return cond_if_else_store_replacement_1 (then_bb, else_bb, join_bb,
-                                             then_assign, else_assign);
+  if (MAX_STORES_TO_SINK == 0)
+    return false;
 
   /* Find data references.  */
   then_datarefs.create (1);
Index: gcc/testsuite/gcc.dg/tree-ssa/pr35286.c
===================================================================
--- gcc/testsuite/gcc.dg/tree-ssa/pr35286.c	(revision 258584)
+++ gcc/testsuite/gcc.dg/tree-ssa/pr35286.c	(working copy)
@@ -1,5 +1,5 @@
 /* { dg-do compile } */ 
-/* { dg-options "-O2 -fno-code-hoisting -fdump-tree-pre-stats" } */
+/* { dg-options "-O2 -fno-code-hoisting -fno-tree-cselim -fdump-tree-pre-stats" } */
 int g2;
 struct A {
     int a; int b;
Index: gcc/testsuite/gcc.dg/tree-ssa/split-path-6.c
===================================================================
--- gcc/testsuite/gcc.dg/tree-ssa/split-path-6.c	(revision 258584)
+++ gcc/testsuite/gcc.dg/tree-ssa/split-path-6.c	(working copy)
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -fsplit-paths -fdump-tree-split-paths-details -w" } */
+/* { dg-options "-O2 -fsplit-paths -fno-tree-cselim -fdump-tree-split-paths-details -w" } */
 
 struct __sFILE
 {
Index: gcc/testsuite/gcc.dg/tree-ssa/split-path-7.c
===================================================================
--- gcc/testsuite/gcc.dg/tree-ssa/split-path-7.c	(revision 258584)
+++ gcc/testsuite/gcc.dg/tree-ssa/split-path-7.c	(working copy)
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -fsplit-paths -fdump-tree-split-paths-details -w" } */
+/* { dg-options "-O2 -fsplit-paths -fno-tree-cselim -fdump-tree-split-paths-details -w" } */
 
 
 struct _reent
Index: gcc/testsuite/gcc.dg/tree-ssa/pr84859.c
===================================================================
--- gcc/testsuite/gcc.dg/tree-ssa/pr84859.c	(nonexistent)
+++ gcc/testsuite/gcc.dg/tree-ssa/pr84859.c	(working copy)
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -Warray-bounds -fdump-tree-phiopt1" } */
+
+void
+h (const void *p, unsigned n)
+{
+  unsigned char a[8];
+  if (n > sizeof a)
+    return;
+
+  for (; n > 0; n -= *a)
+    {
+      if (n > 255)
+	*a = 255;
+      else
+	*a = n;
+
+      __builtin_memcpy (a, p, *a);   /* { dg-bogus "bounds" } */
+    }
+}
+
+/* { dg-final { scan-tree-dump "MIN_EXPR" "phiopt1" } } */

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] Fix PR84859
  2018-03-16 14:09 [PATCH] Fix PR84859 Richard Biener
@ 2018-03-19 12:12 ` Jakub Jelinek
  2018-03-20 12:07   ` Bin.Cheng
  0 siblings, 1 reply; 7+ messages in thread
From: Jakub Jelinek @ 2018-03-19 12:12 UTC (permalink / raw)
  To: Richard Biener; +Cc: gcc-patches

On Fri, Mar 16, 2018 at 03:08:17PM +0100, Richard Biener wrote:
> Bootstrapped and tested on x86_64-unknown-linux-gnu, ok?

Ok, thanks.

	Jakub

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] Fix PR84859
  2018-03-19 12:12 ` Jakub Jelinek
@ 2018-03-20 12:07   ` Bin.Cheng
  2018-03-20 12:08     ` Richard Biener
  0 siblings, 1 reply; 7+ messages in thread
From: Bin.Cheng @ 2018-03-20 12:07 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: Richard Biener, gcc-patches List

On Mon, Mar 19, 2018 at 11:57 AM, Jakub Jelinek <jakub@redhat.com> wrote:
> On Fri, Mar 16, 2018 at 03:08:17PM +0100, Richard Biener wrote:
>> Bootstrapped and tested on x86_64-unknown-linux-gnu, ok?
>
> Ok, thanks.
Hi Richard,

This change causes below test failure on aarch64/arm linux:
FAIL: libgomp.graphite/force-parallel-4.c scan-tree-dump-times
graphite "2 loops carried no dependency" 1

Thanks,
bin
>
>         Jakub

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] Fix PR84859
  2018-03-20 12:07   ` Bin.Cheng
@ 2018-03-20 12:08     ` Richard Biener
  2018-03-20 12:44       ` Richard Biener
  0 siblings, 1 reply; 7+ messages in thread
From: Richard Biener @ 2018-03-20 12:08 UTC (permalink / raw)
  To: Bin.Cheng; +Cc: Jakub Jelinek, gcc-patches List, sebpop

On Tue, 20 Mar 2018, Bin.Cheng wrote:

> On Mon, Mar 19, 2018 at 11:57 AM, Jakub Jelinek <jakub@redhat.com> wrote:
> > On Fri, Mar 16, 2018 at 03:08:17PM +0100, Richard Biener wrote:
> >> Bootstrapped and tested on x86_64-unknown-linux-gnu, ok?
> >
> > Ok, thanks.
> Hi Richard,
> 
> This change causes below test failure on aarch64/arm linux:
> FAIL: libgomp.graphite/force-parallel-4.c scan-tree-dump-times
> graphite "2 loops carried no dependency" 1

So it looks like the conditional stores to A[i] in

  for (i = 0; i < N; i++)
    {
      if (i < T)
        /* loop i1: carried no dependency.  */
        A[i] = A[i+T];
      else
        /* loop i2: carried dependency.  */
        A[i] = A[i+T+1];
    }

vs. the stores commoned in the merge block causes the merge
block to appear in the schedule and thus

 [scheduler] original ast:
 {
   for (int c0 = 0; c0 <= 19999; c0 += 1)
     S_3(c0);
-  for (int c0 = 0; c0 <= 999; c0 += 1)
-    S_5(c0);
-  for (int c0 = 1000; c0 <= 9999; c0 += 1)
-    S_6(c0);
+  for (int c0 = 0; c0 <= 9999; c0 += 1) {
+    if (c0 >= 1000) {
+      S_6(c0);
+    } else {
+      S_5(c0);
+    }
+    S_7(c0);
+  }
 }

 [scheduler] AST generated by isl:
 {
   for (int c0 = 0; c0 <= 19999; c0 += 1)
     S_3(c0);
-  for (int c0 = 0; c0 <= 999; c0 += 1)
+  for (int c0 = 0; c0 <= 999; c0 += 1) {
     S_5(c0);
-  for (int c0 = 1000; c0 <= 9999; c0 += 1)
+    S_7(c0);
+  }
+  for (int c0 = 1000; c0 <= 9999; c0 += 1) {
     S_6(c0);
+    S_7(c0);
+  }
 }

not sure why neither of the two loops end up parallelizable then.
Possibly because of the dependences we generate for the cross-BB
scalar deps which store and load to a "scalar" memory location
(in the end those are just SSA dependences).  So it's an artificial
limitation caused by how we represent scheduling dependences
I guess:

+Adding scalar write: _4
+From stmt: _4 = A[_3];
+Adding scalar write: cstore_13
+From stmt: cstore_13 = PHI <_2(5), _4(6)>
...
+Adding scalar write: _2
+From stmt: _2 = A[_1];
+Adding scalar write: cstore_13
+From stmt: cstore_13 = PHI <_2(5), _4(6)>
...
+Adding scalar read: cstore_13
+From stmt: cstore_13 = PHI <_2(5), _4(6)>

I have no idea/plan to improve that on the GRAPHITE/ISL side right now
but maybe Sebastian has?  We'd need to tell ISL that we can "elide"
the dependence or rather that it is inner-iteration only and doesn't
leak cross-iteration - kind of clobbering it at the end?

So I'm going to XFAIL the testcase.

Thanks,
Richard.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] Fix PR84859
  2018-03-20 12:08     ` Richard Biener
@ 2018-03-20 12:44       ` Richard Biener
  2018-03-20 14:37         ` Jakub Jelinek
  0 siblings, 1 reply; 7+ messages in thread
From: Richard Biener @ 2018-03-20 12:44 UTC (permalink / raw)
  To: Bin.Cheng; +Cc: Jakub Jelinek, gcc-patches List

On Tue, 20 Mar 2018, Richard Biener wrote:

> On Tue, 20 Mar 2018, Bin.Cheng wrote:
> 
> > On Mon, Mar 19, 2018 at 11:57 AM, Jakub Jelinek <jakub@redhat.com> wrote:
> > > On Fri, Mar 16, 2018 at 03:08:17PM +0100, Richard Biener wrote:
> > >> Bootstrapped and tested on x86_64-unknown-linux-gnu, ok?
> > >
> > > Ok, thanks.
> > Hi Richard,
> > 
> > This change causes below test failure on aarch64/arm linux:
> > FAIL: libgomp.graphite/force-parallel-4.c scan-tree-dump-times
> > graphite "2 loops carried no dependency" 1
> 
> So it looks like the conditional stores to A[i] in
> 
>   for (i = 0; i < N; i++)
>     {
>       if (i < T)
>         /* loop i1: carried no dependency.  */
>         A[i] = A[i+T];
>       else
>         /* loop i2: carried dependency.  */
>         A[i] = A[i+T+1];
>     }
> 
> vs. the stores commoned in the merge block causes the merge
> block to appear in the schedule and thus
> 
>  [scheduler] original ast:
>  {
>    for (int c0 = 0; c0 <= 19999; c0 += 1)
>      S_3(c0);
> -  for (int c0 = 0; c0 <= 999; c0 += 1)
> -    S_5(c0);
> -  for (int c0 = 1000; c0 <= 9999; c0 += 1)
> -    S_6(c0);
> +  for (int c0 = 0; c0 <= 9999; c0 += 1) {
> +    if (c0 >= 1000) {
> +      S_6(c0);
> +    } else {
> +      S_5(c0);
> +    }
> +    S_7(c0);
> +  }
>  }
> 
>  [scheduler] AST generated by isl:
>  {
>    for (int c0 = 0; c0 <= 19999; c0 += 1)
>      S_3(c0);
> -  for (int c0 = 0; c0 <= 999; c0 += 1)
> +  for (int c0 = 0; c0 <= 999; c0 += 1) {
>      S_5(c0);
> -  for (int c0 = 1000; c0 <= 9999; c0 += 1)
> +    S_7(c0);
> +  }
> +  for (int c0 = 1000; c0 <= 9999; c0 += 1) {
>      S_6(c0);
> +    S_7(c0);
> +  }
>  }
> 
> not sure why neither of the two loops end up parallelizable then.
> Possibly because of the dependences we generate for the cross-BB
> scalar deps which store and load to a "scalar" memory location
> (in the end those are just SSA dependences).  So it's an artificial
> limitation caused by how we represent scheduling dependences
> I guess:
> 
> +Adding scalar write: _4
> +From stmt: _4 = A[_3];
> +Adding scalar write: cstore_13
> +From stmt: cstore_13 = PHI <_2(5), _4(6)>
> ...
> +Adding scalar write: _2
> +From stmt: _2 = A[_1];
> +Adding scalar write: cstore_13
> +From stmt: cstore_13 = PHI <_2(5), _4(6)>
> ...
> +Adding scalar read: cstore_13
> +From stmt: cstore_13 = PHI <_2(5), _4(6)>
> 
> I have no idea/plan to improve that on the GRAPHITE/ISL side right now
> but maybe Sebastian has?  We'd need to tell ISL that we can "elide"
> the dependence or rather that it is inner-iteration only and doesn't
> leak cross-iteration - kind of clobbering it at the end?
> 
> So I'm going to XFAIL the testcase.

2018-03-20  Richard Biener  <rguenther@suse.de>

	* testsuite/libgomp.graphite/force-parallel-4.c: XFAIL one
	parallelizable loop.

Index: libgomp/testsuite/libgomp.graphite/force-parallel-4.c
===================================================================
--- libgomp/testsuite/libgomp.graphite/force-parallel-4.c	(revision 258678)
+++ libgomp/testsuite/libgomp.graphite/force-parallel-4.c	(working copy)
@@ -46,7 +46,10 @@ int main(void)
   return 0;
 }
 
-/* Check that parallel code generation part make the right answer.  */
-/* { dg-final { scan-tree-dump-times "2 loops carried no dependency" 1 "graphite" } } */
+/* Check that parallel code generation part make the right answer.
+   ???  XFAILed for i1 because conditional store elimination wrecks
+   our dependence representation.  */
+/* { dg-final { scan-tree-dump-times "2 loops carried no dependency" 1 "graphite" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump-times "1 loops carried no dependency" 1 "graphite" } } */
 /* { dg-final { scan-tree-dump-times "loopfn.0" 4 "optimized" } } */
 /* { dg-final { scan-tree-dump-times "loopfn.1" 4 "optimized" } } */

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] Fix PR84859
  2018-03-20 14:37         ` Jakub Jelinek
@ 2018-03-20 13:02           ` Richard Biener
  0 siblings, 0 replies; 7+ messages in thread
From: Richard Biener @ 2018-03-20 13:02 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: Bin.Cheng, gcc-patches List

On Tue, 20 Mar 2018, Jakub Jelinek wrote:

> On Tue, Mar 20, 2018 at 01:42:04PM +0100, Richard Biener wrote:
> > 2018-03-20  Richard Biener  <rguenther@suse.de>
> > 
> > 	* testsuite/libgomp.graphite/force-parallel-4.c: XFAIL one
> > 	parallelizable loop.
> > 
> > Index: libgomp/testsuite/libgomp.graphite/force-parallel-4.c
> > ===================================================================
> > --- libgomp/testsuite/libgomp.graphite/force-parallel-4.c	(revision 258678)
> > +++ libgomp/testsuite/libgomp.graphite/force-parallel-4.c	(working copy)
> > @@ -46,7 +46,10 @@ int main(void)
> >    return 0;
> >  }
> >  
> > -/* Check that parallel code generation part make the right answer.  */
> > -/* { dg-final { scan-tree-dump-times "2 loops carried no dependency" 1 "graphite" } } */
> > +/* Check that parallel code generation part make the right answer.
> > +   ???  XFAILed for i1 because conditional store elimination wrecks
> > +   our dependence representation.  */
> > +/* { dg-final { scan-tree-dump-times "2 loops carried no dependency" 1 "graphite" { xfail *-*-* } } } */
> > +/* { dg-final { scan-tree-dump-times "1 loops carried no dependency" 1 "graphite" } } */
> 
> Shouldn't this line be then "\[12] loops carried no dependency" 1 "graphite" } } */
> so that when the previous starts XPASSing, we don't actually get a new FAIL?

I think we want to know this and adjust the testcase then to not go
back to 1 loop with no dependence.

> >  /* { dg-final { scan-tree-dump-times "loopfn.0" 4 "optimized" } } */
> >  /* { dg-final { scan-tree-dump-times "loopfn.1" 4 "optimized" } } */

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] Fix PR84859
  2018-03-20 12:44       ` Richard Biener
@ 2018-03-20 14:37         ` Jakub Jelinek
  2018-03-20 13:02           ` Richard Biener
  0 siblings, 1 reply; 7+ messages in thread
From: Jakub Jelinek @ 2018-03-20 14:37 UTC (permalink / raw)
  To: Richard Biener; +Cc: Bin.Cheng, gcc-patches List

On Tue, Mar 20, 2018 at 01:42:04PM +0100, Richard Biener wrote:
> 2018-03-20  Richard Biener  <rguenther@suse.de>
> 
> 	* testsuite/libgomp.graphite/force-parallel-4.c: XFAIL one
> 	parallelizable loop.
> 
> Index: libgomp/testsuite/libgomp.graphite/force-parallel-4.c
> ===================================================================
> --- libgomp/testsuite/libgomp.graphite/force-parallel-4.c	(revision 258678)
> +++ libgomp/testsuite/libgomp.graphite/force-parallel-4.c	(working copy)
> @@ -46,7 +46,10 @@ int main(void)
>    return 0;
>  }
>  
> -/* Check that parallel code generation part make the right answer.  */
> -/* { dg-final { scan-tree-dump-times "2 loops carried no dependency" 1 "graphite" } } */
> +/* Check that parallel code generation part make the right answer.
> +   ???  XFAILed for i1 because conditional store elimination wrecks
> +   our dependence representation.  */
> +/* { dg-final { scan-tree-dump-times "2 loops carried no dependency" 1 "graphite" { xfail *-*-* } } } */
> +/* { dg-final { scan-tree-dump-times "1 loops carried no dependency" 1 "graphite" } } */

Shouldn't this line be then "\[12] loops carried no dependency" 1 "graphite" } } */
so that when the previous starts XPASSing, we don't actually get a new FAIL?

>  /* { dg-final { scan-tree-dump-times "loopfn.0" 4 "optimized" } } */
>  /* { dg-final { scan-tree-dump-times "loopfn.1" 4 "optimized" } } */

	Jakub

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2018-03-20 14:35 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-03-16 14:09 [PATCH] Fix PR84859 Richard Biener
2018-03-19 12:12 ` Jakub Jelinek
2018-03-20 12:07   ` Bin.Cheng
2018-03-20 12:08     ` Richard Biener
2018-03-20 12:44       ` Richard Biener
2018-03-20 14:37         ` Jakub Jelinek
2018-03-20 13:02           ` Richard Biener

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).