[PR tree-optimization/71691] Fix unswitching in presence of maybe-undef SSA

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PR tree-optimization/71691] Fix unswitching in presence of maybe-undef SSA_NAMEs (take 2)
@ 2016-12-16 15:20 Aldy Hernandez
  2016-12-19 23:30 ` Jeff Law
                   ` (2 more replies)
  0 siblings, 3 replies; 23+ messages in thread
From: Aldy Hernandez @ 2016-12-16 15:20 UTC (permalink / raw)
  To: gcc-patches, Richard Biener, Jeff Law

[-- Attachment #1: Type: text/plain, Size: 894 bytes --]

Hi folks.

This is a follow-up on Jeff and Richi's interaction on the 
aforementioned PR here:

	https://gcc.gnu.org/ml/gcc-patches/2016-08/msg01397.html

I decided to explore the idea of analyzing may-undefness on-demand, 
which actually looks rather cheap.

BTW, I don't understand why we don't have auto_bitmap's, as we already 
have auto_sbitmap's.  I've implemented the former based on 
auto_sbitmap's code we already have.

The attached patch fixes the bug without introducing any regressions.

I also tested the patch by compiling 242 .ii files with -O3.  These were 
gathered from a stage1 build with -save-temps.  There is a slight time 
degradation of 4 seconds within 27 minutes of user time:

	tainted:	26:52
	orig:		26:48

This was the average aggregate time of two runs compiling all 242 .ii 
files.  IMO, this looks reasonable.  It is after all, -O3.    Is it 
acceptable?

Aldy

[-- Attachment #2: curr --]
[-- Type: text/plain, Size: 4713 bytes --]

commit 2310bcd0e2552a40ca1de354faf005ed3e9daf4e
Author: Aldy Hernandez <aldyh@redhat.com>
Date:   Fri Dec 16 03:44:52 2016 -0500

            PR tree-optimization/71691
            * bitmap.h (class auto_bitmap): New.
            * tree-ssa-loop-unswitch.c (ssa_maybe_undefined_value_p): New.
            (tree_may_unswitch_on): Call ssa_maybe_undefined_value_p.

diff --git a/gcc/bitmap.h b/gcc/bitmap.h
index 1b2c8e0..bc32a21 100644
--- a/gcc/bitmap.h
+++ b/gcc/bitmap.h
@@ -802,4 +802,25 @@ bmp_iter_and_compl (bitmap_iterator *bi, unsigned *bit_no)
        bmp_iter_and_compl (&(ITER), &(BITNUM));				\
        bmp_iter_next (&(ITER), &(BITNUM)))
 
+/* A class that ties the lifetime of a bitmap to its scope.  */
+class auto_bitmap
+{
+ public:
+  auto_bitmap () { bits = BITMAP_ALLOC (NULL); }
+  ~auto_bitmap () { BITMAP_FREE (bits); }
+  // Allow calling bitmap functions on our bitmap.
+  operator bitmap () { return bits; }
+
+ private:
+  // Prevent making a copy that references our bitmap.
+  auto_bitmap (const auto_bitmap &);
+  auto_bitmap &operator = (const auto_bitmap &);
+#if __cplusplus >= 201103L
+  auto_bitmap (auto_bitmap &&);
+  auto_bitmap &operator = (auto_bitmap &&);
+#endif
+
+  bitmap bits;
+};
+
 #endif /* GCC_BITMAP_H */
diff --git a/gcc/testsuite/gcc.c-torture/execute/pr71691.c b/gcc/testsuite/gcc.c-torture/execute/pr71691.c
new file mode 100644
index 0000000..1ffd1a2
--- /dev/null
+++ b/gcc/testsuite/gcc.c-torture/execute/pr71691.c
@@ -0,0 +1,47 @@
+/* { dg-options "-fno-tree-vrp" } */
+
+char b;
+short f;
+unsigned e;
+int g = 20;
+
+void
+foo ()
+{
+  int l, h;
+  for (l = 0; l <= 7; l++)
+    {
+      int j = 38;
+      if (g)
+	h = 0;
+      for (; h <= 7; h++)
+	{
+	  int i, k = b % (j % 4);
+	  g = f;
+	  for (;;)
+	    {
+	      j = 6 || b;
+	      if (e)
+		{
+		  for (; j; --j)
+		    if (k)
+		      __builtin_printf ("%d", 9);
+		  if (i)
+		    __builtin_printf ("%d", j);
+		}
+	      if (l)
+		continue;
+	      break;
+	    }
+	  i = f || b;
+	}
+    }
+}
+
+int
+main ()
+{
+  foo ();
+  return 0;
+}
+
diff --git a/gcc/tree-ssa-loop-unswitch.c b/gcc/tree-ssa-loop-unswitch.c
index 40905af..bc34395 100644
--- a/gcc/tree-ssa-loop-unswitch.c
+++ b/gcc/tree-ssa-loop-unswitch.c
@@ -109,6 +109,71 @@ tree_ssa_unswitch_loops (void)
   return 0;
 }
 
+/* Return TRUE if an SSA_NAME may be undefined.  */
+
+static bool
+ssa_maybe_undefined_value_p (tree name)
+{
+  /* If it's obviously undefined, avoid further computations.  */
+  if (ssa_undefined_value_p (name, true))
+    return true;
+
+  auto_bitmap visited_ssa;
+  auto_vec<tree> worklist;
+  worklist.safe_push (name);
+  while (!worklist.is_empty ())
+    {
+      name = worklist.pop ();
+      gcc_assert (TREE_CODE (name) == SSA_NAME);
+
+      if (ssa_undefined_value_p (name, true))
+	return true;
+
+      bitmap_set_bit (visited_ssa, SSA_NAME_VERSION (name));
+
+      /* Check that all the PHI args are fully defined.  */
+      gimple *def = SSA_NAME_DEF_STMT (name);
+      if (gphi *phi = dyn_cast <gphi *> (def))
+	{
+	  if (virtual_operand_p (PHI_RESULT (phi)))
+	    continue;
+	  for (unsigned i = 0; i < gimple_phi_num_args (phi); ++i)
+	    {
+	      name = gimple_phi_arg_def (phi, i);
+	      if (TREE_CODE (name) == SSA_NAME)
+		{
+		  /* If an SSA has already been seen, this may be a loop.
+		     Fail conservatively.  */
+		  if (bitmap_bit_p (visited_ssa, SSA_NAME_VERSION (name)))
+		    return false;
+		  bitmap_set_bit (visited_ssa, SSA_NAME_VERSION (name));
+		  worklist.safe_push (name);
+		}
+	    }
+	  continue;
+	}
+
+      /* Check that any SSA names used to define NAME is also fully
+	 defined.  */
+      use_operand_p use_p;
+      ssa_op_iter iter;
+      FOR_EACH_SSA_USE_OPERAND (use_p, def, iter, SSA_OP_USE)
+	{
+	  name = USE_FROM_PTR (use_p);
+	  if (TREE_CODE (name) == SSA_NAME)
+	    {
+	      /* If an SSA has already been seen, this may be a loop.
+		 Fail conservatively.  */
+	      if (bitmap_bit_p (visited_ssa, SSA_NAME_VERSION (name)))
+		return false;
+	      bitmap_set_bit (visited_ssa, SSA_NAME_VERSION (name));
+	      worklist.safe_push (name);
+	    }
+	}
+    }
+  return false;
+}
+
 /* Checks whether we can unswitch LOOP on condition at end of BB -- one of its
    basic blocks (for what it means see comments below).  */
 
@@ -138,7 +203,7 @@ tree_may_unswitch_on (basic_block bb, struct loop *loop)
     {
       /* Unswitching on undefined values would introduce undefined
 	 behavior that the original program might never exercise.  */
-      if (ssa_undefined_value_p (use, true))
+      if (ssa_maybe_undefined_value_p (use))
 	return NULL_TREE;
       def = SSA_NAME_DEF_STMT (use);
       def_bb = gimple_bb (def);

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PR tree-optimization/71691] Fix unswitching in presence of maybe-undef SSA_NAMEs (take 2)
  2016-12-16 15:20 [PR tree-optimization/71691] Fix unswitching in presence of maybe-undef SSA_NAMEs (take 2) Aldy Hernandez
@ 2016-12-19 23:30 ` Jeff Law
  2016-12-20 14:39 ` Richard Biener
  2017-01-05  4:13 ` Trevor Saunders
  2 siblings, 0 replies; 23+ messages in thread
From: Jeff Law @ 2016-12-19 23:30 UTC (permalink / raw)
  To: Aldy Hernandez, gcc-patches, Richard Biener

On 12/16/2016 07:41 AM, Aldy Hernandez wrote:
>
> BTW, I don't understand why we don't have auto_bitmap's, as we already
> have auto_sbitmap's.  I've implemented the former based on
> auto_sbitmap's code we already have.
Trevor poked at it a bit.  bitmaps are a bit more complex than sbitmaps 
in terms of implementation details.

https://gcc.gnu.org/ml/gcc/2016-04/msg00013.html

But his suggestion was to first create auto_bitmap, then look to convert 
to using that as opposed to his other approaches.


>
> The attached patch fixes the bug without introducing any regressions.
>
> I also tested the patch by compiling 242 .ii files with -O3.  These were
> gathered from a stage1 build with -save-temps.  There is a slight time
> degradation of 4 seconds within 27 minutes of user time:
>
>     tainted:    26:52
>     orig:        26:48
>
> This was the average aggregate time of two runs compiling all 242 .ii
> files.  IMO, this looks reasonable.  It is after all, -O3.    Is it
> acceptable?
>
> Aldy
>
> curr
>
>
> commit 2310bcd0e2552a40ca1de354faf005ed3e9daf4e
> Author: Aldy Hernandez <aldyh@redhat.com>
> Date:   Fri Dec 16 03:44:52 2016 -0500
>
>             PR tree-optimization/71691
>             * bitmap.h (class auto_bitmap): New.
>             * tree-ssa-loop-unswitch.c (ssa_maybe_undefined_value_p): New.
>             (tree_may_unswitch_on): Call ssa_maybe_undefined_value_p.
So I'm going to defer to Richi since he was reviewing my original 
attempt in this space.

It probably doesn't matter in practice (when I looked at this I couldn't 
get into the code in question with a -O3 bootstrap or with the 
testsuite, just with the testcase in the BZ) but you might consider 
handling an already visited node slightly differently.

If the the node was visited and resolved as undefined, then we would 
have already exited the loop.

If the node was visited and resolved as defined, then we could just keep 
processing other items on the the worklist.

The case where you want to conservatively return false is when you're 
actively processing the name in question.

Jeff

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PR tree-optimization/71691] Fix unswitching in presence of maybe-undef SSA_NAMEs (take 2)
  2016-12-16 15:20 [PR tree-optimization/71691] Fix unswitching in presence of maybe-undef SSA_NAMEs (take 2) Aldy Hernandez
  2016-12-19 23:30 ` Jeff Law
@ 2016-12-20 14:39 ` Richard Biener
  2016-12-20 16:01   ` Jeff Law
                     ` (2 more replies)
  2017-01-05  4:13 ` Trevor Saunders
  2 siblings, 3 replies; 23+ messages in thread
From: Richard Biener @ 2016-12-20 14:39 UTC (permalink / raw)
  To: Aldy Hernandez; +Cc: gcc-patches, Jeff Law

On Fri, Dec 16, 2016 at 3:41 PM, Aldy Hernandez <aldyh@redhat.com> wrote:
> Hi folks.
>
> This is a follow-up on Jeff and Richi's interaction on the aforementioned PR
> here:
>
>         https://gcc.gnu.org/ml/gcc-patches/2016-08/msg01397.html
>
> I decided to explore the idea of analyzing may-undefness on-demand, which
> actually looks rather cheap.
>
> BTW, I don't understand why we don't have auto_bitmap's, as we already have
> auto_sbitmap's.  I've implemented the former based on auto_sbitmap's code we
> already have.
>
> The attached patch fixes the bug without introducing any regressions.
>
> I also tested the patch by compiling 242 .ii files with -O3.  These were
> gathered from a stage1 build with -save-temps.  There is a slight time
> degradation of 4 seconds within 27 minutes of user time:
>
>         tainted:        26:52
>         orig:           26:48
>
> This was the average aggregate time of two runs compiling all 242 .ii files.
> IMO, this looks reasonable.  It is after all, -O3.    Is it acceptable?

+  while (!worklist.is_empty ())
+    {
+      name = worklist.pop ();
+      gcc_assert (TREE_CODE (name) == SSA_NAME);
+
+      if (ssa_undefined_value_p (name, true))
+       return true;
+
+      bitmap_set_bit (visited_ssa, SSA_NAME_VERSION (name));

it should be already set as we use visited_ssa as "was it ever on the worklist",
so maybe renaming it would be a good thing as well.

+             if (TREE_CODE (name) == SSA_NAME)
+               {
+                 /* If an SSA has already been seen, this may be a loop.
+                    Fail conservatively.  */
+                 if (bitmap_bit_p (visited_ssa, SSA_NAME_VERSION (name)))
+                   return false;

so to me "conservative" is returning true, not false.

+                 bitmap_set_bit (visited_ssa, SSA_NAME_VERSION (name));
+                 worklist.safe_push (name);

but for loops we can just continue and ignore this use.  And bitmap_set_bit
returns whether it set a bit, thus

                if (bitmap_set_bit (visited_ssa, SSA_NAME_VERSION (name)))
                  worklist.safe_push (name);

should work?

+      /* Check that any SSA names used to define NAME is also fully
+        defined.  */
+      use_operand_p use_p;
+      ssa_op_iter iter;
+      FOR_EACH_SSA_USE_OPERAND (use_p, def, iter, SSA_OP_USE)
+       {
+         name = USE_FROM_PTR (use_p);
+         if (TREE_CODE (name) == SSA_NAME)

always true.

+           {
+             /* If an SSA has already been seen, this may be a loop.
+                Fail conservatively.  */
+             if (bitmap_bit_p (visited_ssa, SSA_NAME_VERSION (name)))
+               return false;
+             bitmap_set_bit (visited_ssa, SSA_NAME_VERSION (name));
+             worklist.safe_push (name);

See above.

In principle the thing is sound but I'd like to be able to pass in a bitmap of
known maybe-undefined/must-defined SSA names to have a cache for
multiple invocations from within a pass (like this unswitching case).

Also once you hit defs that are in a post-dominated region of the loop entry
you can treat them as not undefined (as their use invokes undefined
behavior).  This is also how you treat function parameters (well,
ssa_undefined_value_p does), where the call site invokes undefined behavior
when passing in undefined values.  So we need an extra parameter specifying
the post-dominance region.

You do not handle memory or calls conservatively which means the existing
testcase only needs some obfuscation to become a problem again.  To fix
that before /* Check that any SSA names used to define NAME is also fully
defined.  */ bail out conservatively, like

   if (! is_gimple_assign (def)
      || gimple_assign_single_p (def))
    return true;

Only unswitching on conditions that post-dominate the loop entry might be a
simpler solution for the PR in question.  OTOH this may disable too much
unswitching in practice.

Richard.

> Aldy

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PR tree-optimization/71691] Fix unswitching in presence of maybe-undef SSA_NAMEs (take 2)
  2016-12-20 14:39 ` Richard Biener
@ 2016-12-20 16:01   ` Jeff Law
  2016-12-20 17:40     ` Richard Biener
  2016-12-21 18:52   ` Aldy Hernandez
  2017-01-03 17:37   ` Aldy Hernandez
  2 siblings, 1 reply; 23+ messages in thread
From: Jeff Law @ 2016-12-20 16:01 UTC (permalink / raw)
  To: Richard Biener, Aldy Hernandez; +Cc: gcc-patches

On 12/20/2016 07:16 AM, Richard Biener wrote:
>
> it should be already set as we use visited_ssa as "was it ever on the worklist",
> so maybe renaming it would be a good thing as well.
>
> +             if (TREE_CODE (name) == SSA_NAME)
> +               {
> +                 /* If an SSA has already been seen, this may be a loop.
> +                    Fail conservatively.  */
> +                 if (bitmap_bit_p (visited_ssa, SSA_NAME_VERSION (name)))
> +                   return false;
>
> so to me "conservative" is returning true, not false.
Agreed.

>
> +                 bitmap_set_bit (visited_ssa, SSA_NAME_VERSION (name));
> +                 worklist.safe_push (name);
>
> but for loops we can just continue and ignore this use.  And bitmap_set_bit
> returns whether it set a bit, thus
>
>                 if (bitmap_set_bit (visited_ssa, SSA_NAME_VERSION (name)))
>                   worklist.safe_push (name);
>
> should work?
What if we're in an irreducible region?


>
> In principle the thing is sound but I'd like to be able to pass in a bitmap of
> known maybe-undefined/must-defined SSA names to have a cache for
> multiple invocations from within a pass (like this unswitching case).
So that means keeping another bitmap for things positively identified as 
defined, then saving it for later invocations.  So maybe some of my work 
+ some of his melded together.  (mine computes once for the entire FN 
and saves the result, so converting it to on-demand saving the result 
shouldn't be terrible).


>
> Only unswitching on conditions that post-dominate the loop entry might be a
> simpler solution for the PR in question.  OTOH this may disable too much
> unswitching in practice.
It might be worth experimentation.

Jeff

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PR tree-optimization/71691] Fix unswitching in presence of maybe-undef SSA_NAMEs (take 2)
  2016-12-20 16:01   ` Jeff Law
@ 2016-12-20 17:40     ` Richard Biener
  2016-12-21 18:00       ` Jeff Law
  0 siblings, 1 reply; 23+ messages in thread
From: Richard Biener @ 2016-12-20 17:40 UTC (permalink / raw)
  To: Jeff Law, Aldy Hernandez; +Cc: gcc-patches

On December 20, 2016 4:50:25 PM GMT+01:00, Jeff Law <law@redhat.com> wrote:
>On 12/20/2016 07:16 AM, Richard Biener wrote:
>>
>> it should be already set as we use visited_ssa as "was it ever on the
>worklist",
>> so maybe renaming it would be a good thing as well.
>>
>> +             if (TREE_CODE (name) == SSA_NAME)
>> +               {
>> +                 /* If an SSA has already been seen, this may be a
>loop.
>> +                    Fail conservatively.  */
>> +                 if (bitmap_bit_p (visited_ssa, SSA_NAME_VERSION
>(name)))
>> +                   return false;
>>
>> so to me "conservative" is returning true, not false.
>Agreed.
>
>>
>> +                 bitmap_set_bit (visited_ssa, SSA_NAME_VERSION
>(name));
>> +                 worklist.safe_push (name);
>>
>> but for loops we can just continue and ignore this use.  And
>bitmap_set_bit
>> returns whether it set a bit, thus
>>
>>                 if (bitmap_set_bit (visited_ssa, SSA_NAME_VERSION
>(name)))
>>                   worklist.safe_push (name);
>>
>> should work?
>What if we're in an irreducible region?

Handling all back edges by deferring to their defs should work.  At least I can't see how it would not.

>
>>
>> In principle the thing is sound but I'd like to be able to pass in a
>bitmap of
>> known maybe-undefined/must-defined SSA names to have a cache for
>> multiple invocations from within a pass (like this unswitching case).
>So that means keeping another bitmap for things positively identified
>as 
>defined, then saving it for later invocations.

We eventually need a tristate here for maximum caching.  And as the result depends on the dominating BB of the postdom region the savings might be questionable.

>  So maybe some of my
>work 
>+ some of his melded together.  (mine computes once for the entire FN 
>and saves the result, so converting it to on-demand saving the result 
>shouldn't be terrible).
>
>
>>
>> Only unswitching on conditions that post-dominate the loop entry
>might be a
>> simpler solution for the PR in question.  OTOH this may disable too
>much
>> unswitching in practice.
>It might be worth experimentation.
>
>Jeff


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PR tree-optimization/71691] Fix unswitching in presence of maybe-undef SSA_NAMEs (take 2)
  2016-12-20 17:40     ` Richard Biener
@ 2016-12-21 18:00       ` Jeff Law
  0 siblings, 0 replies; 23+ messages in thread
From: Jeff Law @ 2016-12-21 18:00 UTC (permalink / raw)
  To: Richard Biener, Aldy Hernandez; +Cc: gcc-patches

On 12/20/2016 10:33 AM, Richard Biener wrote:
>>>
>>> but for loops we can just continue and ignore this use.  And
>> bitmap_set_bit
>>> returns whether it set a bit, thus
>>>
>>>                 if (bitmap_set_bit (visited_ssa, SSA_NAME_VERSION
>> (name)))
>>>                   worklist.safe_push (name);
>>>
>>> should work?
>> What if we're in an irreducible region?
>
> Handling all back edges by deferring to their defs should work.  At least I can't see how it would not.
I'll take your word for it -- I haven't thought deeply about this.



>
>>
>>>
>>> In principle the thing is sound but I'd like to be able to pass in a
>> bitmap of
>>> known maybe-undefined/must-defined SSA names to have a cache for
>>> multiple invocations from within a pass (like this unswitching case).
>> So that means keeping another bitmap for things positively identified
>> as
>> defined, then saving it for later invocations.
>
> We eventually need a tristate here for maximum caching.  And as the result depends on the dominating BB of the postdom region the savings might be questionable.
Possibly.

Jeff

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PR tree-optimization/71691] Fix unswitching in presence of maybe-undef SSA_NAMEs (take 2)
  2016-12-20 14:39 ` Richard Biener
  2016-12-20 16:01   ` Jeff Law
@ 2016-12-21 18:52   ` Aldy Hernandez
  2017-01-04 11:43     ` Richard Biener
  2017-01-03 17:37   ` Aldy Hernandez
  2 siblings, 1 reply; 23+ messages in thread
From: Aldy Hernandez @ 2016-12-21 18:52 UTC (permalink / raw)
  To: Richard Biener; +Cc: gcc-patches, Jeff Law

On 12/20/2016 09:16 AM, Richard Biener wrote:

> You do not handle memory or calls conservatively which means the existing
> testcase only needs some obfuscation to become a problem again.  To fix
> that before /* Check that any SSA names used to define NAME is also fully
> defined.  */ bail out conservatively, like
>
>    if (! is_gimple_assign (def)
>       || gimple_assign_single_p (def))
>     return true;

I understand the !is_gimple_assign, which will skip over GIMPLE_CALLs 
setting a value, but the "gimple_assign_single_p(def) == true"??

Won't this last one bail on things like e.3_7 = e, or x_77 = y_88? 
Don't we want to follow up the def chain precisely on these?

Thanks.
Aldy

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PR tree-optimization/71691] Fix unswitching in presence of maybe-undef SSA_NAMEs (take 2)
  2016-12-20 14:39 ` Richard Biener
  2016-12-20 16:01   ` Jeff Law
  2016-12-21 18:52   ` Aldy Hernandez
@ 2017-01-03 17:37   ` Aldy Hernandez
  2017-01-04 12:12     ` Richard Biener
  2 siblings, 1 reply; 23+ messages in thread
From: Aldy Hernandez @ 2017-01-03 17:37 UTC (permalink / raw)
  To: Richard Biener; +Cc: gcc-patches, Jeff Law

[-- Attachment #1: Type: text/plain, Size: 4866 bytes --]

On 12/20/2016 09:16 AM, Richard Biener wrote:
> On Fri, Dec 16, 2016 at 3:41 PM, Aldy Hernandez <aldyh@redhat.com> wrote:
>> Hi folks.
>>
>> This is a follow-up on Jeff and Richi's interaction on the aforementioned PR
>> here:
>>
>>         https://gcc.gnu.org/ml/gcc-patches/2016-08/msg01397.html
>>
>> I decided to explore the idea of analyzing may-undefness on-demand, which
>> actually looks rather cheap.
>>
>> BTW, I don't understand why we don't have auto_bitmap's, as we already have
>> auto_sbitmap's.  I've implemented the former based on auto_sbitmap's code we
>> already have.
>>
>> The attached patch fixes the bug without introducing any regressions.
>>
>> I also tested the patch by compiling 242 .ii files with -O3.  These were
>> gathered from a stage1 build with -save-temps.  There is a slight time
>> degradation of 4 seconds within 27 minutes of user time:
>>
>>         tainted:        26:52
>>         orig:           26:48
>>
>> This was the average aggregate time of two runs compiling all 242 .ii files.
>> IMO, this looks reasonable.  It is after all, -O3.    Is it acceptable?
>
> +  while (!worklist.is_empty ())
> +    {
> +      name = worklist.pop ();
> +      gcc_assert (TREE_CODE (name) == SSA_NAME);
> +
> +      if (ssa_undefined_value_p (name, true))
> +       return true;
> +
> +      bitmap_set_bit (visited_ssa, SSA_NAME_VERSION (name));
>
> it should be already set as we use visited_ssa as "was it ever on the worklist",
> so maybe renaming it would be a good thing as well.

I don't understand what you would prefer here.

>
> +             if (TREE_CODE (name) == SSA_NAME)
> +               {
> +                 /* If an SSA has already been seen, this may be a loop.
> +                    Fail conservatively.  */
> +                 if (bitmap_bit_p (visited_ssa, SSA_NAME_VERSION (name)))
> +                   return false;
>
> so to me "conservative" is returning true, not false.

OK

>
> +                 bitmap_set_bit (visited_ssa, SSA_NAME_VERSION (name));
> +                 worklist.safe_push (name);
>
> but for loops we can just continue and ignore this use.  And bitmap_set_bit
> returns whether it set a bit, thus
>
>                 if (bitmap_set_bit (visited_ssa, SSA_NAME_VERSION (name)))
>                   worklist.safe_push (name);
>
> should work?

Fixed.

>
> +      /* Check that any SSA names used to define NAME is also fully
> +        defined.  */
> +      use_operand_p use_p;
> +      ssa_op_iter iter;
> +      FOR_EACH_SSA_USE_OPERAND (use_p, def, iter, SSA_OP_USE)
> +       {
> +         name = USE_FROM_PTR (use_p);
> +         if (TREE_CODE (name) == SSA_NAME)
>
> always true.
>
> +           {
> +             /* If an SSA has already been seen, this may be a loop.
> +                Fail conservatively.  */
> +             if (bitmap_bit_p (visited_ssa, SSA_NAME_VERSION (name)))
> +               return false;
> +             bitmap_set_bit (visited_ssa, SSA_NAME_VERSION (name));
> +             worklist.safe_push (name);
>
> See above.
>
> In principle the thing is sound but I'd like to be able to pass in a bitmap of
> known maybe-undefined/must-defined SSA names to have a cache for
> multiple invocations from within a pass (like this unswitching case).

Done, though bitmaps are now calculated as part of the instantiation.

>
> Also once you hit defs that are in a post-dominated region of the loop entry
> you can treat them as not undefined (as their use invokes undefined
> behavior).  This is also how you treat function parameters (well,
> ssa_undefined_value_p does), where the call site invokes undefined behavior
> when passing in undefined values.  So we need an extra parameter specifying
> the post-dominance region.

Done.

>
> You do not handle memory or calls conservatively which means the existing
> testcase only needs some obfuscation to become a problem again.  To fix
> that before /* Check that any SSA names used to define NAME is also fully
> defined.  */ bail out conservatively, like
>
>    if (! is_gimple_assign (def)
>       || gimple_assign_single_p (def))
>     return true;

As I asked previously, I understand the !is_gimple_assign, which will 
skip over GIMPLE_CALLs setting a value, but the 
"gimple_assign_single_p(def) == true"??

Won't this last one bail on things like e.3_7 = e, or x_77 = y_88? Don't 
we want to follow up the def chain precisely on these?

The attached implementation uses a cache, and a pre-computed 
post-dominance region.  A previous incantation of this patch (sans the 
post-dominance stuff) had performance within the noise of the previous 
implementation.

I am testing again, and will do some performance comparisons in a bit, 
but for now-- are we on the same page here?  Is this what you wanted?

Aldy

p.s. I could turn the post-dominance region into a bitmap for faster 
access if preferred.

[-- Attachment #2: curr --]
[-- Type: text/plain, Size: 13348 bytes --]

commit 47d0c1b3144d4d56405d72c3ad55183d8632d0a7
Author: Aldy Hernandez <aldyh@redhat.com>
Date:   Fri Dec 16 03:44:52 2016 -0500

            PR tree-optimization/71691
            * bitmap.h (class auto_bitmap): New.
            * tree-ssa-defined-or-undefined.c: New file.
            * tree-ssa-defined-or-undefined.h: New file.
            * Makefile.in (tree-ssa-defined-or-undefined.o): New.
            * tree-ssa-loop-unswitch.c (tree_ssa_unswitch_loops): Calculate
            post dominance info.
            (tree_may_unswitch_on): Add defined_or_undefined parameter.
            Use defined_or_undefined instead of ssa_undefined_value_p.
            (tree_unswitch_single_loop): Instantiate defined_or_undefined
            variable.

diff --git a/gcc/Makefile.in b/gcc/Makefile.in
index 2aae684..beff302 100644
--- a/gcc/Makefile.in
+++ b/gcc/Makefile.in
@@ -1511,6 +1511,7 @@ OBJS = \
 	tree-ssa-coalesce.o \
 	tree-ssa-copy.o \
 	tree-ssa-dce.o \
+	tree-ssa-defined-or-undefined.o \
 	tree-ssa-dom.o \
 	tree-ssa-dse.o \
 	tree-ssa-forwprop.o \
diff --git a/gcc/bitmap.h b/gcc/bitmap.h
index 1b2c8e0..bc32a21 100644
--- a/gcc/bitmap.h
+++ b/gcc/bitmap.h
@@ -802,4 +802,25 @@ bmp_iter_and_compl (bitmap_iterator *bi, unsigned *bit_no)
        bmp_iter_and_compl (&(ITER), &(BITNUM));				\
        bmp_iter_next (&(ITER), &(BITNUM)))
 
+/* A class that ties the lifetime of a bitmap to its scope.  */
+class auto_bitmap
+{
+ public:
+  auto_bitmap () { bits = BITMAP_ALLOC (NULL); }
+  ~auto_bitmap () { BITMAP_FREE (bits); }
+  // Allow calling bitmap functions on our bitmap.
+  operator bitmap () { return bits; }
+
+ private:
+  // Prevent making a copy that references our bitmap.
+  auto_bitmap (const auto_bitmap &);
+  auto_bitmap &operator = (const auto_bitmap &);
+#if __cplusplus >= 201103L
+  auto_bitmap (auto_bitmap &&);
+  auto_bitmap &operator = (auto_bitmap &&);
+#endif
+
+  bitmap bits;
+};
+
 #endif /* GCC_BITMAP_H */
diff --git a/gcc/testsuite/gcc.c-torture/execute/pr71691.c b/gcc/testsuite/gcc.c-torture/execute/pr71691.c
new file mode 100644
index 0000000..1ffd1a2
--- /dev/null
+++ b/gcc/testsuite/gcc.c-torture/execute/pr71691.c
@@ -0,0 +1,47 @@
+/* { dg-options "-fno-tree-vrp" } */
+
+char b;
+short f;
+unsigned e;
+int g = 20;
+
+void
+foo ()
+{
+  int l, h;
+  for (l = 0; l <= 7; l++)
+    {
+      int j = 38;
+      if (g)
+	h = 0;
+      for (; h <= 7; h++)
+	{
+	  int i, k = b % (j % 4);
+	  g = f;
+	  for (;;)
+	    {
+	      j = 6 || b;
+	      if (e)
+		{
+		  for (; j; --j)
+		    if (k)
+		      __builtin_printf ("%d", 9);
+		  if (i)
+		    __builtin_printf ("%d", j);
+		}
+	      if (l)
+		continue;
+	      break;
+	    }
+	  i = f || b;
+	}
+    }
+}
+
+int
+main ()
+{
+  foo ();
+  return 0;
+}
+
diff --git a/gcc/tree-ssa-defined-or-undefined.c b/gcc/tree-ssa-defined-or-undefined.c
new file mode 100644
index 0000000..f845eba
--- /dev/null
+++ b/gcc/tree-ssa-defined-or-undefined.c
@@ -0,0 +1,134 @@
+/* Simple class for classifying SSA_NAMEs that are either fully
+   defined or possibly undefined.
+
+   This is meant to support conservative analysis for optimization
+   purposes, not for generating warnings.  The analysis for generating
+   warnings is deeper to avoid generation of false positives.
+
+   Copyright (C) 2016 Free Software Foundation, Inc.
+   Contributed by Aldy Hernandez <aldyh@redhat.com>.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "backend.h"
+#include "tree.h"
+#include "gimple.h"
+#include "ssa.h"
+#include "tree-ssa.h"
+#include "tree-ssa-defined-or-undefined.h"
+
+/* Return TRUE if an SSA_NAME maybe undefined.  */
+
+bool
+defined_or_undefined::is_maybe_undefined (const tree name)
+{
+  if (in_cache (name))
+    return bitmap_bit_p (b_maybe_undef, SSA_NAME_VERSION (name));
+
+  auto_bitmap visited_ssa;
+  auto_vec<tree> worklist;
+  worklist.safe_push (name);
+  while (!worklist.is_empty ())
+    {
+      tree t = worklist.pop ();
+
+      /* If it's obviously undefined, avoid further computations.  */
+      if (ssa_undefined_value_p (t, true))
+	{
+	  set_maybe_undefined (t);
+	  if (t != name)
+	    set_maybe_undefined (name);
+	  return true;
+	}
+
+      bitmap_set_bit (visited_ssa, SSA_NAME_VERSION (t));
+
+      /* A PARM_DECL will not have an SSA_NAME_DEF_STMT.  Parameters
+	 get their initial value from function entry.  */
+      if (SSA_NAME_VAR (t) && TREE_CODE (SSA_NAME_VAR (t)) == PARM_DECL)
+	{
+	  set_defined (t);
+	  continue;
+	}
+
+      gimple *def = SSA_NAME_DEF_STMT (t);
+
+      /* DEFs in the post-dominated region of the loop entry invoke
+	 undefined behavior.  Adding any use won't make things any
+	 worse.  */
+      for (unsigned i = 0; i < postdom.length (); ++i)
+	if (def->bb == postdom[i])
+	  {
+	    set_defined (t);
+	    return false;
+	  }
+
+      /* Check that all the PHI args are fully defined.  */
+      if (gphi *phi = dyn_cast <gphi *> (def))
+	{
+	  if (virtual_operand_p (PHI_RESULT (phi)))
+	    continue;
+	  for (unsigned i = 0; i < gimple_phi_num_args (phi); ++i)
+	    {
+	      tree t = gimple_phi_arg_def (phi, i);
+	      /* If an SSA has already been seen, it may be a loop,
+		 but we can continue and ignore this use.  Otherwise,
+		 add the SSA_NAME to the queue and visit it later.  */
+	      if (TREE_CODE (t) == SSA_NAME
+		  && bitmap_set_bit (visited_ssa, SSA_NAME_VERSION (t)))
+		worklist.safe_push (t);
+	    }
+	  continue;
+	}
+
+      /* Handle memory and calls conservatively.  */
+      if (!is_gimple_assign (def)
+	  /* ?? Do we really want this?  The code below will return
+	     TRUE for something like "e.3_7 = e", which is probably
+	     not what we want.
+	  || gimple_assign_single_p (def)
+	  */
+	  )
+	{
+	  set_maybe_undefined (name);
+	  return true;
+	}
+
+      /* Check that any SSA names used to define NAME is also fully
+	 defined.  */
+      use_operand_p use_p;
+      ssa_op_iter iter;
+      FOR_EACH_SSA_USE_OPERAND (use_p, def, iter, SSA_OP_USE)
+	{
+	  tree t = USE_FROM_PTR (use_p);
+	  /* If an SSA has already been seen, it may be a loop,
+	     but we can continue and ignore this use.  Otherwise,
+	     add the SSA_NAME to the queue and visit it later.  */
+	  if (bitmap_set_bit (visited_ssa, SSA_NAME_VERSION (t)))
+	    worklist.safe_push (t);
+	}
+    }
+  set_defined (name);
+  return false;
+}
diff --git a/gcc/tree-ssa-defined-or-undefined.h b/gcc/tree-ssa-defined-or-undefined.h
new file mode 100644
index 0000000..23e306a
--- /dev/null
+++ b/gcc/tree-ssa-defined-or-undefined.h
@@ -0,0 +1,73 @@
+/* Simple class for identifying SSA_NAMEs that are either fully defined
+   or maybe undefined.
+
+   Copyright (C) 2016 Free Software Foundation, Inc.
+   Contributed by Aldy Hernandez <aldyh@redhat.com>.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_TREE_SSA_DEFINED_OR_UNDEFINED_H
+#define GCC_TREE_SSA_DEFINED_OR_UNDEFINED_H
+
+/* Simple class for identifying SSA_NAMEs that are either fully
+   defined or maybe undefined.
+
+   This is meant to support conservative analysis for optimization
+   purposes, not for generating warnings.  The analysis for generating
+   warnings is deeper to avoid generation of false positives.
+
+   Instantiation of this class triggers analysis which is not kept
+   up-to-date as changes in the IL or CFG are made.
+
+   Queries will return the conservative result (maybe undefined) for
+   SSA_NAMEs that were not encountered during the initial
+   analysis.  */
+
+class defined_or_undefined
+{
+ private:
+  // Bitmap specifying which SSAs are in the cache.
+  auto_bitmap b_seen;
+  // Bit is set if a given SSA is maybe undefined, otherwise the given
+  // SSA is definitely defined.
+  auto_bitmap b_maybe_undef;
+  // The post-dominance region of the current loop's entry.
+  vec<basic_block> postdom;
+
+  // Return TRUE if SSA has been previously cached.
+  bool in_cache (tree ssa)
+  { return bitmap_bit_p (b_seen, SSA_NAME_VERSION (ssa)); }
+  // Cache SSA to be maybe undefined.
+  void set_maybe_undefined (tree ssa)
+  { bitmap_set_bit (b_seen, SSA_NAME_VERSION (ssa));
+    bitmap_set_bit (b_maybe_undef, SSA_NAME_VERSION (ssa)); }
+  // Cache SSA to be definitely defined.
+  void set_defined (tree ssa)
+  { bitmap_set_bit (b_seen, SSA_NAME_VERSION (ssa));
+    bitmap_clear_bit (b_maybe_undef, SSA_NAME_VERSION (ssa)); }
+ public:
+  defined_or_undefined ();
+  // ENTRY is the current loop's entry block.
+  defined_or_undefined (basic_block entry)
+    { postdom = get_dominated_by_region (CDI_POST_DOMINATORS, &entry, 1); }
+  // Return TRUE if SSA is maybe undefined.
+  bool is_maybe_undefined (tree ssa);
+  // Return TRUE if SSA is definitely defined.
+  bool is_defined (tree ssa) { return !is_maybe_undefined (ssa); }
+};
+
+#endif // GCC_TREE_SSA_DEFINED_OR_UNDEFINED_H
diff --git a/gcc/tree-ssa-loop-unswitch.c b/gcc/tree-ssa-loop-unswitch.c
index 40905af..ea4b68d 100644
--- a/gcc/tree-ssa-loop-unswitch.c
+++ b/gcc/tree-ssa-loop-unswitch.c
@@ -38,6 +38,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "gimple-iterator.h"
 #include "cfghooks.h"
 #include "tree-ssa-loop-manip.h"
+#include "tree-ssa-defined-or-undefined.h"
 
 /* This file implements the loop unswitching, i.e. transformation of loops like
 
@@ -77,7 +78,8 @@ along with GCC; see the file COPYING3.  If not see
 
 static struct loop *tree_unswitch_loop (struct loop *, basic_block, tree);
 static bool tree_unswitch_single_loop (struct loop *, int);
-static tree tree_may_unswitch_on (basic_block, struct loop *);
+static tree tree_may_unswitch_on (basic_block, struct loop *,
+				  defined_or_undefined *);
 static bool tree_unswitch_outer_loop (struct loop *);
 static edge find_loop_guard (struct loop *);
 static bool empty_bb_without_guard_p (struct loop *, basic_block);
@@ -94,6 +96,9 @@ tree_ssa_unswitch_loops (void)
   struct loop *loop;
   bool changed = false;
 
+  /* Used for defined_or_undefined use later.  */
+  calculate_dominance_info (CDI_POST_DOMINATORS);
+
   /* Go through all loops starting from innermost.  */
   FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
     {
@@ -104,6 +109,8 @@ tree_ssa_unswitch_loops (void)
 	changed |= tree_unswitch_outer_loop (loop);
     }
 
+  free_dominance_info (CDI_POST_DOMINATORS);
+
   if (changed)
     return TODO_cleanup_cfg;
   return 0;
@@ -113,7 +120,8 @@ tree_ssa_unswitch_loops (void)
    basic blocks (for what it means see comments below).  */
 
 static tree
-tree_may_unswitch_on (basic_block bb, struct loop *loop)
+tree_may_unswitch_on (basic_block bb, struct loop *loop,
+		      defined_or_undefined *defined_or_undefined)
 {
   gimple *last, *def;
   gcond *stmt;
@@ -138,7 +146,7 @@ tree_may_unswitch_on (basic_block bb, struct loop *loop)
     {
       /* Unswitching on undefined values would introduce undefined
 	 behavior that the original program might never exercise.  */
-      if (ssa_undefined_value_p (use, true))
+      if (defined_or_undefined->is_maybe_undefined (use))
 	return NULL_TREE;
       def = SSA_NAME_DEF_STMT (use);
       def_bb = gimple_bb (def);
@@ -200,6 +208,7 @@ tree_unswitch_single_loop (struct loop *loop, int num)
   gimple *stmt;
   bool changed = false;
   HOST_WIDE_INT iterations;
+  defined_or_undefined defined_or_undefined (loop->header);
 
   /* Perform initial tests if unswitch is eligible.  */
   if (num == 0)
@@ -243,7 +252,7 @@ tree_unswitch_single_loop (struct loop *loop, int num)
     {
       /* Find a bb to unswitch on.  */
       for (; i < loop->num_nodes; i++)
-	if ((cond = tree_may_unswitch_on (bbs[i], loop)))
+	if ((cond = tree_may_unswitch_on (bbs[i], loop, &defined_or_undefined)))
 	  break;
 
       if (i == loop->num_nodes)
@@ -363,7 +372,8 @@ tree_unswitch_single_loop (struct loop *loop, int num)
       /* Find a bb to unswitch on.  */
       for (; found < loop->num_nodes; found++)
 	if ((bbs[found]->flags & BB_REACHABLE)
-	    && (cond = tree_may_unswitch_on (bbs[found], loop)))
+	    && (cond = tree_may_unswitch_on (bbs[found], loop,
+					     &defined_or_undefined)))
 	  break;
 
       if (found == loop->num_nodes)

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PR tree-optimization/71691] Fix unswitching in presence of maybe-undef SSA_NAMEs (take 2)
  2016-12-21 18:52   ` Aldy Hernandez
@ 2017-01-04 11:43     ` Richard Biener
  0 siblings, 0 replies; 23+ messages in thread
From: Richard Biener @ 2017-01-04 11:43 UTC (permalink / raw)
  To: Aldy Hernandez; +Cc: gcc-patches, Jeff Law

On Wed, Dec 21, 2016 at 7:43 PM, Aldy Hernandez <aldyh@redhat.com> wrote:
> On 12/20/2016 09:16 AM, Richard Biener wrote:
>
>> You do not handle memory or calls conservatively which means the existing
>> testcase only needs some obfuscation to become a problem again.  To fix
>> that before /* Check that any SSA names used to define NAME is also fully
>> defined.  */ bail out conservatively, like
>>
>>    if (! is_gimple_assign (def)
>>       || gimple_assign_single_p (def))
>>     return true;
>
>
> I understand the !is_gimple_assign, which will skip over GIMPLE_CALLs
> setting a value, but the "gimple_assign_single_p(def) == true"??
>
> Won't this last one bail on things like e.3_7 = e, or x_77 = y_88? Don't we
> want to follow up the def chain precisely on these?

For the first (= e) we can't, for the second, yes, the predicate is too strict.
But it's conservative ;)

Likewise for e_2 = VIEW_CONVERT_EXPR <u_88> btw, or REAL/IMAGPART_EXPR.

Note both VIEW_CONVERT_EXPR and REAL/IMAGPART_EXPR will also appear
in RHSs we can't handle.

Richard.

>
> Thanks.
> Aldy

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PR tree-optimization/71691] Fix unswitching in presence of maybe-undef SSA_NAMEs (take 2)
  2017-01-03 17:37   ` Aldy Hernandez
@ 2017-01-04 12:12     ` Richard Biener
  2017-01-07 12:54       ` Aldy Hernandez
  0 siblings, 1 reply; 23+ messages in thread
From: Richard Biener @ 2017-01-04 12:12 UTC (permalink / raw)
  To: Aldy Hernandez; +Cc: gcc-patches, Jeff Law

On Tue, Jan 3, 2017 at 6:36 PM, Aldy Hernandez <aldyh@redhat.com> wrote:
> On 12/20/2016 09:16 AM, Richard Biener wrote:
>>
>> On Fri, Dec 16, 2016 at 3:41 PM, Aldy Hernandez <aldyh@redhat.com> wrote:
>>>
>>> Hi folks.
>>>
>>> This is a follow-up on Jeff and Richi's interaction on the aforementioned
>>> PR
>>> here:
>>>
>>>         https://gcc.gnu.org/ml/gcc-patches/2016-08/msg01397.html
>>>
>>> I decided to explore the idea of analyzing may-undefness on-demand, which
>>> actually looks rather cheap.
>>>
>>> BTW, I don't understand why we don't have auto_bitmap's, as we already
>>> have
>>> auto_sbitmap's.  I've implemented the former based on auto_sbitmap's code
>>> we
>>> already have.
>>>
>>> The attached patch fixes the bug without introducing any regressions.
>>>
>>> I also tested the patch by compiling 242 .ii files with -O3.  These were
>>> gathered from a stage1 build with -save-temps.  There is a slight time
>>> degradation of 4 seconds within 27 minutes of user time:
>>>
>>>         tainted:        26:52
>>>         orig:           26:48
>>>
>>> This was the average aggregate time of two runs compiling all 242 .ii
>>> files.
>>> IMO, this looks reasonable.  It is after all, -O3.    Is it acceptable?
>>
>>
>> +  while (!worklist.is_empty ())
>> +    {
>> +      name = worklist.pop ();
>> +      gcc_assert (TREE_CODE (name) == SSA_NAME);
>> +
>> +      if (ssa_undefined_value_p (name, true))
>> +       return true;
>> +
>> +      bitmap_set_bit (visited_ssa, SSA_NAME_VERSION (name));
>>
>> it should be already set as we use visited_ssa as "was it ever on the
>> worklist",
>> so maybe renaming it would be a good thing as well.
>
>
> I don't understand what you would prefer here.

Set the bit when you put name on the worklist (and do not do that if the
bit is set).  Thus simply remove the above and add a bitmap_set_bit
for the initial name you put on the worklist.

>>
>> +             if (TREE_CODE (name) == SSA_NAME)
>> +               {
>> +                 /* If an SSA has already been seen, this may be a loop.
>> +                    Fail conservatively.  */
>> +                 if (bitmap_bit_p (visited_ssa, SSA_NAME_VERSION (name)))
>> +                   return false;
>>
>> so to me "conservative" is returning true, not false.
>
>
> OK
>
>>
>> +                 bitmap_set_bit (visited_ssa, SSA_NAME_VERSION (name));
>> +                 worklist.safe_push (name);
>>
>> but for loops we can just continue and ignore this use.  And
>> bitmap_set_bit
>> returns whether it set a bit, thus
>>
>>                 if (bitmap_set_bit (visited_ssa, SSA_NAME_VERSION (name)))
>>                   worklist.safe_push (name);
>>
>> should work?
>
>
> Fixed.
>
>>
>> +      /* Check that any SSA names used to define NAME is also fully
>> +        defined.  */
>> +      use_operand_p use_p;
>> +      ssa_op_iter iter;
>> +      FOR_EACH_SSA_USE_OPERAND (use_p, def, iter, SSA_OP_USE)
>> +       {
>> +         name = USE_FROM_PTR (use_p);
>> +         if (TREE_CODE (name) == SSA_NAME)
>>
>> always true.
>>
>> +           {
>> +             /* If an SSA has already been seen, this may be a loop.
>> +                Fail conservatively.  */
>> +             if (bitmap_bit_p (visited_ssa, SSA_NAME_VERSION (name)))
>> +               return false;
>> +             bitmap_set_bit (visited_ssa, SSA_NAME_VERSION (name));
>> +             worklist.safe_push (name);
>>
>> See above.
>>
>> In principle the thing is sound but I'd like to be able to pass in a
>> bitmap of
>> known maybe-undefined/must-defined SSA names to have a cache for
>> multiple invocations from within a pass (like this unswitching case).
>
>
> Done, though bitmaps are now calculated as part of the instantiation.
>
>>
>> Also once you hit defs that are in a post-dominated region of the loop
>> entry
>> you can treat them as not undefined (as their use invokes undefined
>> behavior).  This is also how you treat function parameters (well,
>> ssa_undefined_value_p does), where the call site invokes undefined
>> behavior
>> when passing in undefined values.  So we need an extra parameter
>> specifying
>> the post-dominance region.
>
>
> Done.
>
>>
>> You do not handle memory or calls conservatively which means the existing
>> testcase only needs some obfuscation to become a problem again.  To fix
>> that before /* Check that any SSA names used to define NAME is also fully
>> defined.  */ bail out conservatively, like
>>
>>    if (! is_gimple_assign (def)
>>       || gimple_assign_single_p (def))
>>     return true;
>
>
> As I asked previously, I understand the !is_gimple_assign, which will skip
> over GIMPLE_CALLs setting a value, but the "gimple_assign_single_p(def) ==
> true"??
>
> Won't this last one bail on things like e.3_7 = e, or x_77 = y_88? Don't we
> want to follow up the def chain precisely on these?
>
> The attached implementation uses a cache, and a pre-computed post-dominance
> region.  A previous incantation of this patch (sans the post-dominance
> stuff) had performance within the noise of the previous implementation.
>
> I am testing again, and will do some performance comparisons in a bit, but
> for now-- are we on the same page here?  Is this what you wanted?

+      /* DEFs in the post-dominated region of the loop entry invoke
+        undefined behavior.  Adding any use won't make things any
+        worse.  */
+      for (unsigned i = 0; i < postdom.length (); ++i)
+       if (def->bb == postdom[i])

gimple_bb (def)

+         {
+           set_defined (t);
+           return false;
+         }

I think you can't really return false here but need to continue processing
the worklist.  I'd say rather than a linear walk you can as well use
dominated_by_p (CDI_POST_DOMINATORS, ...) and record the entry
block instead?

Note that the way you compute post-dominators doesn't work -- nothing
keeps them up-to-date when unswitching does CFG manipulations :/
Fortunately we have a way to compute them per region, see how tree-if-conv.c
does that (build_region, calculate_dominance_info_for_region).

+      /* Handle memory and calls conservatively.  */
+      if (!is_gimple_assign (def)
+         /* ?? Do we really want this?  The code below will return
+            TRUE for something like "e.3_7 = e", which is probably
+            not what we want.
+         || gimple_assign_single_p (def)

as said for loads it _is_ what we want (the memory may be uninitialized).
You can make it

     || (gimple_assign_single_p (def)
         && gimple_vuse (def))

to better catch that though (stop at memory loads only).

--

The way you add things to the cache (or not) is somewhat odd probably
given that you don't really compute for anything but the arg to
is_maybe_undefined
whether it is maybe undefined or not?

I am missing a cache check in the worklist processing at least but I also don't
see how common "tails" in SSA use trees of different names are reliably
cached (and I'm not sure it's easily possible to populate the cache for all
visited SSA names with conservative values -- a recursive implementation
would have made that obvious but with a worklist it might be tricky).

Given the way it is used from unswitching and given (citing from it):

  /* Condition must be invariant.  */
  FOR_EACH_SSA_TREE_OPERAND (use, stmt, iter, SSA_OP_USE)
    {
      /* Unswitching on undefined values would introduce undefined
         behavior that the original program might never exercise.  */
      if (ssa_undefined_value_p (use, true))
        return NULL_TREE;
      def = SSA_NAME_DEF_STMT (use);
      def_bb = gimple_bb (def);
      if (def_bb
          && flow_bb_inside_loop_p (loop, def_bb))
^^^

we do not unswitch on exprs that are defined inside the loop at all, thus all
SSA use-def walking is pointless here (I think in your worklist processing
you miss that you can stop walking when you see a (non-PHI) def that
dominates the post-dom region entry).  The above also suggests to do that
flow_bb_inside_loop_p check first, eventually even for all stmt operands.


Thanks,
Richard.

> Aldy
>
> p.s. I could turn the post-dominance region into a bitmap for faster access
> if preferred.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PR tree-optimization/71691] Fix unswitching in presence of maybe-undef SSA_NAMEs (take 2)
  2016-12-16 15:20 [PR tree-optimization/71691] Fix unswitching in presence of maybe-undef SSA_NAMEs (take 2) Aldy Hernandez
  2016-12-19 23:30 ` Jeff Law
  2016-12-20 14:39 ` Richard Biener
@ 2017-01-05  4:13 ` Trevor Saunders
  2 siblings, 0 replies; 23+ messages in thread
From: Trevor Saunders @ 2017-01-05  4:13 UTC (permalink / raw)
  To: Aldy Hernandez; +Cc: gcc-patches, Richard Biener, Jeff Law

On Fri, Dec 16, 2016 at 09:41:33AM -0500, Aldy Hernandez wrote:
> Hi folks.
> 
> This is a follow-up on Jeff and Richi's interaction on the aforementioned PR
> here:
> 
> 	https://gcc.gnu.org/ml/gcc-patches/2016-08/msg01397.html
> 
> I decided to explore the idea of analyzing may-undefness on-demand, which
> actually looks rather cheap.
> 
> BTW, I don't understand why we don't have auto_bitmap's, as we already have
> auto_sbitmap's.  I've implemented the former based on auto_sbitmap's code we
> already have.

No good reason, it just hasn't been done yet.

I'm tempted to fit both this and auto_sbitmap into a unique_ptr with
custom deletor, but that would make it hard to do small object
optimizations, so maybe it isn't actually a great idea.

> +class auto_bitmap
> +{
> + public:
> +  auto_bitmap () { bits = BITMAP_ALLOC (NULL); }
> +  ~auto_bitmap () { BITMAP_FREE (bits); }

We could make the member a bitmap_head, and then use bitmap_initialize
and bitmap_clear in the ctor and dtor, that'd save a alloc / dealloc.

You might also want to use the CXX_MEM_STAT macros on the ctor to get
better attribution of memory usage.

> +  // Prevent making a copy that references our bitmap.
> +  auto_bitmap (const auto_bitmap &);
> +  auto_bitmap &operator = (const auto_bitmap &);
> +#if __cplusplus >= 201103L
> +  auto_bitmap (auto_bitmap &&);
> +  auto_bitmap &operator = (auto_bitmap &&);

We could actually support moving bitmaps pretty easily, though we would
need to do the auto_ptr style hacks to keep building as c++98.  I'll
probably do that work eventually for unique_ptr, but its kind of late
for gcc 8 unless we just use it to fix memory leaks.

> +#endif
> +
> +  bitmap bits;

style guide would say that should be m_bits I believe.

Trev

sorry about the lag here.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PR tree-optimization/71691] Fix unswitching in presence of maybe-undef SSA_NAMEs (take 2)
  2017-01-04 12:12     ` Richard Biener
@ 2017-01-07 12:54       ` Aldy Hernandez
  2017-01-09  9:30         ` Richard Biener
  0 siblings, 1 reply; 23+ messages in thread
From: Aldy Hernandez @ 2017-01-07 12:54 UTC (permalink / raw)
  To: Richard Biener; +Cc: gcc-patches, Jeff Law

On 01/04/2017 07:11 AM, Richard Biener wrote:
> On Tue, Jan 3, 2017 at 6:36 PM, Aldy Hernandez <aldyh@redhat.com> wrote:
>> On 12/20/2016 09:16 AM, Richard Biener wrote:
>>>
>>> On Fri, Dec 16, 2016 at 3:41 PM, Aldy Hernandez <aldyh@redhat.com> wrote:
>>>>
>>>> Hi folks.
>>>>
>>>> This is a follow-up on Jeff and Richi's interaction on the aforementioned
>>>> PR
>>>> here:
>>>>
>>>>         https://gcc.gnu.org/ml/gcc-patches/2016-08/msg01397.html
>>>>
>>>> I decided to explore the idea of analyzing may-undefness on-demand, which
>>>> actually looks rather cheap.
>>>>
>>>> BTW, I don't understand why we don't have auto_bitmap's, as we already
>>>> have
>>>> auto_sbitmap's.  I've implemented the former based on auto_sbitmap's code
>>>> we
>>>> already have.
>>>>
>>>> The attached patch fixes the bug without introducing any regressions.
>>>>
>>>> I also tested the patch by compiling 242 .ii files with -O3.  These were
>>>> gathered from a stage1 build with -save-temps.  There is a slight time
>>>> degradation of 4 seconds within 27 minutes of user time:
>>>>
>>>>         tainted:        26:52
>>>>         orig:           26:48
>>>>
>>>> This was the average aggregate time of two runs compiling all 242 .ii
>>>> files.
>>>> IMO, this looks reasonable.  It is after all, -O3.    Is it acceptable?
>>>
>>>
>>> +  while (!worklist.is_empty ())
>>> +    {
>>> +      name = worklist.pop ();
>>> +      gcc_assert (TREE_CODE (name) == SSA_NAME);
>>> +
>>> +      if (ssa_undefined_value_p (name, true))
>>> +       return true;
>>> +
>>> +      bitmap_set_bit (visited_ssa, SSA_NAME_VERSION (name));
>>>
>>> it should be already set as we use visited_ssa as "was it ever on the
>>> worklist",
>>> so maybe renaming it would be a good thing as well.
>>
>>
>> I don't understand what you would prefer here.
>
> Set the bit when you put name on the worklist (and do not do that if the
> bit is set).  Thus simply remove the above and add a bitmap_set_bit
> for the initial name you put on the worklist.
>
>>>
>>> +             if (TREE_CODE (name) == SSA_NAME)
>>> +               {
>>> +                 /* If an SSA has already been seen, this may be a loop.
>>> +                    Fail conservatively.  */
>>> +                 if (bitmap_bit_p (visited_ssa, SSA_NAME_VERSION (name)))
>>> +                   return false;
>>>
>>> so to me "conservative" is returning true, not false.
>>
>>
>> OK
>>
>>>
>>> +                 bitmap_set_bit (visited_ssa, SSA_NAME_VERSION (name));
>>> +                 worklist.safe_push (name);
>>>
>>> but for loops we can just continue and ignore this use.  And
>>> bitmap_set_bit
>>> returns whether it set a bit, thus
>>>
>>>                 if (bitmap_set_bit (visited_ssa, SSA_NAME_VERSION (name)))
>>>                   worklist.safe_push (name);
>>>
>>> should work?
>>
>>
>> Fixed.
>>
>>>
>>> +      /* Check that any SSA names used to define NAME is also fully
>>> +        defined.  */
>>> +      use_operand_p use_p;
>>> +      ssa_op_iter iter;
>>> +      FOR_EACH_SSA_USE_OPERAND (use_p, def, iter, SSA_OP_USE)
>>> +       {
>>> +         name = USE_FROM_PTR (use_p);
>>> +         if (TREE_CODE (name) == SSA_NAME)
>>>
>>> always true.
>>>
>>> +           {
>>> +             /* If an SSA has already been seen, this may be a loop.
>>> +                Fail conservatively.  */
>>> +             if (bitmap_bit_p (visited_ssa, SSA_NAME_VERSION (name)))
>>> +               return false;
>>> +             bitmap_set_bit (visited_ssa, SSA_NAME_VERSION (name));
>>> +             worklist.safe_push (name);
>>>
>>> See above.
>>>
>>> In principle the thing is sound but I'd like to be able to pass in a
>>> bitmap of
>>> known maybe-undefined/must-defined SSA names to have a cache for
>>> multiple invocations from within a pass (like this unswitching case).
>>
>>
>> Done, though bitmaps are now calculated as part of the instantiation.
>>
>>>
>>> Also once you hit defs that are in a post-dominated region of the loop
>>> entry
>>> you can treat them as not undefined (as their use invokes undefined
>>> behavior).  This is also how you treat function parameters (well,
>>> ssa_undefined_value_p does), where the call site invokes undefined
>>> behavior
>>> when passing in undefined values.  So we need an extra parameter
>>> specifying
>>> the post-dominance region.
>>
>>
>> Done.
>>
>>>
>>> You do not handle memory or calls conservatively which means the existing
>>> testcase only needs some obfuscation to become a problem again.  To fix
>>> that before /* Check that any SSA names used to define NAME is also fully
>>> defined.  */ bail out conservatively, like
>>>
>>>    if (! is_gimple_assign (def)
>>>       || gimple_assign_single_p (def))
>>>     return true;
>>
>>
>> As I asked previously, I understand the !is_gimple_assign, which will skip
>> over GIMPLE_CALLs setting a value, but the "gimple_assign_single_p(def) ==
>> true"??
>>
>> Won't this last one bail on things like e.3_7 = e, or x_77 = y_88? Don't we
>> want to follow up the def chain precisely on these?
>>
>> The attached implementation uses a cache, and a pre-computed post-dominance
>> region.  A previous incantation of this patch (sans the post-dominance
>> stuff) had performance within the noise of the previous implementation.
>>
>> I am testing again, and will do some performance comparisons in a bit, but
>> for now-- are we on the same page here?  Is this what you wanted?
>
> +      /* DEFs in the post-dominated region of the loop entry invoke
> +        undefined behavior.  Adding any use won't make things any
> +        worse.  */
> +      for (unsigned i = 0; i < postdom.length (); ++i)
> +       if (def->bb == postdom[i])
>
> gimple_bb (def)
>
> +         {
> +           set_defined (t);
> +           return false;
> +         }
>
> I think you can't really return false here but need to continue processing
> the worklist.  I'd say rather than a linear walk you can as well use
> dominated_by_p (CDI_POST_DOMINATORS, ...) and record the entry
> block instead?
>
> Note that the way you compute post-dominators doesn't work -- nothing
> keeps them up-to-date when unswitching does CFG manipulations :/
> Fortunately we have a way to compute them per region, see how tree-if-conv.c
> does that (build_region, calculate_dominance_info_for_region).

If I understand correctly, we could compute them per region in 
tree_unswitch_single_loop() for each individual loop with what 
tree-if-conv.c uses.

I could certainly abstract 
build_region/calculate_dominance_info_for_region into something new, say 
calculate_dominance_info_for_loop_region().  Then we could use 
dominated_by_p(CDI_POST_DOMINATORS, ...) in my class as you suggest.

Question... computing the post-dom tree per region as I've just 
described will only create post-dom information for the loop region, 
which means that any definition outside of the loop will have zero 
dominance info.

What if the definition in question post-dominates the loop entry but is 
outside/after of the loop?  We will have no post-dom information for 
this.  In this case, could we just ignore and continue evaluating the 
SSA_NAME with the rest of our heuristics, or did you have something else 
in mind?  Another option would be to calculate the post-dom information 
for the entire function on every loop 
(calculate_dominance_info(CDI_POST_DOMINATORS)), but that seems rather 
expensive.

As a side note, at this point I just want to fix/close the PR in an 
acceptable manner, not come up with the end-all catch-all most-efficient 
patch for an unlikely scenario ;-).

Thanks for taking the time to review and offer suggestions here.

Aldy

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PR tree-optimization/71691] Fix unswitching in presence of maybe-undef SSA_NAMEs (take 2)
  2017-01-07 12:54       ` Aldy Hernandez
@ 2017-01-09  9:30         ` Richard Biener
  2017-01-13 18:48           ` Aldy Hernandez
  0 siblings, 1 reply; 23+ messages in thread
From: Richard Biener @ 2017-01-09  9:30 UTC (permalink / raw)
  To: Aldy Hernandez; +Cc: gcc-patches, Jeff Law

On Sat, Jan 7, 2017 at 1:54 PM, Aldy Hernandez <aldyh@redhat.com> wrote:
> On 01/04/2017 07:11 AM, Richard Biener wrote:
>>
>> On Tue, Jan 3, 2017 at 6:36 PM, Aldy Hernandez <aldyh@redhat.com> wrote:
>>>
>>> On 12/20/2016 09:16 AM, Richard Biener wrote:
>>>>
>>>>
>>>> On Fri, Dec 16, 2016 at 3:41 PM, Aldy Hernandez <aldyh@redhat.com>
>>>> wrote:
>>>>>
>>>>>
>>>>> Hi folks.
>>>>>
>>>>> This is a follow-up on Jeff and Richi's interaction on the
>>>>> aforementioned
>>>>> PR
>>>>> here:
>>>>>
>>>>>         https://gcc.gnu.org/ml/gcc-patches/2016-08/msg01397.html
>>>>>
>>>>> I decided to explore the idea of analyzing may-undefness on-demand,
>>>>> which
>>>>> actually looks rather cheap.
>>>>>
>>>>> BTW, I don't understand why we don't have auto_bitmap's, as we already
>>>>> have
>>>>> auto_sbitmap's.  I've implemented the former based on auto_sbitmap's
>>>>> code
>>>>> we
>>>>> already have.
>>>>>
>>>>> The attached patch fixes the bug without introducing any regressions.
>>>>>
>>>>> I also tested the patch by compiling 242 .ii files with -O3.  These
>>>>> were
>>>>> gathered from a stage1 build with -save-temps.  There is a slight time
>>>>> degradation of 4 seconds within 27 minutes of user time:
>>>>>
>>>>>         tainted:        26:52
>>>>>         orig:           26:48
>>>>>
>>>>> This was the average aggregate time of two runs compiling all 242 .ii
>>>>> files.
>>>>> IMO, this looks reasonable.  It is after all, -O3.    Is it acceptable?
>>>>
>>>>
>>>>
>>>> +  while (!worklist.is_empty ())
>>>> +    {
>>>> +      name = worklist.pop ();
>>>> +      gcc_assert (TREE_CODE (name) == SSA_NAME);
>>>> +
>>>> +      if (ssa_undefined_value_p (name, true))
>>>> +       return true;
>>>> +
>>>> +      bitmap_set_bit (visited_ssa, SSA_NAME_VERSION (name));
>>>>
>>>> it should be already set as we use visited_ssa as "was it ever on the
>>>> worklist",
>>>> so maybe renaming it would be a good thing as well.
>>>
>>>
>>>
>>> I don't understand what you would prefer here.
>>
>>
>> Set the bit when you put name on the worklist (and do not do that if the
>> bit is set).  Thus simply remove the above and add a bitmap_set_bit
>> for the initial name you put on the worklist.
>>
>>>>
>>>> +             if (TREE_CODE (name) == SSA_NAME)
>>>> +               {
>>>> +                 /* If an SSA has already been seen, this may be a
>>>> loop.
>>>> +                    Fail conservatively.  */
>>>> +                 if (bitmap_bit_p (visited_ssa, SSA_NAME_VERSION
>>>> (name)))
>>>> +                   return false;
>>>>
>>>> so to me "conservative" is returning true, not false.
>>>
>>>
>>>
>>> OK
>>>
>>>>
>>>> +                 bitmap_set_bit (visited_ssa, SSA_NAME_VERSION (name));
>>>> +                 worklist.safe_push (name);
>>>>
>>>> but for loops we can just continue and ignore this use.  And
>>>> bitmap_set_bit
>>>> returns whether it set a bit, thus
>>>>
>>>>                 if (bitmap_set_bit (visited_ssa, SSA_NAME_VERSION
>>>> (name)))
>>>>                   worklist.safe_push (name);
>>>>
>>>> should work?
>>>
>>>
>>>
>>> Fixed.
>>>
>>>>
>>>> +      /* Check that any SSA names used to define NAME is also fully
>>>> +        defined.  */
>>>> +      use_operand_p use_p;
>>>> +      ssa_op_iter iter;
>>>> +      FOR_EACH_SSA_USE_OPERAND (use_p, def, iter, SSA_OP_USE)
>>>> +       {
>>>> +         name = USE_FROM_PTR (use_p);
>>>> +         if (TREE_CODE (name) == SSA_NAME)
>>>>
>>>> always true.
>>>>
>>>> +           {
>>>> +             /* If an SSA has already been seen, this may be a loop.
>>>> +                Fail conservatively.  */
>>>> +             if (bitmap_bit_p (visited_ssa, SSA_NAME_VERSION (name)))
>>>> +               return false;
>>>> +             bitmap_set_bit (visited_ssa, SSA_NAME_VERSION (name));
>>>> +             worklist.safe_push (name);
>>>>
>>>> See above.
>>>>
>>>> In principle the thing is sound but I'd like to be able to pass in a
>>>> bitmap of
>>>> known maybe-undefined/must-defined SSA names to have a cache for
>>>> multiple invocations from within a pass (like this unswitching case).
>>>
>>>
>>>
>>> Done, though bitmaps are now calculated as part of the instantiation.
>>>
>>>>
>>>> Also once you hit defs that are in a post-dominated region of the loop
>>>> entry
>>>> you can treat them as not undefined (as their use invokes undefined
>>>> behavior).  This is also how you treat function parameters (well,
>>>> ssa_undefined_value_p does), where the call site invokes undefined
>>>> behavior
>>>> when passing in undefined values.  So we need an extra parameter
>>>> specifying
>>>> the post-dominance region.
>>>
>>>
>>>
>>> Done.
>>>
>>>>
>>>> You do not handle memory or calls conservatively which means the
>>>> existing
>>>> testcase only needs some obfuscation to become a problem again.  To fix
>>>> that before /* Check that any SSA names used to define NAME is also
>>>> fully
>>>> defined.  */ bail out conservatively, like
>>>>
>>>>    if (! is_gimple_assign (def)
>>>>       || gimple_assign_single_p (def))
>>>>     return true;
>>>
>>>
>>>
>>> As I asked previously, I understand the !is_gimple_assign, which will
>>> skip
>>> over GIMPLE_CALLs setting a value, but the "gimple_assign_single_p(def)
>>> ==
>>> true"??
>>>
>>> Won't this last one bail on things like e.3_7 = e, or x_77 = y_88? Don't
>>> we
>>> want to follow up the def chain precisely on these?
>>>
>>> The attached implementation uses a cache, and a pre-computed
>>> post-dominance
>>> region.  A previous incantation of this patch (sans the post-dominance
>>> stuff) had performance within the noise of the previous implementation.
>>>
>>> I am testing again, and will do some performance comparisons in a bit,
>>> but
>>> for now-- are we on the same page here?  Is this what you wanted?
>>
>>
>> +      /* DEFs in the post-dominated region of the loop entry invoke
>> +        undefined behavior.  Adding any use won't make things any
>> +        worse.  */
>> +      for (unsigned i = 0; i < postdom.length (); ++i)
>> +       if (def->bb == postdom[i])
>>
>> gimple_bb (def)
>>
>> +         {
>> +           set_defined (t);
>> +           return false;
>> +         }
>>
>> I think you can't really return false here but need to continue processing
>> the worklist.  I'd say rather than a linear walk you can as well use
>> dominated_by_p (CDI_POST_DOMINATORS, ...) and record the entry
>> block instead?
>>
>> Note that the way you compute post-dominators doesn't work -- nothing
>> keeps them up-to-date when unswitching does CFG manipulations :/
>> Fortunately we have a way to compute them per region, see how
>> tree-if-conv.c
>> does that (build_region, calculate_dominance_info_for_region).
>
>
> If I understand correctly, we could compute them per region in
> tree_unswitch_single_loop() for each individual loop with what
> tree-if-conv.c uses.
>
> I could certainly abstract build_region/calculate_dominance_info_for_region
> into something new, say calculate_dominance_info_for_loop_region().  Then we
> could use dominated_by_p(CDI_POST_DOMINATORS, ...) in my class as you
> suggest.
>
> Question... computing the post-dom tree per region as I've just described
> will only create post-dom information for the loop region, which means that
> any definition outside of the loop will have zero dominance info.

Yes.

> What if the definition in question post-dominates the loop entry but is
> outside/after of the loop?

How would we ever arrive at such def?  Once we reach a def before the
region/loop
we know it's fine to use (the missing "stop" point I pointed out).

>  We will have no post-dom information for this.
> In this case, could we just ignore and continue evaluating the SSA_NAME with
> the rest of our heuristics, or did you have something else in mind?  Another
> option would be to calculate the post-dom information for the entire
> function on every loop (calculate_dominance_info(CDI_POST_DOMINATORS)), but
> that seems rather expensive.
>
> As a side note, at this point I just want to fix/close the PR in an
> acceptable manner, not come up with the end-all catch-all most-efficient
> patch for an unlikely scenario ;-).

Yeah, which is why I wonder whether the caching is worth the trouble (in it's
current form) for the unswitching use (given it's other restrictions
on conditions
to unswitch).

Richard.

> Thanks for taking the time to review and offer suggestions here.
>
> Aldy

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PR tree-optimization/71691] Fix unswitching in presence of maybe-undef SSA_NAMEs (take 2)
  2017-01-09  9:30         ` Richard Biener
@ 2017-01-13 18:48           ` Aldy Hernandez
  2017-01-18 15:17             ` Richard Biener
  0 siblings, 1 reply; 23+ messages in thread
From: Aldy Hernandez @ 2017-01-13 18:48 UTC (permalink / raw)
  To: Richard Biener; +Cc: gcc-patches, Jeff Law

[Sorry for the delay, I was sick.]

On 01/09/2017 04:30 AM, Richard Biener wrote:
> On Sat, Jan 7, 2017 at 1:54 PM, Aldy Hernandez <aldyh@redhat.com> wrote:
>> On 01/04/2017 07:11 AM, Richard Biener wrote:
>>>
>>> On Tue, Jan 3, 2017 at 6:36 PM, Aldy Hernandez <aldyh@redhat.com> wrote:
>>>>
>>>> On 12/20/2016 09:16 AM, Richard Biener wrote:
>>>>>
>>>>>
>>>>> On Fri, Dec 16, 2016 at 3:41 PM, Aldy Hernandez <aldyh@redhat.com>
>>>>> wrote:
>>>>>>
>>>>>>
>>>>>> Hi folks.
>>>>>>
>>>>>> This is a follow-up on Jeff and Richi's interaction on the
>>>>>> aforementioned
>>>>>> PR
>>>>>> here:
>>>>>>
>>>>>>         https://gcc.gnu.org/ml/gcc-patches/2016-08/msg01397.html
>>>>>>
>>>>>> I decided to explore the idea of analyzing may-undefness on-demand,
>>>>>> which
>>>>>> actually looks rather cheap.
>>>>>>
>>>>>> BTW, I don't understand why we don't have auto_bitmap's, as we already
>>>>>> have
>>>>>> auto_sbitmap's.  I've implemented the former based on auto_sbitmap's
>>>>>> code
>>>>>> we
>>>>>> already have.
>>>>>>
>>>>>> The attached patch fixes the bug without introducing any regressions.
>>>>>>
>>>>>> I also tested the patch by compiling 242 .ii files with -O3.  These
>>>>>> were
>>>>>> gathered from a stage1 build with -save-temps.  There is a slight time
>>>>>> degradation of 4 seconds within 27 minutes of user time:
>>>>>>
>>>>>>         tainted:        26:52
>>>>>>         orig:           26:48
>>>>>>
>>>>>> This was the average aggregate time of two runs compiling all 242 .ii
>>>>>> files.
>>>>>> IMO, this looks reasonable.  It is after all, -O3.    Is it acceptable?
>>>>>
>>>>>
>>>>>
>>>>> +  while (!worklist.is_empty ())
>>>>> +    {
>>>>> +      name = worklist.pop ();
>>>>> +      gcc_assert (TREE_CODE (name) == SSA_NAME);
>>>>> +
>>>>> +      if (ssa_undefined_value_p (name, true))
>>>>> +       return true;
>>>>> +
>>>>> +      bitmap_set_bit (visited_ssa, SSA_NAME_VERSION (name));
>>>>>
>>>>> it should be already set as we use visited_ssa as "was it ever on the
>>>>> worklist",
>>>>> so maybe renaming it would be a good thing as well.
>>>>
>>>>
>>>>
>>>> I don't understand what you would prefer here.
>>>
>>>
>>> Set the bit when you put name on the worklist (and do not do that if the
>>> bit is set).  Thus simply remove the above and add a bitmap_set_bit
>>> for the initial name you put on the worklist.
>>>
>>>>>
>>>>> +             if (TREE_CODE (name) == SSA_NAME)
>>>>> +               {
>>>>> +                 /* If an SSA has already been seen, this may be a
>>>>> loop.
>>>>> +                    Fail conservatively.  */
>>>>> +                 if (bitmap_bit_p (visited_ssa, SSA_NAME_VERSION
>>>>> (name)))
>>>>> +                   return false;
>>>>>
>>>>> so to me "conservative" is returning true, not false.
>>>>
>>>>
>>>>
>>>> OK
>>>>
>>>>>
>>>>> +                 bitmap_set_bit (visited_ssa, SSA_NAME_VERSION (name));
>>>>> +                 worklist.safe_push (name);
>>>>>
>>>>> but for loops we can just continue and ignore this use.  And
>>>>> bitmap_set_bit
>>>>> returns whether it set a bit, thus
>>>>>
>>>>>                 if (bitmap_set_bit (visited_ssa, SSA_NAME_VERSION
>>>>> (name)))
>>>>>                   worklist.safe_push (name);
>>>>>
>>>>> should work?
>>>>
>>>>
>>>>
>>>> Fixed.
>>>>
>>>>>
>>>>> +      /* Check that any SSA names used to define NAME is also fully
>>>>> +        defined.  */
>>>>> +      use_operand_p use_p;
>>>>> +      ssa_op_iter iter;
>>>>> +      FOR_EACH_SSA_USE_OPERAND (use_p, def, iter, SSA_OP_USE)
>>>>> +       {
>>>>> +         name = USE_FROM_PTR (use_p);
>>>>> +         if (TREE_CODE (name) == SSA_NAME)
>>>>>
>>>>> always true.
>>>>>
>>>>> +           {
>>>>> +             /* If an SSA has already been seen, this may be a loop.
>>>>> +                Fail conservatively.  */
>>>>> +             if (bitmap_bit_p (visited_ssa, SSA_NAME_VERSION (name)))
>>>>> +               return false;
>>>>> +             bitmap_set_bit (visited_ssa, SSA_NAME_VERSION (name));
>>>>> +             worklist.safe_push (name);
>>>>>
>>>>> See above.
>>>>>
>>>>> In principle the thing is sound but I'd like to be able to pass in a
>>>>> bitmap of
>>>>> known maybe-undefined/must-defined SSA names to have a cache for
>>>>> multiple invocations from within a pass (like this unswitching case).
>>>>
>>>>
>>>>
>>>> Done, though bitmaps are now calculated as part of the instantiation.
>>>>
>>>>>
>>>>> Also once you hit defs that are in a post-dominated region of the loop
>>>>> entry
>>>>> you can treat them as not undefined (as their use invokes undefined
>>>>> behavior).  This is also how you treat function parameters (well,
>>>>> ssa_undefined_value_p does), where the call site invokes undefined
>>>>> behavior
>>>>> when passing in undefined values.  So we need an extra parameter
>>>>> specifying
>>>>> the post-dominance region.
>>>>
>>>>
>>>>
>>>> Done.
>>>>
>>>>>
>>>>> You do not handle memory or calls conservatively which means the
>>>>> existing
>>>>> testcase only needs some obfuscation to become a problem again.  To fix
>>>>> that before /* Check that any SSA names used to define NAME is also
>>>>> fully
>>>>> defined.  */ bail out conservatively, like
>>>>>
>>>>>    if (! is_gimple_assign (def)
>>>>>       || gimple_assign_single_p (def))
>>>>>     return true;
>>>>
>>>>
>>>>
>>>> As I asked previously, I understand the !is_gimple_assign, which will
>>>> skip
>>>> over GIMPLE_CALLs setting a value, but the "gimple_assign_single_p(def)
>>>> ==
>>>> true"??
>>>>
>>>> Won't this last one bail on things like e.3_7 = e, or x_77 = y_88? Don't
>>>> we
>>>> want to follow up the def chain precisely on these?
>>>>
>>>> The attached implementation uses a cache, and a pre-computed
>>>> post-dominance
>>>> region.  A previous incantation of this patch (sans the post-dominance
>>>> stuff) had performance within the noise of the previous implementation.
>>>>
>>>> I am testing again, and will do some performance comparisons in a bit,
>>>> but
>>>> for now-- are we on the same page here?  Is this what you wanted?
>>>
>>>
>>> +      /* DEFs in the post-dominated region of the loop entry invoke
>>> +        undefined behavior.  Adding any use won't make things any
>>> +        worse.  */
>>> +      for (unsigned i = 0; i < postdom.length (); ++i)
>>> +       if (def->bb == postdom[i])
>>>
>>> gimple_bb (def)
>>>
>>> +         {
>>> +           set_defined (t);
>>> +           return false;
>>> +         }
>>>
>>> I think you can't really return false here but need to continue processing
>>> the worklist.  I'd say rather than a linear walk you can as well use
>>> dominated_by_p (CDI_POST_DOMINATORS, ...) and record the entry
>>> block instead?
>>>
>>> Note that the way you compute post-dominators doesn't work -- nothing
>>> keeps them up-to-date when unswitching does CFG manipulations :/
>>> Fortunately we have a way to compute them per region, see how
>>> tree-if-conv.c
>>> does that (build_region, calculate_dominance_info_for_region).
>>
>>
>> If I understand correctly, we could compute them per region in
>> tree_unswitch_single_loop() for each individual loop with what
>> tree-if-conv.c uses.
>>
>> I could certainly abstract build_region/calculate_dominance_info_for_region
>> into something new, say calculate_dominance_info_for_loop_region().  Then we
>> could use dominated_by_p(CDI_POST_DOMINATORS, ...) in my class as you
>> suggest.
>>
>> Question... computing the post-dom tree per region as I've just described
>> will only create post-dom information for the loop region, which means that
>> any definition outside of the loop will have zero dominance info.
>
> Yes.
>
>> What if the definition in question post-dominates the loop entry but is
>> outside/after of the loop?
>
> How would we ever arrive at such def?  Once we reach a def before the
> region/loop
> we know it's fine to use (the missing "stop" point I pointed out).

Hmmm, it looks like we can't even build the post-dom region 
appropriately in the presence of infinite loops.

First, build_region() fails if it can't find a loop post-header:

   /* The last element is loop post-header.  */
   gcc_assert (exit_bb);

which we won't have in an infinite loop like:

bb2: //loop header
|
V
loop 1:
bb3:		<-------+
	bar();		|
|			|
V			|
bb4:			|
	goto bb3;   >---+


We could just build the region without the post-header, but then we fail 
while building the DFS dominance region:

dom_info::calc_dfs_tree_nonrec (basic_block bb)
....
	  gcc_assert (bn != m_start_block);

Presumably because we've looped back to the start block (bb4 in a post 
dominator world).  This doesn't happen calculating going forward because 
we always have a start block outside of the region (the function entry 
block).

I tried to fake edge my way out of it, but things get really messed up 
while putting/removing fake edges in the presence of loop info.  I'm 
assuming all this is by design.

Would you prefer me to continue down this path, trying to build a 
post-dom region with infinite loops and fixing build_region / 
calc_dfs_tree_nonrec, or could we do without this dominance 
optimization?  Pretty please?

>
>>  We will have no post-dom information for this.
>> In this case, could we just ignore and continue evaluating the SSA_NAME with
>> the rest of our heuristics, or did you have something else in mind?  Another
>> option would be to calculate the post-dom information for the entire
>> function on every loop (calculate_dominance_info(CDI_POST_DOMINATORS)), but
>> that seems rather expensive.
>>
>> As a side note, at this point I just want to fix/close the PR in an
>> acceptable manner, not come up with the end-all catch-all most-efficient
>> patch for an unlikely scenario ;-).
>
> Yeah, which is why I wonder whether the caching is worth the trouble (in it's
> current form) for the unswitching use (given it's other restrictions
> on conditions
> to unswitch).

We could go back to my original, no caching version (with the other 
suggestions).  That solves the problem quite simply, with no regressions.

Thanks again.
Aldy

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PR tree-optimization/71691] Fix unswitching in presence of maybe-undef SSA_NAMEs (take 2)
  2017-01-13 18:48           ` Aldy Hernandez
@ 2017-01-18 15:17             ` Richard Biener
  2017-01-23 17:28               ` Aldy Hernandez
  0 siblings, 1 reply; 23+ messages in thread
From: Richard Biener @ 2017-01-18 15:17 UTC (permalink / raw)
  To: Aldy Hernandez; +Cc: gcc-patches, Jeff Law

On Fri, Jan 13, 2017 at 7:48 PM, Aldy Hernandez <aldyh@redhat.com> wrote:
> [Sorry for the delay, I was sick.]
>
>
> On 01/09/2017 04:30 AM, Richard Biener wrote:
>>
>> On Sat, Jan 7, 2017 at 1:54 PM, Aldy Hernandez <aldyh@redhat.com> wrote:
>>>
>>> On 01/04/2017 07:11 AM, Richard Biener wrote:
>>>>
>>>>
>>>> On Tue, Jan 3, 2017 at 6:36 PM, Aldy Hernandez <aldyh@redhat.com> wrote:
>>>>>
>>>>>
>>>>> On 12/20/2016 09:16 AM, Richard Biener wrote:
>>>>>>
>>>>>>
>>>>>>
>>>>>> On Fri, Dec 16, 2016 at 3:41 PM, Aldy Hernandez <aldyh@redhat.com>
>>>>>> wrote:
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>> Hi folks.
>>>>>>>
>>>>>>> This is a follow-up on Jeff and Richi's interaction on the
>>>>>>> aforementioned
>>>>>>> PR
>>>>>>> here:
>>>>>>>
>>>>>>>         https://gcc.gnu.org/ml/gcc-patches/2016-08/msg01397.html
>>>>>>>
>>>>>>> I decided to explore the idea of analyzing may-undefness on-demand,
>>>>>>> which
>>>>>>> actually looks rather cheap.
>>>>>>>
>>>>>>> BTW, I don't understand why we don't have auto_bitmap's, as we
>>>>>>> already
>>>>>>> have
>>>>>>> auto_sbitmap's.  I've implemented the former based on auto_sbitmap's
>>>>>>> code
>>>>>>> we
>>>>>>> already have.
>>>>>>>
>>>>>>> The attached patch fixes the bug without introducing any regressions.
>>>>>>>
>>>>>>> I also tested the patch by compiling 242 .ii files with -O3.  These
>>>>>>> were
>>>>>>> gathered from a stage1 build with -save-temps.  There is a slight
>>>>>>> time
>>>>>>> degradation of 4 seconds within 27 minutes of user time:
>>>>>>>
>>>>>>>         tainted:        26:52
>>>>>>>         orig:           26:48
>>>>>>>
>>>>>>> This was the average aggregate time of two runs compiling all 242 .ii
>>>>>>> files.
>>>>>>> IMO, this looks reasonable.  It is after all, -O3.    Is it
>>>>>>> acceptable?
>>>>>>
>>>>>>
>>>>>>
>>>>>>
>>>>>> +  while (!worklist.is_empty ())
>>>>>> +    {
>>>>>> +      name = worklist.pop ();
>>>>>> +      gcc_assert (TREE_CODE (name) == SSA_NAME);
>>>>>> +
>>>>>> +      if (ssa_undefined_value_p (name, true))
>>>>>> +       return true;
>>>>>> +
>>>>>> +      bitmap_set_bit (visited_ssa, SSA_NAME_VERSION (name));
>>>>>>
>>>>>> it should be already set as we use visited_ssa as "was it ever on the
>>>>>> worklist",
>>>>>> so maybe renaming it would be a good thing as well.
>>>>>
>>>>>
>>>>>
>>>>>
>>>>> I don't understand what you would prefer here.
>>>>
>>>>
>>>>
>>>> Set the bit when you put name on the worklist (and do not do that if the
>>>> bit is set).  Thus simply remove the above and add a bitmap_set_bit
>>>> for the initial name you put on the worklist.
>>>>
>>>>>>
>>>>>> +             if (TREE_CODE (name) == SSA_NAME)
>>>>>> +               {
>>>>>> +                 /* If an SSA has already been seen, this may be a
>>>>>> loop.
>>>>>> +                    Fail conservatively.  */
>>>>>> +                 if (bitmap_bit_p (visited_ssa, SSA_NAME_VERSION
>>>>>> (name)))
>>>>>> +                   return false;
>>>>>>
>>>>>> so to me "conservative" is returning true, not false.
>>>>>
>>>>>
>>>>>
>>>>>
>>>>> OK
>>>>>
>>>>>>
>>>>>> +                 bitmap_set_bit (visited_ssa, SSA_NAME_VERSION
>>>>>> (name));
>>>>>> +                 worklist.safe_push (name);
>>>>>>
>>>>>> but for loops we can just continue and ignore this use.  And
>>>>>> bitmap_set_bit
>>>>>> returns whether it set a bit, thus
>>>>>>
>>>>>>                 if (bitmap_set_bit (visited_ssa, SSA_NAME_VERSION
>>>>>> (name)))
>>>>>>                   worklist.safe_push (name);
>>>>>>
>>>>>> should work?
>>>>>
>>>>>
>>>>>
>>>>>
>>>>> Fixed.
>>>>>
>>>>>>
>>>>>> +      /* Check that any SSA names used to define NAME is also fully
>>>>>> +        defined.  */
>>>>>> +      use_operand_p use_p;
>>>>>> +      ssa_op_iter iter;
>>>>>> +      FOR_EACH_SSA_USE_OPERAND (use_p, def, iter, SSA_OP_USE)
>>>>>> +       {
>>>>>> +         name = USE_FROM_PTR (use_p);
>>>>>> +         if (TREE_CODE (name) == SSA_NAME)
>>>>>>
>>>>>> always true.
>>>>>>
>>>>>> +           {
>>>>>> +             /* If an SSA has already been seen, this may be a loop.
>>>>>> +                Fail conservatively.  */
>>>>>> +             if (bitmap_bit_p (visited_ssa, SSA_NAME_VERSION (name)))
>>>>>> +               return false;
>>>>>> +             bitmap_set_bit (visited_ssa, SSA_NAME_VERSION (name));
>>>>>> +             worklist.safe_push (name);
>>>>>>
>>>>>> See above.
>>>>>>
>>>>>> In principle the thing is sound but I'd like to be able to pass in a
>>>>>> bitmap of
>>>>>> known maybe-undefined/must-defined SSA names to have a cache for
>>>>>> multiple invocations from within a pass (like this unswitching case).
>>>>>
>>>>>
>>>>>
>>>>>
>>>>> Done, though bitmaps are now calculated as part of the instantiation.
>>>>>
>>>>>>
>>>>>> Also once you hit defs that are in a post-dominated region of the loop
>>>>>> entry
>>>>>> you can treat them as not undefined (as their use invokes undefined
>>>>>> behavior).  This is also how you treat function parameters (well,
>>>>>> ssa_undefined_value_p does), where the call site invokes undefined
>>>>>> behavior
>>>>>> when passing in undefined values.  So we need an extra parameter
>>>>>> specifying
>>>>>> the post-dominance region.
>>>>>
>>>>>
>>>>>
>>>>>
>>>>> Done.
>>>>>
>>>>>>
>>>>>> You do not handle memory or calls conservatively which means the
>>>>>> existing
>>>>>> testcase only needs some obfuscation to become a problem again.  To
>>>>>> fix
>>>>>> that before /* Check that any SSA names used to define NAME is also
>>>>>> fully
>>>>>> defined.  */ bail out conservatively, like
>>>>>>
>>>>>>    if (! is_gimple_assign (def)
>>>>>>       || gimple_assign_single_p (def))
>>>>>>     return true;
>>>>>
>>>>>
>>>>>
>>>>>
>>>>> As I asked previously, I understand the !is_gimple_assign, which will
>>>>> skip
>>>>> over GIMPLE_CALLs setting a value, but the "gimple_assign_single_p(def)
>>>>> ==
>>>>> true"??
>>>>>
>>>>> Won't this last one bail on things like e.3_7 = e, or x_77 = y_88?
>>>>> Don't
>>>>> we
>>>>> want to follow up the def chain precisely on these?
>>>>>
>>>>> The attached implementation uses a cache, and a pre-computed
>>>>> post-dominance
>>>>> region.  A previous incantation of this patch (sans the post-dominance
>>>>> stuff) had performance within the noise of the previous implementation.
>>>>>
>>>>> I am testing again, and will do some performance comparisons in a bit,
>>>>> but
>>>>> for now-- are we on the same page here?  Is this what you wanted?
>>>>
>>>>
>>>>
>>>> +      /* DEFs in the post-dominated region of the loop entry invoke
>>>> +        undefined behavior.  Adding any use won't make things any
>>>> +        worse.  */
>>>> +      for (unsigned i = 0; i < postdom.length (); ++i)
>>>> +       if (def->bb == postdom[i])
>>>>
>>>> gimple_bb (def)
>>>>
>>>> +         {
>>>> +           set_defined (t);
>>>> +           return false;
>>>> +         }
>>>>
>>>> I think you can't really return false here but need to continue
>>>> processing
>>>> the worklist.  I'd say rather than a linear walk you can as well use
>>>> dominated_by_p (CDI_POST_DOMINATORS, ...) and record the entry
>>>> block instead?
>>>>
>>>> Note that the way you compute post-dominators doesn't work -- nothing
>>>> keeps them up-to-date when unswitching does CFG manipulations :/
>>>> Fortunately we have a way to compute them per region, see how
>>>> tree-if-conv.c
>>>> does that (build_region, calculate_dominance_info_for_region).
>>>
>>>
>>>
>>> If I understand correctly, we could compute them per region in
>>> tree_unswitch_single_loop() for each individual loop with what
>>> tree-if-conv.c uses.
>>>
>>> I could certainly abstract
>>> build_region/calculate_dominance_info_for_region
>>> into something new, say calculate_dominance_info_for_loop_region().  Then
>>> we
>>> could use dominated_by_p(CDI_POST_DOMINATORS, ...) in my class as you
>>> suggest.
>>>
>>> Question... computing the post-dom tree per region as I've just described
>>> will only create post-dom information for the loop region, which means
>>> that
>>> any definition outside of the loop will have zero dominance info.
>>
>>
>> Yes.
>>
>>> What if the definition in question post-dominates the loop entry but is
>>> outside/after of the loop?
>>
>>
>> How would we ever arrive at such def?  Once we reach a def before the
>> region/loop
>> we know it's fine to use (the missing "stop" point I pointed out).
>
>
> Hmmm, it looks like we can't even build the post-dom region appropriately in
> the presence of infinite loops.
>
> First, build_region() fails if it can't find a loop post-header:
>
>   /* The last element is loop post-header.  */
>   gcc_assert (exit_bb);
>
> which we won't have in an infinite loop like:
>
> bb2: //loop header
> |
> V
> loop 1:
> bb3:            <-------+
>         bar();          |
> |                       |
> V                       |
> bb4:                    |
>         goto bb3;   >---+
>
>
> We could just build the region without the post-header, but then we fail
> while building the DFS dominance region:
>
> dom_info::calc_dfs_tree_nonrec (basic_block bb)
> ....
>           gcc_assert (bn != m_start_block);
>
> Presumably because we've looped back to the start block (bb4 in a post
> dominator world).  This doesn't happen calculating going forward because we
> always have a start block outside of the region (the function entry block).
>
> I tried to fake edge my way out of it, but things get really messed up while
> putting/removing fake edges in the presence of loop info.  I'm assuming all
> this is by design.

Heh.  Infinite loops are indeed fun, and yes, the fake edges confuse some
of the loop stuff.

> Would you prefer me to continue down this path, trying to build a post-dom
> region with infinite loops and fixing build_region / calc_dfs_tree_nonrec,
> or could we do without this dominance optimization?  Pretty please?

It will pessimize some cases though, but see below.

>>
>>>  We will have no post-dom information for this.
>>> In this case, could we just ignore and continue evaluating the SSA_NAME
>>> with
>>> the rest of our heuristics, or did you have something else in mind?
>>> Another
>>> option would be to calculate the post-dom information for the entire
>>> function on every loop (calculate_dominance_info(CDI_POST_DOMINATORS)),
>>> but
>>> that seems rather expensive.
>>>
>>> As a side note, at this point I just want to fix/close the PR in an
>>> acceptable manner, not come up with the end-all catch-all most-efficient
>>> patch for an unlikely scenario ;-).
>>
>>
>> Yeah, which is why I wonder whether the caching is worth the trouble (in
>> it's
>> current form) for the unswitching use (given it's other restrictions
>> on conditions
>> to unswitch).
>
>
> We could go back to my original, no caching version (with the other
> suggestions).  That solves the problem quite simply, with no regressions.

So let's go with a unswitching-local solution then.  Based on
tree_may_unswitch_on:

  /* Condition must be invariant.  */
  FOR_EACH_SSA_TREE_OPERAND (use, stmt, iter, SSA_OP_USE)
    {
      /* Unswitching on undefined values would introduce undefined
         behavior that the original program might never exercise.  */
      if (ssa_undefined_value_p (use, true))
        return NULL_TREE;
      def = SSA_NAME_DEF_STMT (use);
      def_bb = gimple_bb (def);
      if (def_bb
          && flow_bb_inside_loop_p (loop, def_bb))
        return NULL_TREE;

we only have to look for uses in blocks dominating the loop header block
(or in blocks post-dominating that loop header, but we can probably implement
that by simply including the loop header itself with a FIXME comment).

Note that we only need to know whether a BB will be always executed when
the loop is entered -- there's "infrastructure" elsewhere that computes this
w/o the need of post-dominance.  For example see fill_always_executed_in_1
tree-ssa-loop-im.c (we can't use that 1:1 I think because we already use ->aux
via the original copy tables, but we could simplify it as we're only
interested in
the loop which preheader we put the unswitch condition on so we can use
a bitmap to record whether a block of the loop is always executed or not).

Can you send a patch that does this maybe?

Thanks,
Richard.


> Thanks again.
> Aldy

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PR tree-optimization/71691] Fix unswitching in presence of maybe-undef SSA_NAMEs (take 2)
  2017-01-18 15:17             ` Richard Biener
@ 2017-01-23 17:28               ` Aldy Hernandez
  2017-01-24 12:25                 ` Richard Biener
  0 siblings, 1 reply; 23+ messages in thread
From: Aldy Hernandez @ 2017-01-23 17:28 UTC (permalink / raw)
  To: Richard Biener; +Cc: gcc-patches, Jeff Law

On 01/18/2017 10:10 AM, Richard Biener wrote:
> On Fri, Jan 13, 2017 at 7:48 PM, Aldy Hernandez <aldyh@redhat.com> wrote:

Hi Richard.

I'd be happy to provide a patch, but could you please elaborate, as I'm 
afraid I'm not following.

>> We could go back to my original, no caching version (with the other
>> suggestions).  That solves the problem quite simply, with no regressions.
>
> So let's go with a unswitching-local solution then.  Based on
> tree_may_unswitch_on:

What do you mean by unswitching-local?

>
>   /* Condition must be invariant.  */
>   FOR_EACH_SSA_TREE_OPERAND (use, stmt, iter, SSA_OP_USE)
>     {
>       /* Unswitching on undefined values would introduce undefined
>          behavior that the original program might never exercise.  */
>       if (ssa_undefined_value_p (use, true))
>         return NULL_TREE;
>       def = SSA_NAME_DEF_STMT (use);
>       def_bb = gimple_bb (def);
>       if (def_bb
>           && flow_bb_inside_loop_p (loop, def_bb))
>         return NULL_TREE;
>
> we only have to look for uses in blocks dominating the loop header block
> (or in blocks post-dominating that loop header, but we can probably implement
> that by simply including the loop header itself with a FIXME comment).

Look for *uses*??  Do you mean definitions?  I mean, we're trying to 
figure out whether we are going to unswitch on a use that is inside a 
the loop, not before or after. So perhaps we only care about 
*definitions* (SSA_NAME_DEF_STMT) dominating the loop header.

>
> Note that we only need to know whether a BB will be always executed when
> the loop is entered -- there's "infrastructure" elsewhere that computes this
> w/o the need of post-dominance.  For example see fill_always_executed_in_1
> tree-ssa-loop-im.c (we can't use that 1:1 I think because we already use ->aux
> via the original copy tables, but we could simplify it as we're only
> interested in
> the loop which preheader we put the unswitch condition on so we can use
> a bitmap to record whether a block of the loop is always executed or not).

I don't see any use of ->aux within loop unswitching, so perhaps no 
adjustment is needed?  I verified this by successfully bootstrapping with:

diff --git a/gcc/tree-ssa-loop-unswitch.c b/gcc/tree-ssa-loop-unswitch.c
index 92599fb..774d6bf 100644
--- a/gcc/tree-ssa-loop-unswitch.c
+++ b/gcc/tree-ssa-loop-unswitch.c
@@ -94,6 +94,14 @@ tree_ssa_unswitch_loops (void)
    struct loop *loop;
    bool changed = false;

+  basic_block bb;
+  FOR_ALL_BB_FN (bb, cfun)
+    if (bb->aux)
+      {
+       gcc_unreachable ();
+      }

Furthermore, you say that we have this "infrastructure" without the need 
for post-dominance.  But we still need dominance info.  The function 
fill_always_execute_in_1 uses CDI_DOMINATORS both inside said function, 
and throughout its dependencies.  I thought the point of pre-calculating 
dominance info (or post-dominance info) originally was because we 
couldn't depend on dominance info being kept up to date between 
executions of tree_unswitch_single_loop(), which BTW, runs recursively.

> Can you send a patch that does this maybe?

Sure, when I understand what you suggest ;-).

Again, thanks for your input here.
Aldy

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PR tree-optimization/71691] Fix unswitching in presence of maybe-undef SSA_NAMEs (take 2)
  2017-01-23 17:28               ` Aldy Hernandez
@ 2017-01-24 12:25                 ` Richard Biener
  2017-01-26 12:04                   ` Aldy Hernandez
  0 siblings, 1 reply; 23+ messages in thread
From: Richard Biener @ 2017-01-24 12:25 UTC (permalink / raw)
  To: Aldy Hernandez; +Cc: gcc-patches, Jeff Law

On Mon, Jan 23, 2017 at 6:26 PM, Aldy Hernandez <aldyh@redhat.com> wrote:
> On 01/18/2017 10:10 AM, Richard Biener wrote:
>>
>> On Fri, Jan 13, 2017 at 7:48 PM, Aldy Hernandez <aldyh@redhat.com> wrote:
>
>
> Hi Richard.
>
> I'd be happy to provide a patch, but could you please elaborate, as I'm
> afraid I'm not following.
>
>>> We could go back to my original, no caching version (with the other
>>> suggestions).  That solves the problem quite simply, with no regressions.
>>
>>
>> So let's go with a unswitching-local solution then.  Based on
>> tree_may_unswitch_on:
>
>
> What do you mean by unswitching-local?

Like your original patch, not adding new generic infrastructure
outside of tree-ssa-unswitch.c.

>>
>>   /* Condition must be invariant.  */
>>   FOR_EACH_SSA_TREE_OPERAND (use, stmt, iter, SSA_OP_USE)
>>     {
>>       /* Unswitching on undefined values would introduce undefined
>>          behavior that the original program might never exercise.  */
>>       if (ssa_undefined_value_p (use, true))
>>         return NULL_TREE;
>>       def = SSA_NAME_DEF_STMT (use);
>>       def_bb = gimple_bb (def);
>>       if (def_bb
>>           && flow_bb_inside_loop_p (loop, def_bb))
>>         return NULL_TREE;
>>
>> we only have to look for uses in blocks dominating the loop header block
>> (or in blocks post-dominating that loop header, but we can probably
>> implement
>> that by simply including the loop header itself with a FIXME comment).
>
>
> Look for *uses*??  Do you mean definitions?  I mean, we're trying to figure
> out whether we are going to unswitch on a use that is inside a the loop, not
> before or after. So perhaps we only care about *definitions*
> (SSA_NAME_DEF_STMT) dominating the loop header.

We're looking for stmts using the 'use's in the above loop on a path that is
always executed when the loop is entered.  So we're not introducing a
use of a possibly undefined value.

Of course we can also prove that 'use' is in fact not undefined looking at
its defs which are obviously always dominating the loop header if the
condition satisfies tree_may_unswitch_on (non-dominating defs will
have uses in PHIs dominating  the header which we have to treat
conservatively of course).

>
>>
>> Note that we only need to know whether a BB will be always executed when
>> the loop is entered -- there's "infrastructure" elsewhere that computes
>> this
>> w/o the need of post-dominance.  For example see fill_always_executed_in_1
>> tree-ssa-loop-im.c (we can't use that 1:1 I think because we already use
>> ->aux
>> via the original copy tables, but we could simplify it as we're only
>> interested in
>> the loop which preheader we put the unswitch condition on so we can use
>> a bitmap to record whether a block of the loop is always executed or not).
>
>
> I don't see any use of ->aux within loop unswitching, so perhaps no
> adjustment is needed?  I verified this by successfully bootstrapping with:
>
> diff --git a/gcc/tree-ssa-loop-unswitch.c b/gcc/tree-ssa-loop-unswitch.c
> index 92599fb..774d6bf 100644
> --- a/gcc/tree-ssa-loop-unswitch.c
> +++ b/gcc/tree-ssa-loop-unswitch.c
> @@ -94,6 +94,14 @@ tree_ssa_unswitch_loops (void)
>    struct loop *loop;
>    bool changed = false;
>
> +  basic_block bb;
> +  FOR_ALL_BB_FN (bb, cfun)
> +    if (bb->aux)
> +      {
> +       gcc_unreachable ();
> +      }
>
> Furthermore, you say that we have this "infrastructure" without the need for
> post-dominance.  But we still need dominance info.  The function
> fill_always_execute_in_1 uses CDI_DOMINATORS both inside said function, and
> throughout its dependencies.  I thought the point of pre-calculating
> dominance info (or post-dominance info) originally was because we couldn't
> depend on dominance info being kept up to date between executions of
> tree_unswitch_single_loop(), which BTW, runs recursively.

dominators are kept up-to-date within cfg manipulation routines and unswitching
already uses them.

So if you just try to prove 'use' is defined you don't even need
dominators.  But
that misses cases like

int x;
foo ()
{
  for (;;)
    {
       if (x == 5)
         ...;
    }
}

where unswitching is valid because x is always used when the loop is entered.
Similar

int x, a[10];
foo (int c)
{
  foo (x);
  for (i=0;i<c;++i)
    {
       if (a[c])
         break;
       if (x == 5)
         ...;
    }
}

even though the condition is not always executed if the loop is entered.

>> Can you send a patch that does this maybe?
>
>
> Sure, when I understand what you suggest ;-).
>
> Again, thanks for your input here.
> Aldy
>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PR tree-optimization/71691] Fix unswitching in presence of maybe-undef SSA_NAMEs (take 2)
  2017-01-24 12:25                 ` Richard Biener
@ 2017-01-26 12:04                   ` Aldy Hernandez
  2017-01-26 12:48                     ` Richard Biener
  0 siblings, 1 reply; 23+ messages in thread
From: Aldy Hernandez @ 2017-01-26 12:04 UTC (permalink / raw)
  To: Richard Biener; +Cc: gcc-patches, Jeff Law

[-- Attachment #1: Type: text/plain, Size: 5795 bytes --]

On 01/24/2017 07:23 AM, Richard Biener wrote:
> On Mon, Jan 23, 2017 at 6:26 PM, Aldy Hernandez <aldyh@redhat.com> wrote:
>> On 01/18/2017 10:10 AM, Richard Biener wrote:
>>>
>>> On Fri, Jan 13, 2017 at 7:48 PM, Aldy Hernandez <aldyh@redhat.com> wrote:
>>
>>
>> Hi Richard.
>>
>> I'd be happy to provide a patch, but could you please elaborate, as I'm
>> afraid I'm not following.
>>
>>>> We could go back to my original, no caching version (with the other
>>>> suggestions).  That solves the problem quite simply, with no regressions.
>>>
>>>
>>> So let's go with a unswitching-local solution then.  Based on
>>> tree_may_unswitch_on:
>>
>>
>> What do you mean by unswitching-local?
>
> Like your original patch, not adding new generic infrastructure
> outside of tree-ssa-unswitch.c.
>
>>>
>>>   /* Condition must be invariant.  */
>>>   FOR_EACH_SSA_TREE_OPERAND (use, stmt, iter, SSA_OP_USE)
>>>     {
>>>       /* Unswitching on undefined values would introduce undefined
>>>          behavior that the original program might never exercise.  */
>>>       if (ssa_undefined_value_p (use, true))
>>>         return NULL_TREE;
>>>       def = SSA_NAME_DEF_STMT (use);
>>>       def_bb = gimple_bb (def);
>>>       if (def_bb
>>>           && flow_bb_inside_loop_p (loop, def_bb))
>>>         return NULL_TREE;
>>>
>>> we only have to look for uses in blocks dominating the loop header block
>>> (or in blocks post-dominating that loop header, but we can probably
>>> implement
>>> that by simply including the loop header itself with a FIXME comment).
>>
>>
>> Look for *uses*??  Do you mean definitions?  I mean, we're trying to figure
>> out whether we are going to unswitch on a use that is inside a the loop, not
>> before or after. So perhaps we only care about *definitions*
>> (SSA_NAME_DEF_STMT) dominating the loop header.
>
> We're looking for stmts using the 'use's in the above loop on a path that is
> always executed when the loop is entered.  So we're not introducing a
> use of a possibly undefined value.
>
> Of course we can also prove that 'use' is in fact not undefined looking at
> its defs which are obviously always dominating the loop header if the
> condition satisfies tree_may_unswitch_on (non-dominating defs will
> have uses in PHIs dominating  the header which we have to treat
> conservatively of course).
>
>>
>>>
>>> Note that we only need to know whether a BB will be always executed when
>>> the loop is entered -- there's "infrastructure" elsewhere that computes
>>> this
>>> w/o the need of post-dominance.  For example see fill_always_executed_in_1
>>> tree-ssa-loop-im.c (we can't use that 1:1 I think because we already use
>>> ->aux
>>> via the original copy tables, but we could simplify it as we're only
>>> interested in
>>> the loop which preheader we put the unswitch condition on so we can use
>>> a bitmap to record whether a block of the loop is always executed or not).
>>
>>
>> I don't see any use of ->aux within loop unswitching, so perhaps no
>> adjustment is needed?  I verified this by successfully bootstrapping with:
>>
>> diff --git a/gcc/tree-ssa-loop-unswitch.c b/gcc/tree-ssa-loop-unswitch.c
>> index 92599fb..774d6bf 100644
>> --- a/gcc/tree-ssa-loop-unswitch.c
>> +++ b/gcc/tree-ssa-loop-unswitch.c
>> @@ -94,6 +94,14 @@ tree_ssa_unswitch_loops (void)
>>    struct loop *loop;
>>    bool changed = false;
>>
>> +  basic_block bb;
>> +  FOR_ALL_BB_FN (bb, cfun)
>> +    if (bb->aux)
>> +      {
>> +       gcc_unreachable ();
>> +      }
>>
>> Furthermore, you say that we have this "infrastructure" without the need for
>> post-dominance.  But we still need dominance info.  The function
>> fill_always_execute_in_1 uses CDI_DOMINATORS both inside said function, and
>> throughout its dependencies.  I thought the point of pre-calculating
>> dominance info (or post-dominance info) originally was because we couldn't
>> depend on dominance info being kept up to date between executions of
>> tree_unswitch_single_loop(), which BTW, runs recursively.
>
> dominators are kept up-to-date within cfg manipulation routines and unswitching
> already uses them.
>
> So if you just try to prove 'use' is defined you don't even need
> dominators.  But
> that misses cases like
>
> int x;
> foo ()
> {
>   for (;;)
>     {
>        if (x == 5)
>          ...;
>     }
> }
>
> where unswitching is valid because x is always used when the loop is entered.
> Similar
>
> int x, a[10];
> foo (int c)
> {
>   foo (x);
>   for (i=0;i<c;++i)
>     {
>        if (a[c])
>          break;
>        if (x == 5)
>          ...;
>     }
> }
>
> even though the condition is not always executed if the loop is entered.

I don't think fill_always_execute_in_1 gives us what (I think) you want. 
   For the attached graph, fill_always_execute_in_1 only marks the 
following basic blocks:

bb 4 in loop 2
bb 6 in loop 3
bb 9 in loop 1

Which are basically the loop headers.  Unless I'm misunderstanding 
something, this is useless for the problem at hand.

For example, for loop 3, I would've expected bb5 to be marked as always 
executed if we enter loop 3 since it is its immediate predecessor. 
Similarly for loop 2.  I would expect bb3 to be marked as always being 
executed if we enter loop 2.  And bb10 always enters loop 1.

Also, in other testcases, for loops where the flow loops back to the 
loop-header, we don't even get the loop header marked as always executed 
in the loop.

I still think my initial on-demand approach is straightforward, 
conservative, and solves the problem at hand.  Since I think I'm either 
misunderstanding here, or spending way too much time on this (I think 
this is the 3rd or 4th approach if we count Jeff's original one), I may 
have to respectfully put this back on the queue for someone else to tackle.

Aldy

[-- Attachment #2: foo.ps.gz --]
[-- Type: application/gzip, Size: 7119 bytes --]

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PR tree-optimization/71691] Fix unswitching in presence of maybe-undef SSA_NAMEs (take 2)
  2017-01-26 12:04                   ` Aldy Hernandez
@ 2017-01-26 12:48                     ` Richard Biener
  2017-01-27 11:32                       ` Aldy Hernandez
  0 siblings, 1 reply; 23+ messages in thread
From: Richard Biener @ 2017-01-26 12:48 UTC (permalink / raw)
  To: Aldy Hernandez; +Cc: gcc-patches, Jeff Law

On Thu, Jan 26, 2017 at 1:04 PM, Aldy Hernandez <aldyh@redhat.com> wrote:
> On 01/24/2017 07:23 AM, Richard Biener wrote:
>>
>> On Mon, Jan 23, 2017 at 6:26 PM, Aldy Hernandez <aldyh@redhat.com> wrote:
>>>
>>> On 01/18/2017 10:10 AM, Richard Biener wrote:
>>>>
>>>>
>>>> On Fri, Jan 13, 2017 at 7:48 PM, Aldy Hernandez <aldyh@redhat.com>
>>>> wrote:
>>>
>>>
>>>
>>> Hi Richard.
>>>
>>> I'd be happy to provide a patch, but could you please elaborate, as I'm
>>> afraid I'm not following.
>>>
>>>>> We could go back to my original, no caching version (with the other
>>>>> suggestions).  That solves the problem quite simply, with no
>>>>> regressions.
>>>>
>>>>
>>>>
>>>> So let's go with a unswitching-local solution then.  Based on
>>>> tree_may_unswitch_on:
>>>
>>>
>>>
>>> What do you mean by unswitching-local?
>>
>>
>> Like your original patch, not adding new generic infrastructure
>> outside of tree-ssa-unswitch.c.
>>
>>>>
>>>>   /* Condition must be invariant.  */
>>>>   FOR_EACH_SSA_TREE_OPERAND (use, stmt, iter, SSA_OP_USE)
>>>>     {
>>>>       /* Unswitching on undefined values would introduce undefined
>>>>          behavior that the original program might never exercise.  */
>>>>       if (ssa_undefined_value_p (use, true))
>>>>         return NULL_TREE;
>>>>       def = SSA_NAME_DEF_STMT (use);
>>>>       def_bb = gimple_bb (def);
>>>>       if (def_bb
>>>>           && flow_bb_inside_loop_p (loop, def_bb))
>>>>         return NULL_TREE;
>>>>
>>>> we only have to look for uses in blocks dominating the loop header block
>>>> (or in blocks post-dominating that loop header, but we can probably
>>>> implement
>>>> that by simply including the loop header itself with a FIXME comment).
>>>
>>>
>>>
>>> Look for *uses*??  Do you mean definitions?  I mean, we're trying to
>>> figure
>>> out whether we are going to unswitch on a use that is inside a the loop,
>>> not
>>> before or after. So perhaps we only care about *definitions*
>>> (SSA_NAME_DEF_STMT) dominating the loop header.
>>
>>
>> We're looking for stmts using the 'use's in the above loop on a path that
>> is
>> always executed when the loop is entered.  So we're not introducing a
>> use of a possibly undefined value.
>>
>> Of course we can also prove that 'use' is in fact not undefined looking at
>> its defs which are obviously always dominating the loop header if the
>> condition satisfies tree_may_unswitch_on (non-dominating defs will
>> have uses in PHIs dominating  the header which we have to treat
>> conservatively of course).
>>
>>>
>>>>
>>>> Note that we only need to know whether a BB will be always executed when
>>>> the loop is entered -- there's "infrastructure" elsewhere that computes
>>>> this
>>>> w/o the need of post-dominance.  For example see
>>>> fill_always_executed_in_1
>>>> tree-ssa-loop-im.c (we can't use that 1:1 I think because we already use
>>>> ->aux
>>>> via the original copy tables, but we could simplify it as we're only
>>>> interested in
>>>> the loop which preheader we put the unswitch condition on so we can use
>>>> a bitmap to record whether a block of the loop is always executed or
>>>> not).
>>>
>>>
>>>
>>> I don't see any use of ->aux within loop unswitching, so perhaps no
>>> adjustment is needed?  I verified this by successfully bootstrapping
>>> with:
>>>
>>> diff --git a/gcc/tree-ssa-loop-unswitch.c b/gcc/tree-ssa-loop-unswitch.c
>>> index 92599fb..774d6bf 100644
>>> --- a/gcc/tree-ssa-loop-unswitch.c
>>> +++ b/gcc/tree-ssa-loop-unswitch.c
>>> @@ -94,6 +94,14 @@ tree_ssa_unswitch_loops (void)
>>>    struct loop *loop;
>>>    bool changed = false;
>>>
>>> +  basic_block bb;
>>> +  FOR_ALL_BB_FN (bb, cfun)
>>> +    if (bb->aux)
>>> +      {
>>> +       gcc_unreachable ();
>>> +      }
>>>
>>> Furthermore, you say that we have this "infrastructure" without the need
>>> for
>>> post-dominance.  But we still need dominance info.  The function
>>> fill_always_execute_in_1 uses CDI_DOMINATORS both inside said function,
>>> and
>>> throughout its dependencies.  I thought the point of pre-calculating
>>> dominance info (or post-dominance info) originally was because we
>>> couldn't
>>> depend on dominance info being kept up to date between executions of
>>> tree_unswitch_single_loop(), which BTW, runs recursively.
>>
>>
>> dominators are kept up-to-date within cfg manipulation routines and
>> unswitching
>> already uses them.
>>
>> So if you just try to prove 'use' is defined you don't even need
>> dominators.  But
>> that misses cases like
>>
>> int x;
>> foo ()
>> {
>>   for (;;)
>>     {
>>        if (x == 5)
>>          ...;
>>     }
>> }
>>
>> where unswitching is valid because x is always used when the loop is
>> entered.
>> Similar
>>
>> int x, a[10];
>> foo (int c)
>> {
>>   foo (x);
>>   for (i=0;i<c;++i)
>>     {
>>        if (a[c])
>>          break;
>>        if (x == 5)
>>          ...;
>>     }
>> }
>>
>> even though the condition is not always executed if the loop is entered.
>
>
> I don't think fill_always_execute_in_1 gives us what (I think) you want.
> For the attached graph, fill_always_execute_in_1 only marks the following
> basic blocks:
>
> bb 4 in loop 2
> bb 6 in loop 3
> bb 9 in loop 1
>
> Which are basically the loop headers.  Unless I'm misunderstanding
> something, this is useless for the problem at hand.
>
> For example, for loop 3, I would've expected bb5 to be marked as always
> executed if we enter loop 3 since it is its immediate predecessor. Similarly
> for loop 2.  I would expect bb3 to be marked as always being executed if we
> enter loop 2.  And bb10 always enters loop 1.

It only marks blocks inside loops, for out-of-loop blocks you can use dominance
info.  With fill_always_executed_in we want to handle

  header:
    if (x)
      ...not exit
  merge:
    if (y)

where we can unswitch on if (y) because if we enter the loop the condition is
always executed.  if the if (x) were guarding loop exit we couldn't do this.
This is basically computing dominated_by_p (CDI_POST_DOMINATORS,
loop-header, block-with-unswitch-condition).

> Also, in other testcases, for loops where the flow loops back to the
> loop-header, we don't even get the loop header marked as always executed in
> the loop.
>
> I still think my initial on-demand approach is straightforward,
> conservative, and solves the problem at hand.  Since I think I'm either
> misunderstanding here, or spending way too much time on this (I think this
> is the 3rd or 4th approach if we count Jeff's original one), I may have to
> respectfully put this back on the queue for someone else to tackle.

Your initial on-demand approach is fine to catch some of the cases but it
will not handle those for which we'd need post-dominance.

I guess we can incrementally add that.

Richard.

> Aldy

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PR tree-optimization/71691] Fix unswitching in presence of maybe-undef SSA_NAMEs (take 2)
  2017-01-26 12:48                     ` Richard Biener
@ 2017-01-27 11:32                       ` Aldy Hernandez
  2017-01-30 15:05                         ` Richard Biener
  0 siblings, 1 reply; 23+ messages in thread
From: Aldy Hernandez @ 2017-01-27 11:32 UTC (permalink / raw)
  To: Richard Biener; +Cc: gcc-patches, Jeff Law

[-- Attachment #1: Type: text/plain, Size: 1298 bytes --]

On 01/26/2017 07:29 AM, Richard Biener wrote:
> On Thu, Jan 26, 2017 at 1:04 PM, Aldy Hernandez <aldyh@redhat.com> wrote:
>> On 01/24/2017 07:23 AM, Richard Biener wrote:

> Your initial on-demand approach is fine to catch some of the cases but it
> will not handle those for which we'd need post-dominance.
>
> I guess we can incrementally add that.

No complaints from me.

This is my initial on-demand approach, with a few fixes you've commented 
on throughout.

As you can see, there is still an #if 0 wrt to using your suggested 
conservative handling of memory loads, which I'm not entirely convinced 
of, as it pessimizes gcc.dg/loop-unswitch-1.c.  If you feel strongly 
about it, I can enable the code again.

Also, I enhanced gcc.dg/loop-unswitch-1.c to verify that we're actually 
unswitching something.  It seems kinda silly that we have various 
unswitch tests, but we don't actually check whether we have unswitched 
anything.  This test was the only one in the *unswitch*.c set that I saw 
was actually being unswitched.  Of course, if you don't agree with my 
#if 0 above, I will remove this change to the test.

Finally, are we even guaranteed to unswitch in loop-unswitch-1.c across 
architectures?  If not, again, I can remove the one-liner.

How does this look for trunk?

Aldy


[-- Attachment #2: curr --]
[-- Type: text/plain, Size: 12324 bytes --]

gcc/

	PR tree-optimization/71691
	* bitmap.h (class auto_bitmap): New.
	* tree-ssa-defined-or-undefined.c: New file.
	* tree-ssa-defined-or-undefined.h: New file.
	* Makefile.in (tree-ssa-defined-or-undefined.o): New.
	* tree-ssa-loop-unswitch.c (tree_may_unswitch_on): Add
	defined_or_undefined parameter.  Use defined_or_undefined instead
	of ssa_undefined_value_p.
	(tree_unswitch_single_loop): Instantiate defined_or_undefined
	variable.

diff --git a/gcc/Makefile.in b/gcc/Makefile.in
index 821584a..e0de276 100644
--- a/gcc/Makefile.in
+++ b/gcc/Makefile.in
@@ -1507,6 +1507,7 @@ OBJS = \
 	tree-ssa-coalesce.o \
 	tree-ssa-copy.o \
 	tree-ssa-dce.o \
+	tree-ssa-defined-or-undefined.o \
 	tree-ssa-dom.o \
 	tree-ssa-dse.o \
 	tree-ssa-forwprop.o \
diff --git a/gcc/bitmap.h b/gcc/bitmap.h
index 196738b..f158b44 100644
--- a/gcc/bitmap.h
+++ b/gcc/bitmap.h
@@ -802,4 +802,25 @@ bmp_iter_and_compl (bitmap_iterator *bi, unsigned *bit_no)
        bmp_iter_and_compl (&(ITER), &(BITNUM));				\
        bmp_iter_next (&(ITER), &(BITNUM)))
 
+/* A class that ties the lifetime of a bitmap to its scope.  */
+class auto_bitmap
+{
+ public:
+  auto_bitmap () { bits = BITMAP_ALLOC (NULL); }
+  ~auto_bitmap () { BITMAP_FREE (bits); }
+  // Allow calling bitmap functions on our bitmap.
+  operator bitmap () { return bits; }
+
+ private:
+  // Prevent making a copy that references our bitmap.
+  auto_bitmap (const auto_bitmap &);
+  auto_bitmap &operator = (const auto_bitmap &);
+#if __cplusplus >= 201103L
+  auto_bitmap (auto_bitmap &&);
+  auto_bitmap &operator = (auto_bitmap &&);
+#endif
+
+  bitmap bits;
+};
+
 #endif /* GCC_BITMAP_H */
diff --git a/gcc/testsuite/gcc.dg/loop-unswitch-1.c b/gcc/testsuite/gcc.dg/loop-unswitch-1.c
index 930364c..f6fc41d 100644
--- a/gcc/testsuite/gcc.dg/loop-unswitch-1.c
+++ b/gcc/testsuite/gcc.dg/loop-unswitch-1.c
@@ -1,6 +1,6 @@
 /* For PR rtl-optimization/27735  */
 /* { dg-do compile } */
-/* { dg-options "-O2 -funswitch-loops" } */
+/* { dg-options "-O2 -funswitch-loops -fdump-tree-unswitch-details" } */
 
 void set_color(void);
 void xml_colorize_line(unsigned int *p, int state)
@@ -32,3 +32,5 @@ parse_tag: ;
     }
 }
 
+/* Test that we actually unswitched something.  */
+/* { dg-final { scan-tree-dump ";; Unswitching loop" "unswitch" } } */
diff --git a/gcc/testsuite/gcc.dg/loop-unswitch-5.c b/gcc/testsuite/gcc.dg/loop-unswitch-5.c
new file mode 100644
index 0000000..b41e853
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/loop-unswitch-5.c
@@ -0,0 +1,51 @@
+/* PR middle-end/71691 */
+/* { dg-do run } */
+/* { dg-options "-fno-tree-vrp -O2 -funswitch-loops -fdump-tree-unswitch-details" } */
+
+/* Note: The -fno-tree-vrp above is only there to avoid VRP papering
+   over the problem.  */
+
+char b;
+short f;
+unsigned e;
+int g = 20;
+
+void
+foo ()
+{
+  int l, h;
+  for (l = 0; l <= 7; l++)
+    {
+      int j = 38;
+      if (g)
+	h = 0;
+      for (; h <= 7; h++)
+	{
+	  int i, k = b % (j % 4);
+	  g = f;
+	  for (;;)
+	    {
+	      j = 6 || b;
+	      if (e)
+		{
+		  for (; j; --j)
+		    if (k)
+		      __builtin_printf ("%d", 9);
+		  if (i)
+		    __builtin_printf ("%d", j);
+		}
+	      if (l)
+		continue;
+	      break;
+	    }
+	  i = f || b;
+	}
+    }
+}
+
+int
+main ()
+{
+  foo ();
+  return 0;
+}
diff --git a/gcc/tree-ssa-defined-or-undefined.c b/gcc/tree-ssa-defined-or-undefined.c
new file mode 100644
index 0000000..0ee91e5
--- /dev/null
+++ b/gcc/tree-ssa-defined-or-undefined.c
@@ -0,0 +1,118 @@
+/* Simple class for classifying SSA_NAMEs that are either fully
+   defined or possibly undefined.
+
+   This is meant to support conservative analysis for optimization
+   purposes, not for generating warnings.  The analysis for generating
+   warnings is deeper to avoid generation of false positives.
+
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   Contributed by Aldy Hernandez <aldyh@redhat.com>.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "backend.h"
+#include "tree.h"
+#include "gimple.h"
+#include "ssa.h"
+#include "tree-ssa.h"
+#include "cfgloop.h"
+#include "tree-ssa-defined-or-undefined.h"
+
+/* Return TRUE if an SSA_NAME maybe undefined.  */
+
+bool
+defined_or_undefined::is_maybe_undefined (const tree name)
+{
+  auto_bitmap visited_ssa;
+  auto_vec<tree> worklist;
+  worklist.safe_push (name);
+  bitmap_set_bit (visited_ssa, SSA_NAME_VERSION (name));
+  while (!worklist.is_empty ())
+    {
+      tree t = worklist.pop ();
+
+      /* If it's obviously undefined, avoid further computations.  */
+      if (ssa_undefined_value_p (t, true))
+	return true;
+
+      /* A PARM_DECL will not have an SSA_NAME_DEF_STMT.  Parameters
+	 get their initial value from function entry.  */
+      if (SSA_NAME_VAR (t) && TREE_CODE (SSA_NAME_VAR (t)) == PARM_DECL)
+	return false;
+
+      gimple *def = SSA_NAME_DEF_STMT (t);
+
+      /* Check that all the PHI args are fully defined.  */
+      if (gphi *phi = dyn_cast <gphi *> (def))
+	{
+	  if (virtual_operand_p (PHI_RESULT (phi)))
+	    continue;
+	  for (unsigned i = 0; i < gimple_phi_num_args (phi); ++i)
+	    {
+	      tree t = gimple_phi_arg_def (phi, i);
+	      /* If an SSA has already been seen, it may be a loop,
+		 but we can continue and ignore this use.  Otherwise,
+		 add the SSA_NAME to the queue and visit it later.  */
+	      if (TREE_CODE (t) == SSA_NAME
+		  && bitmap_set_bit (visited_ssa, SSA_NAME_VERSION (t)))
+		worklist.safe_push (t);
+	    }
+	  continue;
+	}
+
+      /* Handle calls and memory loads conservatively.  */
+      if (!is_gimple_assign (def)
+#if 0
+	  /*
+	    This will inhibit unswitching in the presence of memory loads,
+	    which causes gcc.dg/loop_unswitch-1.c to FAIL because we no longer
+	    unswitch:
+
+	    for (;;) {
+	      c = *p;
+	      ...
+	      if (c)
+	        stuff;
+	      else
+	        other_stuff;
+	    }
+	  */
+	  || (gimple_assign_single_p (def)
+	      && gimple_vuse (def)))
+#endif
+	  )
+	return true;
+
+      /* Check that any SSA names used to define NAME are also fully
+	 defined.  */
+      use_operand_p use_p;
+      ssa_op_iter iter;
+      FOR_EACH_SSA_USE_OPERAND (use_p, def, iter, SSA_OP_USE)
+	{
+	  tree t = USE_FROM_PTR (use_p);
+	  /* If an SSA has already been seen, it may be a loop,
+	     but we can continue and ignore this use.  Otherwise,
+	     add the SSA_NAME to the queue and visit it later.  */
+	  if (bitmap_set_bit (visited_ssa, SSA_NAME_VERSION (t)))
+	    worklist.safe_push (t);
+	}
+    }
+  return false;
+}
diff --git a/gcc/tree-ssa-defined-or-undefined.h b/gcc/tree-ssa-defined-or-undefined.h
new file mode 100644
index 0000000..aef0908
--- /dev/null
+++ b/gcc/tree-ssa-defined-or-undefined.h
@@ -0,0 +1,59 @@
+/* Simple class for identifying SSA_NAMEs that are either fully defined
+   or maybe undefined.
+
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   Contributed by Aldy Hernandez <aldyh@redhat.com>.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_TREE_SSA_DEFINED_OR_UNDEFINED_H
+#define GCC_TREE_SSA_DEFINED_OR_UNDEFINED_H
+
+/* Simple class for identifying SSA_NAMEs that are either fully
+   defined or maybe undefined.
+
+   This is meant to support conservative analysis for optimization
+   purposes, not for generating warnings.  The analysis for generating
+   warnings is deeper to avoid generation of false positives.
+
+   Instantiation of this class triggers analysis which is not kept
+   up-to-date as changes in the IL or CFG are made.
+
+   Queries will return the conservative result (maybe undefined) for
+   SSA_NAMEs that were not encountered during the initial
+   analysis.  */
+
+class defined_or_undefined
+{
+ private:
+  // Entry block of the current loop.
+  basic_block loop_entry;
+  vec<basic_block> loop_region;
+ public:
+  defined_or_undefined (struct loop *loop)
+    { loop_entry = loop->header; }
+  /* Note: We could implement a variant of this class that does not
+     need loop info.  Until such need arises, leave this
+     unimplemented.  */
+  defined_or_undefined ();
+  // Return TRUE if SSA is maybe undefined.
+  bool is_maybe_undefined (tree ssa);
+  // Return TRUE if SSA is definitely defined.
+  bool is_defined (tree ssa) { return !is_maybe_undefined (ssa); }
+};
+
+#endif // GCC_TREE_SSA_DEFINED_OR_UNDEFINED_H
diff --git a/gcc/tree-ssa-loop-unswitch.c b/gcc/tree-ssa-loop-unswitch.c
index 92599fb..4452373 100644
--- a/gcc/tree-ssa-loop-unswitch.c
+++ b/gcc/tree-ssa-loop-unswitch.c
@@ -38,6 +38,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "gimple-iterator.h"
 #include "cfghooks.h"
 #include "tree-ssa-loop-manip.h"
+#include "tree-ssa-defined-or-undefined.h"
 
 /* This file implements the loop unswitching, i.e. transformation of loops like
 
@@ -77,7 +78,8 @@ along with GCC; see the file COPYING3.  If not see
 
 static struct loop *tree_unswitch_loop (struct loop *, basic_block, tree);
 static bool tree_unswitch_single_loop (struct loop *, int);
-static tree tree_may_unswitch_on (basic_block, struct loop *);
+static tree tree_may_unswitch_on (basic_block, struct loop *,
+				  defined_or_undefined *);
 static bool tree_unswitch_outer_loop (struct loop *);
 static edge find_loop_guard (struct loop *);
 static bool empty_bb_without_guard_p (struct loop *, basic_block);
@@ -113,7 +115,8 @@ tree_ssa_unswitch_loops (void)
    basic blocks (for what it means see comments below).  */
 
 static tree
-tree_may_unswitch_on (basic_block bb, struct loop *loop)
+tree_may_unswitch_on (basic_block bb, struct loop *loop,
+		      defined_or_undefined *defined_or_undefined)
 {
   gimple *last, *def;
   gcond *stmt;
@@ -138,7 +141,7 @@ tree_may_unswitch_on (basic_block bb, struct loop *loop)
     {
       /* Unswitching on undefined values would introduce undefined
 	 behavior that the original program might never exercise.  */
-      if (ssa_undefined_value_p (use, true))
+      if (defined_or_undefined->is_maybe_undefined (use))
 	return NULL_TREE;
       def = SSA_NAME_DEF_STMT (use);
       def_bb = gimple_bb (def);
@@ -200,6 +203,7 @@ tree_unswitch_single_loop (struct loop *loop, int num)
   gimple *stmt;
   bool changed = false;
   HOST_WIDE_INT iterations;
+  defined_or_undefined defined_or_undefined (loop);
 
   /* Perform initial tests if unswitch is eligible.  */
   if (num == 0)
@@ -243,7 +247,7 @@ tree_unswitch_single_loop (struct loop *loop, int num)
     {
       /* Find a bb to unswitch on.  */
       for (; i < loop->num_nodes; i++)
-	if ((cond = tree_may_unswitch_on (bbs[i], loop)))
+	if ((cond = tree_may_unswitch_on (bbs[i], loop, &defined_or_undefined)))
 	  break;
 
       if (i == loop->num_nodes)
@@ -363,7 +367,8 @@ tree_unswitch_single_loop (struct loop *loop, int num)
       /* Find a bb to unswitch on.  */
       for (; found < loop->num_nodes; found++)
 	if ((bbs[found]->flags & BB_REACHABLE)
-	    && (cond = tree_may_unswitch_on (bbs[found], loop)))
+	    && (cond = tree_may_unswitch_on (bbs[found], loop,
+					     &defined_or_undefined)))
 	  break;
 
       if (found == loop->num_nodes)

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PR tree-optimization/71691] Fix unswitching in presence of maybe-undef SSA_NAMEs (take 2)
  2017-01-27 11:32                       ` Aldy Hernandez
@ 2017-01-30 15:05                         ` Richard Biener
  2017-01-30 17:24                           ` Aldy Hernandez
  0 siblings, 1 reply; 23+ messages in thread
From: Richard Biener @ 2017-01-30 15:05 UTC (permalink / raw)
  To: Aldy Hernandez; +Cc: gcc-patches, Jeff Law

On Fri, Jan 27, 2017 at 12:20 PM, Aldy Hernandez <aldyh@redhat.com> wrote:
> On 01/26/2017 07:29 AM, Richard Biener wrote:
>>
>> On Thu, Jan 26, 2017 at 1:04 PM, Aldy Hernandez <aldyh@redhat.com> wrote:
>>>
>>> On 01/24/2017 07:23 AM, Richard Biener wrote:
>
>
>> Your initial on-demand approach is fine to catch some of the cases but it
>> will not handle those for which we'd need post-dominance.
>>
>> I guess we can incrementally add that.
>
>
> No complaints from me.
>
> This is my initial on-demand approach, with a few fixes you've commented on
> throughout.
>
> As you can see, there is still an #if 0 wrt to using your suggested
> conservative handling of memory loads, which I'm not entirely convinced of,
> as it pessimizes gcc.dg/loop-unswitch-1.c.  If you feel strongly about it, I
> can enable the code again.

It is really required -- fortunately loop-unswitch-1.c is one of the cases where
the post-dom / always-executed bits help .  The comparison is inside the
loop header and thus always executed when the loop enters, so inserrting it
on the preheader edge is fine.

> Also, I enhanced gcc.dg/loop-unswitch-1.c to verify that we're actually
> unswitching something.  It seems kinda silly that we have various unswitch
> tests, but we don't actually check whether we have unswitched anything.

Heh.  It probably was added for an ICE...

> This test was the only one in the *unswitch*.c set that I saw was actually
> being unswitched.  Of course, if you don't agree with my #if 0 above, I will
> remove this change to the test.
>
> Finally, are we even guaranteed to unswitch in loop-unswitch-1.c across
> architectures?  If not, again, I can remove the one-liner.

I think so.

>
> How does this look for trunk?

With a unswitch-local solution I meant to not add new files but put the
defined_or_undefined class (well, or rather a single function...) into
tree-ssa-loop-unswitch.c.

@@ -138,7 +141,7 @@ tree_may_unswitch_on (basic_block bb, struct loop *loop)
     {
       /* Unswitching on undefined values would introduce undefined
         behavior that the original program might never exercise.  */
-      if (ssa_undefined_value_p (use, true))
+      if (defined_or_undefined->is_maybe_undefined (use))
        return NULL_TREE;
       def = SSA_NAME_DEF_STMT (use);
       def_bb = gimple_bb (def);

as I said, moving this (now more expensive check) after

      if (def_bb
          && flow_bb_inside_loop_p (loop, def_bb))
        return NULL_TREE;

this cheap check would be better.  It should avoid 99% of all calls I bet.

You can recover the loop-unswitch-1.c case by passing down
the using stmt and checking its BB against loop_header (the only
block that we trivially know is always executed when entering the region).
Or do that check in the caller, like

        if (bb != loop->header
           && ssa_undefined_value_p (use, true) /
defined_or_undefined->is_maybe_undefined (use))

+      gimple *def = SSA_NAME_DEF_STMT (t);
+
+      /* Check that all the PHI args are fully defined.  */
+      if (gphi *phi = dyn_cast <gphi *> (def))
+       {
+         if (virtual_operand_p (PHI_RESULT (phi)))
+           continue;

You should never run into virtual operands (you only walk SSA_OP_USEs).

You can stop walking at stmts that dominate the region header,
like with

+      gimple *def = SSA_NAME_DEF_STMT (t);
        /* Uses in stmts always executed when the region header
executes are fine.  */
        if (dominated_by_p (CDI_DOMINATORS, loop_header, gimple_bb (def)))
          continue;

and the bail out for PARM_DECLs is wrong:

+      /* A PARM_DECL will not have an SSA_NAME_DEF_STMT.  Parameters
+        get their initial value from function entry.  */
+      if (SSA_NAME_VAR (t) && TREE_CODE (SSA_NAME_VAR (t)) == PARM_DECL)
+       return false;

needs to be a continue; rather than a return false.


Otherwise looks ok and sorry for the continuing delays in reviewing this...

Thanks,
Richard.

> Aldy
>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PR tree-optimization/71691] Fix unswitching in presence of maybe-undef SSA_NAMEs (take 2)
  2017-01-30 15:05                         ` Richard Biener
@ 2017-01-30 17:24                           ` Aldy Hernandez
  2017-01-31  8:11                             ` Richard Biener
  0 siblings, 1 reply; 23+ messages in thread
From: Aldy Hernandez @ 2017-01-30 17:24 UTC (permalink / raw)
  To: Richard Biener; +Cc: gcc-patches, Jeff Law

[-- Attachment #1: Type: text/plain, Size: 5130 bytes --]

On 01/30/2017 10:03 AM, Richard Biener wrote:
> On Fri, Jan 27, 2017 at 12:20 PM, Aldy Hernandez <aldyh@redhat.com> wrote:
>> On 01/26/2017 07:29 AM, Richard Biener wrote:
>>>
>>> On Thu, Jan 26, 2017 at 1:04 PM, Aldy Hernandez <aldyh@redhat.com> wrote:
>>>>
>>>> On 01/24/2017 07:23 AM, Richard Biener wrote:
>>
>>
>>> Your initial on-demand approach is fine to catch some of the cases but it
>>> will not handle those for which we'd need post-dominance.
>>>
>>> I guess we can incrementally add that.
>>
>>
>> No complaints from me.
>>
>> This is my initial on-demand approach, with a few fixes you've commented on
>> throughout.
>>
>> As you can see, there is still an #if 0 wrt to using your suggested
>> conservative handling of memory loads, which I'm not entirely convinced of,
>> as it pessimizes gcc.dg/loop-unswitch-1.c.  If you feel strongly about it, I
>> can enable the code again.
>
> It is really required -- fortunately loop-unswitch-1.c is one of the cases where
> the post-dom / always-executed bits help .  The comparison is inside the
> loop header and thus always executed when the loop enters, so inserrting it
> on the preheader edge is fine.

Left as is then.

>
>> Also, I enhanced gcc.dg/loop-unswitch-1.c to verify that we're actually
>> unswitching something.  It seems kinda silly that we have various unswitch
>> tests, but we don't actually check whether we have unswitched anything.
>
> Heh.  It probably was added for an ICE...
>
>> This test was the only one in the *unswitch*.c set that I saw was actually
>> being unswitched.  Of course, if you don't agree with my #if 0 above, I will
>> remove this change to the test.
>>
>> Finally, are we even guaranteed to unswitch in loop-unswitch-1.c across
>> architectures?  If not, again, I can remove the one-liner.
>
> I think so.

Left as well.

>
>>
>> How does this look for trunk?
>
> With a unswitch-local solution I meant to not add new files but put the
> defined_or_undefined class (well, or rather a single function...) into
> tree-ssa-loop-unswitch.c.

Done.

>
> @@ -138,7 +141,7 @@ tree_may_unswitch_on (basic_block bb, struct loop *loop)
>      {
>        /* Unswitching on undefined values would introduce undefined
>          behavior that the original program might never exercise.  */
> -      if (ssa_undefined_value_p (use, true))
> +      if (defined_or_undefined->is_maybe_undefined (use))
>         return NULL_TREE;
>        def = SSA_NAME_DEF_STMT (use);
>        def_bb = gimple_bb (def);
>
> as I said, moving this (now more expensive check) after
>
>       if (def_bb
>           && flow_bb_inside_loop_p (loop, def_bb))
>         return NULL_TREE;
>
> this cheap check would be better.  It should avoid 99% of all calls I bet.

Done.

>
> You can recover the loop-unswitch-1.c case by passing down
> the using stmt and checking its BB against loop_header (the only
> block that we trivially know is always executed when entering the region).
> Or do that check in the caller, like
>
>         if (bb != loop->header
>            && ssa_undefined_value_p (use, true) /
> defined_or_undefined->is_maybe_undefined (use))

Done in callee.

>
> +      gimple *def = SSA_NAME_DEF_STMT (t);
> +
> +      /* Check that all the PHI args are fully defined.  */
> +      if (gphi *phi = dyn_cast <gphi *> (def))
> +       {
> +         if (virtual_operand_p (PHI_RESULT (phi)))
> +           continue;
>
> You should never run into virtual operands (you only walk SSA_OP_USEs).

Done.

>
> You can stop walking at stmts that dominate the region header,
> like with
>
> +      gimple *def = SSA_NAME_DEF_STMT (t);
>         /* Uses in stmts always executed when the region header
> executes are fine.  */
>         if (dominated_by_p (CDI_DOMINATORS, loop_header, gimple_bb (def)))
>           continue;

Hmmmm... doing this causes the PR testcase (gcc.dg/loop-unswitch-5.c in 
the attached patch to FAIL).  I haven't looked at it, but I seem to 
recall in the testcase that we could have a DEF that dominated the loop 
but was a mess of PHI's, some of whose args were undefined.

Did you perhaps want to put that dominated_by_p call after the PHI arg 
checks (which seems to work)?:

       /* Check that all the PHI args are fully defined.  */
       if (gphi *phi = dyn_cast <gphi *> (def))
...
...
...

+      /* Uses in stmts always executed when the region header executes
+        are fine.  */
+      if (dominated_by_p (CDI_DOMINATORS, loop->header, gimple_bb (def)))
+       continue;
+
        /* Handle calls and memory loads conservatively.  */
        if (!is_gimple_assign (def)
           || (gimple_assign_single_p (def)

Until this is clear, I've left this dominated_by_p call #if 0'ed out.

>
> and the bail out for PARM_DECLs is wrong:
>
> +      /* A PARM_DECL will not have an SSA_NAME_DEF_STMT.  Parameters
> +        get their initial value from function entry.  */
> +      if (SSA_NAME_VAR (t) && TREE_CODE (SSA_NAME_VAR (t)) == PARM_DECL)
> +       return false;
>
> needs to be a continue; rather than a return false.

Done.

Preliminary test show the attached patch works.  Further tests on-going.

Aldy

[-- Attachment #2: curr --]
[-- Type: text/plain, Size: 6642 bytes --]

gcc/

	PR tree-optimization/71691
	* bitmap.h (class auto_bitmap): New.
	* tree-ssa-loop-unswitch.c (tree_may_unswitch_on): Call
	is_maybe_undefined instead of ssa_undefined_value_p.

gcc/testsuite/

	PR tree-optimization/71691
	* gcc.dg/loop-unswitch-5.c: Test that we actually unswitch a loop.

diff --git a/gcc/bitmap.h b/gcc/bitmap.h
index 196738b..f158b44 100644
--- a/gcc/bitmap.h
+++ b/gcc/bitmap.h
@@ -802,4 +802,25 @@ bmp_iter_and_compl (bitmap_iterator *bi, unsigned *bit_no)
        bmp_iter_and_compl (&(ITER), &(BITNUM));				\
        bmp_iter_next (&(ITER), &(BITNUM)))
 
+/* A class that ties the lifetime of a bitmap to its scope.  */
+class auto_bitmap
+{
+ public:
+  auto_bitmap () { bits = BITMAP_ALLOC (NULL); }
+  ~auto_bitmap () { BITMAP_FREE (bits); }
+  // Allow calling bitmap functions on our bitmap.
+  operator bitmap () { return bits; }
+
+ private:
+  // Prevent making a copy that references our bitmap.
+  auto_bitmap (const auto_bitmap &);
+  auto_bitmap &operator = (const auto_bitmap &);
+#if __cplusplus >= 201103L
+  auto_bitmap (auto_bitmap &&);
+  auto_bitmap &operator = (auto_bitmap &&);
+#endif
+
+  bitmap bits;
+};
+
 #endif /* GCC_BITMAP_H */
diff --git a/gcc/testsuite/gcc.dg/loop-unswitch-1.c b/gcc/testsuite/gcc.dg/loop-unswitch-1.c
index 930364c..f6fc41d 100644
--- a/gcc/testsuite/gcc.dg/loop-unswitch-1.c
+++ b/gcc/testsuite/gcc.dg/loop-unswitch-1.c
@@ -1,6 +1,6 @@
 /* For PR rtl-optimization/27735  */
 /* { dg-do compile } */
-/* { dg-options "-O2 -funswitch-loops" } */
+/* { dg-options "-O2 -funswitch-loops -fdump-tree-unswitch-details" } */
 
 void set_color(void);
 void xml_colorize_line(unsigned int *p, int state)
@@ -32,3 +32,5 @@ parse_tag: ;
     }
 }
 
+/* Test that we actually unswitched something.  */
+/* { dg-final { scan-tree-dump ";; Unswitching loop" "unswitch" } } */
diff --git a/gcc/testsuite/gcc.dg/loop-unswitch-5.c b/gcc/testsuite/gcc.dg/loop-unswitch-5.c
new file mode 100644
index 0000000..b41e853
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/loop-unswitch-5.c
@@ -0,0 +1,51 @@
+/* PR middle-end/71691 */
+/* { dg-do run } */
+/* { dg-options "-fno-tree-vrp -O2 -funswitch-loops -fdump-tree-unswitch-details" } */
+
+/* Note: The -fno-tree-vrp above is only there to avoid VRP papering
+   over the problem.  */
+
+char b;
+short f;
+unsigned e;
+int g = 20;
+
+void
+foo ()
+{
+  int l, h;
+  for (l = 0; l <= 7; l++)
+    {
+      int j = 38;
+      if (g)
+	h = 0;
+      for (; h <= 7; h++)
+	{
+	  int i, k = b % (j % 4);
+	  g = f;
+	  for (;;)
+	    {
+	      j = 6 || b;
+	      if (e)
+		{
+		  for (; j; --j)
+		    if (k)
+		      __builtin_printf ("%d", 9);
+		  if (i)
+		    __builtin_printf ("%d", j);
+		}
+	      if (l)
+		continue;
+	      break;
+	    }
+	  i = f || b;
+	}
+    }
+}
+
+int
+main ()
+{
+  foo ();
+  return 0;
+}
diff --git a/gcc/tree-ssa-loop-unswitch.c b/gcc/tree-ssa-loop-unswitch.c
index 92599fb..28ae7dc 100644
--- a/gcc/tree-ssa-loop-unswitch.c
+++ b/gcc/tree-ssa-loop-unswitch.c
@@ -109,6 +109,84 @@ tree_ssa_unswitch_loops (void)
   return 0;
 }
 
+/* Return TRUE if an SSA_NAME maybe undefined and is therefore
+   unsuitable for unswitching.  STMT is the statement we are
+   considering for unswitching and LOOP is the loop it appears in.  */
+
+static bool
+is_maybe_undefined (const tree name, gimple *stmt, struct loop *loop)
+{
+  /* The loop header is the only block we can trivially determine that
+     will always be executed.  If the comparison is in the loop
+     header, we know it's OK to unswitch on it.  */
+  if (gimple_bb (stmt) == loop->header)
+    return false;
+
+  auto_bitmap visited_ssa;
+  auto_vec<tree> worklist;
+  worklist.safe_push (name);
+  bitmap_set_bit (visited_ssa, SSA_NAME_VERSION (name));
+  while (!worklist.is_empty ())
+    {
+      tree t = worklist.pop ();
+
+      /* If it's obviously undefined, avoid further computations.  */
+      if (ssa_undefined_value_p (t, true))
+	return true;
+
+      /* A PARM_DECL will not have an SSA_NAME_DEF_STMT.  Parameters
+	 get their initial value from function entry.  */
+      if (SSA_NAME_VAR (t) && TREE_CODE (SSA_NAME_VAR (t)) == PARM_DECL)
+	continue;
+
+      gimple *def = SSA_NAME_DEF_STMT (t);
+
+#if 0
+      /* Uses in stmts always executed when the region header executes
+	 are fine.  */
+      if (dominated_by_p (CDI_DOMINATORS, loop->header, gimple_bb (def)))
+	continue;
+#endif
+
+      /* Check that all the PHI args are fully defined.  */
+      if (gphi *phi = dyn_cast <gphi *> (def))
+	{
+	  for (unsigned i = 0; i < gimple_phi_num_args (phi); ++i)
+	    {
+	      tree t = gimple_phi_arg_def (phi, i);
+	      /* If an SSA has already been seen, it may be a loop,
+		 but we can continue and ignore this use.  Otherwise,
+		 add the SSA_NAME to the queue and visit it later.  */
+	      if (TREE_CODE (t) == SSA_NAME
+		  && bitmap_set_bit (visited_ssa, SSA_NAME_VERSION (t)))
+		worklist.safe_push (t);
+	    }
+	  continue;
+	}
+
+      /* Handle calls and memory loads conservatively.  */
+      if (!is_gimple_assign (def)
+	  || (gimple_assign_single_p (def)
+	      && gimple_vuse (def)))
+	return true;
+
+      /* Check that any SSA names used to define NAME are also fully
+	 defined.  */
+      use_operand_p use_p;
+      ssa_op_iter iter;
+      FOR_EACH_SSA_USE_OPERAND (use_p, def, iter, SSA_OP_USE)
+	{
+	  tree t = USE_FROM_PTR (use_p);
+	  /* If an SSA has already been seen, it may be a loop,
+	     but we can continue and ignore this use.  Otherwise,
+	     add the SSA_NAME to the queue and visit it later.  */
+	  if (bitmap_set_bit (visited_ssa, SSA_NAME_VERSION (t)))
+	    worklist.safe_push (t);
+	}
+    }
+  return false;
+}
+
 /* Checks whether we can unswitch LOOP on condition at end of BB -- one of its
    basic blocks (for what it means see comments below).  */
 
@@ -136,15 +214,15 @@ tree_may_unswitch_on (basic_block bb, struct loop *loop)
   /* Condition must be invariant.  */
   FOR_EACH_SSA_TREE_OPERAND (use, stmt, iter, SSA_OP_USE)
     {
-      /* Unswitching on undefined values would introduce undefined
-	 behavior that the original program might never exercise.  */
-      if (ssa_undefined_value_p (use, true))
-	return NULL_TREE;
       def = SSA_NAME_DEF_STMT (use);
       def_bb = gimple_bb (def);
       if (def_bb
 	  && flow_bb_inside_loop_p (loop, def_bb))
 	return NULL_TREE;
+      /* Unswitching on undefined values would introduce undefined
+	 behavior that the original program might never exercise.  */
+      if (is_maybe_undefined (use, stmt, loop))
+	return NULL_TREE;
     }
 
   cond = build2 (gimple_cond_code (stmt), boolean_type_node,

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PR tree-optimization/71691] Fix unswitching in presence of maybe-undef SSA_NAMEs (take 2)
  2017-01-30 17:24                           ` Aldy Hernandez
@ 2017-01-31  8:11                             ` Richard Biener
  0 siblings, 0 replies; 23+ messages in thread
From: Richard Biener @ 2017-01-31  8:11 UTC (permalink / raw)
  To: Aldy Hernandez; +Cc: gcc-patches, Jeff Law

On Mon, Jan 30, 2017 at 6:19 PM, Aldy Hernandez <aldyh@redhat.com> wrote:
> On 01/30/2017 10:03 AM, Richard Biener wrote:
>>
>> On Fri, Jan 27, 2017 at 12:20 PM, Aldy Hernandez <aldyh@redhat.com> wrote:
>>>
>>> On 01/26/2017 07:29 AM, Richard Biener wrote:
>>>>
>>>>
>>>> On Thu, Jan 26, 2017 at 1:04 PM, Aldy Hernandez <aldyh@redhat.com>
>>>> wrote:
>>>>>
>>>>>
>>>>> On 01/24/2017 07:23 AM, Richard Biener wrote:
>>>
>>>
>>>
>>>> Your initial on-demand approach is fine to catch some of the cases but
>>>> it
>>>> will not handle those for which we'd need post-dominance.
>>>>
>>>> I guess we can incrementally add that.
>>>
>>>
>>>
>>> No complaints from me.
>>>
>>> This is my initial on-demand approach, with a few fixes you've commented
>>> on
>>> throughout.
>>>
>>> As you can see, there is still an #if 0 wrt to using your suggested
>>> conservative handling of memory loads, which I'm not entirely convinced
>>> of,
>>> as it pessimizes gcc.dg/loop-unswitch-1.c.  If you feel strongly about
>>> it, I
>>> can enable the code again.
>>
>>
>> It is really required -- fortunately loop-unswitch-1.c is one of the cases
>> where
>> the post-dom / always-executed bits help .  The comparison is inside the
>> loop header and thus always executed when the loop enters, so inserrting
>> it
>> on the preheader edge is fine.
>
>
> Left as is then.
>
>>
>>> Also, I enhanced gcc.dg/loop-unswitch-1.c to verify that we're actually
>>> unswitching something.  It seems kinda silly that we have various
>>> unswitch
>>> tests, but we don't actually check whether we have unswitched anything.
>>
>>
>> Heh.  It probably was added for an ICE...
>>
>>> This test was the only one in the *unswitch*.c set that I saw was
>>> actually
>>> being unswitched.  Of course, if you don't agree with my #if 0 above, I
>>> will
>>> remove this change to the test.
>>>
>>> Finally, are we even guaranteed to unswitch in loop-unswitch-1.c across
>>> architectures?  If not, again, I can remove the one-liner.
>>
>>
>> I think so.
>
>
> Left as well.
>
>>
>>>
>>> How does this look for trunk?
>>
>>
>> With a unswitch-local solution I meant to not add new files but put the
>> defined_or_undefined class (well, or rather a single function...) into
>> tree-ssa-loop-unswitch.c.
>
>
> Done.
>
>>
>> @@ -138,7 +141,7 @@ tree_may_unswitch_on (basic_block bb, struct loop
>> *loop)
>>      {
>>        /* Unswitching on undefined values would introduce undefined
>>          behavior that the original program might never exercise.  */
>> -      if (ssa_undefined_value_p (use, true))
>> +      if (defined_or_undefined->is_maybe_undefined (use))
>>         return NULL_TREE;
>>        def = SSA_NAME_DEF_STMT (use);
>>        def_bb = gimple_bb (def);
>>
>> as I said, moving this (now more expensive check) after
>>
>>       if (def_bb
>>           && flow_bb_inside_loop_p (loop, def_bb))
>>         return NULL_TREE;
>>
>> this cheap check would be better.  It should avoid 99% of all calls I bet.
>
>
> Done.
>
>>
>> You can recover the loop-unswitch-1.c case by passing down
>> the using stmt and checking its BB against loop_header (the only
>> block that we trivially know is always executed when entering the region).
>> Or do that check in the caller, like
>>
>>         if (bb != loop->header
>>            && ssa_undefined_value_p (use, true) /
>> defined_or_undefined->is_maybe_undefined (use))
>
>
> Done in callee.
>
>>
>> +      gimple *def = SSA_NAME_DEF_STMT (t);
>> +
>> +      /* Check that all the PHI args are fully defined.  */
>> +      if (gphi *phi = dyn_cast <gphi *> (def))
>> +       {
>> +         if (virtual_operand_p (PHI_RESULT (phi)))
>> +           continue;
>>
>> You should never run into virtual operands (you only walk SSA_OP_USEs).
>
>
> Done.
>
>>
>> You can stop walking at stmts that dominate the region header,
>> like with
>>
>> +      gimple *def = SSA_NAME_DEF_STMT (t);
>>         /* Uses in stmts always executed when the region header
>> executes are fine.  */
>>         if (dominated_by_p (CDI_DOMINATORS, loop_header, gimple_bb (def)))
>>           continue;
>
>
> Hmmmm... doing this causes the PR testcase (gcc.dg/loop-unswitch-5.c in the
> attached patch to FAIL).  I haven't looked at it, but I seem to recall in
> the testcase that we could have a DEF that dominated the loop but was a mess
> of PHI's, some of whose args were undefined.
>
> Did you perhaps want to put that dominated_by_p call after the PHI arg
> checks (which seems to work)?:

Ah, yes - of course.  PHIs are not really "defs".

>       /* Check that all the PHI args are fully defined.  */
>       if (gphi *phi = dyn_cast <gphi *> (def))
> ...
> ...
> ...
>
> +      /* Uses in stmts always executed when the region header executes
> +        are fine.  */
> +      if (dominated_by_p (CDI_DOMINATORS, loop->header, gimple_bb (def)))
> +       continue;
> +
>        /* Handle calls and memory loads conservatively.  */
>        if (!is_gimple_assign (def)
>           || (gimple_assign_single_p (def)
>
> Until this is clear, I've left this dominated_by_p call #if 0'ed out.
>
>>
>> and the bail out for PARM_DECLs is wrong:
>>
>> +      /* A PARM_DECL will not have an SSA_NAME_DEF_STMT.  Parameters
>> +        get their initial value from function entry.  */
>> +      if (SSA_NAME_VAR (t) && TREE_CODE (SSA_NAME_VAR (t)) == PARM_DECL)
>> +       return false;
>>
>> needs to be a continue; rather than a return false.
>
>
> Done.
>
> Preliminary test show the attached patch works.  Further tests on-going.

Looks good now, with the #if 0 dominance check moved after the PHI handling.

Thus, ok for trunk if the fixed variant passes testing.

Thanks,
Richard.

> Aldy

^ permalink raw reply	[flat|nested] 23+ messages in thread

end of thread, other threads:[~2017-01-31  8:03 UTC | newest]

Thread overview: 23+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-12-16 15:20 [PR tree-optimization/71691] Fix unswitching in presence of maybe-undef SSA_NAMEs (take 2) Aldy Hernandez
2016-12-19 23:30 ` Jeff Law
2016-12-20 14:39 ` Richard Biener
2016-12-20 16:01   ` Jeff Law
2016-12-20 17:40     ` Richard Biener
2016-12-21 18:00       ` Jeff Law
2016-12-21 18:52   ` Aldy Hernandez
2017-01-04 11:43     ` Richard Biener
2017-01-03 17:37   ` Aldy Hernandez
2017-01-04 12:12     ` Richard Biener
2017-01-07 12:54       ` Aldy Hernandez
2017-01-09  9:30         ` Richard Biener
2017-01-13 18:48           ` Aldy Hernandez
2017-01-18 15:17             ` Richard Biener
2017-01-23 17:28               ` Aldy Hernandez
2017-01-24 12:25                 ` Richard Biener
2017-01-26 12:04                   ` Aldy Hernandez
2017-01-26 12:48                     ` Richard Biener
2017-01-27 11:32                       ` Aldy Hernandez
2017-01-30 15:05                         ` Richard Biener
2017-01-30 17:24                           ` Aldy Hernandez
2017-01-31  8:11                             ` Richard Biener
2017-01-05  4:13 ` Trevor Saunders

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).