public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH]middle-end: Fix dominators updates when peeling with multiple exits [PR113144]
@ 2023-12-29 14:44 Tamar Christina
  2024-01-08 12:19 ` Richard Biener
  0 siblings, 1 reply; 9+ messages in thread
From: Tamar Christina @ 2023-12-29 14:44 UTC (permalink / raw)
  To: gcc-patches; +Cc: nd, rguenther, jlaw

[-- Attachment #1: Type: text/plain, Size: 3876 bytes --]

Hi All,

Only trying to update certain dominators doesn't seem to work very well
because as the loop gets versioned, peeled, or skip_vector then we end up with
very complicated control flow.  This means that the final merge blocks for the
loop exit are not easy to find or update.

Instead of trying to pick which exits to update, this changes it to update all
the blocks reachable by the new exits.  This is because they'll contain common
blocks with e.g. the versioned loop.  It's these blocks that need an update
most of the time.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	PR middle-end/113144
	* tree-vect-loop-manip.cc (slpeel_tree_duplicate_loop_to_edge_cfg):
	Update all dominators reachable from exit.

gcc/testsuite/ChangeLog:

	PR middle-end/113144
	* gcc.dg/vect/vect-early-break_94-pr113144.c: New test.

--- inline copy of patch -- 
diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_94-pr113144.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_94-pr113144.c
new file mode 100644
index 0000000000000000000000000000000000000000..903fe7be6621e81db6f29441e4309fa213d027c5
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_94-pr113144.c
@@ -0,0 +1,41 @@
+/* { dg-do compile } */
+/* { dg-add-options vect_early_break } */
+/* { dg-require-effective-target vect_early_break } */
+/* { dg-require-effective-target vect_int } */
+
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+
+long tar_atol256_max, tar_atol256_size, tar_atosl_min;
+char tar_atol256_s;
+void __errno_location();
+
+
+inline static long tar_atol256(long min) {
+  char c;
+  int sign;
+  c = tar_atol256_s;
+  sign = c;
+  while (tar_atol256_size) {
+    if (c != sign)
+      return sign ? min : tar_atol256_max;
+    c = tar_atol256_size--;
+  }
+  if ((c & 128) != (sign & 128))
+    return sign ? min : tar_atol256_max;
+  return 0;
+}
+
+inline static long tar_atol(long min) {
+  return tar_atol256(min);
+}
+
+long tar_atosl() {
+  long n = tar_atol(-1);
+  if (tar_atosl_min) {
+    __errno_location();
+    return 0;
+  }
+  if (n > 0)
+    return 0;
+  return n;
+}
diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
index 1066ea17c5674e03412b3dcd8a62ddf4dd54cf31..3810983a80c8b989be9fd9a9993642069fd39b99 100644
--- a/gcc/tree-vect-loop-manip.cc
+++ b/gcc/tree-vect-loop-manip.cc
@@ -1716,8 +1716,6 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
 	  /* Now link the alternative exits.  */
 	  if (multiple_exits_p)
 	    {
-	      set_immediate_dominator (CDI_DOMINATORS, new_preheader,
-				       main_loop_exit_block);
 	      for (auto gsi_from = gsi_start_phis (loop->header),
 		   gsi_to = gsi_start_phis (new_preheader);
 		   !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);
@@ -1751,12 +1749,26 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
 
       /* Finally after wiring the new epilogue we need to update its main exit
 	 to the original function exit we recorded.  Other exits are already
-	 correct.  */
+	 correct.  Because of versioning, skip vectors and others we must update
+	 the dominators of every node reachable by the new exits.  */
       if (multiple_exits_p)
 	{
 	  update_loop = new_loop;
-	  for (edge e : get_loop_exit_edges (loop))
-	    doms.safe_push (e->dest);
+	  hash_set <basic_block> visited;
+	  auto_vec <edge> workset;
+	  edge ev;
+	  edge_iterator ei;
+	  workset.safe_splice (get_loop_exit_edges (loop));
+	  while (!workset.is_empty ())
+	    {
+	      auto bb = workset.pop ()->dest;
+	      if (visited.add (bb))
+		continue;
+	      doms.safe_push (bb);
+	      FOR_EACH_EDGE (ev, ei, bb->succs)
+		workset.safe_push (ev);
+	    }
+	  visited.empty ();
 	  doms.safe_push (exit_dest);
 
 	  /* Likely a fall-through edge, so update if needed.  */




-- 

[-- Attachment #2: rb18113.patch --]
[-- Type: text/plain, Size: 2953 bytes --]

diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_94-pr113144.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_94-pr113144.c
new file mode 100644
index 0000000000000000000000000000000000000000..903fe7be6621e81db6f29441e4309fa213d027c5
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_94-pr113144.c
@@ -0,0 +1,41 @@
+/* { dg-do compile } */
+/* { dg-add-options vect_early_break } */
+/* { dg-require-effective-target vect_early_break } */
+/* { dg-require-effective-target vect_int } */
+
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+
+long tar_atol256_max, tar_atol256_size, tar_atosl_min;
+char tar_atol256_s;
+void __errno_location();
+
+
+inline static long tar_atol256(long min) {
+  char c;
+  int sign;
+  c = tar_atol256_s;
+  sign = c;
+  while (tar_atol256_size) {
+    if (c != sign)
+      return sign ? min : tar_atol256_max;
+    c = tar_atol256_size--;
+  }
+  if ((c & 128) != (sign & 128))
+    return sign ? min : tar_atol256_max;
+  return 0;
+}
+
+inline static long tar_atol(long min) {
+  return tar_atol256(min);
+}
+
+long tar_atosl() {
+  long n = tar_atol(-1);
+  if (tar_atosl_min) {
+    __errno_location();
+    return 0;
+  }
+  if (n > 0)
+    return 0;
+  return n;
+}
diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
index 1066ea17c5674e03412b3dcd8a62ddf4dd54cf31..3810983a80c8b989be9fd9a9993642069fd39b99 100644
--- a/gcc/tree-vect-loop-manip.cc
+++ b/gcc/tree-vect-loop-manip.cc
@@ -1716,8 +1716,6 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
 	  /* Now link the alternative exits.  */
 	  if (multiple_exits_p)
 	    {
-	      set_immediate_dominator (CDI_DOMINATORS, new_preheader,
-				       main_loop_exit_block);
 	      for (auto gsi_from = gsi_start_phis (loop->header),
 		   gsi_to = gsi_start_phis (new_preheader);
 		   !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);
@@ -1751,12 +1749,26 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
 
       /* Finally after wiring the new epilogue we need to update its main exit
 	 to the original function exit we recorded.  Other exits are already
-	 correct.  */
+	 correct.  Because of versioning, skip vectors and others we must update
+	 the dominators of every node reachable by the new exits.  */
       if (multiple_exits_p)
 	{
 	  update_loop = new_loop;
-	  for (edge e : get_loop_exit_edges (loop))
-	    doms.safe_push (e->dest);
+	  hash_set <basic_block> visited;
+	  auto_vec <edge> workset;
+	  edge ev;
+	  edge_iterator ei;
+	  workset.safe_splice (get_loop_exit_edges (loop));
+	  while (!workset.is_empty ())
+	    {
+	      auto bb = workset.pop ()->dest;
+	      if (visited.add (bb))
+		continue;
+	      doms.safe_push (bb);
+	      FOR_EACH_EDGE (ev, ei, bb->succs)
+		workset.safe_push (ev);
+	    }
+	  visited.empty ();
 	  doms.safe_push (exit_dest);
 
 	  /* Likely a fall-through edge, so update if needed.  */




^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH]middle-end: Fix dominators updates when peeling with multiple exits [PR113144]
  2023-12-29 14:44 [PATCH]middle-end: Fix dominators updates when peeling with multiple exits [PR113144] Tamar Christina
@ 2024-01-08 12:19 ` Richard Biener
  2024-01-09 11:47   ` Tamar Christina
  0 siblings, 1 reply; 9+ messages in thread
From: Richard Biener @ 2024-01-08 12:19 UTC (permalink / raw)
  To: Tamar Christina; +Cc: gcc-patches, nd, jlaw

On Fri, 29 Dec 2023, Tamar Christina wrote:

> Hi All,
> 
> Only trying to update certain dominators doesn't seem to work very well
> because as the loop gets versioned, peeled, or skip_vector then we end up with
> very complicated control flow.  This means that the final merge blocks for the
> loop exit are not easy to find or update.
> 
> Instead of trying to pick which exits to update, this changes it to update all
> the blocks reachable by the new exits.  This is because they'll contain common
> blocks with e.g. the versioned loop.  It's these blocks that need an update
> most of the time.
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> 
> Ok for master?

This makes it quadratic in the number of vectorized early exit loops
in a function.  The vectorizer CFG manipulation operates in a local
enough bubble that programmatic updating of dominators should be
possible (after all we manage to produce correct SSA form!), the
proposed change gets us too far off to a point where re-computating
dominance info is likely cheaper (but no, we shouldn't do this either).

Can you instead give manual updating a try again?  I think
versioning should produce up-to-date dominator info, it's only
when you redirect branches during peeling that you'd need
adjustments - but IIRC we're never introducing new merges?

IIRC we can't wipe dominators during transform since we query them
during code generation.  We possibly could code generate all
CFG manipulations of all vectorized loops, recompute all dominators
and then do code generation of all vectorized loops.

But then we're doing a loop transform and the exits will ultimatively
end up in the same place, so the CFG and dominator update is bound to
where the original exits went to.

Richard

> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
> 	PR middle-end/113144
> 	* tree-vect-loop-manip.cc (slpeel_tree_duplicate_loop_to_edge_cfg):
> 	Update all dominators reachable from exit.
> 
> gcc/testsuite/ChangeLog:
> 
> 	PR middle-end/113144
> 	* gcc.dg/vect/vect-early-break_94-pr113144.c: New test.
> 
> --- inline copy of patch -- 
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_94-pr113144.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_94-pr113144.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..903fe7be6621e81db6f29441e4309fa213d027c5
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_94-pr113144.c
> @@ -0,0 +1,41 @@
> +/* { dg-do compile } */
> +/* { dg-add-options vect_early_break } */
> +/* { dg-require-effective-target vect_early_break } */
> +/* { dg-require-effective-target vect_int } */
> +
> +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
> +
> +long tar_atol256_max, tar_atol256_size, tar_atosl_min;
> +char tar_atol256_s;
> +void __errno_location();
> +
> +
> +inline static long tar_atol256(long min) {
> +  char c;
> +  int sign;
> +  c = tar_atol256_s;
> +  sign = c;
> +  while (tar_atol256_size) {
> +    if (c != sign)
> +      return sign ? min : tar_atol256_max;
> +    c = tar_atol256_size--;
> +  }
> +  if ((c & 128) != (sign & 128))
> +    return sign ? min : tar_atol256_max;
> +  return 0;
> +}
> +
> +inline static long tar_atol(long min) {
> +  return tar_atol256(min);
> +}
> +
> +long tar_atosl() {
> +  long n = tar_atol(-1);
> +  if (tar_atosl_min) {
> +    __errno_location();
> +    return 0;
> +  }
> +  if (n > 0)
> +    return 0;
> +  return n;
> +}
> diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
> index 1066ea17c5674e03412b3dcd8a62ddf4dd54cf31..3810983a80c8b989be9fd9a9993642069fd39b99 100644
> --- a/gcc/tree-vect-loop-manip.cc
> +++ b/gcc/tree-vect-loop-manip.cc
> @@ -1716,8 +1716,6 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
>  	  /* Now link the alternative exits.  */
>  	  if (multiple_exits_p)
>  	    {
> -	      set_immediate_dominator (CDI_DOMINATORS, new_preheader,
> -				       main_loop_exit_block);
>  	      for (auto gsi_from = gsi_start_phis (loop->header),
>  		   gsi_to = gsi_start_phis (new_preheader);
>  		   !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);
> @@ -1751,12 +1749,26 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
>  
>        /* Finally after wiring the new epilogue we need to update its main exit
>  	 to the original function exit we recorded.  Other exits are already
> -	 correct.  */
> +	 correct.  Because of versioning, skip vectors and others we must update
> +	 the dominators of every node reachable by the new exits.  */
>        if (multiple_exits_p)
>  	{
>  	  update_loop = new_loop;
> -	  for (edge e : get_loop_exit_edges (loop))
> -	    doms.safe_push (e->dest);
> +	  hash_set <basic_block> visited;
> +	  auto_vec <edge> workset;
> +	  edge ev;
> +	  edge_iterator ei;
> +	  workset.safe_splice (get_loop_exit_edges (loop));
> +	  while (!workset.is_empty ())
> +	    {
> +	      auto bb = workset.pop ()->dest;
> +	      if (visited.add (bb))
> +		continue;
> +	      doms.safe_push (bb);
> +	      FOR_EACH_EDGE (ev, ei, bb->succs)
> +		workset.safe_push (ev);
> +	    }
> +	  visited.empty ();
>  	  doms.safe_push (exit_dest);
>  
>  	  /* Likely a fall-through edge, so update if needed.  */
> 
> 
> 
> 
> 

-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)

^ permalink raw reply	[flat|nested] 9+ messages in thread

* RE: [PATCH]middle-end: Fix dominators updates when peeling with multiple exits [PR113144]
  2024-01-08 12:19 ` Richard Biener
@ 2024-01-09 11:47   ` Tamar Christina
  2024-01-09 12:25     ` Richard Biener
  0 siblings, 1 reply; 9+ messages in thread
From: Tamar Christina @ 2024-01-09 11:47 UTC (permalink / raw)
  To: Richard Biener; +Cc: gcc-patches, nd, jlaw

[-- Attachment #1: Type: text/plain, Size: 5229 bytes --]

> This makes it quadratic in the number of vectorized early exit loops
> in a function.  The vectorizer CFG manipulation operates in a local
> enough bubble that programmatic updating of dominators should be
> possible (after all we manage to produce correct SSA form!), the
> proposed change gets us too far off to a point where re-computating
> dominance info is likely cheaper (but no, we shouldn't do this either).
> 
> Can you instead give manual updating a try again?  I think
> versioning should produce up-to-date dominator info, it's only
> when you redirect branches during peeling that you'd need
> adjustments - but IIRC we're never introducing new merges?
> 
> IIRC we can't wipe dominators during transform since we query them
> during code generation.  We possibly could code generate all
> CFG manipulations of all vectorized loops, recompute all dominators
> and then do code generation of all vectorized loops.
> 
> But then we're doing a loop transform and the exits will ultimatively
> end up in the same place, so the CFG and dominator update is bound to
> where the original exits went to.

Yeah that's a fair point, the issue is specifically with at_exit.  So how about:

When we peel at_exit we are moving the new loop at the exit of the previous
loop.  This means that the blocks outside the loop dat the previous loop used to
dominate are no longer being dominated by it.

The new dominators however are hard to predict since if the loop has multiple
exits and all the exits are an "early" one then we always execute the scalar
loop.  In this case the scalar loop can completely dominate the new loop.

If we later have skip_vector then there's an additional skip edge added that
might change the dominators.

The previous patch would force an update of all blocks reachable from the new
exits.  This one updates *only* blocks that we know the scalar exits dominated.

For the examples this reduces the blocks to update from 18 to 3.

Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
and no issues normally and with --enable-checking=release --enable-lto
--with-build-config=bootstrap-O3 --enable-checking=yes,rtl,extra.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	PR tree-optimization/113144
	PR tree-optimization/113145
	* tree-vect-loop-manip.cc (slpeel_tree_duplicate_loop_to_edge_cfg):
	Update all BB that the original exits dominated.

gcc/testsuite/ChangeLog:

	PR tree-optimization/113144
	PR tree-optimization/113145
	* gcc.dg/vect/vect-early-break_94-pr113144.c: New test.

--- inline copy of patch ---

diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_94-pr113144.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_94-pr113144.c
new file mode 100644
index 0000000000000000000000000000000000000000..903fe7be6621e81db6f29441e4309fa213d027c5
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_94-pr113144.c
@@ -0,0 +1,41 @@
+/* { dg-do compile } */
+/* { dg-add-options vect_early_break } */
+/* { dg-require-effective-target vect_early_break } */
+/* { dg-require-effective-target vect_int } */
+
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+
+long tar_atol256_max, tar_atol256_size, tar_atosl_min;
+char tar_atol256_s;
+void __errno_location();
+
+
+inline static long tar_atol256(long min) {
+  char c;
+  int sign;
+  c = tar_atol256_s;
+  sign = c;
+  while (tar_atol256_size) {
+    if (c != sign)
+      return sign ? min : tar_atol256_max;
+    c = tar_atol256_size--;
+  }
+  if ((c & 128) != (sign & 128))
+    return sign ? min : tar_atol256_max;
+  return 0;
+}
+
+inline static long tar_atol(long min) {
+  return tar_atol256(min);
+}
+
+long tar_atosl() {
+  long n = tar_atol(-1);
+  if (tar_atosl_min) {
+    __errno_location();
+    return 0;
+  }
+  if (n > 0)
+    return 0;
+  return n;
+}
diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
index 76d4979c0b3b374dcaacf6825a95a8714114a63b..9bacaa182a3919cae1cb99dfc5ae4923e1f93376 100644
--- a/gcc/tree-vect-loop-manip.cc
+++ b/gcc/tree-vect-loop-manip.cc
@@ -1719,8 +1719,6 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
 	  /* Now link the alternative exits.  */
 	  if (multiple_exits_p)
 	    {
-	      set_immediate_dominator (CDI_DOMINATORS, new_preheader,
-				       main_loop_exit_block);
 	      for (auto gsi_from = gsi_start_phis (loop->header),
 		   gsi_to = gsi_start_phis (new_preheader);
 		   !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);
@@ -1776,7 +1774,14 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
 	{
 	  update_loop = new_loop;
 	  for (edge e : get_loop_exit_edges (loop))
-	    doms.safe_push (e->dest);
+	    {
+	      /* Basic blocks that the old loop dominated are now dominated by
+		 the new loop and so we have to update those.  */
+	      for (auto bb : get_all_dominated_blocks (CDI_DOMINATORS, e->src))
+		if (!flow_bb_inside_loop_p (loop, bb))
+		  doms.safe_push (bb);
+	      doms.safe_push (e->dest);
+	    }
 	  doms.safe_push (exit_dest);
 
 	  /* Likely a fall-through edge, so update if needed.  */

[-- Attachment #2: rb18113 (1).patch --]
[-- Type: application/octet-stream, Size: 2524 bytes --]

diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_94-pr113144.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_94-pr113144.c
new file mode 100644
index 0000000000000000000000000000000000000000..903fe7be6621e81db6f29441e4309fa213d027c5
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_94-pr113144.c
@@ -0,0 +1,41 @@
+/* { dg-do compile } */
+/* { dg-add-options vect_early_break } */
+/* { dg-require-effective-target vect_early_break } */
+/* { dg-require-effective-target vect_int } */
+
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+
+long tar_atol256_max, tar_atol256_size, tar_atosl_min;
+char tar_atol256_s;
+void __errno_location();
+
+
+inline static long tar_atol256(long min) {
+  char c;
+  int sign;
+  c = tar_atol256_s;
+  sign = c;
+  while (tar_atol256_size) {
+    if (c != sign)
+      return sign ? min : tar_atol256_max;
+    c = tar_atol256_size--;
+  }
+  if ((c & 128) != (sign & 128))
+    return sign ? min : tar_atol256_max;
+  return 0;
+}
+
+inline static long tar_atol(long min) {
+  return tar_atol256(min);
+}
+
+long tar_atosl() {
+  long n = tar_atol(-1);
+  if (tar_atosl_min) {
+    __errno_location();
+    return 0;
+  }
+  if (n > 0)
+    return 0;
+  return n;
+}
diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
index 76d4979c0b3b374dcaacf6825a95a8714114a63b..9bacaa182a3919cae1cb99dfc5ae4923e1f93376 100644
--- a/gcc/tree-vect-loop-manip.cc
+++ b/gcc/tree-vect-loop-manip.cc
@@ -1719,8 +1719,6 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
 	  /* Now link the alternative exits.  */
 	  if (multiple_exits_p)
 	    {
-	      set_immediate_dominator (CDI_DOMINATORS, new_preheader,
-				       main_loop_exit_block);
 	      for (auto gsi_from = gsi_start_phis (loop->header),
 		   gsi_to = gsi_start_phis (new_preheader);
 		   !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);
@@ -1776,7 +1774,14 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
 	{
 	  update_loop = new_loop;
 	  for (edge e : get_loop_exit_edges (loop))
-	    doms.safe_push (e->dest);
+	    {
+	      /* Basic blocks that the old loop dominated are now dominated by
+		 the new loop and so we have to update those.  */
+	      for (auto bb : get_all_dominated_blocks (CDI_DOMINATORS, e->src))
+		if (!flow_bb_inside_loop_p (loop, bb))
+		  doms.safe_push (bb);
+	      doms.safe_push (e->dest);
+	    }
 	  doms.safe_push (exit_dest);
 
 	  /* Likely a fall-through edge, so update if needed.  */

^ permalink raw reply	[flat|nested] 9+ messages in thread

* RE: [PATCH]middle-end: Fix dominators updates when peeling with multiple exits [PR113144]
  2024-01-09 11:47   ` Tamar Christina
@ 2024-01-09 12:25     ` Richard Biener
  2024-01-09 13:26       ` Tamar Christina
  0 siblings, 1 reply; 9+ messages in thread
From: Richard Biener @ 2024-01-09 12:25 UTC (permalink / raw)
  To: Tamar Christina; +Cc: gcc-patches, nd, jlaw

On Tue, 9 Jan 2024, Tamar Christina wrote:

> > This makes it quadratic in the number of vectorized early exit loops
> > in a function.  The vectorizer CFG manipulation operates in a local
> > enough bubble that programmatic updating of dominators should be
> > possible (after all we manage to produce correct SSA form!), the
> > proposed change gets us too far off to a point where re-computating
> > dominance info is likely cheaper (but no, we shouldn't do this either).
> > 
> > Can you instead give manual updating a try again?  I think
> > versioning should produce up-to-date dominator info, it's only
> > when you redirect branches during peeling that you'd need
> > adjustments - but IIRC we're never introducing new merges?
> > 
> > IIRC we can't wipe dominators during transform since we query them
> > during code generation.  We possibly could code generate all
> > CFG manipulations of all vectorized loops, recompute all dominators
> > and then do code generation of all vectorized loops.
> > 
> > But then we're doing a loop transform and the exits will ultimatively
> > end up in the same place, so the CFG and dominator update is bound to
> > where the original exits went to.
> 
> Yeah that's a fair point, the issue is specifically with at_exit.  So how about:
> 
> When we peel at_exit we are moving the new loop at the exit of the previous
> loop.  This means that the blocks outside the loop dat the previous loop used to
> dominate are no longer being dominated by it.

Hmm, indeed.  Note this does make the dominator update O(function-size)
and when vectorizing multiple loops in a function this becomes
quadratic.  That's quite unfortunate so I wonder if we can delay the
update to the parts we do not need up-to-date dominators during
vectorization (of course it gets fragile with having only partly
correct dominators).

> The new dominators however are hard to predict since if the loop has multiple
> exits and all the exits are an "early" one then we always execute the scalar
> loop.  In this case the scalar loop can completely dominate the new loop.
> 
> If we later have skip_vector then there's an additional skip edge added that
> might change the dominators.
> 
> The previous patch would force an update of all blocks reachable from the new
> exits.  This one updates *only* blocks that we know the scalar exits dominated.
> 
> For the examples this reduces the blocks to update from 18 to 3.
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
> and no issues normally and with --enable-checking=release --enable-lto
> --with-build-config=bootstrap-O3 --enable-checking=yes,rtl,extra.
> 
> Ok for master?

See below.

> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
> 	PR tree-optimization/113144
> 	PR tree-optimization/113145
> 	* tree-vect-loop-manip.cc (slpeel_tree_duplicate_loop_to_edge_cfg):
> 	Update all BB that the original exits dominated.
> 
> gcc/testsuite/ChangeLog:
> 
> 	PR tree-optimization/113144
> 	PR tree-optimization/113145
> 	* gcc.dg/vect/vect-early-break_94-pr113144.c: New test.
> 
> --- inline copy of patch ---
> 
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_94-pr113144.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_94-pr113144.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..903fe7be6621e81db6f29441e4309fa213d027c5
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_94-pr113144.c
> @@ -0,0 +1,41 @@
> +/* { dg-do compile } */
> +/* { dg-add-options vect_early_break } */
> +/* { dg-require-effective-target vect_early_break } */
> +/* { dg-require-effective-target vect_int } */
> +
> +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
> +
> +long tar_atol256_max, tar_atol256_size, tar_atosl_min;
> +char tar_atol256_s;
> +void __errno_location();
> +
> +
> +inline static long tar_atol256(long min) {
> +  char c;
> +  int sign;
> +  c = tar_atol256_s;
> +  sign = c;
> +  while (tar_atol256_size) {
> +    if (c != sign)
> +      return sign ? min : tar_atol256_max;
> +    c = tar_atol256_size--;
> +  }
> +  if ((c & 128) != (sign & 128))
> +    return sign ? min : tar_atol256_max;
> +  return 0;
> +}
> +
> +inline static long tar_atol(long min) {
> +  return tar_atol256(min);
> +}
> +
> +long tar_atosl() {
> +  long n = tar_atol(-1);
> +  if (tar_atosl_min) {
> +    __errno_location();
> +    return 0;
> +  }
> +  if (n > 0)
> +    return 0;
> +  return n;
> +}
> diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
> index 76d4979c0b3b374dcaacf6825a95a8714114a63b..9bacaa182a3919cae1cb99dfc5ae4923e1f93376 100644
> --- a/gcc/tree-vect-loop-manip.cc
> +++ b/gcc/tree-vect-loop-manip.cc
> @@ -1719,8 +1719,6 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
>  	  /* Now link the alternative exits.  */
>  	  if (multiple_exits_p)
>  	    {
> -	      set_immediate_dominator (CDI_DOMINATORS, new_preheader,
> -				       main_loop_exit_block);
>  	      for (auto gsi_from = gsi_start_phis (loop->header),
>  		   gsi_to = gsi_start_phis (new_preheader);
>  		   !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);
> @@ -1776,7 +1774,14 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
>  	{
>  	  update_loop = new_loop;
>  	  for (edge e : get_loop_exit_edges (loop))
> -	    doms.safe_push (e->dest);
> +	    {
> +	      /* Basic blocks that the old loop dominated are now dominated by
> +		 the new loop and so we have to update those.  */
> +	      for (auto bb : get_all_dominated_blocks (CDI_DOMINATORS, e->src))
> +		if (!flow_bb_inside_loop_p (loop, bb))
> +		  doms.safe_push (bb);
> +	      doms.safe_push (e->dest);
> +	    }

I think you'll get duplicate blocks that way.  Maybe simplify this
all by instead doing

          auto doms = get_all_dominated_blocks (CDI_DOMINATORS, loop->header);
          for (unsigned i = 0; i < doms.length (); ++i)
            if (flow_bb_inside_loop_p (loop, doms[i]))
              doms.unordered_remove (i);

?

OK with that change, but really we should see to avoid this
quadraticness :/  It's probably not too bad right now given we have
quite some restrictions on vectorizing loops with multiple exits,
but I suggest you try an artificial testcase with the "same"
loop repeated N times to see whether dominance compute creeps up
in the profile.

Richard.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* RE: [PATCH]middle-end: Fix dominators updates when peeling with multiple exits [PR113144]
  2024-01-09 12:25     ` Richard Biener
@ 2024-01-09 13:26       ` Tamar Christina
  2024-01-09 13:34         ` Richard Biener
  0 siblings, 1 reply; 9+ messages in thread
From: Tamar Christina @ 2024-01-09 13:26 UTC (permalink / raw)
  To: Richard Biener; +Cc: gcc-patches, nd, jlaw



> -----Original Message-----
> From: Richard Biener <rguenther@suse.de>
> Sent: Tuesday, January 9, 2024 12:26 PM
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; jlaw@ventanamicro.com
> Subject: RE: [PATCH]middle-end: Fix dominators updates when peeling with
> multiple exits [PR113144]
> 
> On Tue, 9 Jan 2024, Tamar Christina wrote:
> 
> > > This makes it quadratic in the number of vectorized early exit loops
> > > in a function.  The vectorizer CFG manipulation operates in a local
> > > enough bubble that programmatic updating of dominators should be
> > > possible (after all we manage to produce correct SSA form!), the
> > > proposed change gets us too far off to a point where re-computating
> > > dominance info is likely cheaper (but no, we shouldn't do this either).
> > >
> > > Can you instead give manual updating a try again?  I think
> > > versioning should produce up-to-date dominator info, it's only
> > > when you redirect branches during peeling that you'd need
> > > adjustments - but IIRC we're never introducing new merges?
> > >
> > > IIRC we can't wipe dominators during transform since we query them
> > > during code generation.  We possibly could code generate all
> > > CFG manipulations of all vectorized loops, recompute all dominators
> > > and then do code generation of all vectorized loops.
> > >
> > > But then we're doing a loop transform and the exits will ultimatively
> > > end up in the same place, so the CFG and dominator update is bound to
> > > where the original exits went to.
> >
> > Yeah that's a fair point, the issue is specifically with at_exit.  So how about:
> >
> > When we peel at_exit we are moving the new loop at the exit of the previous
> > loop.  This means that the blocks outside the loop dat the previous loop used to
> > dominate are no longer being dominated by it.
> 
> Hmm, indeed.  Note this does make the dominator update O(function-size)
> and when vectorizing multiple loops in a function this becomes
> quadratic.  That's quite unfortunate so I wonder if we can delay the
> update to the parts we do not need up-to-date dominators during
> vectorization (of course it gets fragile with having only partly
> correct dominators).

Fair, I created https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113290 and will
tackle it when I add SLP support in GCC 15.

I think the problem is, and the reason we do early dominator correction and
validation is because the same function is used by loop distribution.

But you're right that during vectorization we perform dominators update twice
now.

So Maybe we should have a parameter to indicate whether dominators should
be updated?

Thanks,
Tamar

> 
> > The new dominators however are hard to predict since if the loop has multiple
> > exits and all the exits are an "early" one then we always execute the scalar
> > loop.  In this case the scalar loop can completely dominate the new loop.
> >
> > If we later have skip_vector then there's an additional skip edge added that
> > might change the dominators.
> >
> > The previous patch would force an update of all blocks reachable from the new
> > exits.  This one updates *only* blocks that we know the scalar exits dominated.
> >
> > For the examples this reduces the blocks to update from 18 to 3.
> >
> > Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
> > and no issues normally and with --enable-checking=release --enable-lto
> > --with-build-config=bootstrap-O3 --enable-checking=yes,rtl,extra.
> >
> > Ok for master?
> 
> See below.
> 
> > Thanks,
> > Tamar
> >
> > gcc/ChangeLog:
> >
> > 	PR tree-optimization/113144
> > 	PR tree-optimization/113145
> > 	* tree-vect-loop-manip.cc (slpeel_tree_duplicate_loop_to_edge_cfg):
> > 	Update all BB that the original exits dominated.
> >
> > gcc/testsuite/ChangeLog:
> >
> > 	PR tree-optimization/113144
> > 	PR tree-optimization/113145
> > 	* gcc.dg/vect/vect-early-break_94-pr113144.c: New test.
> >
> > --- inline copy of patch ---
> >
> > diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_94-pr113144.c
> b/gcc/testsuite/gcc.dg/vect/vect-early-break_94-pr113144.c
> > new file mode 100644
> > index
> 0000000000000000000000000000000000000000..903fe7be6621e81db6f294
> 41e4309fa213d027c5
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_94-pr113144.c
> > @@ -0,0 +1,41 @@
> > +/* { dg-do compile } */
> > +/* { dg-add-options vect_early_break } */
> > +/* { dg-require-effective-target vect_early_break } */
> > +/* { dg-require-effective-target vect_int } */
> > +
> > +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
> > +
> > +long tar_atol256_max, tar_atol256_size, tar_atosl_min;
> > +char tar_atol256_s;
> > +void __errno_location();
> > +
> > +
> > +inline static long tar_atol256(long min) {
> > +  char c;
> > +  int sign;
> > +  c = tar_atol256_s;
> > +  sign = c;
> > +  while (tar_atol256_size) {
> > +    if (c != sign)
> > +      return sign ? min : tar_atol256_max;
> > +    c = tar_atol256_size--;
> > +  }
> > +  if ((c & 128) != (sign & 128))
> > +    return sign ? min : tar_atol256_max;
> > +  return 0;
> > +}
> > +
> > +inline static long tar_atol(long min) {
> > +  return tar_atol256(min);
> > +}
> > +
> > +long tar_atosl() {
> > +  long n = tar_atol(-1);
> > +  if (tar_atosl_min) {
> > +    __errno_location();
> > +    return 0;
> > +  }
> > +  if (n > 0)
> > +    return 0;
> > +  return n;
> > +}
> > diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
> > index
> 76d4979c0b3b374dcaacf6825a95a8714114a63b..9bacaa182a3919cae1cb99dfc
> 5ae4923e1f93376 100644
> > --- a/gcc/tree-vect-loop-manip.cc
> > +++ b/gcc/tree-vect-loop-manip.cc
> > @@ -1719,8 +1719,6 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop
> *loop, edge loop_exit,
> >  	  /* Now link the alternative exits.  */
> >  	  if (multiple_exits_p)
> >  	    {
> > -	      set_immediate_dominator (CDI_DOMINATORS, new_preheader,
> > -				       main_loop_exit_block);
> >  	      for (auto gsi_from = gsi_start_phis (loop->header),
> >  		   gsi_to = gsi_start_phis (new_preheader);
> >  		   !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);
> > @@ -1776,7 +1774,14 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop
> *loop, edge loop_exit,
> >  	{
> >  	  update_loop = new_loop;
> >  	  for (edge e : get_loop_exit_edges (loop))
> > -	    doms.safe_push (e->dest);
> > +	    {
> > +	      /* Basic blocks that the old loop dominated are now dominated by
> > +		 the new loop and so we have to update those.  */
> > +	      for (auto bb : get_all_dominated_blocks (CDI_DOMINATORS, e->src))
> > +		if (!flow_bb_inside_loop_p (loop, bb))
> > +		  doms.safe_push (bb);
> > +	      doms.safe_push (e->dest);
> > +	    }
> 
> I think you'll get duplicate blocks that way.  Maybe simplify this
> all by instead doing
> 
>           auto doms = get_all_dominated_blocks (CDI_DOMINATORS, loop->header);
>           for (unsigned i = 0; i < doms.length (); ++i)
>             if (flow_bb_inside_loop_p (loop, doms[i]))
>               doms.unordered_remove (i);
> 
> ?
> 
> OK with that change, but really we should see to avoid this
> quadraticness :/  It's probably not too bad right now given we have
> quite some restrictions on vectorizing loops with multiple exits,
> but I suggest you try an artificial testcase with the "same"
> loop repeated N times to see whether dominance compute creeps up
> in the profile.
> 
> Richard.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* RE: [PATCH]middle-end: Fix dominators updates when peeling with multiple exits [PR113144]
  2024-01-09 13:26       ` Tamar Christina
@ 2024-01-09 13:34         ` Richard Biener
  2024-01-09 13:50           ` Richard Biener
  0 siblings, 1 reply; 9+ messages in thread
From: Richard Biener @ 2024-01-09 13:34 UTC (permalink / raw)
  To: Tamar Christina; +Cc: gcc-patches, nd, jlaw

On Tue, 9 Jan 2024, Tamar Christina wrote:

> 
> 
> > -----Original Message-----
> > From: Richard Biener <rguenther@suse.de>
> > Sent: Tuesday, January 9, 2024 12:26 PM
> > To: Tamar Christina <Tamar.Christina@arm.com>
> > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; jlaw@ventanamicro.com
> > Subject: RE: [PATCH]middle-end: Fix dominators updates when peeling with
> > multiple exits [PR113144]
> > 
> > On Tue, 9 Jan 2024, Tamar Christina wrote:
> > 
> > > > This makes it quadratic in the number of vectorized early exit loops
> > > > in a function.  The vectorizer CFG manipulation operates in a local
> > > > enough bubble that programmatic updating of dominators should be
> > > > possible (after all we manage to produce correct SSA form!), the
> > > > proposed change gets us too far off to a point where re-computating
> > > > dominance info is likely cheaper (but no, we shouldn't do this either).
> > > >
> > > > Can you instead give manual updating a try again?  I think
> > > > versioning should produce up-to-date dominator info, it's only
> > > > when you redirect branches during peeling that you'd need
> > > > adjustments - but IIRC we're never introducing new merges?
> > > >
> > > > IIRC we can't wipe dominators during transform since we query them
> > > > during code generation.  We possibly could code generate all
> > > > CFG manipulations of all vectorized loops, recompute all dominators
> > > > and then do code generation of all vectorized loops.
> > > >
> > > > But then we're doing a loop transform and the exits will ultimatively
> > > > end up in the same place, so the CFG and dominator update is bound to
> > > > where the original exits went to.
> > >
> > > Yeah that's a fair point, the issue is specifically with at_exit.  So how about:
> > >
> > > When we peel at_exit we are moving the new loop at the exit of the previous
> > > loop.  This means that the blocks outside the loop dat the previous loop used to
> > > dominate are no longer being dominated by it.
> > 
> > Hmm, indeed.  Note this does make the dominator update O(function-size)
> > and when vectorizing multiple loops in a function this becomes
> > quadratic.  That's quite unfortunate so I wonder if we can delay the
> > update to the parts we do not need up-to-date dominators during
> > vectorization (of course it gets fragile with having only partly
> > correct dominators).
> 
> Fair, I created https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113290 and will
> tackle it when I add SLP support in GCC 15.
> 
> I think the problem is, and the reason we do early dominator correction and
> validation is because the same function is used by loop distribution.
> 
> But you're right that during vectorization we perform dominators update twice
> now.

We're performing it at least once per multi-exit loop that is vectorized,
covering all downstream blocks.

> So Maybe we should have a parameter to indicate whether dominators should
> be updated?

I think we should possibly try making loop distribution use another
mechanism for its copying ... there's duplicate_loop_body_to_header_edge
that's also used by loop_version, the core parts doing the new
loop creation could be split out and the detail how the final CFG
is set up be retained in two workers.

Richard.

> Thanks,
> Tamar
> 
> > 
> > > The new dominators however are hard to predict since if the loop has multiple
> > > exits and all the exits are an "early" one then we always execute the scalar
> > > loop.  In this case the scalar loop can completely dominate the new loop.
> > >
> > > If we later have skip_vector then there's an additional skip edge added that
> > > might change the dominators.
> > >
> > > The previous patch would force an update of all blocks reachable from the new
> > > exits.  This one updates *only* blocks that we know the scalar exits dominated.
> > >
> > > For the examples this reduces the blocks to update from 18 to 3.
> > >
> > > Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
> > > and no issues normally and with --enable-checking=release --enable-lto
> > > --with-build-config=bootstrap-O3 --enable-checking=yes,rtl,extra.
> > >
> > > Ok for master?
> > 
> > See below.
> > 
> > > Thanks,
> > > Tamar
> > >
> > > gcc/ChangeLog:
> > >
> > > 	PR tree-optimization/113144
> > > 	PR tree-optimization/113145
> > > 	* tree-vect-loop-manip.cc (slpeel_tree_duplicate_loop_to_edge_cfg):
> > > 	Update all BB that the original exits dominated.
> > >
> > > gcc/testsuite/ChangeLog:
> > >
> > > 	PR tree-optimization/113144
> > > 	PR tree-optimization/113145
> > > 	* gcc.dg/vect/vect-early-break_94-pr113144.c: New test.
> > >
> > > --- inline copy of patch ---
> > >
> > > diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_94-pr113144.c
> > b/gcc/testsuite/gcc.dg/vect/vect-early-break_94-pr113144.c
> > > new file mode 100644
> > > index
> > 0000000000000000000000000000000000000000..903fe7be6621e81db6f294
> > 41e4309fa213d027c5
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_94-pr113144.c
> > > @@ -0,0 +1,41 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-add-options vect_early_break } */
> > > +/* { dg-require-effective-target vect_early_break } */
> > > +/* { dg-require-effective-target vect_int } */
> > > +
> > > +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
> > > +
> > > +long tar_atol256_max, tar_atol256_size, tar_atosl_min;
> > > +char tar_atol256_s;
> > > +void __errno_location();
> > > +
> > > +
> > > +inline static long tar_atol256(long min) {
> > > +  char c;
> > > +  int sign;
> > > +  c = tar_atol256_s;
> > > +  sign = c;
> > > +  while (tar_atol256_size) {
> > > +    if (c != sign)
> > > +      return sign ? min : tar_atol256_max;
> > > +    c = tar_atol256_size--;
> > > +  }
> > > +  if ((c & 128) != (sign & 128))
> > > +    return sign ? min : tar_atol256_max;
> > > +  return 0;
> > > +}
> > > +
> > > +inline static long tar_atol(long min) {
> > > +  return tar_atol256(min);
> > > +}
> > > +
> > > +long tar_atosl() {
> > > +  long n = tar_atol(-1);
> > > +  if (tar_atosl_min) {
> > > +    __errno_location();
> > > +    return 0;
> > > +  }
> > > +  if (n > 0)
> > > +    return 0;
> > > +  return n;
> > > +}
> > > diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
> > > index
> > 76d4979c0b3b374dcaacf6825a95a8714114a63b..9bacaa182a3919cae1cb99dfc
> > 5ae4923e1f93376 100644
> > > --- a/gcc/tree-vect-loop-manip.cc
> > > +++ b/gcc/tree-vect-loop-manip.cc
> > > @@ -1719,8 +1719,6 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop
> > *loop, edge loop_exit,
> > >  	  /* Now link the alternative exits.  */
> > >  	  if (multiple_exits_p)
> > >  	    {
> > > -	      set_immediate_dominator (CDI_DOMINATORS, new_preheader,
> > > -				       main_loop_exit_block);
> > >  	      for (auto gsi_from = gsi_start_phis (loop->header),
> > >  		   gsi_to = gsi_start_phis (new_preheader);
> > >  		   !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);
> > > @@ -1776,7 +1774,14 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop
> > *loop, edge loop_exit,
> > >  	{
> > >  	  update_loop = new_loop;
> > >  	  for (edge e : get_loop_exit_edges (loop))
> > > -	    doms.safe_push (e->dest);
> > > +	    {
> > > +	      /* Basic blocks that the old loop dominated are now dominated by
> > > +		 the new loop and so we have to update those.  */
> > > +	      for (auto bb : get_all_dominated_blocks (CDI_DOMINATORS, e->src))
> > > +		if (!flow_bb_inside_loop_p (loop, bb))
> > > +		  doms.safe_push (bb);
> > > +	      doms.safe_push (e->dest);
> > > +	    }
> > 
> > I think you'll get duplicate blocks that way.  Maybe simplify this
> > all by instead doing
> > 
> >           auto doms = get_all_dominated_blocks (CDI_DOMINATORS, loop->header);
> >           for (unsigned i = 0; i < doms.length (); ++i)
> >             if (flow_bb_inside_loop_p (loop, doms[i]))
> >               doms.unordered_remove (i);
> > 
> > ?
> > 
> > OK with that change, but really we should see to avoid this
> > quadraticness :/  It's probably not too bad right now given we have
> > quite some restrictions on vectorizing loops with multiple exits,
> > but I suggest you try an artificial testcase with the "same"
> > loop repeated N times to see whether dominance compute creeps up
> > in the profile.
> > 
> > Richard.
> 

-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)

^ permalink raw reply	[flat|nested] 9+ messages in thread

* RE: [PATCH]middle-end: Fix dominators updates when peeling with multiple exits [PR113144]
  2024-01-09 13:34         ` Richard Biener
@ 2024-01-09 13:50           ` Richard Biener
  2024-01-09 13:58             ` Tamar Christina
  0 siblings, 1 reply; 9+ messages in thread
From: Richard Biener @ 2024-01-09 13:50 UTC (permalink / raw)
  To: Tamar Christina; +Cc: gcc-patches, nd, jlaw

On Tue, 9 Jan 2024, Richard Biener wrote:

> On Tue, 9 Jan 2024, Tamar Christina wrote:
> 
> > 
> > 
> > > -----Original Message-----
> > > From: Richard Biener <rguenther@suse.de>
> > > Sent: Tuesday, January 9, 2024 12:26 PM
> > > To: Tamar Christina <Tamar.Christina@arm.com>
> > > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; jlaw@ventanamicro.com
> > > Subject: RE: [PATCH]middle-end: Fix dominators updates when peeling with
> > > multiple exits [PR113144]
> > > 
> > > On Tue, 9 Jan 2024, Tamar Christina wrote:
> > > 
> > > > > This makes it quadratic in the number of vectorized early exit loops
> > > > > in a function.  The vectorizer CFG manipulation operates in a local
> > > > > enough bubble that programmatic updating of dominators should be
> > > > > possible (after all we manage to produce correct SSA form!), the
> > > > > proposed change gets us too far off to a point where re-computating
> > > > > dominance info is likely cheaper (but no, we shouldn't do this either).
> > > > >
> > > > > Can you instead give manual updating a try again?  I think
> > > > > versioning should produce up-to-date dominator info, it's only
> > > > > when you redirect branches during peeling that you'd need
> > > > > adjustments - but IIRC we're never introducing new merges?
> > > > >
> > > > > IIRC we can't wipe dominators during transform since we query them
> > > > > during code generation.  We possibly could code generate all
> > > > > CFG manipulations of all vectorized loops, recompute all dominators
> > > > > and then do code generation of all vectorized loops.
> > > > >
> > > > > But then we're doing a loop transform and the exits will ultimatively
> > > > > end up in the same place, so the CFG and dominator update is bound to
> > > > > where the original exits went to.
> > > >
> > > > Yeah that's a fair point, the issue is specifically with at_exit.  So how about:
> > > >
> > > > When we peel at_exit we are moving the new loop at the exit of the previous
> > > > loop.  This means that the blocks outside the loop dat the previous loop used to
> > > > dominate are no longer being dominated by it.
> > > 
> > > Hmm, indeed.  Note this does make the dominator update O(function-size)
> > > and when vectorizing multiple loops in a function this becomes
> > > quadratic.  That's quite unfortunate so I wonder if we can delay the
> > > update to the parts we do not need up-to-date dominators during
> > > vectorization (of course it gets fragile with having only partly
> > > correct dominators).
> > 
> > Fair, I created https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113290 and will
> > tackle it when I add SLP support in GCC 15.
> > 
> > I think the problem is, and the reason we do early dominator correction and
> > validation is because the same function is used by loop distribution.
> > 
> > But you're right that during vectorization we perform dominators update twice
> > now.
> 
> We're performing it at least once per multi-exit loop that is vectorized,
> covering all downstream blocks.

That is, consider sth like

int a[77];

int bar ();
void foo ()
{
  int val;
#define LOOP \
  val = bar (); \
  for (int i = 0; i < 77; ++i) \
    { \
      if (a[i] == val) \
        break; \
      a[i]++; \
    }
#define LOOP10 LOOP LOOP LOOP LOOP LOOP LOOP LOOP LOOP LOOP LOOP
#define LOOP100 LOOP10 LOOP10 LOOP10 LOOP10 LOOP10 LOOP10 LOOP10 LOOP10 
LOOP10 LOOP10
#define LOOP1000 LOOP100 LOOP100 LOOP100 LOOP100 LOOP100 LOOP100 LOOP100 
LOOP100 LOOP100 LOOP100
  LOOP1000
}

where on x86_64 with -O3 -msse4.1 -fno-gcse -fno-gcse-after-reload we're
currently "fine" (calling iterate_fix_dominators 2000 times).  But
with geting all dominated blocks you should get every block to exit
"fixed" and maybe get dominance compute to show up in the profile.

Richard.

 
> > So Maybe we should have a parameter to indicate whether dominators should
> > be updated?
> 
> I think we should possibly try making loop distribution use another
> mechanism for its copying ... there's duplicate_loop_body_to_header_edge
> that's also used by loop_version, the core parts doing the new
> loop creation could be split out and the detail how the final CFG
> is set up be retained in two workers.
>
> Richard.
> 
> > Thanks,
> > Tamar
> > 
> > > 
> > > > The new dominators however are hard to predict since if the loop has multiple
> > > > exits and all the exits are an "early" one then we always execute the scalar
> > > > loop.  In this case the scalar loop can completely dominate the new loop.
> > > >
> > > > If we later have skip_vector then there's an additional skip edge added that
> > > > might change the dominators.
> > > >
> > > > The previous patch would force an update of all blocks reachable from the new
> > > > exits.  This one updates *only* blocks that we know the scalar exits dominated.
> > > >
> > > > For the examples this reduces the blocks to update from 18 to 3.
> > > >
> > > > Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
> > > > and no issues normally and with --enable-checking=release --enable-lto
> > > > --with-build-config=bootstrap-O3 --enable-checking=yes,rtl,extra.
> > > >
> > > > Ok for master?
> > > 
> > > See below.
> > > 
> > > > Thanks,
> > > > Tamar
> > > >
> > > > gcc/ChangeLog:
> > > >
> > > > 	PR tree-optimization/113144
> > > > 	PR tree-optimization/113145
> > > > 	* tree-vect-loop-manip.cc (slpeel_tree_duplicate_loop_to_edge_cfg):
> > > > 	Update all BB that the original exits dominated.
> > > >
> > > > gcc/testsuite/ChangeLog:
> > > >
> > > > 	PR tree-optimization/113144
> > > > 	PR tree-optimization/113145
> > > > 	* gcc.dg/vect/vect-early-break_94-pr113144.c: New test.
> > > >
> > > > --- inline copy of patch ---
> > > >
> > > > diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_94-pr113144.c
> > > b/gcc/testsuite/gcc.dg/vect/vect-early-break_94-pr113144.c
> > > > new file mode 100644
> > > > index
> > > 0000000000000000000000000000000000000000..903fe7be6621e81db6f294
> > > 41e4309fa213d027c5
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_94-pr113144.c
> > > > @@ -0,0 +1,41 @@
> > > > +/* { dg-do compile } */
> > > > +/* { dg-add-options vect_early_break } */
> > > > +/* { dg-require-effective-target vect_early_break } */
> > > > +/* { dg-require-effective-target vect_int } */
> > > > +
> > > > +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
> > > > +
> > > > +long tar_atol256_max, tar_atol256_size, tar_atosl_min;
> > > > +char tar_atol256_s;
> > > > +void __errno_location();
> > > > +
> > > > +
> > > > +inline static long tar_atol256(long min) {
> > > > +  char c;
> > > > +  int sign;
> > > > +  c = tar_atol256_s;
> > > > +  sign = c;
> > > > +  while (tar_atol256_size) {
> > > > +    if (c != sign)
> > > > +      return sign ? min : tar_atol256_max;
> > > > +    c = tar_atol256_size--;
> > > > +  }
> > > > +  if ((c & 128) != (sign & 128))
> > > > +    return sign ? min : tar_atol256_max;
> > > > +  return 0;
> > > > +}
> > > > +
> > > > +inline static long tar_atol(long min) {
> > > > +  return tar_atol256(min);
> > > > +}
> > > > +
> > > > +long tar_atosl() {
> > > > +  long n = tar_atol(-1);
> > > > +  if (tar_atosl_min) {
> > > > +    __errno_location();
> > > > +    return 0;
> > > > +  }
> > > > +  if (n > 0)
> > > > +    return 0;
> > > > +  return n;
> > > > +}
> > > > diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
> > > > index
> > > 76d4979c0b3b374dcaacf6825a95a8714114a63b..9bacaa182a3919cae1cb99dfc
> > > 5ae4923e1f93376 100644
> > > > --- a/gcc/tree-vect-loop-manip.cc
> > > > +++ b/gcc/tree-vect-loop-manip.cc
> > > > @@ -1719,8 +1719,6 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop
> > > *loop, edge loop_exit,
> > > >  	  /* Now link the alternative exits.  */
> > > >  	  if (multiple_exits_p)
> > > >  	    {
> > > > -	      set_immediate_dominator (CDI_DOMINATORS, new_preheader,
> > > > -				       main_loop_exit_block);
> > > >  	      for (auto gsi_from = gsi_start_phis (loop->header),
> > > >  		   gsi_to = gsi_start_phis (new_preheader);
> > > >  		   !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);
> > > > @@ -1776,7 +1774,14 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop
> > > *loop, edge loop_exit,
> > > >  	{
> > > >  	  update_loop = new_loop;
> > > >  	  for (edge e : get_loop_exit_edges (loop))
> > > > -	    doms.safe_push (e->dest);
> > > > +	    {
> > > > +	      /* Basic blocks that the old loop dominated are now dominated by
> > > > +		 the new loop and so we have to update those.  */
> > > > +	      for (auto bb : get_all_dominated_blocks (CDI_DOMINATORS, e->src))
> > > > +		if (!flow_bb_inside_loop_p (loop, bb))
> > > > +		  doms.safe_push (bb);
> > > > +	      doms.safe_push (e->dest);
> > > > +	    }
> > > 
> > > I think you'll get duplicate blocks that way.  Maybe simplify this
> > > all by instead doing
> > > 
> > >           auto doms = get_all_dominated_blocks (CDI_DOMINATORS, loop->header);
> > >           for (unsigned i = 0; i < doms.length (); ++i)
> > >             if (flow_bb_inside_loop_p (loop, doms[i]))
> > >               doms.unordered_remove (i);
> > > 
> > > ?
> > > 
> > > OK with that change, but really we should see to avoid this
> > > quadraticness :/  It's probably not too bad right now given we have
> > > quite some restrictions on vectorizing loops with multiple exits,
> > > but I suggest you try an artificial testcase with the "same"
> > > loop repeated N times to see whether dominance compute creeps up
> > > in the profile.
> > > 
> > > Richard.
> > 
> 
> 

-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)

^ permalink raw reply	[flat|nested] 9+ messages in thread

* RE: [PATCH]middle-end: Fix dominators updates when peeling with multiple exits [PR113144]
  2024-01-09 13:50           ` Richard Biener
@ 2024-01-09 13:58             ` Tamar Christina
  2024-01-09 14:14               ` Richard Biener
  0 siblings, 1 reply; 9+ messages in thread
From: Tamar Christina @ 2024-01-09 13:58 UTC (permalink / raw)
  To: Richard Biener; +Cc: gcc-patches, nd, jlaw

> -----Original Message-----
> From: Richard Biener <rguenther@suse.de>
> Sent: Tuesday, January 9, 2024 1:51 PM
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; jlaw@ventanamicro.com
> Subject: RE: [PATCH]middle-end: Fix dominators updates when peeling with
> multiple exits [PR113144]
> 
> On Tue, 9 Jan 2024, Richard Biener wrote:
> 
> > On Tue, 9 Jan 2024, Tamar Christina wrote:
> >
> > >
> > >
> > > > -----Original Message-----
> > > > From: Richard Biener <rguenther@suse.de>
> > > > Sent: Tuesday, January 9, 2024 12:26 PM
> > > > To: Tamar Christina <Tamar.Christina@arm.com>
> > > > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; jlaw@ventanamicro.com
> > > > Subject: RE: [PATCH]middle-end: Fix dominators updates when peeling with
> > > > multiple exits [PR113144]
> > > >
> > > > On Tue, 9 Jan 2024, Tamar Christina wrote:
> > > >
> > > > > > This makes it quadratic in the number of vectorized early exit loops
> > > > > > in a function.  The vectorizer CFG manipulation operates in a local
> > > > > > enough bubble that programmatic updating of dominators should be
> > > > > > possible (after all we manage to produce correct SSA form!), the
> > > > > > proposed change gets us too far off to a point where re-computating
> > > > > > dominance info is likely cheaper (but no, we shouldn't do this either).
> > > > > >
> > > > > > Can you instead give manual updating a try again?  I think
> > > > > > versioning should produce up-to-date dominator info, it's only
> > > > > > when you redirect branches during peeling that you'd need
> > > > > > adjustments - but IIRC we're never introducing new merges?
> > > > > >
> > > > > > IIRC we can't wipe dominators during transform since we query them
> > > > > > during code generation.  We possibly could code generate all
> > > > > > CFG manipulations of all vectorized loops, recompute all dominators
> > > > > > and then do code generation of all vectorized loops.
> > > > > >
> > > > > > But then we're doing a loop transform and the exits will ultimatively
> > > > > > end up in the same place, so the CFG and dominator update is bound to
> > > > > > where the original exits went to.
> > > > >
> > > > > Yeah that's a fair point, the issue is specifically with at_exit.  So how about:
> > > > >
> > > > > When we peel at_exit we are moving the new loop at the exit of the
> previous
> > > > > loop.  This means that the blocks outside the loop dat the previous loop
> used to
> > > > > dominate are no longer being dominated by it.
> > > >
> > > > Hmm, indeed.  Note this does make the dominator update O(function-size)
> > > > and when vectorizing multiple loops in a function this becomes
> > > > quadratic.  That's quite unfortunate so I wonder if we can delay the
> > > > update to the parts we do not need up-to-date dominators during
> > > > vectorization (of course it gets fragile with having only partly
> > > > correct dominators).
> > >
> > > Fair, I created https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113290 and will
> > > tackle it when I add SLP support in GCC 15.
> > >
> > > I think the problem is, and the reason we do early dominator correction and
> > > validation is because the same function is used by loop distribution.
> > >
> > > But you're right that during vectorization we perform dominators update twice
> > > now.
> >
> > We're performing it at least once per multi-exit loop that is vectorized,
> > covering all downstream blocks.
> 
> That is, consider sth like
> 
> int a[77];
> 
> int bar ();
> void foo ()
> {
>   int val;
> #define LOOP \
>   val = bar (); \
>   for (int i = 0; i < 77; ++i) \
>     { \
>       if (a[i] == val) \
>         break; \
>       a[i]++; \
>     }
> #define LOOP10 LOOP LOOP LOOP LOOP LOOP LOOP LOOP LOOP LOOP LOOP
> #define LOOP100 LOOP10 LOOP10 LOOP10 LOOP10 LOOP10 LOOP10 LOOP10
> LOOP10
> LOOP10 LOOP10
> #define LOOP1000 LOOP100 LOOP100 LOOP100 LOOP100 LOOP100 LOOP100
> LOOP100
> LOOP100 LOOP100 LOOP100
>   LOOP1000
> }
> 
> where on x86_64 with -O3 -msse4.1 -fno-gcse -fno-gcse-after-reload we're
> currently "fine" (calling iterate_fix_dominators 2000 times).  But
> with geting all dominated blocks you should get every block to exit
> "fixed" and maybe get dominance compute to show up in the profile.
> 

Yeah, that makes sense.  If we can move loop distribution to a different method,
then we can just perform dominators update only once for all loops at the end
of vectorization right?

Thanks,
Tamar

> Richard.
> 
> 
> > > So Maybe we should have a parameter to indicate whether dominators should
> > > be updated?
> >
> > I think we should possibly try making loop distribution use another
> > mechanism for its copying ... there's duplicate_loop_body_to_header_edge
> > that's also used by loop_version, the core parts doing the new
> > loop creation could be split out and the detail how the final CFG
> > is set up be retained in two workers.
> >
> > Richard.
> >
> > > Thanks,
> > > Tamar
> > >
> > > >
> > > > > The new dominators however are hard to predict since if the loop has
> multiple
> > > > > exits and all the exits are an "early" one then we always execute the scalar
> > > > > loop.  In this case the scalar loop can completely dominate the new loop.
> > > > >
> > > > > If we later have skip_vector then there's an additional skip edge added that
> > > > > might change the dominators.
> > > > >
> > > > > The previous patch would force an update of all blocks reachable from the
> new
> > > > > exits.  This one updates *only* blocks that we know the scalar exits
> dominated.
> > > > >
> > > > > For the examples this reduces the blocks to update from 18 to 3.
> > > > >
> > > > > Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
> > > > > and no issues normally and with --enable-checking=release --enable-lto
> > > > > --with-build-config=bootstrap-O3 --enable-checking=yes,rtl,extra.
> > > > >
> > > > > Ok for master?
> > > >
> > > > See below.
> > > >
> > > > > Thanks,
> > > > > Tamar
> > > > >
> > > > > gcc/ChangeLog:
> > > > >
> > > > > 	PR tree-optimization/113144
> > > > > 	PR tree-optimization/113145
> > > > > 	* tree-vect-loop-manip.cc (slpeel_tree_duplicate_loop_to_edge_cfg):
> > > > > 	Update all BB that the original exits dominated.
> > > > >
> > > > > gcc/testsuite/ChangeLog:
> > > > >
> > > > > 	PR tree-optimization/113144
> > > > > 	PR tree-optimization/113145
> > > > > 	* gcc.dg/vect/vect-early-break_94-pr113144.c: New test.
> > > > >
> > > > > --- inline copy of patch ---
> > > > >
> > > > > diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_94-pr113144.c
> > > > b/gcc/testsuite/gcc.dg/vect/vect-early-break_94-pr113144.c
> > > > > new file mode 100644
> > > > > index
> > > >
> 0000000000000000000000000000000000000000..903fe7be6621e81db6f294
> > > > 41e4309fa213d027c5
> > > > > --- /dev/null
> > > > > +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_94-pr113144.c
> > > > > @@ -0,0 +1,41 @@
> > > > > +/* { dg-do compile } */
> > > > > +/* { dg-add-options vect_early_break } */
> > > > > +/* { dg-require-effective-target vect_early_break } */
> > > > > +/* { dg-require-effective-target vect_int } */
> > > > > +
> > > > > +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
> > > > > +
> > > > > +long tar_atol256_max, tar_atol256_size, tar_atosl_min;
> > > > > +char tar_atol256_s;
> > > > > +void __errno_location();
> > > > > +
> > > > > +
> > > > > +inline static long tar_atol256(long min) {
> > > > > +  char c;
> > > > > +  int sign;
> > > > > +  c = tar_atol256_s;
> > > > > +  sign = c;
> > > > > +  while (tar_atol256_size) {
> > > > > +    if (c != sign)
> > > > > +      return sign ? min : tar_atol256_max;
> > > > > +    c = tar_atol256_size--;
> > > > > +  }
> > > > > +  if ((c & 128) != (sign & 128))
> > > > > +    return sign ? min : tar_atol256_max;
> > > > > +  return 0;
> > > > > +}
> > > > > +
> > > > > +inline static long tar_atol(long min) {
> > > > > +  return tar_atol256(min);
> > > > > +}
> > > > > +
> > > > > +long tar_atosl() {
> > > > > +  long n = tar_atol(-1);
> > > > > +  if (tar_atosl_min) {
> > > > > +    __errno_location();
> > > > > +    return 0;
> > > > > +  }
> > > > > +  if (n > 0)
> > > > > +    return 0;
> > > > > +  return n;
> > > > > +}
> > > > > diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
> > > > > index
> > > >
> 76d4979c0b3b374dcaacf6825a95a8714114a63b..9bacaa182a3919cae1cb99dfc
> > > > 5ae4923e1f93376 100644
> > > > > --- a/gcc/tree-vect-loop-manip.cc
> > > > > +++ b/gcc/tree-vect-loop-manip.cc
> > > > > @@ -1719,8 +1719,6 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class
> loop
> > > > *loop, edge loop_exit,
> > > > >  	  /* Now link the alternative exits.  */
> > > > >  	  if (multiple_exits_p)
> > > > >  	    {
> > > > > -	      set_immediate_dominator (CDI_DOMINATORS, new_preheader,
> > > > > -				       main_loop_exit_block);
> > > > >  	      for (auto gsi_from = gsi_start_phis (loop->header),
> > > > >  		   gsi_to = gsi_start_phis (new_preheader);
> > > > >  		   !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);
> > > > > @@ -1776,7 +1774,14 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class
> loop
> > > > *loop, edge loop_exit,
> > > > >  	{
> > > > >  	  update_loop = new_loop;
> > > > >  	  for (edge e : get_loop_exit_edges (loop))
> > > > > -	    doms.safe_push (e->dest);
> > > > > +	    {
> > > > > +	      /* Basic blocks that the old loop dominated are now dominated
> by
> > > > > +		 the new loop and so we have to update those.  */
> > > > > +	      for (auto bb : get_all_dominated_blocks (CDI_DOMINATORS, e-
> >src))
> > > > > +		if (!flow_bb_inside_loop_p (loop, bb))
> > > > > +		  doms.safe_push (bb);
> > > > > +	      doms.safe_push (e->dest);
> > > > > +	    }
> > > >
> > > > I think you'll get duplicate blocks that way.  Maybe simplify this
> > > > all by instead doing
> > > >
> > > >           auto doms = get_all_dominated_blocks (CDI_DOMINATORS, loop-
> >header);
> > > >           for (unsigned i = 0; i < doms.length (); ++i)
> > > >             if (flow_bb_inside_loop_p (loop, doms[i]))
> > > >               doms.unordered_remove (i);
> > > >
> > > > ?
> > > >
> > > > OK with that change, but really we should see to avoid this
> > > > quadraticness :/  It's probably not too bad right now given we have
> > > > quite some restrictions on vectorizing loops with multiple exits,
> > > > but I suggest you try an artificial testcase with the "same"
> > > > loop repeated N times to see whether dominance compute creeps up
> > > > in the profile.
> > > >
> > > > Richard.
> > >
> >
> >
> 
> --
> Richard Biener <rguenther@suse.de>
> SUSE Software Solutions Germany GmbH,
> Frankenstrasse 146, 90461 Nuernberg, Germany;
> GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)

^ permalink raw reply	[flat|nested] 9+ messages in thread

* RE: [PATCH]middle-end: Fix dominators updates when peeling with multiple exits [PR113144]
  2024-01-09 13:58             ` Tamar Christina
@ 2024-01-09 14:14               ` Richard Biener
  0 siblings, 0 replies; 9+ messages in thread
From: Richard Biener @ 2024-01-09 14:14 UTC (permalink / raw)
  To: Tamar Christina; +Cc: gcc-patches, nd, jlaw

On Tue, 9 Jan 2024, Tamar Christina wrote:

> > -----Original Message-----
> > From: Richard Biener <rguenther@suse.de>
> > Sent: Tuesday, January 9, 2024 1:51 PM
> > To: Tamar Christina <Tamar.Christina@arm.com>
> > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; jlaw@ventanamicro.com
> > Subject: RE: [PATCH]middle-end: Fix dominators updates when peeling with
> > multiple exits [PR113144]
> > 
> > On Tue, 9 Jan 2024, Richard Biener wrote:
> > 
> > > On Tue, 9 Jan 2024, Tamar Christina wrote:
> > >
> > > >
> > > >
> > > > > -----Original Message-----
> > > > > From: Richard Biener <rguenther@suse.de>
> > > > > Sent: Tuesday, January 9, 2024 12:26 PM
> > > > > To: Tamar Christina <Tamar.Christina@arm.com>
> > > > > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; jlaw@ventanamicro.com
> > > > > Subject: RE: [PATCH]middle-end: Fix dominators updates when peeling with
> > > > > multiple exits [PR113144]
> > > > >
> > > > > On Tue, 9 Jan 2024, Tamar Christina wrote:
> > > > >
> > > > > > > This makes it quadratic in the number of vectorized early exit loops
> > > > > > > in a function.  The vectorizer CFG manipulation operates in a local
> > > > > > > enough bubble that programmatic updating of dominators should be
> > > > > > > possible (after all we manage to produce correct SSA form!), the
> > > > > > > proposed change gets us too far off to a point where re-computating
> > > > > > > dominance info is likely cheaper (but no, we shouldn't do this either).
> > > > > > >
> > > > > > > Can you instead give manual updating a try again?  I think
> > > > > > > versioning should produce up-to-date dominator info, it's only
> > > > > > > when you redirect branches during peeling that you'd need
> > > > > > > adjustments - but IIRC we're never introducing new merges?
> > > > > > >
> > > > > > > IIRC we can't wipe dominators during transform since we query them
> > > > > > > during code generation.  We possibly could code generate all
> > > > > > > CFG manipulations of all vectorized loops, recompute all dominators
> > > > > > > and then do code generation of all vectorized loops.
> > > > > > >
> > > > > > > But then we're doing a loop transform and the exits will ultimatively
> > > > > > > end up in the same place, so the CFG and dominator update is bound to
> > > > > > > where the original exits went to.
> > > > > >
> > > > > > Yeah that's a fair point, the issue is specifically with at_exit.  So how about:
> > > > > >
> > > > > > When we peel at_exit we are moving the new loop at the exit of the
> > previous
> > > > > > loop.  This means that the blocks outside the loop dat the previous loop
> > used to
> > > > > > dominate are no longer being dominated by it.
> > > > >
> > > > > Hmm, indeed.  Note this does make the dominator update O(function-size)
> > > > > and when vectorizing multiple loops in a function this becomes
> > > > > quadratic.  That's quite unfortunate so I wonder if we can delay the
> > > > > update to the parts we do not need up-to-date dominators during
> > > > > vectorization (of course it gets fragile with having only partly
> > > > > correct dominators).
> > > >
> > > > Fair, I created https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113290 and will
> > > > tackle it when I add SLP support in GCC 15.
> > > >
> > > > I think the problem is, and the reason we do early dominator correction and
> > > > validation is because the same function is used by loop distribution.
> > > >
> > > > But you're right that during vectorization we perform dominators update twice
> > > > now.
> > >
> > > We're performing it at least once per multi-exit loop that is vectorized,
> > > covering all downstream blocks.
> > 
> > That is, consider sth like
> > 
> > int a[77];
> > 
> > int bar ();
> > void foo ()
> > {
> >   int val;
> > #define LOOP \
> >   val = bar (); \
> >   for (int i = 0; i < 77; ++i) \
> >     { \
> >       if (a[i] == val) \
> >         break; \
> >       a[i]++; \
> >     }
> > #define LOOP10 LOOP LOOP LOOP LOOP LOOP LOOP LOOP LOOP LOOP LOOP
> > #define LOOP100 LOOP10 LOOP10 LOOP10 LOOP10 LOOP10 LOOP10 LOOP10
> > LOOP10
> > LOOP10 LOOP10
> > #define LOOP1000 LOOP100 LOOP100 LOOP100 LOOP100 LOOP100 LOOP100
> > LOOP100
> > LOOP100 LOOP100 LOOP100
> >   LOOP1000
> > }
> > 
> > where on x86_64 with -O3 -msse4.1 -fno-gcse -fno-gcse-after-reload we're
> > currently "fine" (calling iterate_fix_dominators 2000 times).  But
> > with geting all dominated blocks you should get every block to exit
> > "fixed" and maybe get dominance compute to show up in the profile.
> > 
> 
> Yeah, that makes sense.  If we can move loop distribution to a different method,
> then we can just perform dominators update only once for all loops at the end
> of vectorization right?

Maybe.  We'll have to see.  It would be good to have a way to mark
dominator regions as invalid (you could simply disconnect the nodes
from the tree).  What we know is that within loops dominator queries
should be OK.

I still think the invalidated region is bound, for example to blocks
dominating the sons of the scalar loop header, that is, I doubt
you really need to use get_all_dominated_blocks, get_dominated_by
should eventually suffice (or get_dominated_by_region with the
region being the scalar loop body and maybe exit blocks)?

> Thanks,
> Tamar
> 
> > Richard.
> > 
> > 
> > > > So Maybe we should have a parameter to indicate whether dominators should
> > > > be updated?
> > >
> > > I think we should possibly try making loop distribution use another
> > > mechanism for its copying ... there's duplicate_loop_body_to_header_edge
> > > that's also used by loop_version, the core parts doing the new
> > > loop creation could be split out and the detail how the final CFG
> > > is set up be retained in two workers.
> > >
> > > Richard.
> > >
> > > > Thanks,
> > > > Tamar
> > > >
> > > > >
> > > > > > The new dominators however are hard to predict since if the loop has
> > multiple
> > > > > > exits and all the exits are an "early" one then we always execute the scalar
> > > > > > loop.  In this case the scalar loop can completely dominate the new loop.
> > > > > >
> > > > > > If we later have skip_vector then there's an additional skip edge added that
> > > > > > might change the dominators.
> > > > > >
> > > > > > The previous patch would force an update of all blocks reachable from the
> > new
> > > > > > exits.  This one updates *only* blocks that we know the scalar exits
> > dominated.
> > > > > >
> > > > > > For the examples this reduces the blocks to update from 18 to 3.
> > > > > >
> > > > > > Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
> > > > > > and no issues normally and with --enable-checking=release --enable-lto
> > > > > > --with-build-config=bootstrap-O3 --enable-checking=yes,rtl,extra.
> > > > > >
> > > > > > Ok for master?
> > > > >
> > > > > See below.
> > > > >
> > > > > > Thanks,
> > > > > > Tamar
> > > > > >
> > > > > > gcc/ChangeLog:
> > > > > >
> > > > > > 	PR tree-optimization/113144
> > > > > > 	PR tree-optimization/113145
> > > > > > 	* tree-vect-loop-manip.cc (slpeel_tree_duplicate_loop_to_edge_cfg):
> > > > > > 	Update all BB that the original exits dominated.
> > > > > >
> > > > > > gcc/testsuite/ChangeLog:
> > > > > >
> > > > > > 	PR tree-optimization/113144
> > > > > > 	PR tree-optimization/113145
> > > > > > 	* gcc.dg/vect/vect-early-break_94-pr113144.c: New test.
> > > > > >
> > > > > > --- inline copy of patch ---
> > > > > >
> > > > > > diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_94-pr113144.c
> > > > > b/gcc/testsuite/gcc.dg/vect/vect-early-break_94-pr113144.c
> > > > > > new file mode 100644
> > > > > > index
> > > > >
> > 0000000000000000000000000000000000000000..903fe7be6621e81db6f294
> > > > > 41e4309fa213d027c5
> > > > > > --- /dev/null
> > > > > > +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_94-pr113144.c
> > > > > > @@ -0,0 +1,41 @@
> > > > > > +/* { dg-do compile } */
> > > > > > +/* { dg-add-options vect_early_break } */
> > > > > > +/* { dg-require-effective-target vect_early_break } */
> > > > > > +/* { dg-require-effective-target vect_int } */
> > > > > > +
> > > > > > +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
> > > > > > +
> > > > > > +long tar_atol256_max, tar_atol256_size, tar_atosl_min;
> > > > > > +char tar_atol256_s;
> > > > > > +void __errno_location();
> > > > > > +
> > > > > > +
> > > > > > +inline static long tar_atol256(long min) {
> > > > > > +  char c;
> > > > > > +  int sign;
> > > > > > +  c = tar_atol256_s;
> > > > > > +  sign = c;
> > > > > > +  while (tar_atol256_size) {
> > > > > > +    if (c != sign)
> > > > > > +      return sign ? min : tar_atol256_max;
> > > > > > +    c = tar_atol256_size--;
> > > > > > +  }
> > > > > > +  if ((c & 128) != (sign & 128))
> > > > > > +    return sign ? min : tar_atol256_max;
> > > > > > +  return 0;
> > > > > > +}
> > > > > > +
> > > > > > +inline static long tar_atol(long min) {
> > > > > > +  return tar_atol256(min);
> > > > > > +}
> > > > > > +
> > > > > > +long tar_atosl() {
> > > > > > +  long n = tar_atol(-1);
> > > > > > +  if (tar_atosl_min) {
> > > > > > +    __errno_location();
> > > > > > +    return 0;
> > > > > > +  }
> > > > > > +  if (n > 0)
> > > > > > +    return 0;
> > > > > > +  return n;
> > > > > > +}
> > > > > > diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
> > > > > > index
> > > > >
> > 76d4979c0b3b374dcaacf6825a95a8714114a63b..9bacaa182a3919cae1cb99dfc
> > > > > 5ae4923e1f93376 100644
> > > > > > --- a/gcc/tree-vect-loop-manip.cc
> > > > > > +++ b/gcc/tree-vect-loop-manip.cc
> > > > > > @@ -1719,8 +1719,6 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class
> > loop
> > > > > *loop, edge loop_exit,
> > > > > >  	  /* Now link the alternative exits.  */
> > > > > >  	  if (multiple_exits_p)
> > > > > >  	    {
> > > > > > -	      set_immediate_dominator (CDI_DOMINATORS, new_preheader,
> > > > > > -				       main_loop_exit_block);
> > > > > >  	      for (auto gsi_from = gsi_start_phis (loop->header),
> > > > > >  		   gsi_to = gsi_start_phis (new_preheader);
> > > > > >  		   !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);
> > > > > > @@ -1776,7 +1774,14 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class
> > loop
> > > > > *loop, edge loop_exit,
> > > > > >  	{
> > > > > >  	  update_loop = new_loop;
> > > > > >  	  for (edge e : get_loop_exit_edges (loop))
> > > > > > -	    doms.safe_push (e->dest);
> > > > > > +	    {
> > > > > > +	      /* Basic blocks that the old loop dominated are now dominated
> > by
> > > > > > +		 the new loop and so we have to update those.  */
> > > > > > +	      for (auto bb : get_all_dominated_blocks (CDI_DOMINATORS, e-
> > >src))
> > > > > > +		if (!flow_bb_inside_loop_p (loop, bb))
> > > > > > +		  doms.safe_push (bb);
> > > > > > +	      doms.safe_push (e->dest);
> > > > > > +	    }
> > > > >
> > > > > I think you'll get duplicate blocks that way.  Maybe simplify this
> > > > > all by instead doing
> > > > >
> > > > >           auto doms = get_all_dominated_blocks (CDI_DOMINATORS, loop-
> > >header);
> > > > >           for (unsigned i = 0; i < doms.length (); ++i)
> > > > >             if (flow_bb_inside_loop_p (loop, doms[i]))
> > > > >               doms.unordered_remove (i);
> > > > >
> > > > > ?
> > > > >
> > > > > OK with that change, but really we should see to avoid this
> > > > > quadraticness :/  It's probably not too bad right now given we have
> > > > > quite some restrictions on vectorizing loops with multiple exits,
> > > > > but I suggest you try an artificial testcase with the "same"
> > > > > loop repeated N times to see whether dominance compute creeps up
> > > > > in the profile.
> > > > >
> > > > > Richard.
> > > >
> > >
> > >
> > 
> > --
> > Richard Biener <rguenther@suse.de>
> > SUSE Software Solutions Germany GmbH,
> > Frankenstrasse 146, 90461 Nuernberg, Germany;
> > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)
> 

-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2024-01-09 14:19 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-12-29 14:44 [PATCH]middle-end: Fix dominators updates when peeling with multiple exits [PR113144] Tamar Christina
2024-01-08 12:19 ` Richard Biener
2024-01-09 11:47   ` Tamar Christina
2024-01-09 12:25     ` Richard Biener
2024-01-09 13:26       ` Tamar Christina
2024-01-09 13:34         ` Richard Biener
2024-01-09 13:50           ` Richard Biener
2024-01-09 13:58             ` Tamar Christina
2024-01-09 14:14               ` Richard Biener

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).