public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH] Optimize BB sorting in domwalk
@ 2017-07-24 11:30 Alexander Monakov
  2017-07-24 12:06 ` Ulrich Drepper
  2017-07-24 22:14 ` Jeff Law
  0 siblings, 2 replies; 6+ messages in thread
From: Alexander Monakov @ 2017-07-24 11:30 UTC (permalink / raw)
  To: gcc-patches

Profiling uses of qsort in GCC reveals that a significant portion of calls
comes from domwalk.c where child nodes in dominator tree are reordered
according to postorder numbering.  However we know that in dominator trees
the vast majority of nodes have 0, 2 or 3 children, and handling those
cases separately allows to avoid qsort overhead.

I don't have trunk figures handy, but on gcc-6 release-checking tramp3d
compilation this would eliminate about 63% of _all_ qsort calls by count,
or about 27% by time.  Call count profile looks like:

N=0	N=1	N=2	N=3	N=4	N=5	N=6 	...	Total
16111   9406    228607  91870   20154   8317    3823            406971

With this patch it would look like

N=0	N=1	N=2	N=3	N=4	N=5	N=6	...	Total
16111   9406    32200   34132   16966   8022    3702            149177

While at it, I've also added a clarifying comment to bb_postorder (it
actually gives reverse postorder), simplified casts in cmp_bb_postorder
and changed it to compute the result in a branchless fashion.

Bootstrapped and regtested on x86-64, also verified that gcc/*.o files are
unchanged except for domwalk.o and *-checksum.o.  OK for trunk?

	* domwalk.c (cmp_bb_postorder): Simplify.
	(sort_bbs_postorder): New function.  Use it...
	(dom_walker::walk): ...here to optimize common cases.
---
 gcc/domwalk.c | 53 ++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 36 insertions(+), 17 deletions(-)

diff --git a/gcc/domwalk.c b/gcc/domwalk.c
index a0daae6..dfb1993 100644
--- a/gcc/domwalk.c
+++ b/gcc/domwalk.c
@@ -128,19 +128,46 @@ along with GCC; see the file COPYING3.  If not see
     which is currently an abstraction over walking tree statements.  Thus
     the dominator walker is currently only useful for trees.  */
 
+/* Reverse postorder index of each basic block.  */
 static int *bb_postorder;
 
 static int
 cmp_bb_postorder (const void *a, const void *b)
 {
-  basic_block bb1 = *(basic_block *)const_cast<void *>(a);
-  basic_block bb2 = *(basic_block *)const_cast<void *>(b);
-  if (bb1->index == bb2->index)
-    return 0;
+  basic_block bb1 = *(const basic_block *)(a);
+  basic_block bb2 = *(const basic_block *)(b);
+  int n1 = bb_postorder[bb1->index], n2 = bb_postorder[bb2->index];
   /* Place higher completion number first (pop off lower number first).  */
-  if (bb_postorder[bb1->index] > bb_postorder[bb2->index])
-    return -1;
-  return 1;
+  return (n1 < n2) - (n1 > n2);
+}
+
+/* Permute array BBS of N basic blocks in postorder,
+   i.e. by descending number in BB_POSTORDER array.  */
+
+static void
+sort_bbs_postorder (basic_block *bbs, int n)
+{
+  if (__builtin_expect (n == 2, true))
+    {
+      basic_block bb0 = bbs[0], bb1 = bbs[1];
+      if (bb_postorder[bb0->index] < bb_postorder[bb1->index])
+	bbs[0] = bb1, bbs[1] = bb0;
+    }
+  else if (__builtin_expect (n == 3, true))
+    {
+      basic_block t, bb0 = bbs[0], bb1 = bbs[1], bb2 = bbs[2];
+      if (bb_postorder[bb0->index] < bb_postorder[bb1->index])
+	t = bb0, bb0 = bb1, bb1 = t;
+      if (bb_postorder[bb1->index] < bb_postorder[bb2->index])
+	{
+	  t = bb1, bb1 = bb2, bb2 = t;
+	  if (bb_postorder[bb0->index] < bb_postorder[bb1->index])
+	    t = bb0, bb0 = bb1, bb1 = t;
+	}
+      bbs[0] = bb0, bbs[1] = bb1, bbs[2] = bb2;
+    }
+  else
+    qsort (bbs, n, sizeof *bbs, cmp_bb_postorder);
 }
 
 /* Constructor for a dom walker.
@@ -284,16 +311,8 @@ dom_walker::walk (basic_block bb)
 	  for (dest = first_dom_son (m_dom_direction, bb);
 	       dest; dest = next_dom_son (m_dom_direction, dest))
 	    worklist[sp++] = dest;
-	  if (m_dom_direction == CDI_DOMINATORS)
-	    switch (sp - saved_sp)
-	      {
-	      case 0:
-	      case 1:
-		break;
-	      default:
-		qsort (&worklist[saved_sp], sp - saved_sp,
-		       sizeof (basic_block), cmp_bb_postorder);
-	      }
+	  if (sp - saved_sp > 1 && m_dom_direction == CDI_DOMINATORS)
+	    sort_bbs_postorder (&worklist[saved_sp], sp - saved_sp);
 	}
       /* NULL is used to mark pop operations in the recursion stack.  */
       while (sp > 0 && !worklist[sp - 1])
-- 
1.8.3.1

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] Optimize BB sorting in domwalk
  2017-07-24 11:30 [PATCH] Optimize BB sorting in domwalk Alexander Monakov
@ 2017-07-24 12:06 ` Ulrich Drepper
  2017-07-24 22:14 ` Jeff Law
  1 sibling, 0 replies; 6+ messages in thread
From: Ulrich Drepper @ 2017-07-24 12:06 UTC (permalink / raw)
  To: Alexander Monakov; +Cc: GCC Patches

Not commenting on the correctness... but

On Mon, Jul 24, 2017 at 1:29 PM, Alexander Monakov <amonakov@ispras.ru> wrote:
> +      basic_block bb0 = bbs[0], bb1 = bbs[1];
> +      if (bb_postorder[bb0->index] < bb_postorder[bb1->index])
> +       bbs[0] = bb1, bbs[1] = bb0;
> +    }
> +  else if (__builtin_expect (n == 3, true))
> +    {
> +      basic_block t, bb0 = bbs[0], bb1 = bbs[1], bb2 = bbs[2];
> +      if (bb_postorder[bb0->index] < bb_postorder[bb1->index])
> +       t = bb0, bb0 = bb1, bb1 = t;
> +      if (bb_postorder[bb1->index] < bb_postorder[bb2->index])
> +       {
> +         t = bb1, bb1 = bb2, bb2 = t;
> +         if (bb_postorder[bb0->index] < bb_postorder[bb1->index])
> +           t = bb0, bb0 = bb1, bb1 = t;
> +       }
> +      bbs[0] = bb0, bbs[1] = bb1, bbs[2] = bb2;

... maybe use std::swap() in all four cases?

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] Optimize BB sorting in domwalk
  2017-07-24 11:30 [PATCH] Optimize BB sorting in domwalk Alexander Monakov
  2017-07-24 12:06 ` Ulrich Drepper
@ 2017-07-24 22:14 ` Jeff Law
  2017-07-25  8:23   ` Alexander Monakov
  1 sibling, 1 reply; 6+ messages in thread
From: Jeff Law @ 2017-07-24 22:14 UTC (permalink / raw)
  To: Alexander Monakov, gcc-patches

On 07/24/2017 05:29 AM, Alexander Monakov wrote:
> Profiling uses of qsort in GCC reveals that a significant portion of calls
> comes from domwalk.c where child nodes in dominator tree are reordered
> according to postorder numbering.  However we know that in dominator trees
> the vast majority of nodes have 0, 2 or 3 children, and handling those
> cases separately allows to avoid qsort overhead.
> 
> I don't have trunk figures handy, but on gcc-6 release-checking tramp3d
> compilation this would eliminate about 63% of _all_ qsort calls by count,
> or about 27% by time.  Call count profile looks like:
> 
> N=0	N=1	N=2	N=3	N=4	N=5	N=6 	...	Total
> 16111   9406    228607  91870   20154   8317    3823            406971
> 
> With this patch it would look like
> 
> N=0	N=1	N=2	N=3	N=4	N=5	N=6	...	Total
> 16111   9406    32200   34132   16966   8022    3702            149177
> 
> While at it, I've also added a clarifying comment to bb_postorder (it
> actually gives reverse postorder), simplified casts in cmp_bb_postorder
> and changed it to compute the result in a branchless fashion.
> 
> Bootstrapped and regtested on x86-64, also verified that gcc/*.o files are
> unchanged except for domwalk.o and *-checksum.o.  OK for trunk?
> 
> 	* domwalk.c (cmp_bb_postorder): Simplify.
> 	(sort_bbs_postorder): New function.  Use it...
> 	(dom_walker::walk): ...here to optimize common cases.
Is it really beneficial to write cmp_bb_postorder without simple and
trivial to understand conditionals?  And if it is, then doesn't that
actually suggest that we should have a match.pd pattern to turn the
branchy code into a branchless sequence?

As Uli noted, we should be using std::swap.

Can you please repost ?

THanks,
Jeff

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] Optimize BB sorting in domwalk
  2017-07-24 22:14 ` Jeff Law
@ 2017-07-25  8:23   ` Alexander Monakov
  2017-07-25  8:46     ` Richard Biener
  2017-07-25  8:51     ` Alexander Monakov
  0 siblings, 2 replies; 6+ messages in thread
From: Alexander Monakov @ 2017-07-25  8:23 UTC (permalink / raw)
  To: Jeff Law; +Cc: gcc-patches

On Mon, 24 Jul 2017, Jeff Law wrote:
> As Uli noted, we should be using std::swap.
> 
> Can you please repost ?

	* domwalk.c (cmp_bb_postorder): Simplify.
	(sort_bbs_postorder): New function.  Use it...
	(dom_walker::walk): ...here to optimize common cases.

---
 gcc/domwalk.c | 53 ++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 36 insertions(+), 17 deletions(-)

diff --git a/gcc/domwalk.c b/gcc/domwalk.c
index a0daae6b2d8..df51b8c4451 100644
--- a/gcc/domwalk.c
+++ b/gcc/domwalk.c
@@ -128,19 +128,46 @@ along with GCC; see the file COPYING3.  If not see
     which is currently an abstraction over walking tree statements.  Thus
     the dominator walker is currently only useful for trees.  */
 
+/* Reverse postorder index of each basic block.  */
 static int *bb_postorder;
 
 static int
 cmp_bb_postorder (const void *a, const void *b)
 {
-  basic_block bb1 = *(basic_block *)const_cast<void *>(a);
-  basic_block bb2 = *(basic_block *)const_cast<void *>(b);
-  if (bb1->index == bb2->index)
-    return 0;
+  basic_block bb1 = *(const basic_block *)(a);
+  basic_block bb2 = *(const basic_block *)(b);
+  int n1 = bb_postorder[bb1->index], n2 = bb_postorder[bb2->index];
   /* Place higher completion number first (pop off lower number first).  */
-  if (bb_postorder[bb1->index] > bb_postorder[bb2->index])
-    return -1;
-  return 1;
+  return (n1 < n2) - (n1 > n2);
+}
+
+/* Permute array BBS of N basic blocks in postorder,
+   i.e. by descending number in BB_POSTORDER array.  */
+
+static void
+sort_bbs_postorder (basic_block *bbs, int n)
+{
+  if (__builtin_expect (n == 2, true))
+    {
+      basic_block bb0 = bbs[0], bb1 = bbs[1];
+      if (bb_postorder[bb0->index] < bb_postorder[bb1->index])
+	bbs[0] = bb1, bbs[1] = bb0;
+    }
+  else if (__builtin_expect (n == 3, true))
+    {
+      basic_block bb0 = bbs[0], bb1 = bbs[1], bb2 = bbs[2];
+      if (bb_postorder[bb0->index] < bb_postorder[bb1->index])
+	std::swap (bb0, bb1);
+      if (bb_postorder[bb1->index] < bb_postorder[bb2->index])
+	{
+	  std::swap (bb1, bb2);
+	  if (bb_postorder[bb0->index] < bb_postorder[bb1->index])
+	    std::swap (bb0, bb1);
+	}
+      bbs[0] = bb0, bbs[1] = bb1, bbs[2] = bb2;
+    }
+  else
+    qsort (bbs, n, sizeof *bbs, cmp_bb_postorder);
 }
 
 /* Constructor for a dom walker.
@@ -284,16 +311,8 @@ dom_walker::walk (basic_block bb)
 	  for (dest = first_dom_son (m_dom_direction, bb);
 	       dest; dest = next_dom_son (m_dom_direction, dest))
 	    worklist[sp++] = dest;
-	  if (m_dom_direction == CDI_DOMINATORS)
-	    switch (sp - saved_sp)
-	      {
-	      case 0:
-	      case 1:
-		break;
-	      default:
-		qsort (&worklist[saved_sp], sp - saved_sp,
-		       sizeof (basic_block), cmp_bb_postorder);
-	      }
+	  if (sp - saved_sp > 1 && m_dom_direction == CDI_DOMINATORS)
+	    sort_bbs_postorder (&worklist[saved_sp], sp - saved_sp);
 	}
       /* NULL is used to mark pop operations in the recursion stack.  */
       while (sp > 0 && !worklist[sp - 1])
-- 
2.13.3

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] Optimize BB sorting in domwalk
  2017-07-25  8:23   ` Alexander Monakov
@ 2017-07-25  8:46     ` Richard Biener
  2017-07-25  8:51     ` Alexander Monakov
  1 sibling, 0 replies; 6+ messages in thread
From: Richard Biener @ 2017-07-25  8:46 UTC (permalink / raw)
  To: Alexander Monakov; +Cc: Jeff Law, GCC Patches

On Tue, Jul 25, 2017 at 10:22 AM, Alexander Monakov <amonakov@ispras.ru> wrote:
> On Mon, 24 Jul 2017, Jeff Law wrote:
>> As Uli noted, we should be using std::swap.
>>
>> Can you please repost ?
>
>         * domwalk.c (cmp_bb_postorder): Simplify.
>         (sort_bbs_postorder): New function.  Use it...
>         (dom_walker::walk): ...here to optimize common cases.

Ok.  Note that I think vec::qsort might benefit from handling length
() == 2 and domwalk
might benefit from using vec::qsort if it would work on a sub-vector
as we are asking for
here...

Richard.

> ---
>  gcc/domwalk.c | 53 ++++++++++++++++++++++++++++++++++++-----------------
>  1 file changed, 36 insertions(+), 17 deletions(-)
>
> diff --git a/gcc/domwalk.c b/gcc/domwalk.c
> index a0daae6b2d8..df51b8c4451 100644
> --- a/gcc/domwalk.c
> +++ b/gcc/domwalk.c
> @@ -128,19 +128,46 @@ along with GCC; see the file COPYING3.  If not see
>      which is currently an abstraction over walking tree statements.  Thus
>      the dominator walker is currently only useful for trees.  */
>
> +/* Reverse postorder index of each basic block.  */
>  static int *bb_postorder;
>
>  static int
>  cmp_bb_postorder (const void *a, const void *b)
>  {
> -  basic_block bb1 = *(basic_block *)const_cast<void *>(a);
> -  basic_block bb2 = *(basic_block *)const_cast<void *>(b);
> -  if (bb1->index == bb2->index)
> -    return 0;
> +  basic_block bb1 = *(const basic_block *)(a);
> +  basic_block bb2 = *(const basic_block *)(b);
> +  int n1 = bb_postorder[bb1->index], n2 = bb_postorder[bb2->index];
>    /* Place higher completion number first (pop off lower number first).  */
> -  if (bb_postorder[bb1->index] > bb_postorder[bb2->index])
> -    return -1;
> -  return 1;
> +  return (n1 < n2) - (n1 > n2);
> +}
> +
> +/* Permute array BBS of N basic blocks in postorder,
> +   i.e. by descending number in BB_POSTORDER array.  */
> +
> +static void
> +sort_bbs_postorder (basic_block *bbs, int n)
> +{
> +  if (__builtin_expect (n == 2, true))
> +    {
> +      basic_block bb0 = bbs[0], bb1 = bbs[1];
> +      if (bb_postorder[bb0->index] < bb_postorder[bb1->index])
> +       bbs[0] = bb1, bbs[1] = bb0;
> +    }
> +  else if (__builtin_expect (n == 3, true))
> +    {
> +      basic_block bb0 = bbs[0], bb1 = bbs[1], bb2 = bbs[2];
> +      if (bb_postorder[bb0->index] < bb_postorder[bb1->index])
> +       std::swap (bb0, bb1);
> +      if (bb_postorder[bb1->index] < bb_postorder[bb2->index])
> +       {
> +         std::swap (bb1, bb2);
> +         if (bb_postorder[bb0->index] < bb_postorder[bb1->index])
> +           std::swap (bb0, bb1);
> +       }
> +      bbs[0] = bb0, bbs[1] = bb1, bbs[2] = bb2;
> +    }
> +  else
> +    qsort (bbs, n, sizeof *bbs, cmp_bb_postorder);
>  }
>
>  /* Constructor for a dom walker.
> @@ -284,16 +311,8 @@ dom_walker::walk (basic_block bb)
>           for (dest = first_dom_son (m_dom_direction, bb);
>                dest; dest = next_dom_son (m_dom_direction, dest))
>             worklist[sp++] = dest;
> -         if (m_dom_direction == CDI_DOMINATORS)
> -           switch (sp - saved_sp)
> -             {
> -             case 0:
> -             case 1:
> -               break;
> -             default:
> -               qsort (&worklist[saved_sp], sp - saved_sp,
> -                      sizeof (basic_block), cmp_bb_postorder);
> -             }
> +         if (sp - saved_sp > 1 && m_dom_direction == CDI_DOMINATORS)
> +           sort_bbs_postorder (&worklist[saved_sp], sp - saved_sp);
>         }
>        /* NULL is used to mark pop operations in the recursion stack.  */
>        while (sp > 0 && !worklist[sp - 1])
> --
> 2.13.3

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] Optimize BB sorting in domwalk
  2017-07-25  8:23   ` Alexander Monakov
  2017-07-25  8:46     ` Richard Biener
@ 2017-07-25  8:51     ` Alexander Monakov
  1 sibling, 0 replies; 6+ messages in thread
From: Alexander Monakov @ 2017-07-25  8:51 UTC (permalink / raw)
  To: Jeff Law; +Cc: gcc-patches

On Tue, 25 Jul 2017, Alexander Monakov wrote:
> --- a/gcc/domwalk.c
> +++ b/gcc/domwalk.c
> @@ -128,19 +128,46 @@ along with GCC; see the file COPYING3.  If not see
>      which is currently an abstraction over walking tree statements.  Thus
>      the dominator walker is currently only useful for trees.  */
>  
> +/* Reverse postorder index of each basic block.  */
>  static int *bb_postorder;
>  
>  static int
>  cmp_bb_postorder (const void *a, const void *b)
>  {
> -  basic_block bb1 = *(basic_block *)const_cast<void *>(a);
> -  basic_block bb2 = *(basic_block *)const_cast<void *>(b);
> -  if (bb1->index == bb2->index)
> -    return 0;
> +  basic_block bb1 = *(const basic_block *)(a);
> +  basic_block bb2 = *(const basic_block *)(b);
> +  int n1 = bb_postorder[bb1->index], n2 = bb_postorder[bb2->index];
>    /* Place higher completion number first (pop off lower number first).  */
> -  if (bb_postorder[bb1->index] > bb_postorder[bb2->index])
> -    return -1;
> -  return 1;
> +  return (n1 < n2) - (n1 > n2);

Actually since the 'int *bb_postorder' array is not going to hold negative
entries, this can be simply

  return n2 - n1;

(the (A > B) - (A < B) formulation is useful in case signed A - B may overflow)

Alexander

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2017-07-25  8:51 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-07-24 11:30 [PATCH] Optimize BB sorting in domwalk Alexander Monakov
2017-07-24 12:06 ` Ulrich Drepper
2017-07-24 22:14 ` Jeff Law
2017-07-25  8:23   ` Alexander Monakov
2017-07-25  8:46     ` Richard Biener
2017-07-25  8:51     ` Alexander Monakov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).