public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [x86, PATCH] operand reordering for commutative operations
@ 2014-12-29 13:54 Yuri Rumyantsev
  2014-12-29 14:04 ` H.J. Lu
  2015-01-05 20:26 ` Jeff Law
  0 siblings, 2 replies; 7+ messages in thread
From: Yuri Rumyantsev @ 2014-12-29 13:54 UTC (permalink / raw)
  To: gcc-patches, Igor Zamyatin

[-- Attachment #1: Type: text/plain, Size: 929 bytes --]

Hi All,

Here is a patch which fixed several performance degradation after
operand canonicalization (r216728). Very simple approach is used - if
operation is commutative and its second operand required more
operations (statements) for computation, swap operands.
Currently this is done under special option which is set-up to true
only for x86 32-bit targets ( we have not  seen any performance
improvements on 64-bit).

Is it OK for trunk?

2014-12-26  Yuri Rumyantsev  <ysrumyan@gmail.com>

* cfgexpand.c (count_num_stmt): New function.
(reorder_operands): Likewise.
(expand_gimple_basic_block): Insert call of reorder_operands.
* common.opt(flag_reorder_operands): Add new flag.
* config/i386/i386.c (ix86_option_override_internal): Add setup of
flag_reorder_operands for 32-bit target only.
* (doc/invoke.texi: Add new optimization option -freorder-operands.

gcc/testsuite/ChangeLog
* gcc.target/i386/swap_opnd.c: New test.

[-- Attachment #2: patch --]
[-- Type: application/octet-stream, Size: 7484 bytes --]

diff --git a/gcc/cfgexpand.c b/gcc/cfgexpand.c
index 5200053..01b745d 100644
--- a/gcc/cfgexpand.c
+++ b/gcc/cfgexpand.c
@@ -4884,6 +4884,148 @@ expand_debug_locations (void)
   flag_strict_aliasing = save_strict_alias;
 }
 
+/* Determine number of statements required to compute value of VAR.
+   Return false if this number is invalid and true otherwise.  */
+
+static bool
+count_num_stmt (tree var, int *count)
+{
+  gimple stmt;
+  tree op;
+  ssa_op_iter iter;
+
+  /* Do not handle vector operamds.  */
+  if (TREE_CODE (TREE_TYPE (var)) == VECTOR_TYPE)
+    return false;
+  if (TREE_CODE (var) != SSA_NAME)
+    return true;
+  stmt = get_gimple_for_ssa_name (var);
+  if (stmt == NULL)
+    {
+      /* Assume that we have PHI node or def_stmt is in another bb.  */
+      *count += 2;
+      return true;
+    }
+
+  if (gimple_has_volatile_ops (stmt))
+    return false;
+  if (gimple_assign_load_p (stmt))
+    {
+      *count += 1;
+      return true;
+    }
+  if (is_gimple_call (stmt))
+    {
+      *count += 10;
+      return true;
+    }
+  if (gimple_clobber_p (stmt))
+    return true;
+
+  *count += 1;
+  FOR_EACH_SSA_TREE_OPERAND (op, stmt, iter, SSA_OP_USE)
+    {
+      if (TREE_CODE (TREE_TYPE (op)) == VECTOR_TYPE)
+	return false;
+      if (!count_num_stmt (op, count))
+	return false;
+    }
+  return true;
+}
+
+/* Performs swap operands of commutative operations to compute
+   heavier operand first (which requires more instructions to
+   compute it).  It is done only in loops to save compile time.  */
+
+static void
+reorder_operands (basic_block bb)
+{
+  gimple_stmt_iterator gsi;
+  gimple_seq stmts;
+  gimple stmt = NULL;
+  enum tree_code code;
+  gimple def0, def1;
+  tree op0, op1;
+  bool swap;
+  int count1, count2;
+
+  if (!flag_reorder_operands)
+    return;
+
+  if (!bb->loop_father)
+    return;
+  stmts = bb_seq (bb);
+  for (gsi = gsi_start (stmts); !gsi_end_p (gsi); gsi_next (&gsi))
+    {
+      stmt = gsi_stmt (gsi);
+      /* Special handling of equality/non-equality which are not
+	 considered as commutative operations.  */
+      if (gimple_code (stmt) == GIMPLE_COND)
+	{
+	  code = gimple_cond_code (stmt);
+	  if ((code == EQ_EXPR || code == NE_EXPR)
+	      && (TREE_CODE (gimple_cond_lhs (stmt)) == SSA_NAME)
+	      && (TREE_CODE (gimple_cond_rhs (stmt)) == SSA_NAME))
+	    {
+	      op0 = gimple_cond_lhs (stmt);
+	      op1 = gimple_cond_rhs (stmt);
+	    }
+	  else
+	    continue;
+	}
+      else
+	{
+	  /* Consider assignments only.  */
+	  if (gimple_code (stmt) != GIMPLE_ASSIGN
+	      || gimple_has_volatile_ops (stmt))
+	    continue;
+          code = gimple_assign_rhs_code (stmt);
+          if (!commutative_tree_code (code))
+	    continue;
+          gcc_assert (gimple_num_ops (stmt) == 3);
+          op0 = gimple_op (stmt, 1);
+          op1 = gimple_op (stmt, 2);
+          if (op0 ==NULL_TREE || op1 == NULL_TREE)
+	    continue;
+          if (TREE_CODE (op0) != SSA_NAME || TREE_CODE (op1) != SSA_NAME)
+	    continue;
+      }
+      def0 = get_gimple_for_ssa_name (op0);
+      def1 = get_gimple_for_ssa_name (op1);
+
+      /* Skip stmt if its operands do not have definition in BB.  */
+      if (def0 == NULL || def1 == NULL)
+	continue;
+
+      count1 = count2 = 0;
+      if (!count_num_stmt (op0, &count1)
+	  || !count_num_stmt (op1, &count2))
+	continue;
+      swap = false;
+      if (count2 > count1)
+        swap = true;
+      if (swap)
+	{
+	  /* Swap operands.  */
+	  if (dump_file && (dump_flags & TDF_DETAILS))
+	    {
+	      fprintf (dump_file, "Swap operands in stmt:\n");
+	      print_gimple_stmt (dump_file, stmt, 0, TDF_SLIM);
+	    }
+	  if (gimple_code (stmt) == GIMPLE_ASSIGN)
+	    {
+	      gimple_set_op (stmt, 1, op1);
+	      gimple_set_op (stmt, 2, op0);
+	    }
+	  else if (gimple_code (stmt) == GIMPLE_COND)
+	    {
+	      gimple_cond_set_lhs (stmt, op1);
+	      gimple_cond_set_rhs (stmt, op0);
+	    }
+	}
+    }
+}
+
 /* Expand basic block BB from GIMPLE trees to RTL.  */
 
 static basic_block
@@ -4905,6 +5047,7 @@ expand_gimple_basic_block (basic_block bb, bool disable_tail_calls)
      cannot use the gsi_*_bb() routines because they expect the basic
      block to be in GIMPLE, instead of RTL.  Therefore, we need to
      access the BB sequence directly.  */
+  reorder_operands (bb);
   stmts = bb_seq (bb);
   bb->il.gimple.seq = NULL;
   bb->il.gimple.phi_nodes = NULL;
diff --git a/gcc/common.opt b/gcc/common.opt
index da5250b..833bd10 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -1848,6 +1848,10 @@ freorder-functions
 Common Report Var(flag_reorder_functions) Optimization
 Reorder functions to improve code placement
 
+freorder-operands
+Common Report Var(flag_reorder_operands) Optimization
+Perform operand reordering for commutative operations
+
 frerun-cse-after-loop
 Common Report Var(flag_rerun_cse_after_loop) Optimization
 Add a common subexpression elimination pass after loop optimizations
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index ec3e056..23d5d95 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -3829,6 +3829,10 @@ ix86_option_override_internal (bool main_args_p,
   if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
     opts->x_ix86_regparm = REGPARM_MAX;
 
+  if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
+    if (opts->x_optimize >= 1 && !opts_set->x_flag_reorder_operands)
+      opts->x_flag_reorder_operands = 1;
+
   /* Default align_* from the processor table.  */
   if (opts->x_align_loops == 0)
     {
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 9f21c96..861da34 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -406,7 +406,7 @@ Objective-C and Objective-C++ Dialects}.
 -fprofile-generate=@var{path} @gol
 -fprofile-use -fprofile-use=@var{path} -fprofile-values -fprofile-reorder-functions @gol
 -freciprocal-math -free -frename-registers -freorder-blocks @gol
--freorder-blocks-and-partition -freorder-functions @gol
+-freorder-blocks-and-partition -freorder-functions  -freorder-operands @gol
 -frerun-cse-after-loop -freschedule-modulo-scheduled-loops @gol
 -frounding-math -fsched2-use-superblocks -fsched-pressure @gol
 -fsched-spec-load -fsched-spec-load-dangerous @gol
@@ -8662,6 +8662,13 @@ Also profile feedback must be available to make this option effective.  See
 
 Enabled at levels @option{-O2}, @option{-O3}, @option{-Os}.
 
+@item -freorder-operands
+@opindex freorder-operands
+Reorder operands of commutative operations to compute heavier operand which
+requires more instructions for evaluation first to shrink register live range.
+
+Enabled for x86 32-bit targets at levels @option{-O2}, @option{-O3}.
+
 @item -fstrict-aliasing
 @opindex fstrict-aliasing
 Allow the compiler to assume the strictest aliasing rules applicable to
diff --git a/gcc/testsuite/gcc.target/i386/swap_opnd.c b/gcc/testsuite/gcc.target/i386/swap_opnd.c
new file mode 100644
index 0000000..5b0c11c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/swap_opnd.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target ia32 } */
+/* { dg-options "-O2 -fdump-rtl-expand-details" } */
+#define N 256
+int a1[N], a2[N], a3[N], a4[N];
+
+void foo()
+{
+  int i;
+  for (i=0; i<N; i++) {
+    if (a3[i] != a1[i] + a2[i]) {
+       a4[i] += 1;
+       a1[i] = a2[i] - 1;
+    }
+  }
+}
+/* { dg-final { scan-rtl-dump-times "Swap operands" 1 "expand" } } */
+/* { dg-final { cleanup-rtl-dump "expand" } } */

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [x86, PATCH] operand reordering for commutative operations
  2014-12-29 13:54 [x86, PATCH] operand reordering for commutative operations Yuri Rumyantsev
@ 2014-12-29 14:04 ` H.J. Lu
  2014-12-29 15:29   ` Yuri Rumyantsev
  2015-01-05 20:26 ` Jeff Law
  1 sibling, 1 reply; 7+ messages in thread
From: H.J. Lu @ 2014-12-29 14:04 UTC (permalink / raw)
  To: Yuri Rumyantsev; +Cc: gcc-patches, Igor Zamyatin

On Mon, Dec 29, 2014 at 5:30 AM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
> Hi All,
>
> Here is a patch which fixed several performance degradation after
> operand canonicalization (r216728). Very simple approach is used - if
> operation is commutative and its second operand required more
> operations (statements) for computation, swap operands.
> Currently this is done under special option which is set-up to true
> only for x86 32-bit targets ( we have not  seen any performance
> improvements on 64-bit).

Can you open a regression bug? Do you know how it improves
performance in 32-bit?

> Is it OK for trunk?
>
> 2014-12-26  Yuri Rumyantsev  <ysrumyan@gmail.com>
>
> * cfgexpand.c (count_num_stmt): New function.
> (reorder_operands): Likewise.
> (expand_gimple_basic_block): Insert call of reorder_operands.
> * common.opt(flag_reorder_operands): Add new flag.
> * config/i386/i386.c (ix86_option_override_internal): Add setup of
> flag_reorder_operands for 32-bit target only.
> * (doc/invoke.texi: Add new optimization option -freorder-operands.
>
> gcc/testsuite/ChangeLog
> * gcc.target/i386/swap_opnd.c: New test.



-- 
H.J.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [x86, PATCH] operand reordering for commutative operations
  2014-12-29 14:04 ` H.J. Lu
@ 2014-12-29 15:29   ` Yuri Rumyantsev
  0 siblings, 0 replies; 7+ messages in thread
From: Yuri Rumyantsev @ 2014-12-29 15:29 UTC (permalink / raw)
  To: H.J. Lu; +Cc: gcc-patches, Igor Zamyatin

Hi H.J.

Bug with reproducer was submitted:

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=64434


2014-12-29 16:53 GMT+03:00 H.J. Lu <hjl.tools@gmail.com>:
> On Mon, Dec 29, 2014 at 5:30 AM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
>> Hi All,
>>
>> Here is a patch which fixed several performance degradation after
>> operand canonicalization (r216728). Very simple approach is used - if
>> operation is commutative and its second operand required more
>> operations (statements) for computation, swap operands.
>> Currently this is done under special option which is set-up to true
>> only for x86 32-bit targets ( we have not  seen any performance
>> improvements on 64-bit).
>
> Can you open a regression bug? Do you know how it improves
> performance in 32-bit?
>
>> Is it OK for trunk?
>>
>> 2014-12-26  Yuri Rumyantsev  <ysrumyan@gmail.com>
>>
>> * cfgexpand.c (count_num_stmt): New function.
>> (reorder_operands): Likewise.
>> (expand_gimple_basic_block): Insert call of reorder_operands.
>> * common.opt(flag_reorder_operands): Add new flag.
>> * config/i386/i386.c (ix86_option_override_internal): Add setup of
>> flag_reorder_operands for 32-bit target only.
>> * (doc/invoke.texi: Add new optimization option -freorder-operands.
>>
>> gcc/testsuite/ChangeLog
>> * gcc.target/i386/swap_opnd.c: New test.
>
>
>
> --
> H.J.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [x86, PATCH] operand reordering for commutative operations
  2014-12-29 13:54 [x86, PATCH] operand reordering for commutative operations Yuri Rumyantsev
  2014-12-29 14:04 ` H.J. Lu
@ 2015-01-05 20:26 ` Jeff Law
  2015-01-09 10:21   ` Richard Biener
  1 sibling, 1 reply; 7+ messages in thread
From: Jeff Law @ 2015-01-05 20:26 UTC (permalink / raw)
  To: Yuri Rumyantsev, gcc-patches, Igor Zamyatin

On 12/29/14 06:30, Yuri Rumyantsev wrote:
> Hi All,
>
> Here is a patch which fixed several performance degradation after
> operand canonicalization (r216728). Very simple approach is used - if
> operation is commutative and its second operand required more
> operations (statements) for computation, swap operands.
> Currently this is done under special option which is set-up to true
> only for x86 32-bit targets ( we have not  seen any performance
> improvements on 64-bit).
>
> Is it OK for trunk?
>
> 2014-12-26  Yuri Rumyantsev  <ysrumyan@gmail.com>
>
> * cfgexpand.c (count_num_stmt): New function.
> (reorder_operands): Likewise.
> (expand_gimple_basic_block): Insert call of reorder_operands.
> * common.opt(flag_reorder_operands): Add new flag.
> * config/i386/i386.c (ix86_option_override_internal): Add setup of
> flag_reorder_operands for 32-bit target only.
> * (doc/invoke.texi: Add new optimization option -freorder-operands.
>
> gcc/testsuite/ChangeLog
> * gcc.target/i386/swap_opnd.c: New test.
I'd do this unconditionally -- I don't think there's a compelling reason 
to add another flag here.

Could you use estimate_num_insns rather than rolling your own estimate 
code here?  All you have to do is setup the weights structure and call 
the estimation code.  I wouldn't be surprised if ultimately the existing 
insn estimator is better than the one you're adding.

Make sure to reference the PR in the ChangeLog.

Please update and resubmit.

Thanks,
Jeff

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [x86, PATCH] operand reordering for commutative operations
  2015-01-05 20:26 ` Jeff Law
@ 2015-01-09 10:21   ` Richard Biener
  2015-01-12 15:12     ` Yuri Rumyantsev
  0 siblings, 1 reply; 7+ messages in thread
From: Richard Biener @ 2015-01-09 10:21 UTC (permalink / raw)
  To: Jeff Law; +Cc: Yuri Rumyantsev, gcc-patches, Igor Zamyatin

On Mon, Jan 5, 2015 at 9:26 PM, Jeff Law <law@redhat.com> wrote:
> On 12/29/14 06:30, Yuri Rumyantsev wrote:
>>
>> Hi All,
>>
>> Here is a patch which fixed several performance degradation after
>> operand canonicalization (r216728). Very simple approach is used - if
>> operation is commutative and its second operand required more
>> operations (statements) for computation, swap operands.
>> Currently this is done under special option which is set-up to true
>> only for x86 32-bit targets ( we have not  seen any performance
>> improvements on 64-bit).
>>
>> Is it OK for trunk?
>>
>> 2014-12-26  Yuri Rumyantsev  <ysrumyan@gmail.com>
>>
>> * cfgexpand.c (count_num_stmt): New function.
>> (reorder_operands): Likewise.
>> (expand_gimple_basic_block): Insert call of reorder_operands.
>> * common.opt(flag_reorder_operands): Add new flag.
>> * config/i386/i386.c (ix86_option_override_internal): Add setup of
>> flag_reorder_operands for 32-bit target only.
>> * (doc/invoke.texi: Add new optimization option -freorder-operands.
>>
>> gcc/testsuite/ChangeLog
>> * gcc.target/i386/swap_opnd.c: New test.
>
> I'd do this unconditionally -- I don't think there's a compelling reason to
> add another flag here.

Indeed.

> Could you use estimate_num_insns rather than rolling your own estimate code
> here?  All you have to do is setup the weights structure and call the
> estimation code.  I wouldn't be surprised if ultimately the existing insn
> estimator is better than the one you're adding.

Just use eni_size_weights.

Your counting is quadratic, that's a no-go.  You'd have to keep a lattice
of counts for SSA names to avoid this.  There is swap_ssa_operands (),
in your swapping code you fail to update SSA operands (maybe non-fatal
because we are just expanding to RTL, but ...).

bb->loop_father is always non-NULL, but doing this everywhere, not only
in loops looks fine to me.

You can swap comparison operands on GIMPLE_CONDs for all
codes by also swapping the EDGE_TRUE_VALUE/EDGE_FALSE_VALUE
flags on the outgoing BB edges.

There are more cases that can be swapped in regular stmts as well,
but I suppose we don't need to be "complete" here.

So, in reorder_operands I'd do (pseudo-code)

  n = 0;
  for-all-stmts
    gimple_set_uid (stmt, n++);
  lattice = XALLOCVEC (unsigned, n);

  i = 0;
  for-all-stmts
    this_stmt_cost = estimate_num_insns (stmt, &eni_size_weights);
    lattice[i] = this_stmt_cost;
    FOR_EACH_SSA_USE_OPERAND ()
       if (use-in-this-BB)
         lattice[i] += lattice[gimple_uid (SSA_NAME_DEF_STMT)];
     i++;
    swap-if-operand-cost says so

Richard.

> Make sure to reference the PR in the ChangeLog.
>
> Please update and resubmit.
>
> Thanks,
> Jeff

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [x86, PATCH] operand reordering for commutative operations
  2015-01-09 10:21   ` Richard Biener
@ 2015-01-12 15:12     ` Yuri Rumyantsev
  2015-01-12 15:13       ` Richard Biener
  0 siblings, 1 reply; 7+ messages in thread
From: Yuri Rumyantsev @ 2015-01-12 15:12 UTC (permalink / raw)
  To: Richard Biener; +Cc: Jeff Law, gcc-patches, Igor Zamyatin

Hi All,

Thanks a lot for your comments.
I've re-written reorder_operands as you proposed, but I'd like to know
if we should apply this reordering at -O0?

I will re-send the patch after testing completion.
Thanks.
Yuri.

2015-01-09 13:13 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
> On Mon, Jan 5, 2015 at 9:26 PM, Jeff Law <law@redhat.com> wrote:
>> On 12/29/14 06:30, Yuri Rumyantsev wrote:
>>>
>>> Hi All,
>>>
>>> Here is a patch which fixed several performance degradation after
>>> operand canonicalization (r216728). Very simple approach is used - if
>>> operation is commutative and its second operand required more
>>> operations (statements) for computation, swap operands.
>>> Currently this is done under special option which is set-up to true
>>> only for x86 32-bit targets ( we have not  seen any performance
>>> improvements on 64-bit).
>>>
>>> Is it OK for trunk?
>>>
>>> 2014-12-26  Yuri Rumyantsev  <ysrumyan@gmail.com>
>>>
>>> * cfgexpand.c (count_num_stmt): New function.
>>> (reorder_operands): Likewise.
>>> (expand_gimple_basic_block): Insert call of reorder_operands.
>>> * common.opt(flag_reorder_operands): Add new flag.
>>> * config/i386/i386.c (ix86_option_override_internal): Add setup of
>>> flag_reorder_operands for 32-bit target only.
>>> * (doc/invoke.texi: Add new optimization option -freorder-operands.
>>>
>>> gcc/testsuite/ChangeLog
>>> * gcc.target/i386/swap_opnd.c: New test.
>>
>> I'd do this unconditionally -- I don't think there's a compelling reason to
>> add another flag here.
>
> Indeed.
>
>> Could you use estimate_num_insns rather than rolling your own estimate code
>> here?  All you have to do is setup the weights structure and call the
>> estimation code.  I wouldn't be surprised if ultimately the existing insn
>> estimator is better than the one you're adding.
>
> Just use eni_size_weights.
>
> Your counting is quadratic, that's a no-go.  You'd have to keep a lattice
> of counts for SSA names to avoid this.  There is swap_ssa_operands (),
> in your swapping code you fail to update SSA operands (maybe non-fatal
> because we are just expanding to RTL, but ...).
>
> bb->loop_father is always non-NULL, but doing this everywhere, not only
> in loops looks fine to me.
>
> You can swap comparison operands on GIMPLE_CONDs for all
> codes by also swapping the EDGE_TRUE_VALUE/EDGE_FALSE_VALUE
> flags on the outgoing BB edges.
>
> There are more cases that can be swapped in regular stmts as well,
> but I suppose we don't need to be "complete" here.
>
> So, in reorder_operands I'd do (pseudo-code)
>
>   n = 0;
>   for-all-stmts
>     gimple_set_uid (stmt, n++);
>   lattice = XALLOCVEC (unsigned, n);
>
>   i = 0;
>   for-all-stmts
>     this_stmt_cost = estimate_num_insns (stmt, &eni_size_weights);
>     lattice[i] = this_stmt_cost;
>     FOR_EACH_SSA_USE_OPERAND ()
>        if (use-in-this-BB)
>          lattice[i] += lattice[gimple_uid (SSA_NAME_DEF_STMT)];
>      i++;
>     swap-if-operand-cost says so
>
> Richard.
>
>> Make sure to reference the PR in the ChangeLog.
>>
>> Please update and resubmit.
>>
>> Thanks,
>> Jeff

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [x86, PATCH] operand reordering for commutative operations
  2015-01-12 15:12     ` Yuri Rumyantsev
@ 2015-01-12 15:13       ` Richard Biener
  0 siblings, 0 replies; 7+ messages in thread
From: Richard Biener @ 2015-01-12 15:13 UTC (permalink / raw)
  To: Yuri Rumyantsev; +Cc: Jeff Law, gcc-patches, Igor Zamyatin

On Mon, Jan 12, 2015 at 4:00 PM, Yuri Rumyantsev <ysrumyan@gmail.com> wrote:
> Hi All,
>
> Thanks a lot for your comments.
> I've re-written reorder_operands as you proposed, but I'd like to know
> if we should apply this reordering at -O0?

No, I think we can spare those cycles there.

Richard.

> I will re-send the patch after testing completion.
> Thanks.
> Yuri.
>
> 2015-01-09 13:13 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
>> On Mon, Jan 5, 2015 at 9:26 PM, Jeff Law <law@redhat.com> wrote:
>>> On 12/29/14 06:30, Yuri Rumyantsev wrote:
>>>>
>>>> Hi All,
>>>>
>>>> Here is a patch which fixed several performance degradation after
>>>> operand canonicalization (r216728). Very simple approach is used - if
>>>> operation is commutative and its second operand required more
>>>> operations (statements) for computation, swap operands.
>>>> Currently this is done under special option which is set-up to true
>>>> only for x86 32-bit targets ( we have not  seen any performance
>>>> improvements on 64-bit).
>>>>
>>>> Is it OK for trunk?
>>>>
>>>> 2014-12-26  Yuri Rumyantsev  <ysrumyan@gmail.com>
>>>>
>>>> * cfgexpand.c (count_num_stmt): New function.
>>>> (reorder_operands): Likewise.
>>>> (expand_gimple_basic_block): Insert call of reorder_operands.
>>>> * common.opt(flag_reorder_operands): Add new flag.
>>>> * config/i386/i386.c (ix86_option_override_internal): Add setup of
>>>> flag_reorder_operands for 32-bit target only.
>>>> * (doc/invoke.texi: Add new optimization option -freorder-operands.
>>>>
>>>> gcc/testsuite/ChangeLog
>>>> * gcc.target/i386/swap_opnd.c: New test.
>>>
>>> I'd do this unconditionally -- I don't think there's a compelling reason to
>>> add another flag here.
>>
>> Indeed.
>>
>>> Could you use estimate_num_insns rather than rolling your own estimate code
>>> here?  All you have to do is setup the weights structure and call the
>>> estimation code.  I wouldn't be surprised if ultimately the existing insn
>>> estimator is better than the one you're adding.
>>
>> Just use eni_size_weights.
>>
>> Your counting is quadratic, that's a no-go.  You'd have to keep a lattice
>> of counts for SSA names to avoid this.  There is swap_ssa_operands (),
>> in your swapping code you fail to update SSA operands (maybe non-fatal
>> because we are just expanding to RTL, but ...).
>>
>> bb->loop_father is always non-NULL, but doing this everywhere, not only
>> in loops looks fine to me.
>>
>> You can swap comparison operands on GIMPLE_CONDs for all
>> codes by also swapping the EDGE_TRUE_VALUE/EDGE_FALSE_VALUE
>> flags on the outgoing BB edges.
>>
>> There are more cases that can be swapped in regular stmts as well,
>> but I suppose we don't need to be "complete" here.
>>
>> So, in reorder_operands I'd do (pseudo-code)
>>
>>   n = 0;
>>   for-all-stmts
>>     gimple_set_uid (stmt, n++);
>>   lattice = XALLOCVEC (unsigned, n);
>>
>>   i = 0;
>>   for-all-stmts
>>     this_stmt_cost = estimate_num_insns (stmt, &eni_size_weights);
>>     lattice[i] = this_stmt_cost;
>>     FOR_EACH_SSA_USE_OPERAND ()
>>        if (use-in-this-BB)
>>          lattice[i] += lattice[gimple_uid (SSA_NAME_DEF_STMT)];
>>      i++;
>>     swap-if-operand-cost says so
>>
>> Richard.
>>
>>> Make sure to reference the PR in the ChangeLog.
>>>
>>> Please update and resubmit.
>>>
>>> Thanks,
>>> Jeff

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2015-01-12 15:07 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-12-29 13:54 [x86, PATCH] operand reordering for commutative operations Yuri Rumyantsev
2014-12-29 14:04 ` H.J. Lu
2014-12-29 15:29   ` Yuri Rumyantsev
2015-01-05 20:26 ` Jeff Law
2015-01-09 10:21   ` Richard Biener
2015-01-12 15:12     ` Yuri Rumyantsev
2015-01-12 15:13       ` Richard Biener

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).