[gcc r14-5779] swap ops in reassoc to reduce cross backedge FMA

public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed

* [gcc r14-5779] swap ops in reassoc to reduce cross backedge FMA
@ 2023-11-23 12:57 Di Zhao
  0 siblings, 0 replies; only message in thread
From: Di Zhao @ 2023-11-23 12:57 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:746344dd53807d840c29f52adba10d0ab093bd3d

commit r14-5779-g746344dd53807d840c29f52adba10d0ab093bd3d
Author: Di Zhao <dizhao@os.amperecomputing.com>
Date:   Thu Nov 9 15:06:37 2023 +0800

    swap ops in reassoc to reduce cross backedge FMA
    
    Previously for ops.length >= 3, when FMA is present, we don't
    rank the operands so that more FMAs can be preserved. But this
    brings more FMAs with loop dependency, which lead to worse
    performance on some targets.
    
    Rank the oprands (set width=2) when:
    1. avoid_fma_max_bits is set.
    2. And loop dependent FMA sequence is found.
    
    In this way, we don't have to discard all the FMA candidates
    in the bad shaped sequence in widening_mul, instead we can keep
    fewer FMAs without loop dependency.
    
    With this patch, there's about 2% improvement in 510.parest_r
    1-copy run on ampere1 (with "-Ofast -mcpu=ampere1 -flto
    --param avoid-fma-max-bits=512").
    
    PR tree-optimization/110279
    
    gcc/ChangeLog:
    
            * tree-ssa-reassoc.cc (get_reassociation_width): check
            for loop dependent FMAs.
            (reassociate_bb): For 3 ops, refine the condition to call
            swap_ops_for_binary_stmt.
    
    gcc/testsuite/ChangeLog:
    
            * gcc.dg/pr110279-1.c: New test.

Diff:
---
 gcc/testsuite/gcc.dg/pr110279-1.c | 65 +++++++++++++++++++++++++++++++++
 gcc/tree-ssa-reassoc.cc           | 77 ++++++++++++++++++++++++++++++++++++---
 2 files changed, 136 insertions(+), 6 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/pr110279-1.c b/gcc/testsuite/gcc.dg/pr110279-1.c
new file mode 100644
index 00000000000..f25b6aec967
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/pr110279-1.c
@@ -0,0 +1,65 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast --param avoid-fma-max-bits=512 --param tree-reassoc-width=4 -fdump-tree-widening_mul-details" } */
+/* { dg-additional-options "-march=armv8.2-a" { target aarch64-*-* } } */
+
+#define LOOP_COUNT 800000000
+typedef double data_e;
+
+/* Check that FMAs with backedge dependency are avoided. Otherwise there won't
+   be FMA generated with "--param avoid-fma-max-bits=512".   */
+
+data_e
+foo1 (data_e a, data_e b, data_e c, data_e d)
+{
+  data_e result = 0;
+
+  for (int ic = 0; ic < LOOP_COUNT; ic++)
+    {
+      result += (a * b + c * d);
+
+      a -= 0.1;
+      b += 0.9;
+      c *= 1.02;
+      d *= 0.61;
+    }
+
+  return result;
+}
+
+data_e
+foo2 (data_e a, data_e b, data_e c, data_e d)
+{
+  data_e result = 0;
+
+  for (int ic = 0; ic < LOOP_COUNT; ic++)
+    {
+      result = a * b + result + c * d;
+
+      a -= 0.1;
+      b += 0.9;
+      c *= 1.02;
+      d *= 0.61;
+    }
+
+  return result;
+}
+
+data_e
+foo3 (data_e a, data_e b, data_e c, data_e d)
+{
+  data_e result = 0;
+
+  for (int ic = 0; ic < LOOP_COUNT; ic++)
+    {
+      result = result + a * b + c * d;
+
+      a -= 0.1;
+      b += 0.9;
+      c *= 1.02;
+      d *= 0.61;
+    }
+
+  return result;
+}
+
+/* { dg-final { scan-tree-dump-times "Generated FMA" 3 "widening_mul"} } */
\ No newline at end of file
diff --git a/gcc/tree-ssa-reassoc.cc b/gcc/tree-ssa-reassoc.cc
index 26321aa4fc5..07fc8e2f1f9 100644
--- a/gcc/tree-ssa-reassoc.cc
+++ b/gcc/tree-ssa-reassoc.cc
@@ -5431,16 +5431,19 @@ get_required_cycles (int ops_num, int cpu_width)
 }
 
 /* Returns an optimal number of registers to use for computation of
-   given statements.  */
+   given statements.
+
+   LHS is the result ssa name of OPS.  */
 
 static int
-get_reassociation_width (int ops_num, enum tree_code opc,
-			 machine_mode mode)
+get_reassociation_width (vec<operand_entry *> *ops, tree lhs,
+			 enum tree_code opc, machine_mode mode)
 {
   int param_width = param_tree_reassoc_width;
   int width;
   int width_min;
   int cycles_best;
+  int ops_num = ops->length ();
 
   if (param_width > 0)
     width = param_width;
@@ -5471,6 +5474,61 @@ get_reassociation_width (int ops_num, enum tree_code opc,
 	break;
     }
 
+  /* If there's loop dependent FMA result, return width=2 to avoid it.  This is
+     better than skipping these FMA candidates in widening_mul.  */
+  if (width == 1
+      && maybe_le (tree_to_poly_int64 (TYPE_SIZE (TREE_TYPE (lhs))),
+		   param_avoid_fma_max_bits))
+    {
+      /* Look for cross backedge dependency:
+	1. LHS is a phi argument in the same basic block it is defined.
+	2. And the result of the phi node is used in OPS.  */
+      basic_block bb = gimple_bb (SSA_NAME_DEF_STMT (lhs));
+
+      use_operand_p use_p;
+      imm_use_iterator iter;
+      FOR_EACH_IMM_USE_FAST (use_p, iter, lhs)
+	if (gphi *phi = dyn_cast<gphi *> (USE_STMT (use_p)))
+	  {
+	    if (gimple_phi_arg_edge (phi, phi_arg_index_from_use (use_p))->src
+		!= bb)
+	      continue;
+	    tree phi_result = gimple_phi_result (phi);
+	    operand_entry *oe;
+	    unsigned int j;
+	    FOR_EACH_VEC_ELT (*ops, j, oe)
+	      {
+		if (TREE_CODE (oe->op) != SSA_NAME)
+		  continue;
+
+		/* Result of phi is operand of PLUS_EXPR.  */
+		if (oe->op == phi_result)
+		  return 2;
+
+		/* Check is result of phi is operand of MULT_EXPR.  */
+		gimple *def_stmt = SSA_NAME_DEF_STMT (oe->op);
+		if (is_gimple_assign (def_stmt)
+		    && gimple_assign_rhs_code (def_stmt) == NEGATE_EXPR)
+		  {
+		    tree rhs = gimple_assign_rhs1 (def_stmt);
+		    if (TREE_CODE (rhs) == SSA_NAME)
+		      {
+			if (rhs == phi_result)
+			  return 2;
+			def_stmt = SSA_NAME_DEF_STMT (rhs);
+		      }
+		  }
+		if (is_gimple_assign (def_stmt)
+		    && gimple_assign_rhs_code (def_stmt) == MULT_EXPR)
+		  {
+		    if (gimple_assign_rhs1 (def_stmt) == phi_result
+			|| gimple_assign_rhs2 (def_stmt) == phi_result)
+		      return 2;
+		  }
+	      }
+	  }
+    }
+
   return width;
 }
 
@@ -7031,8 +7089,9 @@ reassociate_bb (basic_block bb)
 		     with initial linearization.  */
 		  if (!reassoc_insert_powi_p
 		      && ops.length () > 3
-		      && (width = get_reassociation_width (ops_num, rhs_code,
-							   mode)) > 1)
+		      && (width
+			  = get_reassociation_width (&ops, lhs, rhs_code, mode))
+			   > 1)
 		    {
 		      if (dump_file && (dump_flags & TDF_DETAILS))
 			fprintf (dump_file,
@@ -7049,7 +7108,13 @@ reassociate_bb (basic_block bb)
 			 to make sure the ones that get the double
 			 binary op are chosen wisely.  */
 		      int len = ops.length ();
-		      if (len >= 3 && !has_fma)
+		      if (len >= 3
+			  && (!has_fma
+			      /* width > 1 means ranking ops results in better
+				 parallelism.  */
+			      || get_reassociation_width (&ops, lhs, rhs_code,
+							  mode)
+				   > 1))
 			swap_ops_for_binary_stmt (ops, len - 3);
 
 		      new_lhs = rewrite_expr_tree (stmt, rhs_code, 0, ops,

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2023-11-23 12:57 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-11-23 12:57 [gcc r14-5779] swap ops in reassoc to reduce cross backedge FMA Di Zhao

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).