public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH v2] tree-optimization/110279- Check for nested FMA chains in reassoc
@ 2023-07-11  2:59 Di Zhao OS
  2023-07-11  6:51 ` Philipp Tomsich
  2023-07-19  9:05 ` Richard Biener
  0 siblings, 2 replies; 5+ messages in thread
From: Di Zhao OS @ 2023-07-11  2:59 UTC (permalink / raw)
  To: gcc-patches; +Cc: Philipp Tomsich

[-- Attachment #1: Type: text/plain, Size: 1396 bytes --]

Attached is an updated version of the patch.

Based on Philipp's review, some changes:

1. Defined new enum fma_state to describe the state of FMA candidates
   for a list of operands. (Since the tests seems simple after the
   change, I didn't add predicates on it.)
2. Changed return type of convert_mult_to_fma_1 and convert_mult_to_fma
   to tree, to remove the in/out parameter.
3. Added description of return value values of rank_ops_for_fma.

---
gcc/ChangeLog:

        * tree-ssa-math-opts.cc (convert_mult_to_fma_1): Added new parameter
        check_only_p. Changed return type to tree.
        (struct fma_transformation_info): Moved to header.
        (class fma_deferring_state): Moved to header.
        (convert_mult_to_fma): Added new parameter check_only_p. Changed
        return type to tree.
        * tree-ssa-math-opts.h (struct fma_transformation_info): Moved from .cc.
        (class fma_deferring_state): Moved from .cc.
        (convert_mult_to_fma): Add function decl.
        * tree-ssa-reassoc.cc (enum fma_state): Defined new enum to describe
        the state of FMA candidates for a list of operands.
        (rewrite_expr_tree_parallel): Changed boolean parameter to enum type.
        (rank_ops_for_fma): Return enum fma_state.
        (reassociate_bb): Avoid rewriting to parallel if nested FMAs are found.

Thanks,
Di Zhao 



[-- Attachment #2: 0001-Check-for-nested-FMA-chains-in-reassoc.patch --]
[-- Type: application/octet-stream, Size: 20766 bytes --]

From 57f4c183937ca71b12c165247aa43be669713d5f Mon Sep 17 00:00:00 2001
From: "Di Zhao OS" <dizhao@os.amperecomputing.com>
Date: Mon, 10 Jul 2023 17:08:51 +0800
Subject: [PATCH] Check for nested FMA chains in reassoc

1. Reused code in pass widening_mul to check for nested FMA chains
   (those connected by MULT_EXPRs), since re-writing to parallel
   generates worse codes.

2. Avoid re-arrange to produce less FMA chains that can be slow.

With option "-funroll-loops -Ofast -flto", the improvements of
spec2017 fprate 1-copy run are:

Ampere1:
    508.namd_r          4.26%
    510.parest_r        2.55%
    Overall             0.54%
Intel Xeon:
    503.bwaves_r        1.3%
    508.namd_r          1.58%
    overall             0.42%
---
 gcc/tree-ssa-math-opts.cc | 131 +++++++++++++++-----------------------
 gcc/tree-ssa-math-opts.h  |  53 +++++++++++++++
 gcc/tree-ssa-reassoc.cc   | 101 ++++++++++++++++++++++++++---
 3 files changed, 195 insertions(+), 90 deletions(-)

diff --git a/gcc/tree-ssa-math-opts.cc b/gcc/tree-ssa-math-opts.cc
index 68fc518b1ab..364f2f13cfb 100644
--- a/gcc/tree-ssa-math-opts.cc
+++ b/gcc/tree-ssa-math-opts.cc
@@ -3064,15 +3064,20 @@ convert_plusminus_to_widen (gimple_stmt_iterator *gsi, gimple *stmt,
 
 /* Given a result MUL_RESULT which is a result of a multiplication of OP1 and
    OP2 and which we know is used in statements that can be, together with the
-   multiplication, converted to FMAs, perform the transformation.  */
+   multiplication, converted to FMAs, perform the transformation.  Returns LHS
+   of a converted FMA.
 
-static void
-convert_mult_to_fma_1 (tree mul_result, tree op1, tree op2)
+   If CHECK_ONLY_P is true, skip the actual transformation.  */
+
+static tree
+convert_mult_to_fma_1 (tree mul_result, tree op1, tree op2,
+		       bool check_only_p = false)
 {
   tree type = TREE_TYPE (mul_result);
   gimple *use_stmt;
   imm_use_iterator imm_iter;
   gcall *fma_stmt;
+  tree lhs = NULL_TREE;
 
   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, mul_result)
     {
@@ -3091,8 +3096,11 @@ convert_mult_to_fma_1 (tree mul_result, tree op1, tree op2)
 	  use_operand_p use_p;
 	  gimple *neguse_stmt;
 	  single_imm_use (gimple_assign_lhs (use_stmt), &use_p, &neguse_stmt);
-	  gsi_remove (&gsi, true);
-	  release_defs (use_stmt);
+	  if (!check_only_p)
+	    {
+	      gsi_remove (&gsi, true);
+	      release_defs (use_stmt);
+	    }
 
 	  use_stmt = neguse_stmt;
 	  gsi = gsi_for_stmt (use_stmt);
@@ -3106,6 +3114,10 @@ convert_mult_to_fma_1 (tree mul_result, tree op1, tree op2)
 	gcc_unreachable ();
       addop = ops[0] == result ? ops[1] : ops[0];
 
+      lhs = gimple_get_lhs (use_stmt);
+      if (check_only_p)
+	return lhs;
+
       if (code == MINUS_EXPR)
 	{
 	  if (ops[0] == result)
@@ -3127,7 +3139,7 @@ convert_mult_to_fma_1 (tree mul_result, tree op1, tree op2)
 					       op2, addop, else_value);
       else
 	fma_stmt = gimple_build_call_internal (IFN_FMA, 3, mulop1, op2, addop);
-      gimple_set_lhs (fma_stmt, gimple_get_lhs (use_stmt));
+      gimple_set_lhs (fma_stmt, lhs);
       gimple_call_set_nothrow (fma_stmt, !stmt_can_throw_internal (cfun,
 								   use_stmt));
       gsi_replace (&gsi, fma_stmt, true);
@@ -3179,55 +3191,9 @@ convert_mult_to_fma_1 (tree mul_result, tree op1, tree op2)
 
       widen_mul_stats.fmas_inserted++;
     }
-}
-
-/* Data necessary to perform the actual transformation from a multiplication
-   and an addition to an FMA after decision is taken it should be done and to
-   then delete the multiplication statement from the function IL.  */
-
-struct fma_transformation_info
-{
-  gimple *mul_stmt;
-  tree mul_result;
-  tree op1;
-  tree op2;
-};
-
-/* Structure containing the current state of FMA deferring, i.e. whether we are
-   deferring, whether to continue deferring, and all data necessary to come
-   back and perform all deferred transformations.  */
-
-class fma_deferring_state
-{
-public:
-  /* Class constructor.  Pass true as PERFORM_DEFERRING in order to actually
-     do any deferring.  */
 
-  fma_deferring_state (bool perform_deferring)
-    : m_candidates (), m_mul_result_set (), m_initial_phi (NULL),
-      m_last_result (NULL_TREE), m_deferring_p (perform_deferring) {}
-
-  /* List of FMA candidates for which we the transformation has been determined
-     possible but we at this point in BB analysis we do not consider them
-     beneficial.  */
-  auto_vec<fma_transformation_info, 8> m_candidates;
-
-  /* Set of results of multiplication that are part of an already deferred FMA
-     candidates.  */
-  hash_set<tree> m_mul_result_set;
-
-  /* The PHI that supposedly feeds back result of a FMA to another over loop
-     boundary.  */
-  gphi *m_initial_phi;
-
-  /* Result of the last produced FMA candidate or NULL if there has not been
-     one.  */
-  tree m_last_result;
-
-  /* If true, deferring might still be profitable.  If false, transform all
-     candidates and no longer defer.  */
-  bool m_deferring_p;
-};
+  return lhs;
+}
 
 /* Transform all deferred FMA candidates and mark STATE as no longer
    deferring.  */
@@ -3288,7 +3254,9 @@ last_fma_candidate_feeds_initial_phi (fma_deferring_state *state,
 
 /* Combine the multiplication at MUL_STMT with operands MULOP1 and MULOP2
    with uses in additions and subtractions to form fused multiply-add
-   operations.  Returns true if successful and MUL_STMT should be removed.
+   operations.  Returns the LHS of FMA if successful and MUL_STMT should be
+   removed, otherwise returns NULL_TREE.
+
    If MUL_COND is nonnull, the multiplication in MUL_STMT is conditional
    on MUL_COND, otherwise it is unconditional.
 
@@ -3303,17 +3271,21 @@ last_fma_candidate_feeds_initial_phi (fma_deferring_state *state,
   or its unrolled version, i.e. with several FMA candidates that feed result
   of one into the addend of another.  Instead, we add them to a list in STATE
   and if we later discover an FMA candidate that is not part of such a chain,
-  we go back and perform all deferred past candidates.  */
+  we go back and perform all deferred past candidates.
 
-static bool
+  If CHECK_ONLY_P is true, skip the actual transformation and just return the
+  result.  */
+
+tree
 convert_mult_to_fma (gimple *mul_stmt, tree op1, tree op2,
-		     fma_deferring_state *state, tree mul_cond = NULL_TREE)
+		     fma_deferring_state *state, tree mul_cond,
+		     bool check_only_p)
 {
   tree mul_result = gimple_get_lhs (mul_stmt);
   /* If there isn't a LHS then this can't be an FMA.  There can be no LHS
      if the statement was left just for the side-effects.  */
   if (!mul_result)
-    return false;
+    return NULL_TREE;
   tree type = TREE_TYPE (mul_result);
   gimple *use_stmt, *neguse_stmt;
   use_operand_p use_p;
@@ -3321,24 +3293,24 @@ convert_mult_to_fma (gimple *mul_stmt, tree op1, tree op2,
 
   if (FLOAT_TYPE_P (type)
       && flag_fp_contract_mode != FP_CONTRACT_FAST)
-    return false;
+    return NULL_TREE;
 
   /* We don't want to do bitfield reduction ops.  */
   if (INTEGRAL_TYPE_P (type)
       && (!type_has_mode_precision_p (type) || TYPE_OVERFLOW_TRAPS (type)))
-    return false;
+    return NULL_TREE;
 
   /* If the target doesn't support it, don't generate it.  We assume that
      if fma isn't available then fms, fnma or fnms are not either.  */
   optimization_type opt_type = bb_optimization_type (gimple_bb (mul_stmt));
   if (!direct_internal_fn_supported_p (IFN_FMA, type, opt_type))
-    return false;
+    return NULL_TREE;
 
   /* If the multiplication has zero uses, it is kept around probably because
      of -fnon-call-exceptions.  Don't optimize it away in that case,
      it is DCE job.  */
   if (has_zero_uses (mul_result))
-    return false;
+    return NULL_TREE;
 
   bool check_defer
     = (state->m_deferring_p
@@ -3358,7 +3330,7 @@ convert_mult_to_fma (gimple *mul_stmt, tree op1, tree op2,
      because the result is not a natural FMA chain.  */
   if (ANY_INTEGRAL_TYPE_P (type)
       && !has_single_use (mul_result))
-    return false;
+    return NULL_TREE;
 
   /* Make sure that the multiplication statement becomes dead after
      the transformation, thus that all uses are transformed to FMAs.
@@ -3384,7 +3356,7 @@ convert_mult_to_fma (gimple *mul_stmt, tree op1, tree op2,
 	 to form a fma in the then block and sink the multiplication to the
 	 else block.  */
       if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
-	return false;
+	return NULL_TREE;
 
       /* A negate on the multiplication leads to FNMA.  */
       if (is_gimple_assign (use_stmt)
@@ -3397,7 +3369,7 @@ convert_mult_to_fma (gimple *mul_stmt, tree op1, tree op2,
 	     negates of the same value, treat them as equivalent
 	     to a single negate with multiple uses.  */
 	  if (seen_negate_p)
-	    return false;
+	    return NULL_TREE;
 
 	  result = gimple_assign_lhs (use_stmt);
 
@@ -3405,17 +3377,17 @@ convert_mult_to_fma (gimple *mul_stmt, tree op1, tree op2,
 	     single transformation.  */
 	  if (!single_imm_use (gimple_assign_lhs (use_stmt),
 			       &use_p, &neguse_stmt))
-	    return false;
+	    return NULL_TREE;
 
 	  /* Make sure the multiplication isn't also used on that stmt.  */
 	  FOR_EACH_PHI_OR_STMT_USE (usep, neguse_stmt, iter, SSA_OP_USE)
 	    if (USE_FROM_PTR (usep) == mul_result)
-	      return false;
+	      return NULL_TREE;
 
 	  /* Re-validate.  */
 	  use_stmt = neguse_stmt;
 	  if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
-	    return false;
+	    return NULL_TREE;
 
 	  negate_p = seen_negate_p = true;
 	}
@@ -3424,7 +3396,7 @@ convert_mult_to_fma (gimple *mul_stmt, tree op1, tree op2,
       tree_code code;
       if (!can_interpret_as_conditional_op_p (use_stmt, &cond, &code, ops,
 					      &else_value))
-	return false;
+	return NULL_TREE;
 
       switch (code)
 	{
@@ -3436,18 +3408,18 @@ convert_mult_to_fma (gimple *mul_stmt, tree op1, tree op2,
 	  break;
 	default:
 	  /* FMA can only be formed from PLUS and MINUS.  */
-	  return false;
+	  return NULL_TREE;
 	}
 
       if (mul_cond && cond != mul_cond)
-	return false;
+	return NULL_TREE;
 
       if (cond)
 	{
 	  if (cond == result || else_value == result)
-	    return false;
+	    return NULL_TREE;
 	  if (!direct_internal_fn_supported_p (IFN_COND_FMA, type, opt_type))
-	    return false;
+	    return NULL_TREE;
 	}
 
       /* If the subtrahend (OPS[1]) is computed by a MULT_EXPR that
@@ -3466,18 +3438,18 @@ convert_mult_to_fma (gimple *mul_stmt, tree op1, tree op2,
 	  gimple *stmt2 = SSA_NAME_DEF_STMT (ops[1]);
 	  if (is_gimple_assign (stmt2)
 	      && gimple_assign_rhs_code (stmt2) == MULT_EXPR)
-	    return false;
+	    return NULL_TREE;
 	}
 
       /* We can't handle a * b + a * b.  */
       if (ops[0] == ops[1])
-	return false;
+	return NULL_TREE;
       /* If deferring, make sure we are not looking at an instruction that
 	 wouldn't have existed if we were not.  */
       if (state->m_deferring_p
 	  && (state->m_mul_result_set.contains (ops[0])
 	      || state->m_mul_result_set.contains (ops[1])))
-	return false;
+	return NULL_TREE;
 
       if (check_defer)
 	{
@@ -3544,14 +3516,13 @@ convert_mult_to_fma (gimple *mul_stmt, tree op1, tree op2,
 	  fprintf (dump_file, "\n");
 	}
 
-      return false;
+      return NULL_TREE;
     }
   else
     {
       if (state->m_deferring_p)
 	cancel_fma_deferring (state);
-      convert_mult_to_fma_1 (mul_result, op1, op2);
-      return true;
+      return convert_mult_to_fma_1 (mul_result, op1, op2, check_only_p);
     }
 }
 
diff --git a/gcc/tree-ssa-math-opts.h b/gcc/tree-ssa-math-opts.h
index 52b7938b599..a4f8ec57da6 100644
--- a/gcc/tree-ssa-math-opts.h
+++ b/gcc/tree-ssa-math-opts.h
@@ -23,4 +23,57 @@ along with GCC; see the file COPYING3.  If not see
 extern tree powi_as_mults (gimple_stmt_iterator *, location_t,
 			   tree, HOST_WIDE_INT);
 
+/* Data necessary to perform the actual transformation from a multiplication
+   and an addition to an FMA after decision is taken it should be done and to
+   then delete the multiplication statement from the function IL.  */
+
+struct fma_transformation_info
+{
+  gimple *mul_stmt;
+  tree mul_result;
+  tree op1;
+  tree op2;
+};
+
+/* Structure containing the current state of FMA deferring, i.e. whether we are
+   deferring, whether to continue deferring, and all data necessary to come
+   back and perform all deferred transformations.  */
+
+class fma_deferring_state
+{
+public:
+  /* Class constructor.  Pass true as PERFORM_DEFERRING in order to actually
+     do any deferring.  */
+
+  fma_deferring_state (bool perform_deferring)
+    : m_candidates (), m_mul_result_set (), m_initial_phi (NULL),
+      m_last_result (NULL_TREE), m_deferring_p (perform_deferring) {}
+
+  /* List of FMA candidates for which we the transformation has been determined
+     possible but we at this point in BB analysis we do not consider them
+     beneficial.  */
+  auto_vec<fma_transformation_info, 8> m_candidates;
+
+  /* Set of results of multiplication that are part of an already deferred FMA
+     candidates.  */
+  hash_set<tree> m_mul_result_set;
+
+  /* The PHI that supposedly feeds back result of a FMA to another over loop
+     boundary.  */
+  gphi *m_initial_phi;
+
+  /* Result of the last produced FMA candidate or NULL if there has not been
+     one.  */
+  tree m_last_result;
+
+  /* If true, deferring might still be profitable.  If false, transform all
+     candidates and no longer defer.  */
+  bool m_deferring_p;
+};
+
+tree
+convert_mult_to_fma (gimple *mul_stmt, tree op1, tree op2,
+		     fma_deferring_state *state, tree mul_cond = NULL_TREE,
+		     bool check_only_p = false);
+
 #endif  /* GCC_TREE_SSA_MATH_OPTS_H  */
diff --git a/gcc/tree-ssa-reassoc.cc b/gcc/tree-ssa-reassoc.cc
index eda03bf98a6..9d53472e390 100644
--- a/gcc/tree-ssa-reassoc.cc
+++ b/gcc/tree-ssa-reassoc.cc
@@ -5471,6 +5471,22 @@ get_reassociation_width (int ops_num, enum tree_code opc,
   return width;
 }
 
+/* This enum describes the state of FMA candidates for a list of operands.  */
+
+enum fma_state {
+  /* There is at most one FMA candidate in the op list.  In this case, no
+     special treatment is needed when rewriting expression tree.  */
+  FMA_STATE_NONE_OR_SINGLE = 0,
+
+  /* The result of a FMA is the first or second operand of another FMA.  In
+     this case, rewriting expression tree should be avoided.  */
+  FMA_STATE_NESTED,
+
+  /* There are multiple FMA candidates in the op list.  Avoid swapping operands
+     in this case.  */
+  FMA_STATE_MULTIPLE
+};
+
 #define SPECIAL_BIASED_END_STMT 0 /* It is the end stmt of all ops.  */
 #define BIASED_END_STMT 1 /* It is the end stmt of normal or biased ops.  */
 #define NORMAL_END_STMT 2 /* It is the end stmt of normal ops.  */
@@ -5507,7 +5523,7 @@ get_reassociation_width (int ops_num, enum tree_code opc,
    SSA6 = SSA4 + SSA5;
  */
 static void
-rewrite_expr_tree_parallel (gassign *stmt, int width, bool has_fma,
+rewrite_expr_tree_parallel (gassign *stmt, int width, enum fma_state fs,
 			    const vec<operand_entry *> &ops)
 {
   enum tree_code opcode = gimple_assign_rhs_code (stmt);
@@ -5582,7 +5598,7 @@ rewrite_expr_tree_parallel (gassign *stmt, int width, bool has_fma,
 	}
 
       /* Swap the operands if no FMA in the chain.  */
-      if (ops1->length () > 2 && !has_fma)
+      if (ops1->length () > 2 && fs == FMA_STATE_NONE_OR_SINGLE)
 	swap_ops_for_binary_stmt (*ops1, ops1->length () - 3);
 
       if (i < width_active
@@ -6767,6 +6783,12 @@ transform_stmt_to_multiply (gimple_stmt_iterator *gsi, gimple *stmt,
 }
 
 /* Rearrange ops may have more FMA when the chain may has more than 2 FMAs.
+   Returns the state of FMA candidates in OPS:
+   - FMA_STATE_NONE_OR_SINGLE if there's at most one FMA candidate in OPS,
+   - FMA_STATE_NESTED if a FMA's result is the first or second operand of
+     another FMA,
+   - FMA_STATE_MULTIPLE if there are multiple FMA candidates in the op list.
+
    Put no-mult ops and mult ops alternately at the end of the queue, which is
    conducive to generating more FMA and reducing the loss of FMA when breaking
    the chain.
@@ -6781,8 +6803,8 @@ transform_stmt_to_multiply (gimple_stmt_iterator *gsi, gimple *stmt,
 
    _4  = .FMA (c_7(D), d_8(D), _3);
    _11 = .FMA (a_5(D), b_6(D), _4);  */
-static bool
-rank_ops_for_fma (vec<operand_entry *> *ops)
+static enum fma_state
+rank_ops_for_fma (vec<operand_entry *> *ops, tree type)
 {
   operand_entry *oe;
   unsigned int i;
@@ -6804,6 +6826,59 @@ rank_ops_for_fma (vec<operand_entry *> *ops)
       else
 	ops_others.safe_push (oe);
     }
+
+  /* Check the lhs of MULT_EXPRs, if there's a FMA chain with pattern:
+
+	d = .FMA (a, b, c);
+	g = .FMA (d, e, f);
+	...
+
+     then return FMA_STATE_NESTED to avoid rewrite_expr_tree_parallel.  */
+  fma_deferring_state fds (false);
+  FOR_EACH_VEC_ELT (ops_mult, i, oe)
+    {
+      gimple *mul_stmt = SSA_NAME_DEF_STMT (oe->op);
+      tree lhs, lhs2;
+      if ((lhs = convert_mult_to_fma (mul_stmt, gimple_assign_rhs1 (mul_stmt),
+				      gimple_assign_rhs2 (mul_stmt), &fds,
+				      NULL, true)))
+	{
+	  /* Check for MULT_EXPR that uses LHS.  */
+	  imm_use_iterator imm_iter;
+	  gimple *use_stmt;
+	  use_operand_p use_p;
+	  FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
+	    {
+	      use_stmt = USE_STMT (use_p);
+	      if (is_gimple_debug (use_stmt))
+		continue;
+	      if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
+		continue;
+
+	      if (is_gimple_assign (use_stmt)
+		  && gimple_assign_rhs_code (use_stmt) == MULT_EXPR
+		  && (lhs2 = convert_mult_to_fma (use_stmt,
+						  gimple_assign_rhs1 (use_stmt),
+						  gimple_assign_rhs2 (use_stmt),
+						  &fds, NULL, true))
+		  && gimple_bb (SSA_NAME_DEF_STMT (lhs2))
+		       == gimple_bb (mul_stmt))
+		{
+		  if (dump_file && (dump_flags & TDF_DETAILS))
+		    {
+		      fprintf (dump_file,
+			       "Found possible nested FMA chain. LHS1: ");
+		      print_generic_expr (dump_file, lhs);
+		      fprintf (dump_file, ", LHS2: ");
+		      print_generic_expr (dump_file, lhs2);
+		      fprintf (dump_file, "\n");
+		    }
+		  return FMA_STATE_NESTED;
+		}
+	    }
+	}
+    }
+
   /* 1. When ops_mult.length == 2, like the following case,
 
      a * b + c * d + e.
@@ -6815,6 +6890,11 @@ rank_ops_for_fma (vec<operand_entry *> *ops)
      2. If all ops are defined with mult, we don't need to rearrange them.  */
   if (ops_mult.length () >= 2 && ops_mult.length () != ops_length)
     {
+      if (maybe_le (tree_to_poly_int64 (TYPE_SIZE (type)),
+		    param_avoid_fma_max_bits))
+	/* Avoid re-arrange to produce less FMA chains that can be slow.  */
+	return FMA_STATE_MULTIPLE;
+
       /* Put no-mult ops and mult ops alternately at the end of the
 	 queue, which is conducive to generating more FMA and reducing the
 	 loss of FMA when breaking the chain.  */
@@ -6829,9 +6909,9 @@ rank_ops_for_fma (vec<operand_entry *> *ops)
 	  if (opindex > 0)
 	    opindex--;
 	}
-      return true;
+      return FMA_STATE_MULTIPLE;
     }
-  return false;
+  return FMA_STATE_NONE_OR_SINGLE;
 }
 /* Reassociate expressions in basic block BB and its post-dominator as
    children.
@@ -6997,7 +7077,7 @@ reassociate_bb (basic_block bb)
 		  machine_mode mode = TYPE_MODE (TREE_TYPE (lhs));
 		  int ops_num = ops.length ();
 		  int width;
-		  bool has_fma = false;
+		  enum fma_state fs = FMA_STATE_NONE_OR_SINGLE;
 
 		  /* For binary bit operations, if there are at least 3
 		     operands and the last operand in OPS is a constant,
@@ -7020,7 +7100,7 @@ reassociate_bb (basic_block bb)
 						      opt_type)
 		      && (rhs_code == PLUS_EXPR || rhs_code == MINUS_EXPR))
 		    {
-		      has_fma = rank_ops_for_fma (&ops);
+		      fs = rank_ops_for_fma (&ops, TREE_TYPE (lhs));
 		    }
 
 		  /* Only rewrite the expression tree to parallel in the
@@ -7028,6 +7108,7 @@ reassociate_bb (basic_block bb)
 		     with initial linearization.  */
 		  if (!reassoc_insert_powi_p
 		      && ops.length () > 3
+		      && fs != FMA_STATE_NESTED
 		      && (width = get_reassociation_width (ops_num, rhs_code,
 							   mode)) > 1)
 		    {
@@ -7037,7 +7118,7 @@ reassociate_bb (basic_block bb)
 				 width);
 		      rewrite_expr_tree_parallel (as_a <gassign *> (stmt),
 						  width,
-						  has_fma,
+						  fs,
 						  ops);
 		    }
 		  else
@@ -7046,7 +7127,7 @@ reassociate_bb (basic_block bb)
 			 to make sure the ones that get the double
 			 binary op are chosen wisely.  */
 		      int len = ops.length ();
-		      if (len >= 3 && !has_fma)
+		      if (len >= 3 && fs == FMA_STATE_NONE_OR_SINGLE)
 			swap_ops_for_binary_stmt (ops, len - 3);
 
 		      new_lhs = rewrite_expr_tree (stmt, rhs_code, 0, ops,
-- 
2.25.1


^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2023-07-19  9:05 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-07-11  2:59 [PATCH v2] tree-optimization/110279- Check for nested FMA chains in reassoc Di Zhao OS
2023-07-11  6:51 ` Philipp Tomsich
2023-07-17 14:25   ` Tamar Christina
2023-07-18 11:41     ` Richard Biener
2023-07-19  9:05 ` Richard Biener

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).