public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH]AArch64 update costing for MLA by invariant
@ 2023-08-02 10:21 Tamar Christina
  2023-08-02 10:43 ` Richard Sandiford
  0 siblings, 1 reply; 6+ messages in thread
From: Tamar Christina @ 2023-08-02 10:21 UTC (permalink / raw)
  To: gcc-patches
  Cc: nd, Richard.Earnshaw, Marcus.Shawcroft, Kyrylo.Tkachov,
	richard.sandiford

[-- Attachment #1: Type: text/plain, Size: 5589 bytes --]

Hi All,

When determining issue rates we currently discount non-constant MLA accumulators
for Advanced SIMD but don't do it for the latency.

This means the costs for Advanced SIMD with a constant accumulator are wrong and
results in us costing SVE and Advanced SIMD the same.  This can cauze us to
vectorize with Advanced SIMD instead of SVE in some cases.

This patch adds the same discount for SVE and Scalar as we do for issue rate.

My assumption was that on issue rate we reject all scalar constants early
because we take into account the extra instruction to create the constant?
Though I'd have expected this to be in prologue costs.  For this reason I added
an extra parameter to allow me to force the check to at least look for the
multiplication.

This gives a 5% improvement in fotonik3d_r in SPECCPU 2017 on large
Neoverse cores.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* config/aarch64/aarch64.cc (aarch64_multiply_add_p): Add param
	allow_constants. 
	(aarch64_adjust_stmt_cost): Use it.
	(aarch64_vector_costs::count_ops): Likewise.
	(aarch64_vector_costs::add_stmt_cost): Pass vinfo to
	aarch64_adjust_stmt_cost.

--- inline copy of patch -- 
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 560e5431636ef46c41d56faa0c4e95be78f64b50..76b74b77b3f122a3c972557e2f83b63ba365fea9 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -16398,10 +16398,11 @@ aarch64_advsimd_ldp_stp_p (enum vect_cost_for_stmt kind,
    or multiply-subtract sequence that might be suitable for fusing into a
    single instruction.  If VEC_FLAGS is zero, analyze the operation as
    a scalar one, otherwise analyze it as an operation on vectors with those
-   VEC_* flags.  */
+   VEC_* flags.  When ALLOW_CONSTANTS we'll recognize all accumulators including
+   constant ones.  */
 static bool
 aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info,
-			unsigned int vec_flags)
+			unsigned int vec_flags, bool allow_constants)
 {
   gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
   if (!assign)
@@ -16410,8 +16411,9 @@ aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info,
   if (code != PLUS_EXPR && code != MINUS_EXPR)
     return false;
 
-  if (CONSTANT_CLASS_P (gimple_assign_rhs1 (assign))
-      || CONSTANT_CLASS_P (gimple_assign_rhs2 (assign)))
+  if (!allow_constants
+      && (CONSTANT_CLASS_P (gimple_assign_rhs1 (assign))
+	  || CONSTANT_CLASS_P (gimple_assign_rhs2 (assign))))
     return false;
 
   for (int i = 1; i < 3; ++i)
@@ -16429,7 +16431,7 @@ aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info,
       if (!rhs_assign || gimple_assign_rhs_code (rhs_assign) != MULT_EXPR)
 	continue;
 
-      if (vec_flags & VEC_ADVSIMD)
+      if (!allow_constants && (vec_flags & VEC_ADVSIMD))
 	{
 	  /* Scalar and SVE code can tie the result to any FMLA input (or none,
 	     although that requires a MOVPRFX for SVE).  However, Advanced SIMD
@@ -16441,7 +16443,8 @@ aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info,
 	    return false;
 	  def_stmt_info = vinfo->lookup_def (rhs);
 	  if (!def_stmt_info
-	      || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_external_def)
+	      || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_external_def
+	      || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_constant_def)
 	    return false;
 	}
 
@@ -16721,8 +16724,9 @@ aarch64_sve_adjust_stmt_cost (class vec_info *vinfo, vect_cost_for_stmt kind,
    and which when vectorized would operate on vector type VECTYPE.  Add the
    cost of any embedded operations.  */
 static fractional_cost
-aarch64_adjust_stmt_cost (vect_cost_for_stmt kind, stmt_vec_info stmt_info,
-			  tree vectype, fractional_cost stmt_cost)
+aarch64_adjust_stmt_cost (vec_info *vinfo, vect_cost_for_stmt kind,
+			  stmt_vec_info stmt_info, tree vectype,
+			  unsigned vec_flags, fractional_cost stmt_cost)
 {
   if (vectype)
     {
@@ -16745,6 +16749,15 @@ aarch64_adjust_stmt_cost (vect_cost_for_stmt kind, stmt_vec_info stmt_info,
 	  break;
 	}
 
+      gassign *assign = dyn_cast<gassign *> (STMT_VINFO_STMT (stmt_info));
+      if (assign && !vect_is_reduction (stmt_info))
+	{
+	  bool simd_p = vec_flags & VEC_ADVSIMD;
+	  /* For MLA we need to reduce the cost since MLA is 1 instruction.  */
+	  if (aarch64_multiply_add_p (vinfo, stmt_info, vec_flags, !simd_p))
+	    return 0;
+	}
+
       if (kind == vector_stmt || kind == vec_to_scalar)
 	if (tree cmp_type = vect_embedded_comparison_type (stmt_info))
 	  {
@@ -16795,7 +16808,8 @@ aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind,
     }
 
   /* Assume that multiply-adds will become a single operation.  */
-  if (stmt_info && aarch64_multiply_add_p (m_vinfo, stmt_info, m_vec_flags))
+  if (stmt_info
+      && aarch64_multiply_add_p (m_vinfo, stmt_info, m_vec_flags, false))
     return;
 
   /* Count the basic operation cost associated with KIND.  */
@@ -17041,8 +17055,8 @@ aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
     {
       /* Account for any extra "embedded" costs that apply additively
 	 to the base cost calculated above.  */
-      stmt_cost = aarch64_adjust_stmt_cost (kind, stmt_info, vectype,
-					    stmt_cost);
+      stmt_cost = aarch64_adjust_stmt_cost (m_vinfo, kind, stmt_info,
+					    vectype, m_vec_flags, stmt_cost);
 
       /* If we're recording a nonzero vector loop body cost for the
 	 innermost loop, also estimate the operations that would need




-- 

[-- Attachment #2: rb17618.patch --]
[-- Type: text/plain, Size: 4350 bytes --]

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 560e5431636ef46c41d56faa0c4e95be78f64b50..76b74b77b3f122a3c972557e2f83b63ba365fea9 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -16398,10 +16398,11 @@ aarch64_advsimd_ldp_stp_p (enum vect_cost_for_stmt kind,
    or multiply-subtract sequence that might be suitable for fusing into a
    single instruction.  If VEC_FLAGS is zero, analyze the operation as
    a scalar one, otherwise analyze it as an operation on vectors with those
-   VEC_* flags.  */
+   VEC_* flags.  When ALLOW_CONSTANTS we'll recognize all accumulators including
+   constant ones.  */
 static bool
 aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info,
-			unsigned int vec_flags)
+			unsigned int vec_flags, bool allow_constants)
 {
   gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
   if (!assign)
@@ -16410,8 +16411,9 @@ aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info,
   if (code != PLUS_EXPR && code != MINUS_EXPR)
     return false;
 
-  if (CONSTANT_CLASS_P (gimple_assign_rhs1 (assign))
-      || CONSTANT_CLASS_P (gimple_assign_rhs2 (assign)))
+  if (!allow_constants
+      && (CONSTANT_CLASS_P (gimple_assign_rhs1 (assign))
+	  || CONSTANT_CLASS_P (gimple_assign_rhs2 (assign))))
     return false;
 
   for (int i = 1; i < 3; ++i)
@@ -16429,7 +16431,7 @@ aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info,
       if (!rhs_assign || gimple_assign_rhs_code (rhs_assign) != MULT_EXPR)
 	continue;
 
-      if (vec_flags & VEC_ADVSIMD)
+      if (!allow_constants && (vec_flags & VEC_ADVSIMD))
 	{
 	  /* Scalar and SVE code can tie the result to any FMLA input (or none,
 	     although that requires a MOVPRFX for SVE).  However, Advanced SIMD
@@ -16441,7 +16443,8 @@ aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info,
 	    return false;
 	  def_stmt_info = vinfo->lookup_def (rhs);
 	  if (!def_stmt_info
-	      || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_external_def)
+	      || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_external_def
+	      || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_constant_def)
 	    return false;
 	}
 
@@ -16721,8 +16724,9 @@ aarch64_sve_adjust_stmt_cost (class vec_info *vinfo, vect_cost_for_stmt kind,
    and which when vectorized would operate on vector type VECTYPE.  Add the
    cost of any embedded operations.  */
 static fractional_cost
-aarch64_adjust_stmt_cost (vect_cost_for_stmt kind, stmt_vec_info stmt_info,
-			  tree vectype, fractional_cost stmt_cost)
+aarch64_adjust_stmt_cost (vec_info *vinfo, vect_cost_for_stmt kind,
+			  stmt_vec_info stmt_info, tree vectype,
+			  unsigned vec_flags, fractional_cost stmt_cost)
 {
   if (vectype)
     {
@@ -16745,6 +16749,15 @@ aarch64_adjust_stmt_cost (vect_cost_for_stmt kind, stmt_vec_info stmt_info,
 	  break;
 	}
 
+      gassign *assign = dyn_cast<gassign *> (STMT_VINFO_STMT (stmt_info));
+      if (assign && !vect_is_reduction (stmt_info))
+	{
+	  bool simd_p = vec_flags & VEC_ADVSIMD;
+	  /* For MLA we need to reduce the cost since MLA is 1 instruction.  */
+	  if (aarch64_multiply_add_p (vinfo, stmt_info, vec_flags, !simd_p))
+	    return 0;
+	}
+
       if (kind == vector_stmt || kind == vec_to_scalar)
 	if (tree cmp_type = vect_embedded_comparison_type (stmt_info))
 	  {
@@ -16795,7 +16808,8 @@ aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind,
     }
 
   /* Assume that multiply-adds will become a single operation.  */
-  if (stmt_info && aarch64_multiply_add_p (m_vinfo, stmt_info, m_vec_flags))
+  if (stmt_info
+      && aarch64_multiply_add_p (m_vinfo, stmt_info, m_vec_flags, false))
     return;
 
   /* Count the basic operation cost associated with KIND.  */
@@ -17041,8 +17055,8 @@ aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
     {
       /* Account for any extra "embedded" costs that apply additively
 	 to the base cost calculated above.  */
-      stmt_cost = aarch64_adjust_stmt_cost (kind, stmt_info, vectype,
-					    stmt_cost);
+      stmt_cost = aarch64_adjust_stmt_cost (m_vinfo, kind, stmt_info,
+					    vectype, m_vec_flags, stmt_cost);
 
       /* If we're recording a nonzero vector loop body cost for the
 	 innermost loop, also estimate the operations that would need




^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2023-08-03 13:39 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-08-02 10:21 [PATCH]AArch64 update costing for MLA by invariant Tamar Christina
2023-08-02 10:43 ` Richard Sandiford
2023-08-02 10:52   ` Tamar Christina
2023-08-02 10:56     ` Richard Sandiford
2023-08-03 12:56       ` Tamar Christina
2023-08-03 13:38         ` Richard Sandiford

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).