public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH]AArch64 update costing for MLA by invariant
@ 2023-08-02 10:21 Tamar Christina
  2023-08-02 10:43 ` Richard Sandiford
  0 siblings, 1 reply; 6+ messages in thread
From: Tamar Christina @ 2023-08-02 10:21 UTC (permalink / raw)
  To: gcc-patches
  Cc: nd, Richard.Earnshaw, Marcus.Shawcroft, Kyrylo.Tkachov,
	richard.sandiford

[-- Attachment #1: Type: text/plain, Size: 5589 bytes --]

Hi All,

When determining issue rates we currently discount non-constant MLA accumulators
for Advanced SIMD but don't do it for the latency.

This means the costs for Advanced SIMD with a constant accumulator are wrong and
results in us costing SVE and Advanced SIMD the same.  This can cauze us to
vectorize with Advanced SIMD instead of SVE in some cases.

This patch adds the same discount for SVE and Scalar as we do for issue rate.

My assumption was that on issue rate we reject all scalar constants early
because we take into account the extra instruction to create the constant?
Though I'd have expected this to be in prologue costs.  For this reason I added
an extra parameter to allow me to force the check to at least look for the
multiplication.

This gives a 5% improvement in fotonik3d_r in SPECCPU 2017 on large
Neoverse cores.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* config/aarch64/aarch64.cc (aarch64_multiply_add_p): Add param
	allow_constants. 
	(aarch64_adjust_stmt_cost): Use it.
	(aarch64_vector_costs::count_ops): Likewise.
	(aarch64_vector_costs::add_stmt_cost): Pass vinfo to
	aarch64_adjust_stmt_cost.

--- inline copy of patch -- 
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 560e5431636ef46c41d56faa0c4e95be78f64b50..76b74b77b3f122a3c972557e2f83b63ba365fea9 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -16398,10 +16398,11 @@ aarch64_advsimd_ldp_stp_p (enum vect_cost_for_stmt kind,
    or multiply-subtract sequence that might be suitable for fusing into a
    single instruction.  If VEC_FLAGS is zero, analyze the operation as
    a scalar one, otherwise analyze it as an operation on vectors with those
-   VEC_* flags.  */
+   VEC_* flags.  When ALLOW_CONSTANTS we'll recognize all accumulators including
+   constant ones.  */
 static bool
 aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info,
-			unsigned int vec_flags)
+			unsigned int vec_flags, bool allow_constants)
 {
   gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
   if (!assign)
@@ -16410,8 +16411,9 @@ aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info,
   if (code != PLUS_EXPR && code != MINUS_EXPR)
     return false;
 
-  if (CONSTANT_CLASS_P (gimple_assign_rhs1 (assign))
-      || CONSTANT_CLASS_P (gimple_assign_rhs2 (assign)))
+  if (!allow_constants
+      && (CONSTANT_CLASS_P (gimple_assign_rhs1 (assign))
+	  || CONSTANT_CLASS_P (gimple_assign_rhs2 (assign))))
     return false;
 
   for (int i = 1; i < 3; ++i)
@@ -16429,7 +16431,7 @@ aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info,
       if (!rhs_assign || gimple_assign_rhs_code (rhs_assign) != MULT_EXPR)
 	continue;
 
-      if (vec_flags & VEC_ADVSIMD)
+      if (!allow_constants && (vec_flags & VEC_ADVSIMD))
 	{
 	  /* Scalar and SVE code can tie the result to any FMLA input (or none,
 	     although that requires a MOVPRFX for SVE).  However, Advanced SIMD
@@ -16441,7 +16443,8 @@ aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info,
 	    return false;
 	  def_stmt_info = vinfo->lookup_def (rhs);
 	  if (!def_stmt_info
-	      || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_external_def)
+	      || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_external_def
+	      || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_constant_def)
 	    return false;
 	}
 
@@ -16721,8 +16724,9 @@ aarch64_sve_adjust_stmt_cost (class vec_info *vinfo, vect_cost_for_stmt kind,
    and which when vectorized would operate on vector type VECTYPE.  Add the
    cost of any embedded operations.  */
 static fractional_cost
-aarch64_adjust_stmt_cost (vect_cost_for_stmt kind, stmt_vec_info stmt_info,
-			  tree vectype, fractional_cost stmt_cost)
+aarch64_adjust_stmt_cost (vec_info *vinfo, vect_cost_for_stmt kind,
+			  stmt_vec_info stmt_info, tree vectype,
+			  unsigned vec_flags, fractional_cost stmt_cost)
 {
   if (vectype)
     {
@@ -16745,6 +16749,15 @@ aarch64_adjust_stmt_cost (vect_cost_for_stmt kind, stmt_vec_info stmt_info,
 	  break;
 	}
 
+      gassign *assign = dyn_cast<gassign *> (STMT_VINFO_STMT (stmt_info));
+      if (assign && !vect_is_reduction (stmt_info))
+	{
+	  bool simd_p = vec_flags & VEC_ADVSIMD;
+	  /* For MLA we need to reduce the cost since MLA is 1 instruction.  */
+	  if (aarch64_multiply_add_p (vinfo, stmt_info, vec_flags, !simd_p))
+	    return 0;
+	}
+
       if (kind == vector_stmt || kind == vec_to_scalar)
 	if (tree cmp_type = vect_embedded_comparison_type (stmt_info))
 	  {
@@ -16795,7 +16808,8 @@ aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind,
     }
 
   /* Assume that multiply-adds will become a single operation.  */
-  if (stmt_info && aarch64_multiply_add_p (m_vinfo, stmt_info, m_vec_flags))
+  if (stmt_info
+      && aarch64_multiply_add_p (m_vinfo, stmt_info, m_vec_flags, false))
     return;
 
   /* Count the basic operation cost associated with KIND.  */
@@ -17041,8 +17055,8 @@ aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
     {
       /* Account for any extra "embedded" costs that apply additively
 	 to the base cost calculated above.  */
-      stmt_cost = aarch64_adjust_stmt_cost (kind, stmt_info, vectype,
-					    stmt_cost);
+      stmt_cost = aarch64_adjust_stmt_cost (m_vinfo, kind, stmt_info,
+					    vectype, m_vec_flags, stmt_cost);
 
       /* If we're recording a nonzero vector loop body cost for the
 	 innermost loop, also estimate the operations that would need




-- 

[-- Attachment #2: rb17618.patch --]
[-- Type: text/plain, Size: 4350 bytes --]

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 560e5431636ef46c41d56faa0c4e95be78f64b50..76b74b77b3f122a3c972557e2f83b63ba365fea9 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -16398,10 +16398,11 @@ aarch64_advsimd_ldp_stp_p (enum vect_cost_for_stmt kind,
    or multiply-subtract sequence that might be suitable for fusing into a
    single instruction.  If VEC_FLAGS is zero, analyze the operation as
    a scalar one, otherwise analyze it as an operation on vectors with those
-   VEC_* flags.  */
+   VEC_* flags.  When ALLOW_CONSTANTS we'll recognize all accumulators including
+   constant ones.  */
 static bool
 aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info,
-			unsigned int vec_flags)
+			unsigned int vec_flags, bool allow_constants)
 {
   gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
   if (!assign)
@@ -16410,8 +16411,9 @@ aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info,
   if (code != PLUS_EXPR && code != MINUS_EXPR)
     return false;
 
-  if (CONSTANT_CLASS_P (gimple_assign_rhs1 (assign))
-      || CONSTANT_CLASS_P (gimple_assign_rhs2 (assign)))
+  if (!allow_constants
+      && (CONSTANT_CLASS_P (gimple_assign_rhs1 (assign))
+	  || CONSTANT_CLASS_P (gimple_assign_rhs2 (assign))))
     return false;
 
   for (int i = 1; i < 3; ++i)
@@ -16429,7 +16431,7 @@ aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info,
       if (!rhs_assign || gimple_assign_rhs_code (rhs_assign) != MULT_EXPR)
 	continue;
 
-      if (vec_flags & VEC_ADVSIMD)
+      if (!allow_constants && (vec_flags & VEC_ADVSIMD))
 	{
 	  /* Scalar and SVE code can tie the result to any FMLA input (or none,
 	     although that requires a MOVPRFX for SVE).  However, Advanced SIMD
@@ -16441,7 +16443,8 @@ aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info,
 	    return false;
 	  def_stmt_info = vinfo->lookup_def (rhs);
 	  if (!def_stmt_info
-	      || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_external_def)
+	      || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_external_def
+	      || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_constant_def)
 	    return false;
 	}
 
@@ -16721,8 +16724,9 @@ aarch64_sve_adjust_stmt_cost (class vec_info *vinfo, vect_cost_for_stmt kind,
    and which when vectorized would operate on vector type VECTYPE.  Add the
    cost of any embedded operations.  */
 static fractional_cost
-aarch64_adjust_stmt_cost (vect_cost_for_stmt kind, stmt_vec_info stmt_info,
-			  tree vectype, fractional_cost stmt_cost)
+aarch64_adjust_stmt_cost (vec_info *vinfo, vect_cost_for_stmt kind,
+			  stmt_vec_info stmt_info, tree vectype,
+			  unsigned vec_flags, fractional_cost stmt_cost)
 {
   if (vectype)
     {
@@ -16745,6 +16749,15 @@ aarch64_adjust_stmt_cost (vect_cost_for_stmt kind, stmt_vec_info stmt_info,
 	  break;
 	}
 
+      gassign *assign = dyn_cast<gassign *> (STMT_VINFO_STMT (stmt_info));
+      if (assign && !vect_is_reduction (stmt_info))
+	{
+	  bool simd_p = vec_flags & VEC_ADVSIMD;
+	  /* For MLA we need to reduce the cost since MLA is 1 instruction.  */
+	  if (aarch64_multiply_add_p (vinfo, stmt_info, vec_flags, !simd_p))
+	    return 0;
+	}
+
       if (kind == vector_stmt || kind == vec_to_scalar)
 	if (tree cmp_type = vect_embedded_comparison_type (stmt_info))
 	  {
@@ -16795,7 +16808,8 @@ aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind,
     }
 
   /* Assume that multiply-adds will become a single operation.  */
-  if (stmt_info && aarch64_multiply_add_p (m_vinfo, stmt_info, m_vec_flags))
+  if (stmt_info
+      && aarch64_multiply_add_p (m_vinfo, stmt_info, m_vec_flags, false))
     return;
 
   /* Count the basic operation cost associated with KIND.  */
@@ -17041,8 +17055,8 @@ aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
     {
       /* Account for any extra "embedded" costs that apply additively
 	 to the base cost calculated above.  */
-      stmt_cost = aarch64_adjust_stmt_cost (kind, stmt_info, vectype,
-					    stmt_cost);
+      stmt_cost = aarch64_adjust_stmt_cost (m_vinfo, kind, stmt_info,
+					    vectype, m_vec_flags, stmt_cost);
 
       /* If we're recording a nonzero vector loop body cost for the
 	 innermost loop, also estimate the operations that would need




^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH]AArch64 update costing for MLA by invariant
  2023-08-02 10:21 [PATCH]AArch64 update costing for MLA by invariant Tamar Christina
@ 2023-08-02 10:43 ` Richard Sandiford
  2023-08-02 10:52   ` Tamar Christina
  0 siblings, 1 reply; 6+ messages in thread
From: Richard Sandiford @ 2023-08-02 10:43 UTC (permalink / raw)
  To: Tamar Christina
  Cc: gcc-patches, nd, Richard.Earnshaw, Marcus.Shawcroft, Kyrylo.Tkachov

Tamar Christina <tamar.christina@arm.com> writes:
> Hi All,
>
> When determining issue rates we currently discount non-constant MLA accumulators
> for Advanced SIMD but don't do it for the latency.
>
> This means the costs for Advanced SIMD with a constant accumulator are wrong and
> results in us costing SVE and Advanced SIMD the same.  This can cauze us to
> vectorize with Advanced SIMD instead of SVE in some cases.
>
> This patch adds the same discount for SVE and Scalar as we do for issue rate.
>
> My assumption was that on issue rate we reject all scalar constants early
> because we take into account the extra instruction to create the constant?
> Though I'd have expected this to be in prologue costs.  For this reason I added
> an extra parameter to allow me to force the check to at least look for the
> multiplication.

I'm not sure that was it.  I wish I'd added a comment to say what
it was though :(  I suspect different parts of this function were
written at different times, hence the inconsistency.

> This gives a 5% improvement in fotonik3d_r in SPECCPU 2017 on large
> Neoverse cores.
>
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> 	* config/aarch64/aarch64.cc (aarch64_multiply_add_p): Add param
> 	allow_constants. 
> 	(aarch64_adjust_stmt_cost): Use it.
> 	(aarch64_vector_costs::count_ops): Likewise.
> 	(aarch64_vector_costs::add_stmt_cost): Pass vinfo to
> 	aarch64_adjust_stmt_cost.
>
> --- inline copy of patch -- 
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index 560e5431636ef46c41d56faa0c4e95be78f64b50..76b74b77b3f122a3c972557e2f83b63ba365fea9 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -16398,10 +16398,11 @@ aarch64_advsimd_ldp_stp_p (enum vect_cost_for_stmt kind,
>     or multiply-subtract sequence that might be suitable for fusing into a
>     single instruction.  If VEC_FLAGS is zero, analyze the operation as
>     a scalar one, otherwise analyze it as an operation on vectors with those
> -   VEC_* flags.  */
> +   VEC_* flags.  When ALLOW_CONSTANTS we'll recognize all accumulators including
> +   constant ones.  */
>  static bool
>  aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info,
> -			unsigned int vec_flags)
> +			unsigned int vec_flags, bool allow_constants)
>  {
>    gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
>    if (!assign)
> @@ -16410,8 +16411,9 @@ aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info,
>    if (code != PLUS_EXPR && code != MINUS_EXPR)
>      return false;
>  
> -  if (CONSTANT_CLASS_P (gimple_assign_rhs1 (assign))
> -      || CONSTANT_CLASS_P (gimple_assign_rhs2 (assign)))
> +  if (!allow_constants
> +      && (CONSTANT_CLASS_P (gimple_assign_rhs1 (assign))
> +	  || CONSTANT_CLASS_P (gimple_assign_rhs2 (assign))))
>      return false;
>  
>    for (int i = 1; i < 3; ++i)
> @@ -16429,7 +16431,7 @@ aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info,
>        if (!rhs_assign || gimple_assign_rhs_code (rhs_assign) != MULT_EXPR)
>  	continue;
>  
> -      if (vec_flags & VEC_ADVSIMD)
> +      if (!allow_constants && (vec_flags & VEC_ADVSIMD))
>  	{
>  	  /* Scalar and SVE code can tie the result to any FMLA input (or none,
>  	     although that requires a MOVPRFX for SVE).  However, Advanced SIMD
> @@ -16441,7 +16443,8 @@ aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info,
>  	    return false;
>  	  def_stmt_info = vinfo->lookup_def (rhs);
>  	  if (!def_stmt_info
> -	      || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_external_def)
> +	      || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_external_def
> +	      || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_constant_def)

Do you see vect_constant_defs in practice, or is this just for completeness?
I would expect any constants to appear as direct operands.  I don't mind
keeping it if it's just a belt-and-braces thing though.

But rather than add the allow_constants parameter, I think we should
just try removing:

  if (CONSTANT_CLASS_P (gimple_assign_rhs1 (assign))
      || CONSTANT_CLASS_P (gimple_assign_rhs2 (assign)))
    return false;

so that the detection is the same for throughput and latency.  I think:

      if (vec_flags & VEC_ADVSIMD)
	{
	  /* Scalar and SVE code can tie the result to any FMLA input (or none,
	     although that requires a MOVPRFX for SVE).  However, Advanced SIMD
	     only supports MLA forms, so will require a move if the result
	     cannot be tied to the accumulator.  The most important case in
	     which this is true is when the accumulator input is invariant.  */
	  rhs = gimple_op (assign, 3 - i);
	  if (TREE_CODE (rhs) != SSA_NAME)
	    return false;
	  def_stmt_info = vinfo->lookup_def (rhs);
	  if (!def_stmt_info
	      || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_external_def)
	    return false;
	}

will then do the right thing, because the first return false will
reject constant accumulators for Advanced SIMD.

Thanks,
Richard

> @@ -16721,8 +16724,9 @@ aarch64_sve_adjust_stmt_cost (class vec_info *vinfo, vect_cost_for_stmt kind,
>     and which when vectorized would operate on vector type VECTYPE.  Add the
>     cost of any embedded operations.  */
>  static fractional_cost
> -aarch64_adjust_stmt_cost (vect_cost_for_stmt kind, stmt_vec_info stmt_info,
> -			  tree vectype, fractional_cost stmt_cost)
> +aarch64_adjust_stmt_cost (vec_info *vinfo, vect_cost_for_stmt kind,
> +			  stmt_vec_info stmt_info, tree vectype,
> +			  unsigned vec_flags, fractional_cost stmt_cost)
>  {
>    if (vectype)
>      {
> @@ -16745,6 +16749,15 @@ aarch64_adjust_stmt_cost (vect_cost_for_stmt kind, stmt_vec_info stmt_info,
>  	  break;
>  	}
>  
> +      gassign *assign = dyn_cast<gassign *> (STMT_VINFO_STMT (stmt_info));
> +      if (assign && !vect_is_reduction (stmt_info))
> +	{
> +	  bool simd_p = vec_flags & VEC_ADVSIMD;
> +	  /* For MLA we need to reduce the cost since MLA is 1 instruction.  */
> +	  if (aarch64_multiply_add_p (vinfo, stmt_info, vec_flags, !simd_p))
> +	    return 0;
> +	}
> +
>        if (kind == vector_stmt || kind == vec_to_scalar)
>  	if (tree cmp_type = vect_embedded_comparison_type (stmt_info))
>  	  {
> @@ -16795,7 +16808,8 @@ aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind,
>      }
>  
>    /* Assume that multiply-adds will become a single operation.  */
> -  if (stmt_info && aarch64_multiply_add_p (m_vinfo, stmt_info, m_vec_flags))
> +  if (stmt_info
> +      && aarch64_multiply_add_p (m_vinfo, stmt_info, m_vec_flags, false))
>      return;
>  
>    /* Count the basic operation cost associated with KIND.  */
> @@ -17041,8 +17055,8 @@ aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
>      {
>        /* Account for any extra "embedded" costs that apply additively
>  	 to the base cost calculated above.  */
> -      stmt_cost = aarch64_adjust_stmt_cost (kind, stmt_info, vectype,
> -					    stmt_cost);
> +      stmt_cost = aarch64_adjust_stmt_cost (m_vinfo, kind, stmt_info,
> +					    vectype, m_vec_flags, stmt_cost);
>  
>        /* If we're recording a nonzero vector loop body cost for the
>  	 innermost loop, also estimate the operations that would need

^ permalink raw reply	[flat|nested] 6+ messages in thread

* RE: [PATCH]AArch64 update costing for MLA by invariant
  2023-08-02 10:43 ` Richard Sandiford
@ 2023-08-02 10:52   ` Tamar Christina
  2023-08-02 10:56     ` Richard Sandiford
  0 siblings, 1 reply; 6+ messages in thread
From: Tamar Christina @ 2023-08-02 10:52 UTC (permalink / raw)
  To: Richard Sandiford
  Cc: gcc-patches, nd, Richard Earnshaw, Marcus Shawcroft, Kyrylo Tkachov

> Tamar Christina <tamar.christina@arm.com> writes:
> > Hi All,
> >
> > When determining issue rates we currently discount non-constant MLA
> > accumulators for Advanced SIMD but don't do it for the latency.
> >
> > This means the costs for Advanced SIMD with a constant accumulator are
> > wrong and results in us costing SVE and Advanced SIMD the same.  This
> > can cauze us to vectorize with Advanced SIMD instead of SVE in some cases.
> >
> > This patch adds the same discount for SVE and Scalar as we do for issue rate.
> >
> > My assumption was that on issue rate we reject all scalar constants
> > early because we take into account the extra instruction to create the
> constant?
> > Though I'd have expected this to be in prologue costs.  For this
> > reason I added an extra parameter to allow me to force the check to at
> > least look for the multiplication.
> 
> I'm not sure that was it.  I wish I'd added a comment to say what it was
> though :(  I suspect different parts of this function were written at different
> times, hence the inconsistency.
> 
> > This gives a 5% improvement in fotonik3d_r in SPECCPU 2017 on large
> > Neoverse cores.
> >
> > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> >
> > Ok for master?
> >
> > Thanks,
> > Tamar
> >
> > gcc/ChangeLog:
> >
> > 	* config/aarch64/aarch64.cc (aarch64_multiply_add_p): Add param
> > 	allow_constants.
> > 	(aarch64_adjust_stmt_cost): Use it.
> > 	(aarch64_vector_costs::count_ops): Likewise.
> > 	(aarch64_vector_costs::add_stmt_cost): Pass vinfo to
> > 	aarch64_adjust_stmt_cost.
> >
> > --- inline copy of patch --
> > diff --git a/gcc/config/aarch64/aarch64.cc
> > b/gcc/config/aarch64/aarch64.cc index
> >
> 560e5431636ef46c41d56faa0c4e95be78f64b50..76b74b77b3f122a3c9725
> 57e2f83
> > b63ba365fea9 100644
> > --- a/gcc/config/aarch64/aarch64.cc
> > +++ b/gcc/config/aarch64/aarch64.cc
> > @@ -16398,10 +16398,11 @@ aarch64_advsimd_ldp_stp_p (enum
> vect_cost_for_stmt kind,
> >     or multiply-subtract sequence that might be suitable for fusing into a
> >     single instruction.  If VEC_FLAGS is zero, analyze the operation as
> >     a scalar one, otherwise analyze it as an operation on vectors with those
> > -   VEC_* flags.  */
> > +   VEC_* flags.  When ALLOW_CONSTANTS we'll recognize all accumulators
> including
> > +   constant ones.  */
> >  static bool
> >  aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info,
> > -			unsigned int vec_flags)
> > +			unsigned int vec_flags, bool allow_constants)
> >  {
> >    gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
> >    if (!assign)
> > @@ -16410,8 +16411,9 @@ aarch64_multiply_add_p (vec_info *vinfo,
> stmt_vec_info stmt_info,
> >    if (code != PLUS_EXPR && code != MINUS_EXPR)
> >      return false;
> >
> > -  if (CONSTANT_CLASS_P (gimple_assign_rhs1 (assign))
> > -      || CONSTANT_CLASS_P (gimple_assign_rhs2 (assign)))
> > +  if (!allow_constants
> > +      && (CONSTANT_CLASS_P (gimple_assign_rhs1 (assign))
> > +	  || CONSTANT_CLASS_P (gimple_assign_rhs2 (assign))))
> >      return false;
> >
> >    for (int i = 1; i < 3; ++i)
> > @@ -16429,7 +16431,7 @@ aarch64_multiply_add_p (vec_info *vinfo,
> stmt_vec_info stmt_info,
> >        if (!rhs_assign || gimple_assign_rhs_code (rhs_assign) != MULT_EXPR)
> >  	continue;
> >
> > -      if (vec_flags & VEC_ADVSIMD)
> > +      if (!allow_constants && (vec_flags & VEC_ADVSIMD))
> >  	{
> >  	  /* Scalar and SVE code can tie the result to any FMLA input (or none,
> >  	     although that requires a MOVPRFX for SVE).  However, Advanced
> > SIMD @@ -16441,7 +16443,8 @@ aarch64_multiply_add_p (vec_info
> *vinfo, stmt_vec_info stmt_info,
> >  	    return false;
> >  	  def_stmt_info = vinfo->lookup_def (rhs);
> >  	  if (!def_stmt_info
> > -	      || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_external_def)
> > +	      || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_external_def
> > +	      || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_constant_def)
> 
> Do you see vect_constant_defs in practice, or is this just for completeness?
> I would expect any constants to appear as direct operands.  I don't mind
> keeping it if it's just a belt-and-braces thing though.

In the latency case where I had allow_constants the early rejection based on
the operand itself wouldn't be rejected so in that case I still needed to reject
them but do so after the multiply check.  While they do appear as direct
operands as well they also have their own nodes, in particular for SLP so the
constants are handled as a group.

But can also check CONSTANT_CLASS_P (rhs) if that's preferrable. 

> 
> But rather than add the allow_constants parameter, I think we should just try
> removing:
> 
>   if (CONSTANT_CLASS_P (gimple_assign_rhs1 (assign))
>       || CONSTANT_CLASS_P (gimple_assign_rhs2 (assign)))
>     return false;
> 
> so that the detection is the same for throughput and latency.  I think:
> 
>       if (vec_flags & VEC_ADVSIMD)
> 	{
> 	  /* Scalar and SVE code can tie the result to any FMLA input (or none,
> 	     although that requires a MOVPRFX for SVE).  However, Advanced
> SIMD
> 	     only supports MLA forms, so will require a move if the result
> 	     cannot be tied to the accumulator.  The most important case in
> 	     which this is true is when the accumulator input is invariant.  */
> 	  rhs = gimple_op (assign, 3 - i);
> 	  if (TREE_CODE (rhs) != SSA_NAME)
> 	    return false;
> 	  def_stmt_info = vinfo->lookup_def (rhs);
> 	  if (!def_stmt_info
> 	      || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_external_def)
> 	    return false;
> 	}
> 
> will then do the right thing, because the first return false will reject constant
> accumulators for Advanced SIMD.

Makes sense, will do 😊

Thanks,
Tamar

> 
> Thanks,
> Richard
> 
> > @@ -16721,8 +16724,9 @@ aarch64_sve_adjust_stmt_cost (class vec_info
> *vinfo, vect_cost_for_stmt kind,
> >     and which when vectorized would operate on vector type VECTYPE.  Add
> the
> >     cost of any embedded operations.  */  static fractional_cost
> > -aarch64_adjust_stmt_cost (vect_cost_for_stmt kind, stmt_vec_info
> stmt_info,
> > -			  tree vectype, fractional_cost stmt_cost)
> > +aarch64_adjust_stmt_cost (vec_info *vinfo, vect_cost_for_stmt kind,
> > +			  stmt_vec_info stmt_info, tree vectype,
> > +			  unsigned vec_flags, fractional_cost stmt_cost)
> >  {
> >    if (vectype)
> >      {
> > @@ -16745,6 +16749,15 @@ aarch64_adjust_stmt_cost
> (vect_cost_for_stmt kind, stmt_vec_info stmt_info,
> >  	  break;
> >  	}
> >
> > +      gassign *assign = dyn_cast<gassign *> (STMT_VINFO_STMT
> (stmt_info));
> > +      if (assign && !vect_is_reduction (stmt_info))
> > +	{
> > +	  bool simd_p = vec_flags & VEC_ADVSIMD;
> > +	  /* For MLA we need to reduce the cost since MLA is 1 instruction.  */
> > +	  if (aarch64_multiply_add_p (vinfo, stmt_info, vec_flags, !simd_p))
> > +	    return 0;
> > +	}
> > +
> >        if (kind == vector_stmt || kind == vec_to_scalar)
> >  	if (tree cmp_type = vect_embedded_comparison_type (stmt_info))
> >  	  {
> > @@ -16795,7 +16808,8 @@ aarch64_vector_costs::count_ops (unsigned
> int count, vect_cost_for_stmt kind,
> >      }
> >
> >    /* Assume that multiply-adds will become a single operation.  */
> > -  if (stmt_info && aarch64_multiply_add_p (m_vinfo, stmt_info,
> > m_vec_flags))
> > +  if (stmt_info
> > +      && aarch64_multiply_add_p (m_vinfo, stmt_info, m_vec_flags,
> > + false))
> >      return;
> >
> >    /* Count the basic operation cost associated with KIND.  */ @@
> > -17041,8 +17055,8 @@ aarch64_vector_costs::add_stmt_cost (int count,
> vect_cost_for_stmt kind,
> >      {
> >        /* Account for any extra "embedded" costs that apply additively
> >  	 to the base cost calculated above.  */
> > -      stmt_cost = aarch64_adjust_stmt_cost (kind, stmt_info, vectype,
> > -					    stmt_cost);
> > +      stmt_cost = aarch64_adjust_stmt_cost (m_vinfo, kind, stmt_info,
> > +					    vectype, m_vec_flags, stmt_cost);
> >
> >        /* If we're recording a nonzero vector loop body cost for the
> >  	 innermost loop, also estimate the operations that would need

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH]AArch64 update costing for MLA by invariant
  2023-08-02 10:52   ` Tamar Christina
@ 2023-08-02 10:56     ` Richard Sandiford
  2023-08-03 12:56       ` Tamar Christina
  0 siblings, 1 reply; 6+ messages in thread
From: Richard Sandiford @ 2023-08-02 10:56 UTC (permalink / raw)
  To: Tamar Christina
  Cc: gcc-patches, nd, Richard Earnshaw, Marcus Shawcroft, Kyrylo Tkachov

Tamar Christina <Tamar.Christina@arm.com> writes:
>> Tamar Christina <tamar.christina@arm.com> writes:
>> > Hi All,
>> >
>> > When determining issue rates we currently discount non-constant MLA
>> > accumulators for Advanced SIMD but don't do it for the latency.
>> >
>> > This means the costs for Advanced SIMD with a constant accumulator are
>> > wrong and results in us costing SVE and Advanced SIMD the same.  This
>> > can cauze us to vectorize with Advanced SIMD instead of SVE in some cases.
>> >
>> > This patch adds the same discount for SVE and Scalar as we do for issue rate.
>> >
>> > My assumption was that on issue rate we reject all scalar constants
>> > early because we take into account the extra instruction to create the
>> constant?
>> > Though I'd have expected this to be in prologue costs.  For this
>> > reason I added an extra parameter to allow me to force the check to at
>> > least look for the multiplication.
>> 
>> I'm not sure that was it.  I wish I'd added a comment to say what it was
>> though :(  I suspect different parts of this function were written at different
>> times, hence the inconsistency.
>> 
>> > This gives a 5% improvement in fotonik3d_r in SPECCPU 2017 on large
>> > Neoverse cores.
>> >
>> > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>> >
>> > Ok for master?
>> >
>> > Thanks,
>> > Tamar
>> >
>> > gcc/ChangeLog:
>> >
>> > 	* config/aarch64/aarch64.cc (aarch64_multiply_add_p): Add param
>> > 	allow_constants.
>> > 	(aarch64_adjust_stmt_cost): Use it.
>> > 	(aarch64_vector_costs::count_ops): Likewise.
>> > 	(aarch64_vector_costs::add_stmt_cost): Pass vinfo to
>> > 	aarch64_adjust_stmt_cost.
>> >
>> > --- inline copy of patch --
>> > diff --git a/gcc/config/aarch64/aarch64.cc
>> > b/gcc/config/aarch64/aarch64.cc index
>> >
>> 560e5431636ef46c41d56faa0c4e95be78f64b50..76b74b77b3f122a3c9725
>> 57e2f83
>> > b63ba365fea9 100644
>> > --- a/gcc/config/aarch64/aarch64.cc
>> > +++ b/gcc/config/aarch64/aarch64.cc
>> > @@ -16398,10 +16398,11 @@ aarch64_advsimd_ldp_stp_p (enum
>> vect_cost_for_stmt kind,
>> >     or multiply-subtract sequence that might be suitable for fusing into a
>> >     single instruction.  If VEC_FLAGS is zero, analyze the operation as
>> >     a scalar one, otherwise analyze it as an operation on vectors with those
>> > -   VEC_* flags.  */
>> > +   VEC_* flags.  When ALLOW_CONSTANTS we'll recognize all accumulators
>> including
>> > +   constant ones.  */
>> >  static bool
>> >  aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info,
>> > -			unsigned int vec_flags)
>> > +			unsigned int vec_flags, bool allow_constants)
>> >  {
>> >    gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
>> >    if (!assign)
>> > @@ -16410,8 +16411,9 @@ aarch64_multiply_add_p (vec_info *vinfo,
>> stmt_vec_info stmt_info,
>> >    if (code != PLUS_EXPR && code != MINUS_EXPR)
>> >      return false;
>> >
>> > -  if (CONSTANT_CLASS_P (gimple_assign_rhs1 (assign))
>> > -      || CONSTANT_CLASS_P (gimple_assign_rhs2 (assign)))
>> > +  if (!allow_constants
>> > +      && (CONSTANT_CLASS_P (gimple_assign_rhs1 (assign))
>> > +	  || CONSTANT_CLASS_P (gimple_assign_rhs2 (assign))))
>> >      return false;
>> >
>> >    for (int i = 1; i < 3; ++i)
>> > @@ -16429,7 +16431,7 @@ aarch64_multiply_add_p (vec_info *vinfo,
>> stmt_vec_info stmt_info,
>> >        if (!rhs_assign || gimple_assign_rhs_code (rhs_assign) != MULT_EXPR)
>> >  	continue;
>> >
>> > -      if (vec_flags & VEC_ADVSIMD)
>> > +      if (!allow_constants && (vec_flags & VEC_ADVSIMD))
>> >  	{
>> >  	  /* Scalar and SVE code can tie the result to any FMLA input (or none,
>> >  	     although that requires a MOVPRFX for SVE).  However, Advanced
>> > SIMD @@ -16441,7 +16443,8 @@ aarch64_multiply_add_p (vec_info
>> *vinfo, stmt_vec_info stmt_info,
>> >  	    return false;
>> >  	  def_stmt_info = vinfo->lookup_def (rhs);
>> >  	  if (!def_stmt_info
>> > -	      || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_external_def)
>> > +	      || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_external_def
>> > +	      || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_constant_def)
>> 
>> Do you see vect_constant_defs in practice, or is this just for completeness?
>> I would expect any constants to appear as direct operands.  I don't mind
>> keeping it if it's just a belt-and-braces thing though.
>
> In the latency case where I had allow_constants the early rejection based on
> the operand itself wouldn't be rejected so in that case I still needed to reject
> them but do so after the multiply check.  While they do appear as direct
> operands as well they also have their own nodes, in particular for SLP so the
> constants are handled as a group.

Ah, OK, thanks.

> But can also check CONSTANT_CLASS_P (rhs) if that's preferrable. 

No, what you did is more correct.  I just wasn't sure at first which case
it was handling.

Thanks,
Richard

^ permalink raw reply	[flat|nested] 6+ messages in thread

* RE: [PATCH]AArch64 update costing for MLA by invariant
  2023-08-02 10:56     ` Richard Sandiford
@ 2023-08-03 12:56       ` Tamar Christina
  2023-08-03 13:38         ` Richard Sandiford
  0 siblings, 1 reply; 6+ messages in thread
From: Tamar Christina @ 2023-08-03 12:56 UTC (permalink / raw)
  To: Richard Sandiford
  Cc: gcc-patches, nd, Richard Earnshaw, Marcus Shawcroft, Kyrylo Tkachov

[-- Attachment #1: Type: text/plain, Size: 4385 bytes --]

> >> Do you see vect_constant_defs in practice, or is this just for completeness?
> >> I would expect any constants to appear as direct operands.  I don't
> >> mind keeping it if it's just a belt-and-braces thing though.
> >
> > In the latency case where I had allow_constants the early rejection
> > based on the operand itself wouldn't be rejected so in that case I
> > still needed to reject them but do so after the multiply check.  While
> > they do appear as direct operands as well they also have their own
> > nodes, in particular for SLP so the constants are handled as a group.
> 
> Ah, OK, thanks.
> 
> > But can also check CONSTANT_CLASS_P (rhs) if that's preferrable.
> 
> No, what you did is more correct.  I just wasn't sure at first which case it was
> handling.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* config/aarch64/aarch64.cc (aarch64_multiply_add_p): Update handling
	of constants. 
	(aarch64_adjust_stmt_cost): Use it.
	(aarch64_vector_costs::count_ops): Likewise.
	(aarch64_vector_costs::add_stmt_cost): Pass vinfo to
	aarch64_adjust_stmt_cost.

--- inline copy of patch ---

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index d4d7602554592b9042b8eaf389eff1ec80c2090e..7cc5916ce06b2635346c807da9306738b939ebc6 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -16410,10 +16410,6 @@ aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info,
   if (code != PLUS_EXPR && code != MINUS_EXPR)
     return false;
 
-  if (CONSTANT_CLASS_P (gimple_assign_rhs1 (assign))
-      || CONSTANT_CLASS_P (gimple_assign_rhs2 (assign)))
-    return false;
-
   for (int i = 1; i < 3; ++i)
     {
       tree rhs = gimple_op (assign, i);
@@ -16441,7 +16437,8 @@ aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info,
 	    return false;
 	  def_stmt_info = vinfo->lookup_def (rhs);
 	  if (!def_stmt_info
-	      || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_external_def)
+	      || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_external_def
+	      || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_constant_def)
 	    return false;
 	}
 
@@ -16721,8 +16718,9 @@ aarch64_sve_adjust_stmt_cost (class vec_info *vinfo, vect_cost_for_stmt kind,
    and which when vectorized would operate on vector type VECTYPE.  Add the
    cost of any embedded operations.  */
 static fractional_cost
-aarch64_adjust_stmt_cost (vect_cost_for_stmt kind, stmt_vec_info stmt_info,
-			  tree vectype, fractional_cost stmt_cost)
+aarch64_adjust_stmt_cost (vec_info *vinfo, vect_cost_for_stmt kind,
+			  stmt_vec_info stmt_info, tree vectype,
+			  unsigned vec_flags, fractional_cost stmt_cost)
 {
   if (vectype)
     {
@@ -16745,6 +16743,14 @@ aarch64_adjust_stmt_cost (vect_cost_for_stmt kind, stmt_vec_info stmt_info,
 	  break;
 	}
 
+      gassign *assign = dyn_cast<gassign *> (STMT_VINFO_STMT (stmt_info));
+      if (assign && !vect_is_reduction (stmt_info))
+	{
+	  /* For MLA we need to reduce the cost since MLA is 1 instruction.  */
+	  if (aarch64_multiply_add_p (vinfo, stmt_info, vec_flags))
+	    return 0;
+	}
+
       if (kind == vector_stmt || kind == vec_to_scalar)
 	if (tree cmp_type = vect_embedded_comparison_type (stmt_info))
 	  {
@@ -16814,7 +16820,8 @@ aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind,
     }
 
   /* Assume that multiply-adds will become a single operation.  */
-  if (stmt_info && aarch64_multiply_add_p (m_vinfo, stmt_info, m_vec_flags))
+  if (stmt_info
+      && aarch64_multiply_add_p (m_vinfo, stmt_info, m_vec_flags))
     return;
 
   /* Count the basic operation cost associated with KIND.  */
@@ -17060,8 +17067,8 @@ aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
     {
       /* Account for any extra "embedded" costs that apply additively
 	 to the base cost calculated above.  */
-      stmt_cost = aarch64_adjust_stmt_cost (kind, stmt_info, vectype,
-					    stmt_cost);
+      stmt_cost = aarch64_adjust_stmt_cost (m_vinfo, kind, stmt_info,
+					    vectype, m_vec_flags, stmt_cost);
 
       /* If we're recording a nonzero vector loop body cost for the
 	 innermost loop, also estimate the operations that would need

[-- Attachment #2: rb17618.patch --]
[-- Type: application/octet-stream, Size: 3103 bytes --]

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index d4d7602554592b9042b8eaf389eff1ec80c2090e..7cc5916ce06b2635346c807da9306738b939ebc6 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -16410,10 +16410,6 @@ aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info,
   if (code != PLUS_EXPR && code != MINUS_EXPR)
     return false;
 
-  if (CONSTANT_CLASS_P (gimple_assign_rhs1 (assign))
-      || CONSTANT_CLASS_P (gimple_assign_rhs2 (assign)))
-    return false;
-
   for (int i = 1; i < 3; ++i)
     {
       tree rhs = gimple_op (assign, i);
@@ -16441,7 +16437,8 @@ aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info,
 	    return false;
 	  def_stmt_info = vinfo->lookup_def (rhs);
 	  if (!def_stmt_info
-	      || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_external_def)
+	      || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_external_def
+	      || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_constant_def)
 	    return false;
 	}
 
@@ -16721,8 +16718,9 @@ aarch64_sve_adjust_stmt_cost (class vec_info *vinfo, vect_cost_for_stmt kind,
    and which when vectorized would operate on vector type VECTYPE.  Add the
    cost of any embedded operations.  */
 static fractional_cost
-aarch64_adjust_stmt_cost (vect_cost_for_stmt kind, stmt_vec_info stmt_info,
-			  tree vectype, fractional_cost stmt_cost)
+aarch64_adjust_stmt_cost (vec_info *vinfo, vect_cost_for_stmt kind,
+			  stmt_vec_info stmt_info, tree vectype,
+			  unsigned vec_flags, fractional_cost stmt_cost)
 {
   if (vectype)
     {
@@ -16745,6 +16743,14 @@ aarch64_adjust_stmt_cost (vect_cost_for_stmt kind, stmt_vec_info stmt_info,
 	  break;
 	}
 
+      gassign *assign = dyn_cast<gassign *> (STMT_VINFO_STMT (stmt_info));
+      if (assign && !vect_is_reduction (stmt_info))
+	{
+	  /* For MLA we need to reduce the cost since MLA is 1 instruction.  */
+	  if (aarch64_multiply_add_p (vinfo, stmt_info, vec_flags))
+	    return 0;
+	}
+
       if (kind == vector_stmt || kind == vec_to_scalar)
 	if (tree cmp_type = vect_embedded_comparison_type (stmt_info))
 	  {
@@ -16814,7 +16820,8 @@ aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind,
     }
 
   /* Assume that multiply-adds will become a single operation.  */
-  if (stmt_info && aarch64_multiply_add_p (m_vinfo, stmt_info, m_vec_flags))
+  if (stmt_info
+      && aarch64_multiply_add_p (m_vinfo, stmt_info, m_vec_flags))
     return;
 
   /* Count the basic operation cost associated with KIND.  */
@@ -17060,8 +17067,8 @@ aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
     {
       /* Account for any extra "embedded" costs that apply additively
 	 to the base cost calculated above.  */
-      stmt_cost = aarch64_adjust_stmt_cost (kind, stmt_info, vectype,
-					    stmt_cost);
+      stmt_cost = aarch64_adjust_stmt_cost (m_vinfo, kind, stmt_info,
+					    vectype, m_vec_flags, stmt_cost);
 
       /* If we're recording a nonzero vector loop body cost for the
 	 innermost loop, also estimate the operations that would need

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH]AArch64 update costing for MLA by invariant
  2023-08-03 12:56       ` Tamar Christina
@ 2023-08-03 13:38         ` Richard Sandiford
  0 siblings, 0 replies; 6+ messages in thread
From: Richard Sandiford @ 2023-08-03 13:38 UTC (permalink / raw)
  To: Tamar Christina
  Cc: gcc-patches, nd, Richard Earnshaw, Marcus Shawcroft, Kyrylo Tkachov

Tamar Christina <Tamar.Christina@arm.com> writes:
>> >> Do you see vect_constant_defs in practice, or is this just for completeness?
>> >> I would expect any constants to appear as direct operands.  I don't
>> >> mind keeping it if it's just a belt-and-braces thing though.
>> >
>> > In the latency case where I had allow_constants the early rejection
>> > based on the operand itself wouldn't be rejected so in that case I
>> > still needed to reject them but do so after the multiply check.  While
>> > they do appear as direct operands as well they also have their own
>> > nodes, in particular for SLP so the constants are handled as a group.
>> 
>> Ah, OK, thanks.
>> 
>> > But can also check CONSTANT_CLASS_P (rhs) if that's preferrable.
>> 
>> No, what you did is more correct.  I just wasn't sure at first which case it was
>> handling.
>
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> 	* config/aarch64/aarch64.cc (aarch64_multiply_add_p): Update handling
> 	of constants. 
> 	(aarch64_adjust_stmt_cost): Use it.
> 	(aarch64_vector_costs::count_ops): Likewise.
> 	(aarch64_vector_costs::add_stmt_cost): Pass vinfo to
> 	aarch64_adjust_stmt_cost.
>
> --- inline copy of patch ---
>
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index d4d7602554592b9042b8eaf389eff1ec80c2090e..7cc5916ce06b2635346c807da9306738b939ebc6 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -16410,10 +16410,6 @@ aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info,
>    if (code != PLUS_EXPR && code != MINUS_EXPR)
>      return false;
>  
> -  if (CONSTANT_CLASS_P (gimple_assign_rhs1 (assign))
> -      || CONSTANT_CLASS_P (gimple_assign_rhs2 (assign)))
> -    return false;
> -
>    for (int i = 1; i < 3; ++i)
>      {
>        tree rhs = gimple_op (assign, i);
> @@ -16441,7 +16437,8 @@ aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info,
>  	    return false;
>  	  def_stmt_info = vinfo->lookup_def (rhs);
>  	  if (!def_stmt_info
> -	      || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_external_def)
> +	      || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_external_def
> +	      || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_constant_def)
>  	    return false;
>  	}
>  
> @@ -16721,8 +16718,9 @@ aarch64_sve_adjust_stmt_cost (class vec_info *vinfo, vect_cost_for_stmt kind,
>     and which when vectorized would operate on vector type VECTYPE.  Add the
>     cost of any embedded operations.  */
>  static fractional_cost
> -aarch64_adjust_stmt_cost (vect_cost_for_stmt kind, stmt_vec_info stmt_info,
> -			  tree vectype, fractional_cost stmt_cost)
> +aarch64_adjust_stmt_cost (vec_info *vinfo, vect_cost_for_stmt kind,
> +			  stmt_vec_info stmt_info, tree vectype,
> +			  unsigned vec_flags, fractional_cost stmt_cost)
>  {
>    if (vectype)
>      {
> @@ -16745,6 +16743,14 @@ aarch64_adjust_stmt_cost (vect_cost_for_stmt kind, stmt_vec_info stmt_info,
>  	  break;
>  	}
>  
> +      gassign *assign = dyn_cast<gassign *> (STMT_VINFO_STMT (stmt_info));
> +      if (assign && !vect_is_reduction (stmt_info))
> +	{
> +	  /* For MLA we need to reduce the cost since MLA is 1 instruction.  */
> +	  if (aarch64_multiply_add_p (vinfo, stmt_info, vec_flags))
> +	    return 0;
> +	}
> +
>        if (kind == vector_stmt || kind == vec_to_scalar)
>  	if (tree cmp_type = vect_embedded_comparison_type (stmt_info))
>  	  {
> @@ -16814,7 +16820,8 @@ aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind,
>      }
>  
>    /* Assume that multiply-adds will become a single operation.  */
> -  if (stmt_info && aarch64_multiply_add_p (m_vinfo, stmt_info, m_vec_flags))
> +  if (stmt_info
> +      && aarch64_multiply_add_p (m_vinfo, stmt_info, m_vec_flags))
>      return;
>  
>    /* Count the basic operation cost associated with KIND.  */

There's no need for this change now that there's no extra parameter.

OK with that change, thanks.

Richard

> @@ -17060,8 +17067,8 @@ aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
>      {
>        /* Account for any extra "embedded" costs that apply additively
>  	 to the base cost calculated above.  */
> -      stmt_cost = aarch64_adjust_stmt_cost (kind, stmt_info, vectype,
> -					    stmt_cost);
> +      stmt_cost = aarch64_adjust_stmt_cost (m_vinfo, kind, stmt_info,
> +					    vectype, m_vec_flags, stmt_cost);
>  
>        /* If we're recording a nonzero vector loop body cost for the
>  	 innermost loop, also estimate the operations that would need

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2023-08-03 13:39 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-08-02 10:21 [PATCH]AArch64 update costing for MLA by invariant Tamar Christina
2023-08-02 10:43 ` Richard Sandiford
2023-08-02 10:52   ` Tamar Christina
2023-08-02 10:56     ` Richard Sandiford
2023-08-03 12:56       ` Tamar Christina
2023-08-03 13:38         ` Richard Sandiford

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).