[PATCH]AArch64 update costing for combining vector conditionals

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCH]AArch64 update costing for combining vector conditionals
@ 2023-08-02 10:22 Tamar Christina
  2023-08-02 10:53 ` Richard Sandiford
  0 siblings, 1 reply; 2+ messages in thread
From: Tamar Christina @ 2023-08-02 10:22 UTC (permalink / raw)
  To: gcc-patches
  Cc: nd, Richard.Earnshaw, Marcus.Shawcroft, Kyrylo.Tkachov,
	richard.sandiford

[-- Attachment #1: Type: text/plain, Size: 4225 bytes --]

Hi All,

boolean comparisons have different cost depending on the mode. e.g.
a && b when predicated doesn't require an addition instruction, the AND is free
by combining the predicate of the one operation into the second one.  At the
moment though we only fuse compares so this update requires one of the
operands to be a comparison.

Scalars also don't require this because the non-ifct variant is a series of
branches where following the branch sequences themselves are natural ANDs.

Advanced SIMD however does require an actual AND to combine the boolean values.

As such this patch discounts Scalar and SVE boolean operation latency and
throughput.

With this patch comparison heavy code prefers SVE as it should, especially in
cases with SVE VL == Advanced SIMD VL where previously the SVE prologue costs
would tip it towards Advanced SIMD.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* config/aarch64/aarch64.cc (aarch64_bool_compound_p): New.
	(aarch64_adjust_stmt_cost, aarch64_vector_costs::count_ops): Use it.

--- inline copy of patch -- 
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index b1bacc734b4630257b6ebf8ca7d9afeb34008c10..55963bb28be7ede08b05fb9fddb5a65f6818c63e 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -16453,6 +16453,49 @@ aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info,
   return false;
 }
 
+/* Return true if STMT_INFO is the second part of a two-statement boolean AND
+   expression sequence that might be suitable for fusing into a
+   single instruction.  If VEC_FLAGS is zero, analyze the operation as
+   a scalar one, otherwise analyze it as an operation on vectors with those
+   VEC_* flags.  */
+
+static bool
+aarch64_bool_compound_p (vec_info *vinfo, stmt_vec_info stmt_info,
+			 unsigned int vec_flags)
+{
+  gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
+  if (!assign
+      || !STMT_VINFO_VECTYPE (stmt_info)
+      || !VECTOR_BOOLEAN_TYPE_P (STMT_VINFO_VECTYPE (stmt_info))
+      || gimple_assign_rhs_code (assign) != BIT_AND_EXPR)
+    return false;
+
+  for (int i = 1; i < 3; ++i)
+    {
+      tree rhs = gimple_op (assign, i);
+
+      if (TREE_CODE (rhs) != SSA_NAME)
+	continue;
+
+      stmt_vec_info def_stmt_info = vinfo->lookup_def (rhs);
+      if (!def_stmt_info
+	  || STMT_VINFO_DEF_TYPE (def_stmt_info) != vect_internal_def)
+	continue;
+
+      gassign *rhs_assign = dyn_cast<gassign *> (def_stmt_info->stmt);
+      if (!rhs_assign
+	  || TREE_CODE_CLASS (gimple_assign_rhs_code (rhs_assign))
+		!= tcc_comparison)
+	continue;
+
+      if (vec_flags & VEC_ADVSIMD)
+	return false;
+
+      return true;
+    }
+  return false;
+}
+
 /* We are considering implementing STMT_INFO using SVE.  If STMT_INFO is an
    in-loop reduction that SVE supports directly, return its latency in cycles,
    otherwise return zero.  SVE_COSTS specifies the latencies of the relevant
@@ -16750,11 +16793,17 @@ aarch64_adjust_stmt_cost (vec_info *vinfo, vect_cost_for_stmt kind,
 	}
 
       gassign *assign = dyn_cast<gassign *> (STMT_VINFO_STMT (stmt_info));
-      if (assign && !vect_is_reduction (stmt_info))
+      if (assign)
 	{
 	  bool simd_p = vec_flags & VEC_ADVSIMD;
 	  /* For MLA we need to reduce the cost since MLA is 1 instruction.  */
-	  if (aarch64_multiply_add_p (vinfo, stmt_info, vec_flags, !simd_p))
+	  if (!vect_is_reduction (stmt_info)
+	      && aarch64_multiply_add_p (vinfo, stmt_info, vec_flags, !simd_p))
+	    return 0;
+
+	  /* For vector boolean ANDs with a compare operand we just need
+	     one insn.  */
+	  if (aarch64_bool_compound_p (vinfo, stmt_info, vec_flags))
 	    return 0;
 	}
 
@@ -16831,6 +16880,12 @@ aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind,
       && aarch64_multiply_add_p (m_vinfo, stmt_info, m_vec_flags, false))
     return;
 
+  /* Assume that bool AND with compare operands will become a single
+     operation.  */
+  if (stmt_info
+      && aarch64_bool_compound_p (m_vinfo, stmt_info, m_vec_flags))
+    return;
+
   /* Count the basic operation cost associated with KIND.  */
   switch (kind)
     {




-- 

[-- Attachment #2: rb17624.patch --]
[-- Type: text/plain, Size: 3099 bytes --]

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index b1bacc734b4630257b6ebf8ca7d9afeb34008c10..55963bb28be7ede08b05fb9fddb5a65f6818c63e 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -16453,6 +16453,49 @@ aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info,
   return false;
 }
 
+/* Return true if STMT_INFO is the second part of a two-statement boolean AND
+   expression sequence that might be suitable for fusing into a
+   single instruction.  If VEC_FLAGS is zero, analyze the operation as
+   a scalar one, otherwise analyze it as an operation on vectors with those
+   VEC_* flags.  */
+
+static bool
+aarch64_bool_compound_p (vec_info *vinfo, stmt_vec_info stmt_info,
+			 unsigned int vec_flags)
+{
+  gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
+  if (!assign
+      || !STMT_VINFO_VECTYPE (stmt_info)
+      || !VECTOR_BOOLEAN_TYPE_P (STMT_VINFO_VECTYPE (stmt_info))
+      || gimple_assign_rhs_code (assign) != BIT_AND_EXPR)
+    return false;
+
+  for (int i = 1; i < 3; ++i)
+    {
+      tree rhs = gimple_op (assign, i);
+
+      if (TREE_CODE (rhs) != SSA_NAME)
+	continue;
+
+      stmt_vec_info def_stmt_info = vinfo->lookup_def (rhs);
+      if (!def_stmt_info
+	  || STMT_VINFO_DEF_TYPE (def_stmt_info) != vect_internal_def)
+	continue;
+
+      gassign *rhs_assign = dyn_cast<gassign *> (def_stmt_info->stmt);
+      if (!rhs_assign
+	  || TREE_CODE_CLASS (gimple_assign_rhs_code (rhs_assign))
+		!= tcc_comparison)
+	continue;
+
+      if (vec_flags & VEC_ADVSIMD)
+	return false;
+
+      return true;
+    }
+  return false;
+}
+
 /* We are considering implementing STMT_INFO using SVE.  If STMT_INFO is an
    in-loop reduction that SVE supports directly, return its latency in cycles,
    otherwise return zero.  SVE_COSTS specifies the latencies of the relevant
@@ -16750,11 +16793,17 @@ aarch64_adjust_stmt_cost (vec_info *vinfo, vect_cost_for_stmt kind,
 	}
 
       gassign *assign = dyn_cast<gassign *> (STMT_VINFO_STMT (stmt_info));
-      if (assign && !vect_is_reduction (stmt_info))
+      if (assign)
 	{
 	  bool simd_p = vec_flags & VEC_ADVSIMD;
 	  /* For MLA we need to reduce the cost since MLA is 1 instruction.  */
-	  if (aarch64_multiply_add_p (vinfo, stmt_info, vec_flags, !simd_p))
+	  if (!vect_is_reduction (stmt_info)
+	      && aarch64_multiply_add_p (vinfo, stmt_info, vec_flags, !simd_p))
+	    return 0;
+
+	  /* For vector boolean ANDs with a compare operand we just need
+	     one insn.  */
+	  if (aarch64_bool_compound_p (vinfo, stmt_info, vec_flags))
 	    return 0;
 	}
 
@@ -16831,6 +16880,12 @@ aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind,
       && aarch64_multiply_add_p (m_vinfo, stmt_info, m_vec_flags, false))
     return;
 
+  /* Assume that bool AND with compare operands will become a single
+     operation.  */
+  if (stmt_info
+      && aarch64_bool_compound_p (m_vinfo, stmt_info, m_vec_flags))
+    return;
+
   /* Count the basic operation cost associated with KIND.  */
   switch (kind)
     {




^ permalink raw reply	[flat|nested] 2+ messages in thread

* Re: [PATCH]AArch64 update costing for combining vector conditionals
  2023-08-02 10:22 [PATCH]AArch64 update costing for combining vector conditionals Tamar Christina
@ 2023-08-02 10:53 ` Richard Sandiford
  0 siblings, 0 replies; 2+ messages in thread
From: Richard Sandiford @ 2023-08-02 10:53 UTC (permalink / raw)
  To: Tamar Christina
  Cc: gcc-patches, nd, Richard.Earnshaw, Marcus.Shawcroft, Kyrylo.Tkachov

Tamar Christina <tamar.christina@arm.com> writes:
> Hi All,
>
> boolean comparisons have different cost depending on the mode. e.g.
> a && b when predicated doesn't require an addition instruction, the AND is free

Nit (for the commit msg): additional

Maybe:

  for SVE, a && b doesn't require an additional instruction when a or b
  is predicated, ...

?

> by combining the predicate of the one operation into the second one.  At the
> moment though we only fuse compares so this update requires one of the
> operands to be a comparison.
>
> Scalars also don't require this because the non-ifct variant is a series of

Typo: ifcvt

> branches where following the branch sequences themselves are natural ANDs.
>
> Advanced SIMD however does require an actual AND to combine the boolean values.
>
> As such this patch discounts Scalar and SVE boolean operation latency and
> throughput.
>
> With this patch comparison heavy code prefers SVE as it should, especially in
> cases with SVE VL == Advanced SIMD VL where previously the SVE prologue costs
> would tip it towards Advanced SIMD.
>
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> 	* config/aarch64/aarch64.cc (aarch64_bool_compound_p): New.
> 	(aarch64_adjust_stmt_cost, aarch64_vector_costs::count_ops): Use it.
>
> --- inline copy of patch -- 
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index b1bacc734b4630257b6ebf8ca7d9afeb34008c10..55963bb28be7ede08b05fb9fddb5a65f6818c63e 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -16453,6 +16453,49 @@ aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info,
>    return false;
>  }
>  
> +/* Return true if STMT_INFO is the second part of a two-statement boolean AND
> +   expression sequence that might be suitable for fusing into a
> +   single instruction.  If VEC_FLAGS is zero, analyze the operation as
> +   a scalar one, otherwise analyze it as an operation on vectors with those
> +   VEC_* flags.  */
> +
> +static bool
> +aarch64_bool_compound_p (vec_info *vinfo, stmt_vec_info stmt_info,
> +			 unsigned int vec_flags)
> +{
> +  gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
> +  if (!assign
> +      || !STMT_VINFO_VECTYPE (stmt_info)
> +      || !VECTOR_BOOLEAN_TYPE_P (STMT_VINFO_VECTYPE (stmt_info))
> +      || gimple_assign_rhs_code (assign) != BIT_AND_EXPR)

Very minor, sorry, but I think the condition reads more naturally
if the BIT_AND_EXPR test comes immediately after the !assign.

OK with that change, thanks.

Richard

> +    return false;
> +
> +  for (int i = 1; i < 3; ++i)
> +    {
> +      tree rhs = gimple_op (assign, i);
> +
> +      if (TREE_CODE (rhs) != SSA_NAME)
> +	continue;
> +
> +      stmt_vec_info def_stmt_info = vinfo->lookup_def (rhs);
> +      if (!def_stmt_info
> +	  || STMT_VINFO_DEF_TYPE (def_stmt_info) != vect_internal_def)
> +	continue;
> +
> +      gassign *rhs_assign = dyn_cast<gassign *> (def_stmt_info->stmt);
> +      if (!rhs_assign
> +	  || TREE_CODE_CLASS (gimple_assign_rhs_code (rhs_assign))
> +		!= tcc_comparison)
> +	continue;
> +
> +      if (vec_flags & VEC_ADVSIMD)
> +	return false;
> +
> +      return true;
> +    }
> +  return false;
> +}
> +
>  /* We are considering implementing STMT_INFO using SVE.  If STMT_INFO is an
>     in-loop reduction that SVE supports directly, return its latency in cycles,
>     otherwise return zero.  SVE_COSTS specifies the latencies of the relevant
> @@ -16750,11 +16793,17 @@ aarch64_adjust_stmt_cost (vec_info *vinfo, vect_cost_for_stmt kind,
>  	}
>  
>        gassign *assign = dyn_cast<gassign *> (STMT_VINFO_STMT (stmt_info));
> -      if (assign && !vect_is_reduction (stmt_info))
> +      if (assign)
>  	{
>  	  bool simd_p = vec_flags & VEC_ADVSIMD;
>  	  /* For MLA we need to reduce the cost since MLA is 1 instruction.  */
> -	  if (aarch64_multiply_add_p (vinfo, stmt_info, vec_flags, !simd_p))
> +	  if (!vect_is_reduction (stmt_info)
> +	      && aarch64_multiply_add_p (vinfo, stmt_info, vec_flags, !simd_p))
> +	    return 0;
> +
> +	  /* For vector boolean ANDs with a compare operand we just need
> +	     one insn.  */
> +	  if (aarch64_bool_compound_p (vinfo, stmt_info, vec_flags))
>  	    return 0;
>  	}
>  
> @@ -16831,6 +16880,12 @@ aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind,
>        && aarch64_multiply_add_p (m_vinfo, stmt_info, m_vec_flags, false))
>      return;
>  
> +  /* Assume that bool AND with compare operands will become a single
> +     operation.  */
> +  if (stmt_info
> +      && aarch64_bool_compound_p (m_vinfo, stmt_info, m_vec_flags))
> +    return;
> +
>    /* Count the basic operation cost associated with KIND.  */
>    switch (kind)
>      {

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2023-08-02 10:53 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-08-02 10:22 [PATCH]AArch64 update costing for combining vector conditionals Tamar Christina
2023-08-02 10:53 ` Richard Sandiford

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).