[gcc r12-7940] aarch64: Implement determine_suggested_unroll

public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed

* [gcc r12-7940] aarch64: Implement determine_suggested_unroll_factor
@ 2022-03-31 16:15 Andre Simoes Dias Vieira
  0 siblings, 0 replies; only message in thread
From: Andre Simoes Dias Vieira @ 2022-03-31 16:15 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:40d643d8de7bb0b7bd75e35f4274beb9793bb0df

commit r12-7940-g40d643d8de7bb0b7bd75e35f4274beb9793bb0df
Author: Andre Vieira <andre.simoesdiasvieira@arm.com>
Date:   Thu Mar 31 17:08:59 2022 +0100

    aarch64: Implement determine_suggested_unroll_factor
    
    This patch implements the costing function determine_suggested_unroll_factor
    for aarch64.
    It determines the unrolling factor by dividing the number of X operations we
    can do per cycle by the number of X operations, taking this information from
    the vec_ops analysis during vector costing and the available issue_info
    information.
    We multiply the dividend by a potential reduction_latency, to improve our
    pipeline utilization if we are stalled waiting on a particular reduction
    operation.
    
    gcc/ChangeLog:
    
            * config/aarch64/aarch64.cc (aarch64_vector_costs): Define
            determine_suggested_unroll_factor and m_has_avg.
            (determine_suggested_unroll_factor): New function.
            (aarch64_vector_costs::add_stmt_cost): Check for a qualifying pattern
            to set m_nosve_pattern.
            (aarch64_vector_costs::finish_costs): Use
            determine_suggested_unroll_factor.
            * config/aarch64/aarch64.opt (aarch64-vect-unroll-limit): New.
            * doc/invoke.texi: (aarch64-vect-unroll-limit): Document new option.

Diff:
---
 gcc/config/aarch64/aarch64.cc  | 89 +++++++++++++++++++++++++++++++++++++++++-
 gcc/config/aarch64/aarch64.opt |  4 ++
 gcc/doc/invoke.texi            |  6 +++
 3 files changed, 97 insertions(+), 2 deletions(-)

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index ab78b11b158..18f80499079 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -15637,11 +15637,16 @@ private:
   unsigned int adjust_body_cost (loop_vec_info, const aarch64_vector_costs *,
 				 unsigned int);
   bool prefer_unrolled_loop () const;
+  unsigned int determine_suggested_unroll_factor ();
 
   /* True if we have performed one-time initialization based on the
      vec_info.  */
   bool m_analyzed_vinfo = false;
 
+  /* This loop uses an average operation that is not supported by SVE, but is
+     supported by Advanced SIMD and SVE2.  */
+  bool m_has_avg = false;
+
   /* - If M_VEC_FLAGS is zero then we're costing the original scalar code.
      - If M_VEC_FLAGS & VEC_ADVSIMD is nonzero then we're costing Advanced
        SIMD code.
@@ -16642,6 +16647,21 @@ aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
 	 as one iteration of the SVE loop.  */
       if (where == vect_body && m_unrolled_advsimd_niters)
 	m_unrolled_advsimd_stmts += count * m_unrolled_advsimd_niters;
+
+      /* Detect the use of an averaging operation.  */
+      gimple *stmt = stmt_info->stmt;
+      if (is_gimple_call (stmt)
+	  && gimple_call_internal_p (stmt))
+	{
+	  switch (gimple_call_internal_fn (stmt))
+	    {
+	    case IFN_AVG_FLOOR:
+	    case IFN_AVG_CEIL:
+	      m_has_avg = true;
+	    default:
+	      break;
+	    }
+	}
     }
   return record_stmt_cost (stmt_info, where, (count * stmt_cost).ceil ());
 }
@@ -16725,6 +16745,68 @@ adjust_body_cost_sve (const aarch64_vec_op_count *ops,
   return sve_cycles_per_iter;
 }
 
+unsigned int
+aarch64_vector_costs::determine_suggested_unroll_factor ()
+{
+  bool sve = m_vec_flags & VEC_ANY_SVE;
+  /* If we are trying to unroll an Advanced SIMD main loop that contains
+     an averaging operation that we do not support with SVE and we might use a
+     predicated epilogue, we need to be conservative and block unrolling as
+     this might lead to a less optimal loop for the first and only epilogue
+     using the original loop's vectorization factor.
+     TODO: Remove this constraint when we add support for multiple epilogue
+     vectorization.  */
+  if (!sve && !TARGET_SVE2 && m_has_avg)
+    return 1;
+
+  unsigned int max_unroll_factor = 1;
+  for (auto vec_ops : m_ops)
+    {
+      aarch64_simd_vec_issue_info const *vec_issue
+	= vec_ops.simd_issue_info ();
+      if (!vec_issue)
+	return 1;
+      /* Limit unroll factor to a value adjustable by the user, the default
+	 value is 4. */
+      unsigned int unroll_factor = aarch64_vect_unroll_limit;
+      unsigned int factor
+       = vec_ops.reduction_latency > 1 ? vec_ops.reduction_latency : 1;
+      unsigned int temp;
+
+      /* Sanity check, this should never happen.  */
+      if ((vec_ops.stores + vec_ops.loads + vec_ops.general_ops) == 0)
+	return 1;
+
+      /* Check stores.  */
+      if (vec_ops.stores > 0)
+	{
+	  temp = CEIL (factor * vec_issue->stores_per_cycle,
+		       vec_ops.stores);
+	  unroll_factor = MIN (unroll_factor, temp);
+	}
+
+      /* Check loads + stores.  */
+      if (vec_ops.loads > 0)
+	{
+	  temp = CEIL (factor * vec_issue->loads_stores_per_cycle,
+		       vec_ops.loads + vec_ops.stores);
+	  unroll_factor = MIN (unroll_factor, temp);
+	}
+
+      /* Check general ops.  */
+      if (vec_ops.general_ops > 0)
+	{
+	  temp = CEIL (factor * vec_issue->general_ops_per_cycle,
+		       vec_ops.general_ops);
+	  unroll_factor = MIN (unroll_factor, temp);
+	 }
+      max_unroll_factor = MAX (max_unroll_factor, unroll_factor);
+    }
+
+  /* Make sure unroll factor is power of 2.  */
+  return 1 << ceil_log2 (max_unroll_factor);
+}
+
 /* BODY_COST is the cost of a vector loop body.  Adjust the cost as necessary
    and return the new cost.  */
 unsigned int
@@ -16861,8 +16943,11 @@ aarch64_vector_costs::finish_cost (const vector_costs *uncast_scalar_costs)
   if (loop_vinfo
       && m_vec_flags
       && aarch64_use_new_vector_costs_p ())
-    m_costs[vect_body] = adjust_body_cost (loop_vinfo, scalar_costs,
-					   m_costs[vect_body]);
+    {
+      m_costs[vect_body] = adjust_body_cost (loop_vinfo, scalar_costs,
+					     m_costs[vect_body]);
+      m_suggested_unroll_factor = determine_suggested_unroll_factor ();
+    }
 
   /* Apply the heuristic described above m_stp_sequence_cost.  Prefer
      the scalar code in the event of a tie, since there is more chance
diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
index 98ce9c0ab61..92220b26ee2 100644
--- a/gcc/config/aarch64/aarch64.opt
+++ b/gcc/config/aarch64/aarch64.opt
@@ -292,3 +292,7 @@ Constant memmove size in bytes above which to start using MOPS sequence.
 -param=aarch64-mops-memset-size-threshold=
 Target Joined UInteger Var(aarch64_mops_memset_size_threshold) Init(256) Param
 Constant memset size in bytes from which to start using MOPS sequence.
+
+-param=aarch64-vect-unroll-limit=
+Target Joined UInteger Var(aarch64_vect_unroll_limit) Init(4) Param
+Limit how much the autovectorizer may unroll a loop.
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 09715a510b4..3936aef69d0 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -15239,6 +15239,12 @@ If this parameter is set to @var{n}, GCC will not use this heuristic
 for loops that are known to execute in fewer than @var{n} Advanced
 SIMD iterations.
 
+@item aarch64-vect-unroll-limit
+The vectorizer will use available tuning information to determine whether it
+would be beneficial to unroll the main vectorized loop and by how much.  This
+parameter set's the upper bound of how much the vectorizer will unroll the main
+loop.  The default value is four.
+
 @end table
 
 @end table


^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2022-03-31 16:15 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-03-31 16:15 [gcc r12-7940] aarch64: Implement determine_suggested_unroll_factor Andre Simoes Dias Vieira

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).