public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc r14-9157] aarch64: Spread out FPR usage between RA regions [PR113613]
@ 2024-02-23 14:13 Richard Sandiford
  0 siblings, 0 replies; only message in thread
From: Richard Sandiford @ 2024-02-23 14:13 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:ff442719cdb64c9df9d069af88e90d51bee6fb56

commit r14-9157-gff442719cdb64c9df9d069af88e90d51bee6fb56
Author: Richard Sandiford <richard.sandiford@arm.com>
Date:   Fri Feb 23 14:12:55 2024 +0000

    aarch64: Spread out FPR usage between RA regions [PR113613]
    
    early-ra already had code to do regrename-style "broadening"
    of the allocation, to promote scheduling freedom.  However,
    the pass divides the function into allocation regions
    and this broadening only worked within a single region.
    This meant that if a basic block contained one subblock
    of FPR use, followed by a point at which no FPRs were live,
    followed by another subblock of FPR use, the two subblocks
    would tend to reuse the same registers.  This in turn meant
    that it wasn't possible to form LDP/STP pairs between them.
    
    The failure to form LDPs and STPs in the testcase was a
    regression from GCC 13.
    
    The patch adds a simple heuristic to prefer less recently
    used registers in the event of a tie.
    
    gcc/
            PR target/113613
            * config/aarch64/aarch64-early-ra.cc
            (early_ra::m_current_region): New member variable.
            (early_ra::m_fpr_recency): Likewise.
            (early_ra::start_new_region): Bump m_current_region.
            (early_ra::allocate_colors): Prefer less recently used registers
            in the event of a tie.  Add a comment to explain why we prefer(ed)
            higher-numbered registers.
            (early_ra::find_oldest_color): Prefer less recently used registers
            here too.
            (early_ra::finalize_allocation): Update recency information for
            allocated registers.
            (early_ra::process_blocks): Initialize m_current_region and
            m_fpr_recency.
    
    gcc/testsuite/
            PR target/113613
            * gcc.target/aarch64/pr113613.c: New test.

Diff:
---
 gcc/config/aarch64/aarch64-early-ra.cc      | 55 ++++++++++++++++++++++++-----
 gcc/testsuite/gcc.target/aarch64/pr113613.c | 13 +++++++
 2 files changed, 59 insertions(+), 9 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-early-ra.cc b/gcc/config/aarch64/aarch64-early-ra.cc
index 9ac9ec1bb0db..8530b0ae41ea 100644
--- a/gcc/config/aarch64/aarch64-early-ra.cc
+++ b/gcc/config/aarch64/aarch64-early-ra.cc
@@ -532,6 +532,12 @@ private:
   // The set of FPRs that are currently live.
   unsigned int m_live_fprs;
 
+  // A unique one-based identifier for the current region.
+  unsigned int m_current_region;
+
+  // The region in which each FPR was last used, or 0 if none.
+  unsigned int m_fpr_recency[32];
+
   // ----------------------------------------------------------------------
 
   // A mask of the FPRs that have already been allocated.
@@ -1305,6 +1311,7 @@ early_ra::start_new_region ()
   m_allocated_fprs = 0;
   m_call_preserved_fprs = 0;
   m_allocation_successful = true;
+  m_current_region += 1;
 }
 
 // Create and return an allocno group of size SIZE for register REGNO.
@@ -2819,19 +2826,30 @@ early_ra::allocate_colors ()
 	candidates &= ~(m_allocated_fprs >> i);
       unsigned int best = INVALID_REGNUM;
       int best_weight = 0;
+      unsigned int best_recency = 0;
       for (unsigned int fpr = 0; fpr <= 32U - color->group->size; ++fpr)
 	{
 	  if ((candidates & (1U << fpr)) == 0)
 	    continue;
 	  int weight = color->fpr_preferences[fpr];
+	  unsigned int recency = 0;
 	  // Account for registers that the current function must preserve.
 	  for (unsigned int i = 0; i < color->group->size; ++i)
-	    if (m_call_preserved_fprs & (1U << (fpr + i)))
-	      weight -= 1;
-	  if (best == INVALID_REGNUM || best_weight <= weight)
+	    {
+	      if (m_call_preserved_fprs & (1U << (fpr + i)))
+		weight -= 1;
+	      recency = MAX (recency, m_fpr_recency[fpr + i]);
+	    }
+	  // Prefer higher-numbered registers in the event of a tie.
+	  // This should tend to keep lower-numbered registers free
+	  // for allocnos that require V0-V7 or V0-V15.
+	  if (best == INVALID_REGNUM
+	      || best_weight < weight
+	      || (best_weight == weight && recency <= best_recency))
 	    {
 	      best = fpr;
 	      best_weight = weight;
+	      best_recency = recency;
 	    }
 	}
 
@@ -2888,19 +2906,27 @@ early_ra::find_oldest_color (unsigned int first_color,
 {
   color_info *best = nullptr;
   unsigned int best_start_point = ~0U;
+  unsigned int best_recency = 0;
   for (unsigned int ci = first_color; ci < m_colors.length (); ++ci)
     {
       auto *color = m_colors[ci];
-      if (fpr_conflicts & (1U << (color->hard_regno - V0_REGNUM)))
+      unsigned int fpr = color->hard_regno - V0_REGNUM;
+      if (fpr_conflicts & (1U << fpr))
 	continue;
-      if (!color->group)
-	return color;
-      auto chain_head = color->group->chain_heads ()[0];
-      auto start_point = m_allocnos[chain_head]->start_point;
-      if (!best || best_start_point > start_point)
+      unsigned int start_point = 0;
+      if (color->group)
+	{
+	  auto chain_head = color->group->chain_heads ()[0];
+	  start_point = m_allocnos[chain_head]->start_point;
+	}
+      unsigned int recency = m_fpr_recency[fpr];
+      if (!best
+	  || best_start_point > start_point
+	  || (best_start_point == start_point && recency < best_recency))
 	{
 	  best = color;
 	  best_start_point = start_point;
+	  best_recency = recency;
 	}
     }
   return best;
@@ -3004,6 +3030,13 @@ early_ra::broaden_colors ()
 void
 early_ra::finalize_allocation ()
 {
+  for (auto *color : m_colors)
+    if (color->group)
+      {
+	unsigned int fpr = color->hard_regno - V0_REGNUM;
+	for (unsigned int i = 0; i < color->group->size; ++i)
+	  m_fpr_recency[fpr + i] = m_current_region;
+      }
   for (auto *allocno : m_allocnos)
     {
       if (allocno->is_shared ())
@@ -3521,6 +3554,10 @@ early_ra::process_blocks ()
 	bitmap_set_bit (fpr_pseudos_live_in, bb->index);
     }
 
+  // This is incremented by 1 at the start of each region.
+  m_current_region = 0;
+  memset (m_fpr_recency, 0, sizeof (m_fpr_recency));
+
   struct stack_node { edge_iterator ei; basic_block bb; };
 
   auto_vec<stack_node, 32> stack;
diff --git a/gcc/testsuite/gcc.target/aarch64/pr113613.c b/gcc/testsuite/gcc.target/aarch64/pr113613.c
new file mode 100644
index 000000000000..382e4a11c0a3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr113613.c
@@ -0,0 +1,13 @@
+// { dg-options "-O2" }
+
+typedef float __attribute__((vector_size(8))) v2sf;
+v2sf a[4];
+v2sf b[4];
+void f()
+{
+  b[0] += a[0];
+  b[1] += a[1];
+}
+
+// { dg-final { scan-assembler-times {\tldp\t} 2 } }
+// { dg-final { scan-assembler-times {\tstp\t} 1 } }

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2024-02-23 14:13 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-02-23 14:13 [gcc r14-9157] aarch64: Spread out FPR usage between RA regions [PR113613] Richard Sandiford

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).