public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH] aarch64: Some tweaks to the early-ra pass
@ 2023-12-08 16:28 Richard Sandiford
  0 siblings, 0 replies; only message in thread
From: Richard Sandiford @ 2023-12-08 16:28 UTC (permalink / raw)
  To: gcc-patches

early-ra's likely_operand_match_p didn't handle relaxed and special
memory constraints, which meant that the pass wasn't able to match
LD1RQ instructions to their constraints, and so backed out of
trying to allocate.  This patch fixes that by switching the sense
of the match: does the rtx seem appropriate for the constraint?,
rather than: does the constraint seem appropriate for the rtx?

Also, I came across a case that needed more general equivalence
detection.  Previously we would only record equivalences after
the last definition of the source register, but it's worth trying
to handle cases where the destination register's live range is
restricted to a block, and the next definition of the source
occurs only after the end of the destination register's live range.

The patch also fixes a cut-&-pasto that Alex noticed (thanks).

Tested on aarch64-linux-gnu & pushed.

Richard


gcc/
	* config/aarch64/aarch64-early-ra.cc (allocno_info::chain_next):
	Put into an enum with...
	(allocno_info::last_def_point):	...new member variable.
	(allocno_info::m_current_bb_point): New member variable.
	(likely_operand_match_p): Switch based on get_constraint_type,
	rather than based on rtx code.  Handle relaxed and special memory
	constraints.
	(early_ra::record_copy): Allow the source of an equivalence to be
	assigned to more than once.
	(early_ra::record_allocno_use): Invalidate any previous equivalence.
	Initialize last_def_point.
	(early_ra::record_allocno_def): Set last_def_point.
	(early_ra::valid_equivalence_p): New function, split out from...
	(early_ra::record_copy): ...here.  Use last_def_point to handle
	source registers that have a later definition.
	(make_pass_aarch64_early_ra): Fix comment.

gcc/testsuite/
	* gcc.target/aarch64/sme/strided_2.c: New test.
---
 gcc/config/aarch64/aarch64-early-ra.cc        |  89 +++++++++++---
 .../gcc.target/aarch64/sme/strided_2.c        | 115 ++++++++++++++++++
 2 files changed, 184 insertions(+), 20 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sme/strided_2.c

diff --git a/gcc/config/aarch64/aarch64-early-ra.cc b/gcc/config/aarch64/aarch64-early-ra.cc
index c065416c5b9..f05869b5cf2 100644
--- a/gcc/config/aarch64/aarch64-early-ra.cc
+++ b/gcc/config/aarch64/aarch64-early-ra.cc
@@ -306,9 +306,18 @@ private:
     // equivalent to EQUIV_ALLOCNO for the whole of this allocno's lifetime.
     unsigned int equiv_allocno;
 
-    // The next chained allocno in program order (i.e. at lower program
-    // points), or INVALID_ALLOCNO if none.
-    unsigned int chain_next;
+    union
+    {
+      // The program point at which the allocno was last defined,
+      // or START_OF_REGION if none.  This is only used temporarily
+      // while recording allocnos; after that, chain_next below is
+      // used instead.
+      unsigned int last_def_point;
+
+      // The next chained allocno in program order (i.e. at lower program
+      // points), or INVALID_ALLOCNO if none.
+      unsigned int chain_next;
+    };
 
     // The previous chained allocno in program order (i.e. at higher
     // program points), or INVALID_ALLOCNO if none.
@@ -406,6 +415,7 @@ private:
   void record_fpr_def (unsigned int);
   void record_allocno_use (allocno_info *);
   void record_allocno_def (allocno_info *);
+  bool valid_equivalence_p (allocno_info *, allocno_info *);
   void record_copy (rtx, rtx, bool = false);
   void record_constraints (rtx_insn *);
   void record_artificial_refs (unsigned int);
@@ -479,6 +489,9 @@ private:
   // The basic block that we're currently processing.
   basic_block m_current_bb;
 
+  // The lowest-numbered program point in the current basic block.
+  unsigned int m_current_bb_point;
+
   // The program point that we're currently processing (described above).
   unsigned int m_current_point;
 
@@ -576,21 +589,26 @@ likely_operand_match_p (const operand_alternative &op_alt, rtx op)
 	return true;
 
       auto cn = lookup_constraint (constraint);
-      if (REG_P (op) || SUBREG_P (op))
+      switch (get_constraint_type (cn))
 	{
-	  if (insn_extra_register_constraint (cn))
+	case CT_REGISTER:
+	  if (REG_P (op) || SUBREG_P (op))
 	    return true;
-	}
-      else if (MEM_P (op))
-	{
-	  if (insn_extra_memory_constraint (cn))
+	  break;
+
+	case CT_MEMORY:
+	case CT_SPECIAL_MEMORY:
+	case CT_RELAXED_MEMORY:
+	  if (MEM_P (op))
 	    return true;
-	}
-      else
-	{
-	  if (!insn_extra_memory_constraint (cn)
-	      && constraint_satisfied_p (op, cn))
+	  break;
+
+	case CT_CONST_INT:
+	case CT_ADDRESS:
+	case CT_FIXED_FORM:
+	  if (constraint_satisfied_p (op, cn))
 	    return true;
+	  break;
 	}
 
       constraint += len;
@@ -1407,10 +1425,14 @@ early_ra::record_allocno_use (allocno_info *allocno)
 {
   bitmap_set_bit (m_live_allocnos, allocno->id);
   if (allocno->end_point > m_current_point)
-    allocno->end_point = m_current_point;
+    {
+      allocno->end_point = m_current_point;
+      allocno->last_def_point = START_OF_REGION;
+    }
   allocno->start_point = m_current_point;
   allocno->is_copy_dest = false;
   allocno->is_strong_copy_dest = false;
+  allocno->equiv_allocno = INVALID_ALLOCNO;
 }
 
 // Record a definition of the allocno with index AI at the current program
@@ -1419,6 +1441,7 @@ early_ra::record_allocno_use (allocno_info *allocno)
 void
 early_ra::record_allocno_def (allocno_info *allocno)
 {
+  allocno->last_def_point = m_current_point;
   allocno->start_point = m_current_point;
   allocno->num_defs = MIN (allocno->num_defs + 1, 2);
   gcc_checking_assert (!allocno->is_copy_dest
@@ -1427,6 +1450,30 @@ early_ra::record_allocno_def (allocno_info *allocno)
     gcc_unreachable ();
 }
 
+// Return true if a move from SRC_ALLOCNO to DEST_ALLOCNO could be treated
+// as an equivalence.
+bool
+early_ra::valid_equivalence_p (allocno_info *dest_allocno,
+			       allocno_info *src_allocno)
+{
+  if (src_allocno->end_point > dest_allocno->end_point)
+    // The src allocno dies first.
+    return false;
+
+  if (src_allocno->num_defs != 0)
+    {
+      if (dest_allocno->end_point < m_current_bb_point)
+	// We don't currently track enough information to handle multiple
+	// definitions across basic block boundaries.
+	return false;
+
+      if (src_allocno->last_def_point >= dest_allocno->end_point)
+	// There is another definition during the destination's live range.
+	return false;
+    }
+  return dest_allocno->num_defs == 1;
+}
+
 // Record any relevant allocno-related information for an actual or imagined
 // copy from SRC to DEST.  FROM_MOVE_P is true if the copy was an explicit
 // move instruction, false if it represents one way of satisfying the previous
@@ -1512,9 +1559,7 @@ early_ra::record_copy (rtx dest, rtx src, bool from_move_p)
 	      dest_allocno->is_copy_dest = 1;
 	    }
 	  else if (from_move_p
-		   && src_allocno->end_point <= dest_allocno->end_point
-		   && src_allocno->num_defs == 0
-		   && dest_allocno->num_defs == 1)
+		   && valid_equivalence_p (dest_allocno, src_allocno))
 	    dest_allocno->equiv_allocno = src_allocno->id;
 	}
     }
@@ -3048,6 +3093,9 @@ early_ra::apply_allocation ()
 void
 early_ra::process_region ()
 {
+  for (auto *allocno : m_allocnos)
+    allocno->chain_next = INVALID_ALLOCNO;
+
   if (dump_file && (dump_flags & TDF_DETAILS))
     {
       dump_fpr_ranges ();
@@ -3117,6 +3165,8 @@ void
 early_ra::process_block (basic_block bb, bool is_isolated)
 {
   m_current_bb = bb;
+  m_current_point += 1;
+  m_current_bb_point = m_current_point;
 
   // Process live-out FPRs.
   bitmap live_out = df_get_live_out (bb);
@@ -3414,8 +3464,7 @@ pass_early_ra::execute (function *fn)
 
 } // end namespace
 
-// Create a new CC fusion pass instance.
-
+// Create a new instance of the pass.
 rtl_opt_pass *
 make_pass_aarch64_early_ra (gcc::context *ctxt)
 {
diff --git a/gcc/testsuite/gcc.target/aarch64/sme/strided_2.c b/gcc/testsuite/gcc.target/aarch64/sme/strided_2.c
new file mode 100644
index 00000000000..2e58ae643ec
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sme/strided_2.c
@@ -0,0 +1,115 @@
+// { dg-options "-O2 -fno-schedule-insns -fno-schedule-insns2" }
+// { dg-final { check-function-bodies "**" "" } }
+
+#include <arm_sme.h>
+
+#pragma GCC target "+sme2"
+
+// This file deliberately contains nonsense code.
+
+/*
+** test1:
+**	ptrue	(pn[0-9]+)\.s
+**	ld1w	{z16\.s - z19\.s}, \1/z, \[x1\]
+**	ld1w	{z20\.s - z23\.s}, \1/z, \[x1, #4, mul vl\]
+**	ld1w	{z24\.s - z27\.s}, \1/z, \[x1, #8, mul vl\]
+**	ld1w	{z28\.s - z31\.s}, \1/z, \[x1, #12, mul vl\]
+**	ptrue	[^\n]+
+**	ld1rqw	[^\n]+
+**	ld1rqw	[^\n]+
+**	sclamp	{z16.s - z19.s}, [^\n]+
+**	sclamp	{z20.s - z23.s}, [^\n]+
+**	sclamp	{z24.s - z27.s}, [^\n]+
+**	sclamp	{z28.s - z31.s}, [^\n]+
+**	st1w	{z16\.s, z20\.s, z24\.s, z28\.s}, \1, \[x0\]
+**	st1w	{z17\.s, z21\.s, z25\.s, z29\.s}, \1, \[x0, #4, mul vl\]
+**	st1w	{z18\.s, z22\.s, z26\.s, z30\.s}, \1, \[x0, #8, mul vl\]
+**	st1w	{z19\.s, z23\.s, z27\.s, z31\.s}, \1, \[x0, #12, mul vl\]
+**	st1w	{z16\.s, z20\.s, z24\.s, z28\.s}, \1, \[x0, #16, mul vl\]
+**	st1w	{z17\.s, z21\.s, z25\.s, z29\.s}, \1, \[x0, #20, mul vl\]
+**	st1w	{z18\.s, z22\.s, z26\.s, z30\.s}, \1, \[x0, #24, mul vl\]
+**	st1w	{z19\.s, z23\.s, z27\.s, z31\.s}, \1, \[x0, #28, mul vl\]
+**	ld1w	{z16\.s - z19\.s}, \1/z, \[x3\]
+**	ld1w	{z20\.s - z23\.s}, \1/z, \[x3, #4, mul vl\]
+**	ld1w	{z24\.s - z27\.s}, \1/z, \[x3, #8, mul vl\]
+**	ld1w	{z28\.s - z31\.s}, \1/z, \[x3, #12, mul vl\]
+**	sclamp	{z16.s - z19.s}, [^\n]+
+**	sclamp	{z20.s - z23.s}, [^\n]+
+**	sclamp	{z24.s - z27.s}, [^\n]+
+**	sclamp	{z28.s - z31.s}, [^\n]+
+**	...
+**	ret
+*/
+void test1(int32_t *dest, int32_t *src1, int32_t *src2,
+	   int32_t *src3) __arm_streaming
+{
+  svcount_t pg = svptrue_c32();
+  svint32x4_t l0 = svld1_vnum_x4(pg, src1, 0);
+  svint32x4_t l1 = svld1_vnum_x4(pg, src1, 4);
+  svint32x4_t l2 = svld1_vnum_x4(pg, src1, 8);
+  svint32x4_t l3 = svld1_vnum_x4(pg, src1, 12);
+  svint32_t l4 = svld1rq(svptrue_b32(), src2);
+  svint32_t l5 = svld1rq(svptrue_b32(), src2 + 4);
+  l0 = svclamp(l0, l4, l5);
+  l1 = svclamp(l1, l4, l5);
+  l2 = svclamp(l2, l4, l5);
+  l3 = svclamp(l3, l4, l5);
+  svst1_vnum(pg, dest, 0,
+	     svcreate4(svget4(l0, 0), svget4(l1, 0),
+		       svget4(l2, 0), svget4(l3, 0)));
+  svst1_vnum(pg, dest, 4,
+	     svcreate4(svget4(l0, 1), svget4(l1, 1),
+		       svget4(l2, 1), svget4(l3, 1)));
+  svst1_vnum(pg, dest, 8,
+	     svcreate4(svget4(l0, 2), svget4(l1, 2),
+		       svget4(l2, 2), svget4(l3, 2)));
+  svst1_vnum(pg, dest, 12,
+	     svcreate4(svget4(l0, 3), svget4(l1, 3),
+		       svget4(l2, 3), svget4(l3, 3)));
+  svst1_vnum(pg, dest, 16,
+	     svcreate4(svget4(l0, 0), svget4(l1, 0),
+		       svget4(l2, 0), svget4(l3, 0)));
+  svst1_vnum(pg, dest, 20,
+	     svcreate4(svget4(l0, 1), svget4(l1, 1),
+		       svget4(l2, 1), svget4(l3, 1)));
+  svst1_vnum(pg, dest, 24,
+	     svcreate4(svget4(l0, 2), svget4(l1, 2),
+		       svget4(l2, 2), svget4(l3, 2)));
+  svst1_vnum(pg, dest, 28,
+	     svcreate4(svget4(l0, 3), svget4(l1, 3),
+		       svget4(l2, 3), svget4(l3, 3)));
+  l0 = svld1_vnum_x4(pg, src3, 0);
+  l1 = svld1_vnum_x4(pg, src3, 4);
+  l2 = svld1_vnum_x4(pg, src3, 8);
+  l3 = svld1_vnum_x4(pg, src3, 12);
+  l0 = svclamp(l0, l4, l5);
+  l1 = svclamp(l1, l4, l5);
+  l2 = svclamp(l2, l4, l5);
+  l3 = svclamp(l3, l4, l5);
+  svst1_vnum(pg, dest, 32,
+	     svcreate4(svget4(l0, 0), svget4(l1, 0),
+		       svget4(l2, 0), svget4(l3, 0)));
+  svst1_vnum(pg, dest, 36,
+	     svcreate4(svget4(l0, 1), svget4(l1, 1),
+		       svget4(l2, 1), svget4(l3, 1)));
+  svst1_vnum(pg, dest, 40,
+	     svcreate4(svget4(l0, 2), svget4(l1, 2),
+		       svget4(l2, 2), svget4(l3, 2)));
+  svst1_vnum(pg, dest, 44,
+	     svcreate4(svget4(l0, 3), svget4(l1, 3),
+		       svget4(l2, 3), svget4(l3, 3)));
+  svst1_vnum(pg, dest, 48,
+	     svcreate4(svget4(l0, 0), svget4(l1, 0),
+		       svget4(l2, 0), svget4(l3, 0)));
+  svst1_vnum(pg, dest, 52,
+	     svcreate4(svget4(l0, 1), svget4(l1, 1),
+		       svget4(l2, 1), svget4(l3, 1)));
+  svst1_vnum(pg, dest, 56,
+	     svcreate4(svget4(l0, 2), svget4(l1, 2),
+		       svget4(l2, 2), svget4(l3, 2)));
+  svst1_vnum(pg, dest, 60,
+	     svcreate4(svget4(l0, 3), svget4(l1, 3),
+		       svget4(l2, 3), svget4(l3, 3)));
+}
+
+/* { dg-final { scan-assembler-not {\tmov\tz} } } */
-- 
2.25.1


^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2023-12-08 16:28 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-12-08 16:28 [PATCH] aarch64: Some tweaks to the early-ra pass Richard Sandiford

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).