public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [RFC][AARCH64] Machine reorg pass for aarch64/Falkor to handle prefetcher tag collision
@ 2018-02-12 23:59 Kugan Vivekanandarajah
  2018-02-13  9:47 ` Kyrill Tkachov
  0 siblings, 1 reply; 4+ messages in thread
From: Kugan Vivekanandarajah @ 2018-02-12 23:59 UTC (permalink / raw)
  To: GCC Patches

[-- Attachment #1: Type: text/plain, Size: 617 bytes --]

Implements a machine reorg pass for aarch64/Falkor to handle
prefetcher tag collision. This is strictly not part of the loop
unroller but for Falkor, unrolling can make h/w prefetcher performing
badly if there are too much tag collisions based on the discussions in
https://gcc.gnu.org/ml/gcc/2017-10/msg00178.html.

gcc/ChangeLog:

2018-02-12  Kugan Vivekanandarajah  <kuganv@linaro.org>

    * config/aarch64/aarch64.c (iv_p): New.
    (strided_load_p): Likwise.
    (make_tag): Likesie.
    (get_load_info): Likewise.
    (aarch64_reorg): Likewise.
    (TARGET_MACHINE_DEPENDENT_REORG): Implement new target hook.

[-- Attachment #2: 0004-reorg-for-tag-collision.patch --]
[-- Type: text/x-patch, Size: 9521 bytes --]

From 0cd4f5acb2117c739ba81bb4b8b71af499107812 Mon Sep 17 00:00:00 2001
From: Kugan Vivekanandarajah <kugan.vivekanandarajah@linaro.org>
Date: Mon, 12 Feb 2018 10:44:53 +1100
Subject: [PATCH 4/4] reorg-for-tag-collision

Change-Id: Ic6e42d54268c9112ec1c25de577ca92c1808eeff
---
 gcc/config/aarch64/aarch64.c | 353 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 353 insertions(+)

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 1ce2a0c..48e7c54 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -71,6 +71,7 @@
 #include "selftest.h"
 #include "selftest-rtl.h"
 #include "rtx-vector-builder.h"
+#include "cfgrtl.h"
 
 /* This file should be included last.  */
 #include "target-def.h"
@@ -17203,6 +17204,355 @@ aarch64_select_early_remat_modes (sbitmap modes)
     }
 }
 
+static bool
+iv_p (rtx reg, struct loop *loop)
+{
+  df_ref adef;
+  unsigned regno = REGNO (reg);
+  bool def_in_loop = false;
+  bool def_out_loop = false;
+
+  if (GET_MODE_CLASS (GET_MODE (reg)) != MODE_INT)
+    return false;
+
+  for (adef = DF_REG_DEF_CHAIN (regno); adef; adef = DF_REF_NEXT_REG (adef))
+    {
+      if (!DF_REF_INSN_INFO (adef)
+	  || !NONDEBUG_INSN_P (DF_REF_INSN (adef)))
+	continue;
+
+      basic_block bb = DF_REF_BB (adef);
+      if (dominated_by_p (CDI_DOMINATORS, bb, loop->header)
+	  && bb->loop_father == loop)
+	{
+	  rtx_insn *insn = DF_REF_INSN (adef);
+	  recog_memoized (insn);
+	  rtx pat = PATTERN (insn);
+	  if (GET_CODE (pat) != SET)
+	    continue;
+	  rtx x = SET_SRC (pat);
+	  if (GET_CODE (x) == ZERO_EXTRACT
+	      || GET_CODE (x) == ZERO_EXTEND
+	      || GET_CODE (x) == SIGN_EXTEND)
+	    x = XEXP (x, 0);
+	  if (MEM_P (x))
+	    continue;
+	  if (GET_CODE (x) == POST_INC
+	      || GET_CODE (x) == POST_DEC
+	      || GET_CODE (x) == PRE_INC
+	      || GET_CODE (x) == PRE_DEC)
+	    def_in_loop = true;
+	  else if (BINARY_P (x))
+	    def_in_loop = true;
+	}
+      if (dominated_by_p (CDI_DOMINATORS, loop->header, bb))
+	def_out_loop = true;
+      if (def_in_loop && def_out_loop)
+	return true;
+    }
+  return false;
+}
+
+/* Return true if X is a strided load.  */
+
+static bool
+strided_load_p (rtx x,
+		struct loop *loop,
+		bool *pre_post,
+		rtx *base,
+		rtx *offset)
+{
+  /* Loadded value is extended, get src.  */
+  if (GET_CODE (x) == ZERO_EXTRACT
+      || GET_CODE (x) == ZERO_EXTEND
+      || GET_CODE (x) == SIGN_EXTEND)
+    x = XEXP (x, 0);
+
+  /* If it is not MEM_P, it is not lodade from mem.  */
+  if (!MEM_P (x))
+    return false;
+
+  /* Get the src of MEM_P.  */
+  x = XEXP (x, 0);
+
+  /* If it is a post/pre increment, get the src.  */
+  if (GET_CODE (x) == POST_INC
+      || GET_CODE (x) == POST_DEC
+      || GET_CODE (x) == PRE_INC
+      || GET_CODE (x) == PRE_DEC)
+    {
+      x = XEXP (x, 0);
+      *pre_post = true;
+    }
+
+  /* get base and offset depending on the type.  */
+  if (REG_P (x)
+      || UNARY_P (x))
+    {
+      if (!REG_P (x))
+	x = XEXP (x, 0);
+      if (REG_P (x)
+	  && iv_p (x, loop))
+	{
+	  *base = x;
+	  return true;
+	}
+    }
+  else if (BINARY_P (x))
+    {
+      rtx reg1, reg2;
+      reg1 = XEXP (x, 0);
+
+      if (REG_P (reg1)
+	  && REGNO (reg1) == SP_REGNUM)
+	return false;
+      reg2 = XEXP (x, 1);
+
+      if (REG_P (reg1)
+	  && iv_p (reg1, loop))
+	{
+
+	  *base = reg1;
+	  *offset = reg2;
+	  return true;
+	}
+
+      if (REG_P (reg1)
+	  && REG_P (reg2)
+	  && iv_p (reg2, loop))
+	{
+	  *base = reg1;
+	  *offset = reg2;
+	  return true;
+	}
+    }
+  return false;
+}
+
+static unsigned
+make_tag (unsigned dest, unsigned base, unsigned offset)
+{
+  return (dest & 0xf)
+    | ((base & 0xf) << 4)
+    | ((offset & 0x3f) << 8);
+}
+
+
+/* Return true if X INSN is a strided load.  */
+
+static bool
+get_load_info (rtx_insn *insn,
+	       struct loop *loop,
+	       bool *pre_post,
+	       rtx *base,
+	       rtx *dest,
+	       rtx *offset)
+{
+  subrtx_var_iterator::array_type array;
+  if (!INSN_P (insn) || recog_memoized (insn) < 0)
+    return false;
+  rtx pat = PATTERN (insn);
+  switch (GET_CODE (pat))
+    {
+    case PARALLEL:
+	{
+	  for (int j = 0; j < XVECLEN (pat, 0); ++j)
+	    {
+	      rtx ex = XVECEXP (pat, 0, j);
+	      FOR_EACH_SUBRTX_VAR (iter, array, ex, NONCONST)
+		{
+		  const_rtx x = *iter;
+		  if (GET_CODE (x) == SET
+		      && strided_load_p (SET_SRC (x), loop, pre_post,
+					 base, offset))
+		    {
+		      *dest = SET_DEST (x);
+		      return true;
+		    }
+		}
+	    }
+	}
+      break;
+
+    case SET:
+      FOR_EACH_SUBRTX_VAR (iter, array, SET_SRC (pat), NONCONST)
+	{
+	  rtx x = *iter;
+	  if (strided_load_p (x, loop, pre_post,
+			      base, offset))
+	    {
+	      *dest = SET_DEST (pat);
+	      return true;
+	    }
+	}
+
+    default:
+      break;
+    }
+  return false;
+}
+
+static void
+aarch64_reorg (void)
+{
+  basic_block *body, bb;
+  struct loop *loop;
+  rtx_insn *insn;
+
+  if (aarch64_tune != falkor)
+    return;
+
+  compute_bb_for_insn ();
+  /* Compute live regs.  */
+  df_compute_regs_ever_live (true);
+  df_analyze ();
+
+  /* Find the loops.  */
+  loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
+  calculate_dominance_info (CDI_DOMINATORS);
+  FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
+    {
+      hash_map <rtx, auto_vec<rtx_insn *> > tag_map (512);
+      body = get_loop_body (loop);
+      auto_vec <rtx> tags;
+
+      /* Record all the memory tags.  */
+      for (unsigned i = 0; i < loop->num_nodes; i++)
+	{
+	  bb = body[i];
+	  FOR_BB_INSNS (bb, insn)
+	    {
+	      unsigned tag;
+	      rtx base = NULL_RTX;
+	      rtx dest = NULL_RTX;
+	      rtx offset = NULL_RTX;
+	      bool pre_or_post = false;
+
+	      if (!INSN_P (insn)
+		  || DEBUG_INSN_P (insn))
+		continue;
+
+	      if (get_load_info (insn, loop, &pre_or_post,
+				 &base, &dest, &offset))
+		{
+		  int int_offset = 0;
+		  if (offset && REG_P (offset))
+		    int_offset = (1 << 5) | REGNO (offset);
+		  else if (offset && CONST_INT_P (offset))
+		    {
+		      int_offset = INTVAL (offset);
+		      int_offset /= GET_MODE_SIZE (GET_MODE (dest)).to_constant ();
+		      if (!pre_or_post)
+			int_offset >>= 2;
+		    }
+		  tag = make_tag (REGNO (dest), REGNO (base), int_offset);
+		  rtx t = GEN_INT (tag);
+		  if (!tag_map.get (t))
+		    tags.safe_push (t);
+		  tag_map.get_or_insert (t).safe_push (insn);
+		}
+	    }
+	}
+
+      for (unsigned i = 0; i < tags.length (); ++i)
+	{
+	  rtx t = tags[i];
+	  auto_vec<rtx_insn *> *v = tag_map.get (t);
+
+	  for (int j = v->length () - 1; j > 0; --j)
+	    {
+	      /* Get the insns that has tags colliding.  */
+	      rtx_insn *insn = (*v)[j];
+	      rtx pat;
+	      bool changed = false;
+	      int int_offset = 0;
+	      rtx base = NULL_RTX;
+	      rtx dest = NULL_RTX;
+	      rtx offset = NULL_RTX;
+	      bool pre_or_post = false;
+
+	      if (!get_load_info (insn, loop, &pre_or_post,
+				  &base, &dest, &offset))
+		gcc_assert (false);
+
+	      if (offset && REG_P (offset))
+		int_offset = (1 << 5) | REGNO (offset);
+	      else if (offset && CONST_INT_P (offset))
+		{
+		  int_offset = INTVAL (offset);
+		  int_offset /= GET_MODE_SIZE (GET_MODE (dest)).to_constant ();
+		  if (!pre_or_post)
+		    int_offset >>= 2;
+		}
+
+	      /* Go over temporary registers and find a free register, if
+		 available.  */
+	      for (int k = R9_REGNUM; !changed && (k <= R15_REGNUM); k++)
+		if (!df_hard_reg_used_p (k))
+		  {
+		    unsigned tag;
+		    rtx t;
+
+		    tag = make_tag (REGNO (dest), k, int_offset);
+		    t = GEN_INT (tag);
+		    /* Check to see if the new tag also collides with an
+		       existing load.  */
+		    if (tag_map.get (t))
+		      continue;
+
+		    machine_mode mode = GET_MODE (base);
+		    rtx new_reg = gen_rtx_REG (mode, k);
+		    t = GEN_INT (make_tag (REGNO (dest), REGNO (new_reg),
+					   int_offset));
+		    vec <rtx_insn *> *v2 = tag_map.get (t);
+		    if (v2 && (v2->length () > 0))
+		      continue;
+
+		    /* Change the insn: dest = load (base, offset)
+		       into tmp = base; dest = load (tmp, offset).  */
+		    extract_insn (insn);
+		    for (int l = 0;
+			 (!changed) && (l < recog_data.n_operands); l++)
+		      {
+			subrtx_ptr_iterator::array_type array;
+			rtx *op = recog_data.operand_loc[l];
+
+			if (recog_data.operand_type[l] == OP_OUT)
+			  continue;
+
+			FOR_EACH_SUBRTX_PTR (iter, array, op, NONCONST)
+			  {
+			    rtx *loc = *iter;
+			    rtx x = *loc;
+
+			    if (!changed && (base == x))
+			      {
+				pat = gen_rtx_SET (new_reg, base);
+				if (validate_change (insn, loc, new_reg, false))
+				  {
+				    emit_insn_before (pat, insn);
+				    if (pre_or_post)
+				      {
+					rtx pat2 = gen_rtx_SET (base, new_reg);
+					emit_insn_after (pat2, insn);
+				      }
+				  }
+				v->pop ();
+				tag_map.get_or_insert (t).safe_push (insn);
+				changed = true;
+				break;
+			      }
+			  }
+		      }
+		  }
+	    }
+	}
+    }
+
+  loop_optimizer_finalize ();
+  df_finish_pass (true);
+}
+
 /* Target-specific selftests.  */
 
 #if CHECKING_P
@@ -17675,6 +18025,9 @@ aarch64_libgcc_floating_mode_supported_p
 #undef TARGET_HW_MAX_MEM_READ_STREAMS
 #define TARGET_HW_MAX_MEM_READ_STREAMS aarch64_hw_max_mem_read_streams
 
+#undef TARGET_MACHINE_DEPENDENT_REORG
+#define TARGET_MACHINE_DEPENDENT_REORG aarch64_reorg
+
 #if CHECKING_P
 #undef TARGET_RUN_TARGET_SELFTESTS
 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
-- 
2.7.4


^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2018-02-15 20:50 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-02-12 23:59 [RFC][AARCH64] Machine reorg pass for aarch64/Falkor to handle prefetcher tag collision Kugan Vivekanandarajah
2018-02-13  9:47 ` Kyrill Tkachov
2018-02-13 22:47   ` Kugan Vivekanandarajah
2018-02-15 20:50     ` Kugan Vivekanandarajah

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).