[PATCH 0/2 V2] aarch64: Place target independent and dependent code in one file.

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCH 0/2 V2] aarch64: Place target independent and dependent code in one file.
@ 2024-02-15 18:43 Ajit Agarwal
  2024-02-22 19:49 ` Richard Sandiford
  0 siblings, 1 reply; 4+ messages in thread
From: Ajit Agarwal @ 2024-02-15 18:43 UTC (permalink / raw)
  To: Alex Coplan, Richard Sandiford, Kewen.Lin, Segher Boessenkool,
	Michael Meissner, Peter Bergner, David Edelsohn, gcc-patches

Hello Alex/Richard:

I have placed target indpendent and target dependent code in
aarch64-ldp-fusion for load store fusion.

Common infrastructure of load store pair fusion is divided into
target independent and target dependent code.

Target independent code is the Generic code with pure virtual
function to interface betwwen target independent and dependent
code.

Target dependent code is the implementation of pure virtual
function for aarch64 target and the call to target independent
code.

Bootstrapped in aarch64-linux-gnu.

Thanks & Regards
Ajit


aarch64: Place target independent and dependent code in one file.

Common infrastructure of load store pair fusion is divided into
target independent and target dependent code.

Target independent code is the Generic code with pure virtual
function to interface betwwen target independent and dependent
code.

Target dependent code is the implementation of pure virtual
function for aarch64 target and the call to target independent
code.

2024-02-15  Ajit Kumar Agarwal  <aagarwa1@linux.ibm.com>

gcc/ChangeLog:

	* config/aarch64/aarch64-ldp-fusion.cc: Place target
	independent and dependent code.
---
 gcc/config/aarch64/aarch64-ldp-fusion.cc | 3513 ++++++++++++----------
 1 file changed, 1842 insertions(+), 1671 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-ldp-fusion.cc b/gcc/config/aarch64/aarch64-ldp-fusion.cc
index 22ed95eb743..0ab842e2bbb 100644
--- a/gcc/config/aarch64/aarch64-ldp-fusion.cc
+++ b/gcc/config/aarch64/aarch64-ldp-fusion.cc
@@ -17,6 +17,7 @@
 // along with GCC; see the file COPYING3.  If not see
 // <http://www.gnu.org/licenses/>.
 
+
 #define INCLUDE_ALGORITHM
 #define INCLUDE_FUNCTIONAL
 #define INCLUDE_LIST
@@ -37,13 +38,12 @@
 #include "tree-hash-traits.h"
 #include "print-tree.h"
 #include "insn-attr.h"
-
 using namespace rtl_ssa;
 
-static constexpr HOST_WIDE_INT LDP_IMM_BITS = 7;
-static constexpr HOST_WIDE_INT LDP_IMM_SIGN_BIT = (1 << (LDP_IMM_BITS - 1));
-static constexpr HOST_WIDE_INT LDP_MAX_IMM = LDP_IMM_SIGN_BIT - 1;
-static constexpr HOST_WIDE_INT LDP_MIN_IMM = -LDP_MAX_IMM - 1;
+static constexpr HOST_WIDE_INT PAIR_MEM_IMM_BITS = 7;
+static constexpr HOST_WIDE_INT PAIR_MEM_IMM_SIGN_BIT = (1 << (PAIR_MEM_IMM_BITS - 1));
+static constexpr HOST_WIDE_INT PAIR_MEM_MAX_IMM = PAIR_MEM_IMM_SIGN_BIT - 1;
+static constexpr HOST_WIDE_INT PAIR_MEM_MIN_IMM = -PAIR_MEM_MAX_IMM - 1;
 
 // We pack these fields (load_p, fpsimd_p, and size) into an integer
 // (LFS) which we use as part of the key into the main hash tables.
@@ -138,8 +138,144 @@ struct alt_base
   poly_int64 offset;
 };
 
+// Class that implements a state machine for building the changes needed to form
+// a store pair instruction.  This allows us to easily build the changes in
+// program order, as required by rtl-ssa.
+struct stp_change_builder
+{
+  enum class state
+  {
+    FIRST,
+    INSERT,
+    FIXUP_USE,
+    LAST,
+    DONE
+  };
+
+  enum class action
+  {
+    TOMBSTONE,
+    CHANGE,
+    INSERT,
+    FIXUP_USE
+  };
+
+  struct change
+  {
+    action type;
+    insn_info *insn;
+  };
+
+  bool done () const { return m_state == state::DONE; }
+
+  stp_change_builder (insn_info *insns[2],
+		      insn_info *repurpose,
+		      insn_info *dest)
+    : m_state (state::FIRST), m_insns { insns[0], insns[1] },
+      m_repurpose (repurpose), m_dest (dest), m_use (nullptr) {}
+
+  change get_change () const
+  {
+    switch (m_state)
+      {
+      case state::FIRST:
+	return {
+	  m_insns[0] == m_repurpose ? action::CHANGE : action::TOMBSTONE,
+	  m_insns[0]
+	};
+      case state::LAST:
+	return {
+	  m_insns[1] == m_repurpose ? action::CHANGE : action::TOMBSTONE,
+	  m_insns[1]
+	};
+      case state::INSERT:
+	return { action::INSERT, m_dest };
+      case state::FIXUP_USE:
+	return { action::FIXUP_USE, m_use->insn () };
+      case state::DONE:
+	break;
+      }
+
+    gcc_unreachable ();
+  }
+
+  // Transition to the next state.
+  void advance ()
+  {
+    switch (m_state)
+      {
+      case state::FIRST:
+	if (m_repurpose)
+	  m_state = state::LAST;
+	else
+	  m_state = state::INSERT;
+	break;
+      case state::INSERT:
+      {
+	def_info *def = memory_access (m_insns[0]->defs ());
+	while (*def->next_def ()->insn () <= *m_dest)
+	  def = def->next_def ();
+
+	// Now we know DEF feeds the insertion point for the new stp.
+	// Look for any uses of DEF that will consume the new stp.
+	gcc_assert (*def->insn () <= *m_dest
+		    && *def->next_def ()->insn () > *m_dest);
+
+	auto set = as_a<set_info *> (def);
+	for (auto use : set->nondebug_insn_uses ())
+	  if (*use->insn () > *m_dest)
+	    {
+	      m_use = use;
+	      break;
+	    }
+
+	if (m_use)
+	  m_state = state::FIXUP_USE;
+	else
+	  m_state = state::LAST;
+	break;
+      }
+      case state::FIXUP_USE:
+	m_use = m_use->next_nondebug_insn_use ();
+	if (!m_use)
+	  m_state = state::LAST;
+	break;
+      case state::LAST:
+	m_state = state::DONE;
+	break;
+      case state::DONE:
+	gcc_unreachable ();
+      }
+  }
+
+private:
+  state m_state;
+
+  // Original candidate stores.
+  insn_info *m_insns[2];
+
+  // If non-null, this is a candidate insn to change into an stp.  Otherwise we
+  // are deleting both original insns and inserting a new insn for the stp.
+  insn_info *m_repurpose;
+
+  // Destionation of the stp, it will be placed immediately after m_dest.
+  insn_info *m_dest;
+
+  // Current nondebug use that needs updating due to stp insertion.
+  use_info *m_use;
+};
+
+// Virtual base class for load/store walkers used in alias analysis.
+struct alias_walker
+{
+  virtual bool conflict_p (int &budget) const = 0;
+  virtual insn_info *insn () const = 0;
+  virtual bool valid () const  = 0;
+  virtual void advance () = 0;
+};
+
 // State used by the pass for a given basic block.
-struct ldp_bb_info
+struct pair_fusion
 {
   using def_hash = nofree_ptr_hash<def_info>;
   using expr_key_t = pair_hash<tree_operand_hash, int_hash<int, -1, -2>>;
@@ -161,13 +297,13 @@ struct ldp_bb_info
   static const size_t obstack_alignment = sizeof (void *);
   bb_info *m_bb;
 
-  ldp_bb_info (bb_info *bb) : m_bb (bb), m_emitted_tombstone (false)
+  pair_fusion (bb_info *bb) : m_bb (bb), m_emitted_tombstone (false)
   {
     obstack_specify_allocation (&m_obstack, OBSTACK_CHUNK_SIZE,
 				obstack_alignment, obstack_chunk_alloc,
 				obstack_chunk_free);
   }
-  ~ldp_bb_info ()
+  ~pair_fusion ()
   {
     obstack_free (&m_obstack, nullptr);
 
@@ -177,10 +313,50 @@ struct ldp_bb_info
 	bitmap_obstack_release (&m_bitmap_obstack);
       }
   }
+  void track_access (insn_info *, bool load, rtx mem);
+  void transform ();
+  void cleanup_tombstones ();
+  virtual void set_multiword_subreg (insn_info *i1, insn_info *i2,
+				     bool load_p) = 0;
+  virtual rtx gen_load_store_pair (rtx *pats,  rtx writeback,
+				   bool load_p) = 0;
+  void merge_pairs (insn_list_t &, insn_list_t &,
+		    bool load_p, unsigned access_size);
+  virtual void transform_for_base (int load_size, access_group &group) = 0;
+
+  bool try_fuse_pair (bool load_p, unsigned access_size,
+			     insn_info *i1, insn_info *i2);
+
+  bool fuse_pair (bool load_p, unsigned access_size,
+		  int writeback,
+		  insn_info *i1, insn_info *i2,
+		  base_cand &base,
+		  const insn_range_info &move_range);
+
+  void do_alias_analysis (insn_info *alias_hazards[4],
+			  alias_walker *walkers[4],
+			  bool load_p);
+
+  void track_tombstone (int uid);
+
+  bool track_via_mem_expr (insn_info *, rtx mem, lfs_fields lfs);
 
-  inline void track_access (insn_info *, bool load, rtx mem);
-  inline void transform ();
-  inline void cleanup_tombstones ();
+  virtual bool is_fpsimd_op_p (rtx reg_op, machine_mode mem_mode,
+			       bool load_p) = 0;
+
+  virtual bool pair_operand_mode_ok_p (machine_mode mode) = 0;
+  virtual bool pair_trailing_writeback_p () = 0;
+  virtual bool pair_check_register_operand (bool load_p, rtx reg_op,
+					    machine_mode mem_mode) = 0;
+  virtual int pair_mem_alias_check_limit () = 0;
+  virtual bool pair_is_writeback () = 0 ;
+  virtual bool pair_mem_ok_policy (rtx first_mem, bool load_p,
+				   machine_mode mode) = 0;
+  virtual bool fuseable_store_p (insn_info *i1, insn_info *i2) = 0;
+  virtual bool fuseable_load_p (insn_info *info) = 0;
+
+  template<typename Map>
+    void traverse_base_map (Map &map);
 
 private:
   obstack m_obstack;
@@ -191,100 +367,292 @@ private:
   bool m_emitted_tombstone;
 
   inline splay_tree_node<access_record *> *node_alloc (access_record *);
-
-  template<typename Map>
-  inline void traverse_base_map (Map &map);
-  inline void transform_for_base (int load_size, access_group &group);
-
-  inline void merge_pairs (insn_list_t &, insn_list_t &,
-			   bool load_p, unsigned access_size);
-
-  inline bool try_fuse_pair (bool load_p, unsigned access_size,
-			     insn_info *i1, insn_info *i2);
-
-  inline bool fuse_pair (bool load_p, unsigned access_size,
-			 int writeback,
-			 insn_info *i1, insn_info *i2,
-			 base_cand &base,
-			 const insn_range_info &move_range);
-
-  inline void track_tombstone (int uid);
-
-  inline bool track_via_mem_expr (insn_info *, rtx mem, lfs_fields lfs);
 };
-
-splay_tree_node<access_record *> *
-ldp_bb_info::node_alloc (access_record *access)
-{
-  using T = splay_tree_node<access_record *>;
-  void *addr = obstack_alloc (&m_obstack, sizeof (T));
-  return new (addr) T (access);
-}
-
-// Given a mem MEM, if the address has side effects, return a MEM that accesses
-// the same address but without the side effects.  Otherwise, return
-// MEM unchanged.
-static rtx
-drop_writeback (rtx mem)
+// Track the access INSN at offset OFFSET in this access group.
+// ALLOC_NODE is used to allocate splay tree nodes.
+template<typename Alloc>
+void
+access_group::track (Alloc alloc_node, poly_int64 offset, insn_info *insn)
 {
-  rtx addr = XEXP (mem, 0);
+  auto insert_before = [&](std::list<access_record>::iterator after)
+    {
+      auto it = list.emplace (after, offset);
+      it->cand_insns.push_back (insn);
+      it->place = it;
+      return &*it;
+    };
 
-  if (!side_effects_p (addr))
-    return mem;
+  if (!list.size ())
+    {
+      auto access = insert_before (list.end ());
+      tree.insert_max_node (alloc_node (access));
+      return;
+    }
 
-  switch (GET_CODE (addr))
+  auto compare = [&](splay_tree_node<access_record *> *node)
     {
-    case PRE_MODIFY:
-      addr = XEXP (addr, 1);
-      break;
-    case POST_MODIFY:
-    case POST_INC:
-    case POST_DEC:
-      addr = XEXP (addr, 0);
-      break;
-    case PRE_INC:
-    case PRE_DEC:
+      return compare_sizes_for_sort (offset, node->value ()->offset);
+    };
+  auto result = tree.lookup (compare);
+  splay_tree_node<access_record *> *node = tree.root ();
+  if (result == 0)
+    node->value ()->cand_insns.push_back (insn);
+  else
     {
-      poly_int64 adjustment = GET_MODE_SIZE (GET_MODE (mem));
-      if (GET_CODE (addr) == PRE_DEC)
-	adjustment *= -1;
-      addr = plus_constant (GET_MODE (addr), XEXP (addr, 0), adjustment);
-      break;
-    }
-    default:
-      gcc_unreachable ();
+      auto it = node->value ()->place;
+      auto after = (result > 0) ? std::next (it) : it;
+      auto access = insert_before (after);
+      tree.insert_child (node, result > 0, alloc_node (access));
     }
-
-  return change_address (mem, GET_MODE (mem), addr);
 }
 
-// Convenience wrapper around strip_offset that can also look through
-// RTX_AUTOINC addresses.  The interface is like strip_offset except we take a
-// MEM so that we know the mode of the access.
-static rtx
-ldp_strip_offset (rtx mem, poly_int64 *offset)
+bool
+store_modifies_mem_p (rtx mem, insn_info *store_insn, int &budget);
+bool load_modified_by_store_p (insn_info *load,
+			  insn_info *store,
+			  int &budget);
+
+// Implement some common functionality used by both store_walker
+// and load_walker.
+template<bool reverse>
+class def_walker : public alias_walker
 {
-  rtx addr = XEXP (mem, 0);
+protected:
+  using def_iter_t = typename std::conditional<reverse,
+	reverse_def_iterator, def_iterator>::type;
 
-  switch (GET_CODE (addr))
-    {
-    case PRE_MODIFY:
-    case POST_MODIFY:
-      addr = strip_offset (XEXP (addr, 1), offset);
-      gcc_checking_assert (REG_P (addr));
-      gcc_checking_assert (rtx_equal_p (XEXP (XEXP (mem, 0), 0), addr));
-      break;
-    case PRE_INC:
-    case POST_INC:
-      addr = XEXP (addr, 0);
-      *offset = GET_MODE_SIZE (GET_MODE (mem));
-      gcc_checking_assert (REG_P (addr));
-      break;
-    case PRE_DEC:
-    case POST_DEC:
-      addr = XEXP (addr, 0);
-      *offset = -GET_MODE_SIZE (GET_MODE (mem));
-      gcc_checking_assert (REG_P (addr));
+  static use_info *start_use_chain (def_iter_t &def_iter)
+  {
+    set_info *set = nullptr;
+    for (; *def_iter; def_iter++)
+      {
+	set = dyn_cast<set_info *> (*def_iter);
+	if (!set)
+	  continue;
+
+	use_info *use = reverse
+	  ? set->last_nondebug_insn_use ()
+	  : set->first_nondebug_insn_use ();
+
+	if (use)
+	  return use;
+      }
+
+    return nullptr;
+  }
+
+  def_iter_t def_iter;
+  insn_info *limit;
+  def_walker (def_info *def, insn_info *limit) :
+    def_iter (def), limit (limit) {}
+
+  virtual bool iter_valid () const { return *def_iter; }
+
+public:
+  insn_info *insn () const override { return (*def_iter)->insn (); }
+  void advance () override { def_iter++; }
+  bool valid () const override final
+  {
+    if (!iter_valid ())
+      return false;
+
+    if (reverse)
+      return *(insn ()) > *limit;
+    else
+      return *(insn ()) < *limit;
+  }
+};
+
+// alias_walker that iterates over stores.
+template<bool reverse, typename InsnPredicate>
+class store_walker : public def_walker<reverse>
+{
+  rtx cand_mem;
+  InsnPredicate tombstone_p;
+
+public:
+  store_walker (def_info *mem_def, rtx mem, insn_info *limit_insn,
+		InsnPredicate tombstone_fn) :
+    def_walker<reverse> (mem_def, limit_insn),
+    cand_mem (mem), tombstone_p (tombstone_fn) {}
+  bool conflict_p (int &budget) const override final
+  {
+    if (tombstone_p (this->insn ()))
+      return false;
+
+    return store_modifies_mem_p (cand_mem, this->insn (), budget);
+  }
+};
+
+// alias_walker that iterates over loads.
+template<bool reverse>
+class load_walker : public def_walker<reverse>
+{
+  using Base = def_walker<reverse>;
+  using use_iter_t = typename std::conditional<reverse,
+	reverse_use_iterator, nondebug_insn_use_iterator>::type;
+
+  use_iter_t use_iter;
+  insn_info *cand_store;
+
+  bool iter_valid () const override final { return *use_iter; }
+
+public:
+  void advance () override final
+  {
+    use_iter++;
+    if (*use_iter)
+      return;
+    this->def_iter++;
+    use_iter = Base::start_use_chain (this->def_iter);
+  }
+
+  insn_info *insn () const override final
+  {
+    return (*use_iter)->insn ();
+  }
+  bool conflict_p (int &budget) const override final
+  {
+    return load_modified_by_store_p (insn (), cand_store, budget);
+  }
+  load_walker (def_info *def, insn_info *store, insn_info *limit_insn)
+    : Base (def, limit_insn),
+      use_iter (Base::start_use_chain (this->def_iter)),
+      cand_store (store) {}
+};
+
+extern insn_info *
+try_repurpose_store (insn_info *first,
+		     insn_info *second,
+		     const insn_range_info &move_range);
+
+void reset_debug_use (use_info *use);
+
+extern void
+fixup_debug_uses (obstack_watermark &attempt,
+		  insn_info *insns[2],
+		  rtx orig_rtl[2],
+		  insn_info *pair_dst,
+		  insn_info *trailing_add,
+		  bool load_p,
+		  int writeback,
+		  rtx writeback_effect,
+		  unsigned base_regno);
+
+void
+fixup_debug_uses_trailing_add (obstack_watermark &attempt,
+			       insn_info *pair_dst,
+			       insn_info *trailing_add,
+			       rtx writeback_effect);
+
+
+extern void
+fixup_debug_use (obstack_watermark &attempt,
+		 use_info *use,
+		 def_info *def,
+		 rtx base,
+		 poly_int64 wb_offset);
+
+extern insn_info *
+find_trailing_add (insn_info *insns[2],
+		   const insn_range_info &pair_range,
+		   int initial_writeback,
+		   rtx *writeback_effect,
+		   def_info **add_def,
+		   def_info *base_def,
+		   poly_int64 initial_offset,
+		   unsigned access_size);
+
+rtx drop_writeback (rtx mem);
+rtx pair_mem_strip_offset (rtx mem, poly_int64 *offset);
+bool any_pre_modify_p (rtx x);
+bool any_post_modify_p (rtx x);
+int encode_lfs (lfs_fields fields);
+extern insn_info * latest_hazard_before (insn_info *insn, rtx *ignore,
+		      insn_info *ignore_insn = nullptr);
+insn_info * first_hazard_after (insn_info *insn, rtx *ignore);
+bool ranges_overlap_p (const insn_range_info &r1, const insn_range_info &r2);
+insn_range_info get_def_range (def_info *def);
+insn_range_info def_downwards_move_range (def_info *def);
+insn_range_info def_upwards_move_range (def_info *def);
+rtx gen_tombstone (void);
+rtx filter_notes (rtx note, rtx result, bool *eh_region, rtx *fr_expr);
+rtx combine_reg_notes (insn_info *i1, insn_info *i2, bool load_p);
+rtx extract_writebacks (bool load_p, rtx pats[2], int changed);
+void do_alias_analysis (insn_info *alias_hazards[4],
+		   alias_walker *walkers[4],
+		   bool load_p);
+int get_viable_bases (insn_info *insns[2],
+		  vec<base_cand> &base_cands,
+		  rtx cand_mems[2],
+		  unsigned access_size,
+		  bool reversed);
+void dump_insn_list (FILE *f, const insn_list_t &l);
+
+// Given a mem MEM, if the address has side effects, return a MEM that accesses
+// the same address but without the side effects.  Otherwise, return
+// MEM unchanged.
+rtx
+drop_writeback (rtx mem)
+{
+  rtx addr = XEXP (mem, 0);
+
+  if (!side_effects_p (addr))
+    return mem;
+
+  switch (GET_CODE (addr))
+    {
+    case PRE_MODIFY:
+      addr = XEXP (addr, 1);
+      break;
+    case POST_MODIFY:
+    case POST_INC:
+    case POST_DEC:
+      addr = XEXP (addr, 0);
+      break;
+    case PRE_INC:
+    case PRE_DEC:
+    {
+      poly_int64 adjustment = GET_MODE_SIZE (GET_MODE (mem));
+      if (GET_CODE (addr) == PRE_DEC)
+	adjustment *= -1;
+      addr = plus_constant (GET_MODE (addr), XEXP (addr, 0), adjustment);
+      break;
+    }
+    default:
+      gcc_unreachable ();
+    }
+
+  return change_address (mem, GET_MODE (mem), addr);
+}
+
+// Convenience wrapper around strip_offset that can also look through
+// RTX_AUTOINC addresses.  The interface is like strip_offset except we take a
+// MEM so that we know the mode of the access.
+rtx
+pair_mem_strip_offset (rtx mem, poly_int64 *offset)
+{
+  rtx addr = XEXP (mem, 0);
+
+  switch (GET_CODE (addr))
+    {
+    case PRE_MODIFY:
+    case POST_MODIFY:
+      addr = strip_offset (XEXP (addr, 1), offset);
+      gcc_checking_assert (REG_P (addr));
+      gcc_checking_assert (rtx_equal_p (XEXP (XEXP (mem, 0), 0), addr));
+      break;
+    case PRE_INC:
+    case POST_INC:
+      addr = XEXP (addr, 0);
+      *offset = GET_MODE_SIZE (GET_MODE (mem));
+      gcc_checking_assert (REG_P (addr));
+      break;
+    case PRE_DEC:
+    case POST_DEC:
+      addr = XEXP (addr, 0);
+      *offset = -GET_MODE_SIZE (GET_MODE (mem));
+      gcc_checking_assert (REG_P (addr));
       break;
 
     default:
@@ -295,7 +663,7 @@ ldp_strip_offset (rtx mem, poly_int64 *offset)
 }
 
 // Return true if X is a PRE_{INC,DEC,MODIFY} rtx.
-static bool
+bool
 any_pre_modify_p (rtx x)
 {
   const auto code = GET_CODE (x);
@@ -303,318 +671,42 @@ any_pre_modify_p (rtx x)
 }
 
 // Return true if X is a POST_{INC,DEC,MODIFY} rtx.
-static bool
+bool
 any_post_modify_p (rtx x)
 {
   const auto code = GET_CODE (x);
   return code == POST_INC || code == POST_DEC || code == POST_MODIFY;
 }
 
-// Return true if we should consider forming ldp/stp insns from memory
-// accesses with operand mode MODE at this stage in compilation.
-static bool
-ldp_operand_mode_ok_p (machine_mode mode)
-{
-  const bool allow_qregs
-    = !(aarch64_tune_params.extra_tuning_flags
-	& AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS);
-
-  if (!aarch64_ldpstp_operand_mode_p (mode))
-    return false;
-
-  const auto size = GET_MODE_SIZE (mode).to_constant ();
-  if (size == 16 && !allow_qregs)
-    return false;
-
-  // We don't pair up TImode accesses before RA because TImode is
-  // special in that it can be allocated to a pair of GPRs or a single
-  // FPR, and the RA is best placed to make that decision.
-  return reload_completed || mode != TImode;
-}
-
 // Given LFS (load_p, fpsimd_p, size) fields in FIELDS, encode these
 // into an integer for use as a hash table key.
-static int
+int
 encode_lfs (lfs_fields fields)
 {
   int size_log2 = exact_log2 (fields.size);
-  gcc_checking_assert (size_log2 >= 2 && size_log2 <= 4);
+  //gcc_checking_assert (size_log2 >= 2 && size_log2 <= 4);
   return ((int)fields.load_p << 3)
     | ((int)fields.fpsimd_p << 2)
     | (size_log2 - 2);
 }
 
-// Inverse of encode_lfs.
-static lfs_fields
-decode_lfs (int lfs)
-{
-  bool load_p = (lfs & (1 << 3));
-  bool fpsimd_p = (lfs & (1 << 2));
-  unsigned size = 1U << ((lfs & 3) + 2);
-  return { load_p, fpsimd_p, size };
-}
+// Dummy predicate that never ignores any insns.
+static bool no_ignore (insn_info *) { return false; }
 
-// Track the access INSN at offset OFFSET in this access group.
-// ALLOC_NODE is used to allocate splay tree nodes.
-template<typename Alloc>
-void
-access_group::track (Alloc alloc_node, poly_int64 offset, insn_info *insn)
-{
-  auto insert_before = [&](std::list<access_record>::iterator after)
-    {
-      auto it = list.emplace (after, offset);
-      it->cand_insns.push_back (insn);
-      it->place = it;
-      return &*it;
-    };
-
-  if (!list.size ())
-    {
-      auto access = insert_before (list.end ());
-      tree.insert_max_node (alloc_node (access));
-      return;
-    }
-
-  auto compare = [&](splay_tree_node<access_record *> *node)
-    {
-      return compare_sizes_for_sort (offset, node->value ()->offset);
-    };
-  auto result = tree.lookup (compare);
-  splay_tree_node<access_record *> *node = tree.root ();
-  if (result == 0)
-    node->value ()->cand_insns.push_back (insn);
-  else
-    {
-      auto it = node->value ()->place;
-      auto after = (result > 0) ? std::next (it) : it;
-      auto access = insert_before (after);
-      tree.insert_child (node, result > 0, alloc_node (access));
-    }
-}
-
-// Given a candidate access INSN (with mem MEM), see if it has a suitable
-// MEM_EXPR base (i.e. a tree decl) relative to which we can track the access.
-// LFS is used as part of the key to the hash table, see track_access.
-bool
-ldp_bb_info::track_via_mem_expr (insn_info *insn, rtx mem, lfs_fields lfs)
-{
-  if (!MEM_EXPR (mem) || !MEM_OFFSET_KNOWN_P (mem))
-    return false;
-
-  poly_int64 offset;
-  tree base_expr = get_addr_base_and_unit_offset (MEM_EXPR (mem),
-						  &offset);
-  if (!base_expr || !DECL_P (base_expr))
-    return false;
-
-  offset += MEM_OFFSET (mem);
-
-  const machine_mode mem_mode = GET_MODE (mem);
-  const HOST_WIDE_INT mem_size = GET_MODE_SIZE (mem_mode).to_constant ();
-
-  // Punt on misaligned offsets.  LDP/STP instructions require offsets to be a
-  // multiple of the access size, and we believe that misaligned offsets on
-  // MEM_EXPR bases are likely to lead to misaligned offsets w.r.t. RTL bases.
-  if (!multiple_p (offset, mem_size))
-    return false;
-
-  const auto key = std::make_pair (base_expr, encode_lfs (lfs));
-  access_group &group = expr_map.get_or_insert (key, NULL);
-  auto alloc = [&](access_record *access) { return node_alloc (access); };
-  group.track (alloc, offset, insn);
-
-  if (dump_file)
-    {
-      fprintf (dump_file, "[bb %u] tracking insn %d via ",
-	       m_bb->index (), insn->uid ());
-      print_node_brief (dump_file, "mem expr", base_expr, 0);
-      fprintf (dump_file, " [L=%d FP=%d, %smode, off=",
-	       lfs.load_p, lfs.fpsimd_p, mode_name[mem_mode]);
-      print_dec (offset, dump_file);
-      fprintf (dump_file, "]\n");
-    }
-
-  return true;
-}
-
-// Main function to begin pair discovery.  Given a memory access INSN,
-// determine whether it could be a candidate for fusing into an ldp/stp,
-// and if so, track it in the appropriate data structure for this basic
-// block.  LOAD_P is true if the access is a load, and MEM is the mem
-// rtx that occurs in INSN.
-void
-ldp_bb_info::track_access (insn_info *insn, bool load_p, rtx mem)
-{
-  // We can't combine volatile MEMs, so punt on these.
-  if (MEM_VOLATILE_P (mem))
-    return;
-
-  // Ignore writeback accesses if the param says to do so.
-  if (!aarch64_ldp_writeback
-      && GET_RTX_CLASS (GET_CODE (XEXP (mem, 0))) == RTX_AUTOINC)
-    return;
-
-  const machine_mode mem_mode = GET_MODE (mem);
-  if (!ldp_operand_mode_ok_p (mem_mode))
-    return;
-
-  rtx reg_op = XEXP (PATTERN (insn->rtl ()), !load_p);
-
-  // Ignore the access if the register operand isn't suitable for ldp/stp.
-  if (load_p
-      ? !aarch64_ldp_reg_operand (reg_op, mem_mode)
-      : !aarch64_stp_reg_operand (reg_op, mem_mode))
-    return;
-
-  // We want to segregate FP/SIMD accesses from GPR accesses.
-  //
-  // Before RA, we use the modes, noting that stores of constant zero
-  // operands use GPRs (even in non-integer modes).  After RA, we use
-  // the hard register numbers.
-  const bool fpsimd_op_p
-    = reload_completed
-    ? (REG_P (reg_op) && FP_REGNUM_P (REGNO (reg_op)))
-    : (GET_MODE_CLASS (mem_mode) != MODE_INT
-       && (load_p || !aarch64_const_zero_rtx_p (reg_op)));
-
-  // Note ldp_operand_mode_ok_p already rejected VL modes.
-  const HOST_WIDE_INT mem_size = GET_MODE_SIZE (mem_mode).to_constant ();
-  const lfs_fields lfs = { load_p, fpsimd_op_p, mem_size };
-
-  if (track_via_mem_expr (insn, mem, lfs))
-    return;
-
-  poly_int64 mem_off;
-  rtx addr = XEXP (mem, 0);
-  const bool autoinc_p = GET_RTX_CLASS (GET_CODE (addr)) == RTX_AUTOINC;
-  rtx base = ldp_strip_offset (mem, &mem_off);
-  if (!REG_P (base))
-    return;
-
-  // Need to calculate two (possibly different) offsets:
-  //  - Offset at which the access occurs.
-  //  - Offset of the new base def.
-  poly_int64 access_off;
-  if (autoinc_p && any_post_modify_p (addr))
-    access_off = 0;
-  else
-    access_off = mem_off;
-
-  poly_int64 new_def_off = mem_off;
-
-  // Punt on accesses relative to eliminable regs.  Since we don't know the
-  // elimination offset pre-RA, we should postpone forming pairs on such
-  // accesses until after RA.
-  //
-  // As it stands, addresses with offsets in range for LDR but not
-  // in range for LDP/STP are currently reloaded inefficiently,
-  // ending up with a separate base register for each pair.
-  //
-  // In theory LRA should make use of
-  // targetm.legitimize_address_displacement to promote sharing of
-  // bases among multiple (nearby) address reloads, but the current
-  // LRA code returns early from process_address_1 for operands that
-  // satisfy "m", even if they don't satisfy the real (relaxed) address
-  // constraint; this early return means we never get to the code
-  // that calls targetm.legitimize_address_displacement.
-  //
-  // So for now, it's better to punt when we can't be sure that the
-  // offset is in range for LDP/STP.  Out-of-range cases can then be
-  // handled after RA by the out-of-range LDP/STP peepholes.  Eventually, it
-  // would be nice to handle known out-of-range opportunities in the
-  // pass itself (for stack accesses, this would be in the post-RA pass).
-  if (!reload_completed
-      && (REGNO (base) == FRAME_POINTER_REGNUM
-	  || REGNO (base) == ARG_POINTER_REGNUM))
-    return;
-
-  // Now need to find def of base register.
-  use_info *base_use = find_access (insn->uses (), REGNO (base));
-  gcc_assert (base_use);
-  def_info *base_def = base_use->def ();
-  if (!base_def)
-    {
-      if (dump_file)
-	fprintf (dump_file,
-		 "base register (regno %d) of insn %d is undefined",
-		 REGNO (base), insn->uid ());
-      return;
-    }
-
-  alt_base *canon_base = canon_base_map.get (base_def);
-  if (canon_base)
-    {
-      // Express this as the combined offset from the canonical base.
-      base_def = canon_base->base;
-      new_def_off += canon_base->offset;
-      access_off += canon_base->offset;
-    }
-
-  if (autoinc_p)
-    {
-      auto def = find_access (insn->defs (), REGNO (base));
-      gcc_assert (def);
-
-      // Record that DEF = BASE_DEF + MEM_OFF.
-      if (dump_file)
-	{
-	  pretty_printer pp;
-	  pp_access (&pp, def, 0);
-	  pp_string (&pp, " = ");
-	  pp_access (&pp, base_def, 0);
-	  fprintf (dump_file, "[bb %u] recording %s + ",
-		   m_bb->index (), pp_formatted_text (&pp));
-	  print_dec (new_def_off, dump_file);
-	  fprintf (dump_file, "\n");
-	}
-
-      alt_base base_rec { base_def, new_def_off };
-      if (canon_base_map.put (def, base_rec))
-	gcc_unreachable (); // Base defs should be unique.
-    }
-
-  // Punt on misaligned offsets.  LDP/STP require offsets to be a multiple of
-  // the access size.
-  if (!multiple_p (mem_off, mem_size))
-    return;
-
-  const auto key = std::make_pair (base_def, encode_lfs (lfs));
-  access_group &group = def_map.get_or_insert (key, NULL);
-  auto alloc = [&](access_record *access) { return node_alloc (access); };
-  group.track (alloc, access_off, insn);
-
-  if (dump_file)
-    {
-      pretty_printer pp;
-      pp_access (&pp, base_def, 0);
-
-      fprintf (dump_file, "[bb %u] tracking insn %d via %s",
-	       m_bb->index (), insn->uid (), pp_formatted_text (&pp));
-      fprintf (dump_file,
-	       " [L=%d, WB=%d, FP=%d, %smode, off=",
-	       lfs.load_p, autoinc_p, lfs.fpsimd_p, mode_name[mem_mode]);
-      print_dec (access_off, dump_file);
-      fprintf (dump_file, "]\n");
-    }
-}
-
-// Dummy predicate that never ignores any insns.
-static bool no_ignore (insn_info *) { return false; }
-
-// Return the latest dataflow hazard before INSN.
-//
-// If IGNORE is non-NULL, this points to a sub-rtx which we should ignore for
-// dataflow purposes.  This is needed when considering changing the RTL base of
-// an access discovered through a MEM_EXPR base.
-//
-// If IGNORE_INSN is non-NULL, we should further ignore any hazards arising
-// from that insn.
-//
-// N.B. we ignore any defs/uses of memory here as we deal with that separately,
-// making use of alias disambiguation.
-static insn_info *
-latest_hazard_before (insn_info *insn, rtx *ignore,
-		      insn_info *ignore_insn = nullptr)
+// Return the latest dataflow hazard before INSN.
+//
+// If IGNORE is non-NULL, this points to a sub-rtx which we should ignore for
+// dataflow purposes.  This is needed when considering changing the RTL base of
+// an access discovered through a MEM_EXPR base.
+//
+// If IGNORE_INSN is non-NULL, we should further ignore any hazards arising
+// from that insn.
+//
+// N.B. we ignore any defs/uses of memory here as we deal with that separately,
+// making use of alias disambiguation.
+insn_info *
+latest_hazard_before (insn_info *insn, rtx *ignore,
+		      insn_info *ignore_insn)// = nullptr)
 {
   insn_info *result = nullptr;
 
@@ -698,7 +790,7 @@ latest_hazard_before (insn_info *insn, rtx *ignore,
 //
 // N.B. we ignore any defs/uses of memory here as we deal with that separately,
 // making use of alias disambiguation.
-static insn_info *
+insn_info *
 first_hazard_after (insn_info *insn, rtx *ignore)
 {
   insn_info *result = nullptr;
@@ -787,7 +879,7 @@ first_hazard_after (insn_info *insn, rtx *ignore)
 }
 
 // Return true iff R1 and R2 overlap.
-static bool
+bool
 ranges_overlap_p (const insn_range_info &r1, const insn_range_info &r2)
 {
   // If either range is empty, then their intersection is empty.
@@ -799,9 +891,8 @@ ranges_overlap_p (const insn_range_info &r1, const insn_range_info &r2)
   // Inverting this, we get the below.
   return *r1.last >= *r2.first && *r2.last >= *r1.first;
 }
-
 // Get the range of insns that def feeds.
-static insn_range_info get_def_range (def_info *def)
+ insn_range_info get_def_range (def_info *def)
 {
   insn_info *last = def->next_def ()->insn ()->prev_nondebug_insn ();
   return { def->insn (), last };
@@ -809,7 +900,7 @@ static insn_range_info get_def_range (def_info *def)
 
 // Given a def (of memory), return the downwards range within which we
 // can safely move this def.
-static insn_range_info
+insn_range_info
 def_downwards_move_range (def_info *def)
 {
   auto range = get_def_range (def);
@@ -827,7 +918,7 @@ def_downwards_move_range (def_info *def)
 
 // Given a def (of memory), return the upwards range within which we can
 // safely move this def.
-static insn_range_info
+insn_range_info
 def_upwards_move_range (def_info *def)
 {
   def_info *prev = def->prev_def ();
@@ -844,189 +935,18 @@ def_upwards_move_range (def_info *def)
   return range;
 }
 
-// Class that implements a state machine for building the changes needed to form
-// a store pair instruction.  This allows us to easily build the changes in
-// program order, as required by rtl-ssa.
-struct stp_change_builder
+// Generate the RTL pattern for a "tombstone"; used temporarily during this pass
+// to replace stores that are marked for deletion where we can't immediately
+// delete the store (since there are uses of mem hanging off the store).
+//
+// These are deleted at the end of the pass and uses re-parented appropriately
+// at this point.
+rtx
+gen_tombstone (void)
 {
-  enum class state
-  {
-    FIRST,
-    INSERT,
-    FIXUP_USE,
-    LAST,
-    DONE
-  };
-
-  enum class action
-  {
-    TOMBSTONE,
-    CHANGE,
-    INSERT,
-    FIXUP_USE
-  };
-
-  struct change
-  {
-    action type;
-    insn_info *insn;
-  };
-
-  bool done () const { return m_state == state::DONE; }
-
-  stp_change_builder (insn_info *insns[2],
-		      insn_info *repurpose,
-		      insn_info *dest)
-    : m_state (state::FIRST), m_insns { insns[0], insns[1] },
-      m_repurpose (repurpose), m_dest (dest), m_use (nullptr) {}
-
-  change get_change () const
-  {
-    switch (m_state)
-      {
-      case state::FIRST:
-	return {
-	  m_insns[0] == m_repurpose ? action::CHANGE : action::TOMBSTONE,
-	  m_insns[0]
-	};
-      case state::LAST:
-	return {
-	  m_insns[1] == m_repurpose ? action::CHANGE : action::TOMBSTONE,
-	  m_insns[1]
-	};
-      case state::INSERT:
-	return { action::INSERT, m_dest };
-      case state::FIXUP_USE:
-	return { action::FIXUP_USE, m_use->insn () };
-      case state::DONE:
-	break;
-      }
-
-    gcc_unreachable ();
-  }
-
-  // Transition to the next state.
-  void advance ()
-  {
-    switch (m_state)
-      {
-      case state::FIRST:
-	if (m_repurpose)
-	  m_state = state::LAST;
-	else
-	  m_state = state::INSERT;
-	break;
-      case state::INSERT:
-      {
-	def_info *def = memory_access (m_insns[0]->defs ());
-	while (*def->next_def ()->insn () <= *m_dest)
-	  def = def->next_def ();
-
-	// Now we know DEF feeds the insertion point for the new stp.
-	// Look for any uses of DEF that will consume the new stp.
-	gcc_assert (*def->insn () <= *m_dest
-		    && *def->next_def ()->insn () > *m_dest);
-
-	auto set = as_a<set_info *> (def);
-	for (auto use : set->nondebug_insn_uses ())
-	  if (*use->insn () > *m_dest)
-	    {
-	      m_use = use;
-	      break;
-	    }
-
-	if (m_use)
-	  m_state = state::FIXUP_USE;
-	else
-	  m_state = state::LAST;
-	break;
-      }
-      case state::FIXUP_USE:
-	m_use = m_use->next_nondebug_insn_use ();
-	if (!m_use)
-	  m_state = state::LAST;
-	break;
-      case state::LAST:
-	m_state = state::DONE;
-	break;
-      case state::DONE:
-	gcc_unreachable ();
-      }
-  }
-
-private:
-  state m_state;
-
-  // Original candidate stores.
-  insn_info *m_insns[2];
-
-  // If non-null, this is a candidate insn to change into an stp.  Otherwise we
-  // are deleting both original insns and inserting a new insn for the stp.
-  insn_info *m_repurpose;
-
-  // Destionation of the stp, it will be placed immediately after m_dest.
-  insn_info *m_dest;
-
-  // Current nondebug use that needs updating due to stp insertion.
-  use_info *m_use;
-};
-
-// Given candidate store insns FIRST and SECOND, see if we can re-purpose one
-// of them (together with its def of memory) for the stp insn.  If so, return
-// that insn.  Otherwise, return null.
-static insn_info *
-try_repurpose_store (insn_info *first,
-		     insn_info *second,
-		     const insn_range_info &move_range)
-{
-  def_info * const defs[2] = {
-    memory_access (first->defs ()),
-    memory_access (second->defs ())
-  };
-
-  if (move_range.includes (first)
-      || ranges_overlap_p (move_range, def_downwards_move_range (defs[0])))
-    return first;
-
-  if (move_range.includes (second)
-      || ranges_overlap_p (move_range, def_upwards_move_range (defs[1])))
-    return second;
-
-  return nullptr;
-}
-
-// Generate the RTL pattern for a "tombstone"; used temporarily during this pass
-// to replace stores that are marked for deletion where we can't immediately
-// delete the store (since there are uses of mem hanging off the store).
-//
-// These are deleted at the end of the pass and uses re-parented appropriately
-// at this point.
-static rtx
-gen_tombstone (void)
-{
-  return gen_rtx_CLOBBER (VOIDmode,
-			  gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (Pmode)));
-}
-
-// Given a pair mode MODE, return a canonical mode to be used for a single
-// operand of such a pair.  Currently we only use this when promoting a
-// non-writeback pair into a writeback pair, as it isn't otherwise clear
-// which mode to use when storing a modeless CONST_INT.
-static machine_mode
-aarch64_operand_mode_for_pair_mode (machine_mode mode)
-{
-  switch (mode)
-    {
-    case E_V2x4QImode:
-      return SImode;
-    case E_V2x8QImode:
-      return DImode;
-    case E_V2x16QImode:
-      return V16QImode;
-    default:
-      gcc_unreachable ();
-    }
-}
+  return gen_rtx_CLOBBER (VOIDmode,
+			  gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (Pmode)));
+}
 
 // Go through the reg notes rooted at NOTE, dropping those that we should drop,
 // and preserving those that we want to keep by prepending them to (and
@@ -1034,7 +954,7 @@ aarch64_operand_mode_for_pair_mode (machine_mode mode)
 // REG_EH_REGION note in the resulting list.  FR_EXPR is used to return any
 // REG_FRAME_RELATED_EXPR note we find, as these can need special handling in
 // combine_reg_notes.
-static rtx
+rtx
 filter_notes (rtx note, rtx result, bool *eh_region, rtx *fr_expr)
 {
   for (; note; note = XEXP (note, 1))
@@ -1084,7 +1004,7 @@ filter_notes (rtx note, rtx result, bool *eh_region, rtx *fr_expr)
 
 // Return the notes that should be attached to a combination of I1 and I2, where
 // *I1 < *I2.  LOAD_P is true for loads.
-static rtx
+rtx
 combine_reg_notes (insn_info *i1, insn_info *i2, bool load_p)
 {
   // Temporary storage for REG_FRAME_RELATED_EXPR notes.
@@ -1100,8 +1020,8 @@ combine_reg_notes (insn_info *i1, insn_info *i2, bool load_p)
   if (!load_p)
     {
       // Simple frame-related sp-relative saves don't need CFI notes, but when
-      // we combine them into an stp we will need a CFI note as dwarf2cfi can't
-      // interpret the unspec pair representation directly.
+      // we combine them into an pair mem store  we will need a CFI note as
+      // dwarf2cfi can't interpret the unspec pair representation directly.
       if (RTX_FRAME_RELATED_P (i1->rtl ()) && !fr_expr[0])
 	fr_expr[0] = copy_rtx (PATTERN (i1->rtl ()));
       if (RTX_FRAME_RELATED_P (i2->rtl ()) && !fr_expr[1])
@@ -1133,7 +1053,7 @@ combine_reg_notes (insn_info *i1, insn_info *i2, bool load_p)
 // relative to the initial value of the base register, and output these
 // in PATS.  Return an rtx that represents the overall change to the
 // base register.
-static rtx
+rtx
 extract_writebacks (bool load_p, rtx pats[2], int changed)
 {
   rtx base_reg = NULL_RTX;
@@ -1150,7 +1070,7 @@ extract_writebacks (bool load_p, rtx pats[2], int changed)
       const bool autoinc_p = GET_RTX_CLASS (GET_CODE (addr)) == RTX_AUTOINC;
 
       poly_int64 offset;
-      rtx this_base = ldp_strip_offset (mem, &offset);
+      rtx this_base = pair_mem_strip_offset (mem, &offset);
       gcc_assert (REG_P (this_base));
       if (base_reg)
 	gcc_assert (rtx_equal_p (base_reg, this_base));
@@ -1207,7 +1127,7 @@ extract_writebacks (bool load_p, rtx pats[2], int changed)
 // base register.  If there is one, we choose the first such update after
 // PAIR_DST that is still in the same BB as our pair.  We return the new def in
 // *ADD_DEF and the resulting writeback effect in *WRITEBACK_EFFECT.
-static insn_info *
+insn_info *
 find_trailing_add (insn_info *insns[2],
 		   const insn_range_info &pair_range,
 		   int initial_writeback,
@@ -1286,7 +1206,7 @@ find_trailing_add (insn_info *insns[2],
 
   off_hwi /= access_size;
 
-  if (off_hwi < LDP_MIN_IMM || off_hwi > LDP_MAX_IMM)
+  if (off_hwi < PAIR_MEM_MIN_IMM || off_hwi > PAIR_MEM_MAX_IMM)
     return nullptr;
 
   auto dump_prefix = [&]()
@@ -1325,26 +1245,93 @@ find_trailing_add (insn_info *insns[2],
   return nullptr;
 }
 
-// We just emitted a tombstone with uid UID, track it in a bitmap for
-// this BB so we can easily identify it later when cleaning up tombstones.
-void
-ldp_bb_info::track_tombstone (int uid)
+// Return true if STORE_INSN may modify mem rtx MEM.  Make sure we keep
+// within our BUDGET for alias analysis.
+bool
+store_modifies_mem_p (rtx mem, insn_info *store_insn, int &budget)
 {
-  if (!m_emitted_tombstone)
+  if (!budget)
     {
-      // Lazily initialize the bitmap for tracking tombstone insns.
-      bitmap_obstack_initialize (&m_bitmap_obstack);
-      bitmap_initialize (&m_tombstone_bitmap, &m_bitmap_obstack);
-      m_emitted_tombstone = true;
+      if (dump_file)
+	{
+	  fprintf (dump_file,
+		   "exceeded budget, assuming store %d aliases with mem ",
+		   store_insn->uid ());
+	  print_simple_rtl (dump_file, mem);
+	  fprintf (dump_file, "\n");
+	}
+
+      return true;
     }
 
-  if (!bitmap_set_bit (&m_tombstone_bitmap, uid))
-    gcc_unreachable (); // Bit should have changed.
+  budget--;
+  return memory_modified_in_insn_p (mem, store_insn->rtl ());
+}
+
+// Return true if LOAD may be modified by STORE.  Make sure we keep
+// within our BUDGET for alias analysis.
+bool
+load_modified_by_store_p (insn_info *load,
+			  insn_info *store,
+			  int &budget)
+{
+  gcc_checking_assert (budget >= 0);
+
+  if (!budget)
+    {
+      if (dump_file)
+	{
+	  fprintf (dump_file,
+		   "exceeded budget, assuming load %d aliases with store %d\n",
+		   load->uid (), store->uid ());
+	}
+      return true;
+    }
+
+  // It isn't safe to re-order stores over calls.
+  if (CALL_P (load->rtl ()))
+    return true;
+
+  budget--;
+
+  // Iterate over all MEMs in the load, seeing if any alias with
+  // our store.
+  subrtx_var_iterator::array_type array;
+  rtx pat = PATTERN (load->rtl ());
+  FOR_EACH_SUBRTX_VAR (iter, array, pat, NONCONST)
+    if (MEM_P (*iter) && memory_modified_in_insn_p (*iter, store->rtl ()))
+      return true;
+
+  return false;
+}
+// Given candidate store insns FIRST and SECOND, see if we can re-purpose one
+// of them (together with its def of memory) for the stp insn.  If so, return
+// that insn.  Otherwise, return null.
+insn_info *
+try_repurpose_store (insn_info *first,
+		     insn_info *second,
+		     const insn_range_info &move_range)
+{
+  def_info * const defs[2] = {
+    memory_access (first->defs ()),
+    memory_access (second->defs ())
+  };
+
+  if (move_range.includes (first)
+      || ranges_overlap_p (move_range, def_downwards_move_range (defs[0])))
+    return first;
+
+  if (move_range.includes (second)
+      || ranges_overlap_p (move_range, def_upwards_move_range (defs[1])))
+    return second;
+
+  return nullptr;
 }
 
+
 // Reset the debug insn containing USE (the debug insn has been
 // optimized away).
-static void
+void
 reset_debug_use (use_info *use)
 {
   auto use_insn = use->insn ();
@@ -1355,12 +1342,43 @@ reset_debug_use (use_info *use)
   crtl->ssa->change_insn (change);
 }
 
+// Update debug uses when folding in a trailing add insn to form a
+// writeback pair.
+//
+// ATTEMPT is used to allocate RTL-SSA temporaries for the changes,
+// the final pair is placed immediately after PAIR_DST, TRAILING_ADD
+// is a trailing add insn which is being folded into the pair to make it
+// use writeback addressing, and WRITEBACK_EFFECT is the pattern for
+// TRAILING_ADD.
+void
+fixup_debug_uses_trailing_add (obstack_watermark &attempt,
+			       insn_info *pair_dst,
+			       insn_info *trailing_add,
+			       rtx writeback_effect)
+{
+  rtx base = SET_DEST (writeback_effect);
+
+  poly_int64 wb_offset;
+  rtx base2 = strip_offset (SET_SRC (writeback_effect), &wb_offset);
+  gcc_checking_assert (rtx_equal_p (base, base2));
+
+  auto defs = trailing_add->defs ();
+  gcc_checking_assert (defs.size () == 1);
+  def_info *def = defs[0];
+
+  if (auto set = safe_dyn_cast<set_info *> (def->prev_def ()))
+    for (auto use : iterate_safely (set->debug_insn_uses ()))
+      if (*use->insn () > *pair_dst)
+	// DEF is getting re-ordered above USE, fix up USE accordingly.
+	fixup_debug_use (attempt, use, def, base, wb_offset);
+}
+
 // USE is a debug use that needs updating because DEF (a def of the same
 // register) is being re-ordered over it.  If BASE is non-null, then DEF
 // is an update of the register BASE by a constant, given by WB_OFFSET,
 // and we can preserve debug info by accounting for the change in side
 // effects.
-static void
+void
 fixup_debug_use (obstack_watermark &attempt,
 		 use_info *use,
 		 def_info *def,
@@ -1455,37 +1473,6 @@ fixup_debug_use (obstack_watermark &attempt,
     }
 }
 
-// Update debug uses when folding in a trailing add insn to form a
-// writeback pair.
-//
-// ATTEMPT is used to allocate RTL-SSA temporaries for the changes,
-// the final pair is placed immediately after PAIR_DST, TRAILING_ADD
-// is a trailing add insn which is being folded into the pair to make it
-// use writeback addressing, and WRITEBACK_EFFECT is the pattern for
-// TRAILING_ADD.
-static void
-fixup_debug_uses_trailing_add (obstack_watermark &attempt,
-			       insn_info *pair_dst,
-			       insn_info *trailing_add,
-			       rtx writeback_effect)
-{
-  rtx base = SET_DEST (writeback_effect);
-
-  poly_int64 wb_offset;
-  rtx base2 = strip_offset (SET_SRC (writeback_effect), &wb_offset);
-  gcc_checking_assert (rtx_equal_p (base, base2));
-
-  auto defs = trailing_add->defs ();
-  gcc_checking_assert (defs.size () == 1);
-  def_info *def = defs[0];
-
-  if (auto set = safe_dyn_cast<set_info *> (def->prev_def ()))
-    for (auto use : iterate_safely (set->debug_insn_uses ()))
-      if (*use->insn () > *pair_dst)
-	// DEF is getting re-ordered above USE, fix up USE accordingly.
-	fixup_debug_use (attempt, use, def, base, wb_offset);
-}
-
 // Called from fuse_pair, fixes up any debug uses that will be affected
 // by the changes.
 //
@@ -1500,7 +1487,7 @@ fixup_debug_uses_trailing_add (obstack_watermark &attempt,
 // writeback, and WRITEBACK_EFFECT is an rtx describing the overall update to
 // the base register in the final pair (if any).  BASE_REGNO gives the register
 // number of the base register used in the final pair.
-static void
+void
 fixup_debug_uses (obstack_watermark &attempt,
 		  insn_info *insns[2],
 		  rtx orig_rtl[2],
@@ -1528,7 +1515,7 @@ fixup_debug_uses (obstack_watermark &attempt,
 	  gcc_checking_assert (GET_RTX_CLASS (GET_CODE (XEXP (mem, 0)))
 			       == RTX_AUTOINC);
 
-	  base = ldp_strip_offset (mem, &offset);
+	  base = pair_mem_strip_offset (mem, &offset);
 	  gcc_checking_assert (REG_P (base) && REGNO (base) == base_regno);
 	}
       fixup_debug_use (attempt, use, def, base, offset);
@@ -1651,621 +1638,846 @@ fixup_debug_uses (obstack_watermark &attempt,
 				   writeback_effect);
 }
 
-// Try and actually fuse the pair given by insns I1 and I2.
-//
-// Here we've done enough analysis to know this is safe, we only
-// reject the pair at this stage if either the tuning policy says to,
-// or recog fails on the final pair insn.
-//
-// LOAD_P is true for loads, ACCESS_SIZE gives the access size of each
-// candidate insn.  Bit i of WRITEBACK is set if the ith insn (in program
-// order) uses writeback.
+// Given INSNS (in program order) which are known to be adjacent, look
+// to see if either insn has a suitable RTL (register) base that we can
+// use to form a pair.  Push these to BASE_CANDS if we find any.  CAND_MEMs
+// gives the relevant mems from the candidate insns, ACCESS_SIZE gives the
+// size of a single candidate access, and REVERSED says whether the accesses
+// are inverted in offset order.
 //
-// BASE gives the chosen base candidate for the pair and MOVE_RANGE is
-// a singleton range which says where to place the pair.
-bool
-ldp_bb_info::fuse_pair (bool load_p,
-			unsigned access_size,
-			int writeback,
-			insn_info *i1, insn_info *i2,
-			base_cand &base,
-			const insn_range_info &move_range)
+// Returns an integer where bit (1 << i) is set if INSNS[i] uses writeback
+// addressing.
+int
+get_viable_bases (insn_info *insns[2],
+		  vec<base_cand> &base_cands,
+		  rtx cand_mems[2],
+		  unsigned access_size,
+		  bool reversed)
 {
-  auto attempt = crtl->ssa->new_change_attempt ();
-
-  auto make_change = [&attempt](insn_info *insn)
-    {
-      return crtl->ssa->change_alloc<insn_change> (attempt, insn);
-    };
-  auto make_delete = [&attempt](insn_info *insn)
-    {
-      return crtl->ssa->change_alloc<insn_change> (attempt,
-						   insn,
-						   insn_change::DELETE);
-    };
-
-  insn_info *first = (*i1 < *i2) ? i1 : i2;
-  insn_info *second = (first == i1) ? i2 : i1;
-
-  insn_info *pair_dst = move_range.singleton ();
-  gcc_assert (pair_dst);
-
-  insn_info *insns[2] = { first, second };
-
-  auto_vec<insn_change *> changes;
-  auto_vec<int, 2> tombstone_uids (2);
-
-  rtx pats[2] = {
-    PATTERN (first->rtl ()),
-    PATTERN (second->rtl ())
-  };
-
-  // Make copies of the patterns as we might need to refer to the original RTL
-  // later, for example when updating debug uses (which is after we've updated
-  // one or both of the patterns in the candidate insns).
-  rtx orig_rtl[2];
+  // We discovered this pair through a common base.  Need to ensure that
+  // we have a common base register that is live at both locations.
+  def_info *base_defs[2] = {};
+  int writeback = 0;
   for (int i = 0; i < 2; i++)
-    orig_rtl[i] = copy_rtx (pats[i]);
-
-  use_array input_uses[2] = { first->uses (), second->uses () };
-  def_array input_defs[2] = { first->defs (), second->defs () };
-
-  int changed_insn = -1;
-  if (base.from_insn != -1)
     {
-      // If we're not already using a shared base, we need
-      // to re-write one of the accesses to use the base from
-      // the other insn.
-      gcc_checking_assert (base.from_insn == 0 || base.from_insn == 1);
-      changed_insn = !base.from_insn;
-
-      rtx base_pat = pats[base.from_insn];
-      rtx change_pat = pats[changed_insn];
-      rtx base_mem = XEXP (base_pat, load_p);
-      rtx change_mem = XEXP (change_pat, load_p);
+      const bool is_lower = (i == reversed);
+      poly_int64 poly_off;
+      rtx base = pair_mem_strip_offset (cand_mems[i], &poly_off);
+      if (GET_RTX_CLASS (GET_CODE (XEXP (cand_mems[i], 0))) == RTX_AUTOINC)
+	writeback |= (1 << i);
 
-      const bool lower_base_p = (insns[base.from_insn] == i1);
-      HOST_WIDE_INT adjust_amt = access_size;
-      if (!lower_base_p)
-	adjust_amt *= -1;
+      if (!REG_P (base) || !poly_off.is_constant ())
+	continue;
 
-      rtx change_reg = XEXP (change_pat, !load_p);
-      machine_mode mode_for_mem = GET_MODE (change_mem);
-      rtx effective_base = drop_writeback (base_mem);
-      rtx new_mem = adjust_address_nv (effective_base,
-				       mode_for_mem,
-				       adjust_amt);
-      rtx new_set = load_p
-	? gen_rtx_SET (change_reg, new_mem)
-	: gen_rtx_SET (new_mem, change_reg);
+      // Punt on accesses relative to eliminable regs.  See the comment in
+      // pair_fusion::track_access for a detailed explanation of this.
+      if (!reload_completed
+	  && (REGNO (base) == FRAME_POINTER_REGNUM
+	      || REGNO (base) == ARG_POINTER_REGNUM))
+	continue;
 
-      pats[changed_insn] = new_set;
+      HOST_WIDE_INT base_off = poly_off.to_constant ();
 
-      auto keep_use = [&](use_info *u)
+      // It should be unlikely that we ever punt here, since MEM_EXPR offset
+      // alignment should be a good proxy for register offset alignment.
+      if (base_off % access_size != 0)
 	{
-	  return refers_to_regno_p (u->regno (), u->regno () + 1,
-				    change_pat, &XEXP (change_pat, load_p));
-	};
-
-      // Drop any uses that only occur in the old address.
-      input_uses[changed_insn] = filter_accesses (attempt,
-						  input_uses[changed_insn],
-						  keep_use);
-    }
-
-  rtx writeback_effect = NULL_RTX;
-  if (writeback)
-    writeback_effect = extract_writebacks (load_p, pats, changed_insn);
+	  if (dump_file)
+	    fprintf (dump_file,
+		     "base not viable, offset misaligned (insn %d)\n",
+		     insns[i]->uid ());
+	  continue;
+	}
 
-  const auto base_regno = base.def->regno ();
+      base_off /= access_size;
 
-  if (base.from_insn == -1 && (writeback & 1))
-    {
-      // If the first of the candidate insns had a writeback form, we'll need to
-      // drop the use of the updated base register from the second insn's uses.
-      //
-      // N.B. we needn't worry about the base register occurring as a store
-      // operand, as we checked that there was no non-address true dependence
-      // between the insns in try_fuse_pair.
-      gcc_checking_assert (find_access (input_uses[1], base_regno));
-      input_uses[1] = check_remove_regno_access (attempt,
-						 input_uses[1],
-						 base_regno);
-    }
+      if (!is_lower)
+	base_off--;
 
-  // Go through and drop uses that only occur in register notes,
-  // as we won't be preserving those.
-  for (int i = 0; i < 2; i++)
-    {
-      auto rti = insns[i]->rtl ();
-      if (!REG_NOTES (rti))
+      if (base_off < PAIR_MEM_MIN_IMM || base_off > PAIR_MEM_MAX_IMM)
 	continue;
 
-      input_uses[i] = remove_note_accesses (attempt, input_uses[i]);
+      use_info *use = find_access (insns[i]->uses (), REGNO (base));
+      gcc_assert (use);
+      base_defs[i] = use->def ();
     }
 
-  // Edge case: if the first insn is a writeback load and the
-  // second insn is a non-writeback load which transfers into the base
-  // register, then we should drop the writeback altogether as the
-  // update of the base register from the second load should prevail.
-  //
-  // For example:
-  //   ldr x2, [x1], #8
-  //   ldr x1, [x1]
-  //   -->
-  //   ldp x2, x1, [x1]
-  if (writeback == 1
-      && load_p
-      && find_access (input_defs[1], base_regno))
+  if (!base_defs[0] && !base_defs[1])
     {
       if (dump_file)
-	fprintf (dump_file,
-		 "  ldp: i%d has wb but subsequent i%d has non-wb "
-		 "update of base (r%d), dropping wb\n",
-		 insns[0]->uid (), insns[1]->uid (), base_regno);
-      gcc_assert (writeback_effect);
-      writeback_effect = NULL_RTX;
+	fprintf (dump_file, "no viable base register for pair (%d,%d)\n",
+		 insns[0]->uid (), insns[1]->uid ());
+      return writeback;
     }
 
-  // So far the patterns have been in instruction order,
-  // now we want them in offset order.
-  if (i1 != first)
-    std::swap (pats[0], pats[1]);
-
-  poly_int64 offsets[2];
   for (int i = 0; i < 2; i++)
-    {
-      rtx mem = XEXP (pats[i], load_p);
-      gcc_checking_assert (MEM_P (mem));
-      rtx base = strip_offset (XEXP (mem, 0), offsets + i);
-      gcc_checking_assert (REG_P (base));
-      gcc_checking_assert (base_regno == REGNO (base));
+    if ((writeback & (1 << i)) && !base_defs[i])
+      {
+	if (dump_file)
+	  fprintf (dump_file, "insn %d has writeback but base isn't viable\n",
+		   insns[i]->uid ());
+	return writeback;
+      }
+
+  if (writeback == 3
+      && base_defs[0]->regno () != base_defs[1]->regno ())
+    {
+      if (dump_file)
+	fprintf (dump_file,
+		 "pair (%d,%d): double writeback with distinct regs (%d,%d): "
+		 "punting\n",
+		 insns[0]->uid (), insns[1]->uid (),
+		 base_defs[0]->regno (), base_defs[1]->regno ());
+      return writeback;
     }
 
-  // If either of the original insns had writeback, but the resulting pair insn
-  // does not (can happen e.g. in the ldp edge case above, or if the writeback
-  // effects cancel out), then drop the def(s) of the base register as
-  // appropriate.
+  if (base_defs[0] && base_defs[1]
+      && base_defs[0]->regno () == base_defs[1]->regno ())
+    {
+      // Easy case: insns already share the same base reg.
+      base_cands.quick_push (base_defs[0]);
+      return writeback;
+    }
+
+  // Otherwise, we know that one of the bases must change.
   //
-  // Also drop the first def in the case that both of the original insns had
-  // writeback.  The second def could well have uses, but the first def should
-  // only be used by the second insn (and we dropped that use above).
+  // Note that if there is writeback we must use the writeback base
+  // (we know now there is exactly one).
   for (int i = 0; i < 2; i++)
-    if ((!writeback_effect && (writeback & (1 << i)))
-	|| (i == 0 && writeback == 3))
-      input_defs[i] = check_remove_regno_access (attempt,
-						 input_defs[i],
-						 base_regno);
+    if (base_defs[i] && (!writeback || (writeback & (1 << i))))
+      base_cands.quick_push (base_cand { base_defs[i], i });
+
+  return writeback;
+}
+
+void
+dump_insn_list (FILE *f, const insn_list_t &l)
+{
+  fprintf (f, "(");
+
+  auto i = l.begin ();
+  auto end = l.end ();
+
+  if (i != end)
+    fprintf (f, "%d", (*i)->uid ());
+  i++;
+
+  for (; i != end; i++)
+    fprintf (f, ", %d", (*i)->uid ());
+
+  fprintf (f, ")");
+}
+splay_tree_node<access_record *> *
+pair_fusion::node_alloc (access_record *access)
+{
+  using T = splay_tree_node<access_record *>;
+  void *addr = obstack_alloc (&m_obstack, sizeof (T));
+  return new (addr) T (access);
+}
+// Given a candidate access INSN (with mem MEM), see if it has a suitable
+// MEM_EXPR base (i.e. a tree decl) relative to which we can track the access.
+// LFS is used as part of the key to the hash table, see track_access.
+bool
+pair_fusion::track_via_mem_expr (insn_info *insn, rtx mem, lfs_fields lfs)
+{
+  if (!MEM_EXPR (mem) || !MEM_OFFSET_KNOWN_P (mem))
+    return false;
+
+  poly_int64 offset;
+  tree base_expr = get_addr_base_and_unit_offset (MEM_EXPR (mem),
+						  &offset);
+  if (!base_expr || !DECL_P (base_expr))
+    return false;
+
+  offset += MEM_OFFSET (mem);
+
+  const machine_mode mem_mode = GET_MODE (mem);
+  const HOST_WIDE_INT mem_size = GET_MODE_SIZE (mem_mode).to_constant ();
+
+  // Punt on misaligned offsets.  PAIR MEM  instructions require offsets to be a
+  // multiple of the access size, and we believe that misaligned offsets on
+  // MEM_EXPR bases are likely to lead to misaligned offsets w.r.t. RTL bases.
+  if (!multiple_p (offset, mem_size))
+    return false;
+
+  const auto key = std::make_pair (base_expr, encode_lfs (lfs));
+  access_group &group = expr_map.get_or_insert (key, NULL);
+  auto alloc = [&](access_record *access) { return node_alloc (access); };
+  group.track (alloc, offset, insn);
+
+  if (dump_file)
+    {
+      fprintf (dump_file, "[bb %u] tracking insn %d via ",
+	       m_bb->index (), insn->uid ());
+      print_node_brief (dump_file, "mem expr", base_expr, 0);
+      fprintf (dump_file, " [L=%d FP=%d, %smode, off=",
+	       lfs.load_p, lfs.fpsimd_p, mode_name[mem_mode]);
+      print_dec (offset, dump_file);
+      fprintf (dump_file, "]\n");
+    }
+
+  return true;
+}
+// Main function to begin pair discovery.  Given a memory access INSN,
+// determine whether it could be a candidate for fusing into an pair mem,
+// and if so, track it in the appropriate data structure for this basic
+// block.  LOAD_P is true if the access is a load, and MEM is the mem
+// rtx that occurs in INSN.
+void
+pair_fusion::track_access (insn_info *insn, bool load_p, rtx mem)
+{
+  // We can't combine volatile MEMs, so punt on these.
+  if (MEM_VOLATILE_P (mem))
+    return;
+
+  // Ignore writeback accesses if the param says to do so
+  if (pair_is_writeback ()
+      && GET_RTX_CLASS (GET_CODE (XEXP (mem, 0))) == RTX_AUTOINC)
+    return;
+
+  const machine_mode mem_mode = GET_MODE (mem);
+
+  if (!pair_operand_mode_ok_p (mem_mode))
+    return;
+
+  rtx reg_op = XEXP (PATTERN (insn->rtl ()), !load_p);
+
+  if (pair_check_register_operand (load_p, reg_op, mem_mode))
+    return;
+  // We want to segregate FP/SIMD accesses from GPR accesses.
+  //
+  // Before RA, we use the modes, noting that stores of constant zero
+  // operands use GPRs (even in non-integer modes).  After RA, we use
+  // the hard register numbers.
+ const bool fpsimd_op_p = is_fpsimd_op_p (reg_op, mem_mode, load_p);
+  // Note pair_operand_mode_ok_p already rejected VL modes.
+  const HOST_WIDE_INT mem_size = GET_MODE_SIZE (mem_mode).to_constant ();
+  const lfs_fields lfs = { load_p, fpsimd_op_p, mem_size };
+
+  if (track_via_mem_expr (insn, mem, lfs))
+    return;
+
+  poly_int64 mem_off;
+  rtx addr = XEXP (mem, 0);
+  const bool autoinc_p = GET_RTX_CLASS (GET_CODE (addr)) == RTX_AUTOINC;
+  rtx base = pair_mem_strip_offset (mem, &mem_off);
+  if (!REG_P (base))
+    return;
+
+  // Need to calculate two (possibly different) offsets:
+  //  - Offset at which the access occurs.
+  //  - Offset of the new base def.
+  poly_int64 access_off;
+  if (autoinc_p && any_post_modify_p (addr))
+    access_off = 0;
+  else
+    access_off = mem_off;
+
+  poly_int64 new_def_off = mem_off;
+
+  // Punt on accesses relative to eliminable regs.  Since we don't know the
+  // elimination offset pre-RA, we should postpone forming pairs on such
+  // accesses until after RA.
+  //
+  // As it stands, addresses with offsets in range for LDR but not
+  // in range for PAIR MEM LOAD STORE  are currently reloaded inefficiently,
+  // ending up with a separate base register for each pair.
+  //
+  // In theory LRA should make use of
+  // targetm.legitimize_address_displacement to promote sharing of
+  // bases among multiple (nearby) address reloads, but the current
+  // LRA code returns early from process_address_1 for operands that
+  // satisfy "m", even if they don't satisfy the real (relaxed) address
+  // constraint; this early return means we never get to the code
+  // that calls targetm.legitimize_address_displacement.
+  //
+  // So for now, it's better to punt when we can't be sure that the
+  // offset is in range for PAIR MEM LOAD STORE.  Out-of-range cases can then be
+  // handled after RA by the out-of-range PAIR MEM  peepholes.  Eventually, it
+  // would be nice to handle known out-of-range opportunities in the
+  // pass itself (for stack accesses, this would be in the post-RA pass).
+  if (!reload_completed
+      && (REGNO (base) == FRAME_POINTER_REGNUM
+	  || REGNO (base) == ARG_POINTER_REGNUM))
+    return;
+
+  // Now need to find def of base register.
+  use_info *base_use = find_access (insn->uses (), REGNO (base));
+  gcc_assert (base_use);
+  def_info *base_def = base_use->def ();
+  if (!base_def)
+    {
+      if (dump_file)
+	fprintf (dump_file,
+		 "base register (regno %d) of insn %d is undefined",
+		 REGNO (base), insn->uid ());
+      return;
+    }
+
+  alt_base *canon_base = canon_base_map.get (base_def);
+  if (canon_base)
+    {
+      // Express this as the combined offset from the canonical base.
+      base_def = canon_base->base;
+      new_def_off += canon_base->offset;
+      access_off += canon_base->offset;
+    }
+
+  if (autoinc_p)
+    {
+      auto def = find_access (insn->defs (), REGNO (base));
+      gcc_assert (def);
+
+      // Record that DEF = BASE_DEF + MEM_OFF.
+      if (dump_file)
+	{
+	  pretty_printer pp;
+	  pp_access (&pp, def, 0);
+	  pp_string (&pp, " = ");
+	  pp_access (&pp, base_def, 0);
+	  fprintf (dump_file, "[bb %u] recording %s + ",
+		   m_bb->index (), pp_formatted_text (&pp));
+	  print_dec (new_def_off, dump_file);
+	  fprintf (dump_file, "\n");
+	}
+
+      alt_base base_rec { base_def, new_def_off };
+      if (canon_base_map.put (def, base_rec))
+	gcc_unreachable (); // Base defs should be unique.
+    }
+
+  // Punt on misaligned offsets.  PAIR MEM  require offsets to be a multiple of
+  // the access size.
+  if (!multiple_p (mem_off, mem_size))
+    return;
+
+  const auto key = std::make_pair (base_def, encode_lfs (lfs));
+  access_group &group = def_map.get_or_insert (key, NULL);
+  auto alloc = [&](access_record *access) { return node_alloc (access); };
+  group.track (alloc, access_off, insn);
+
+  if (dump_file)
+    {
+      pretty_printer pp;
+      pp_access (&pp, base_def, 0);
+
+      fprintf (dump_file, "[bb %u] tracking insn %d via %s",
+	       m_bb->index (), insn->uid (), pp_formatted_text (&pp));
+      fprintf (dump_file,
+	       " [L=%d, WB=%d, FP=%d, %smode, off=",
+	       lfs.load_p, autoinc_p, lfs.fpsimd_p, mode_name[mem_mode]);
+      print_dec (access_off, dump_file);
+      fprintf (dump_file, "]\n");
+    }
+}
+
+// We just emitted a tombstone with uid UID, track it in a bitmap for
+// this BB so we can easily identify it later when cleaning up tombstones.
+void
+pair_fusion::track_tombstone (int uid)
+{
+  if (!m_emitted_tombstone)
+    {
+      // Lazily initialize the bitmap for tracking tombstone insns.
+      bitmap_obstack_initialize (&m_bitmap_obstack);
+      bitmap_initialize (&m_tombstone_bitmap, &m_bitmap_obstack);
+      m_emitted_tombstone = true;
+    }
+
+  if (!bitmap_set_bit (&m_tombstone_bitmap, uid))
+    gcc_unreachable (); // Bit should have changed.
+}
+
+// Given two adjacent memory accesses of the same size, I1 and I2, try
+// and see if we can merge them into a pair mem load and store.
+//
+// ACCESS_SIZE gives the (common) size of a single access, LOAD_P is true
+// if the accesses are both loads, otherwise they are both stores.
+bool
+pair_fusion::try_fuse_pair (bool load_p, unsigned access_size,
+			    insn_info *i1, insn_info *i2)
+{
+  if (dump_file)
+    fprintf (dump_file, "analyzing pair (load=%d): (%d,%d)\n",
+	     load_p, i1->uid (), i2->uid ());
+
+  insn_info *insns[2];
+  bool reversed = false;
+  if (*i1 < *i2)
+    {
+      insns[0] = i1;
+      insns[1] = i2;
+    }
+  else
+    {
+      insns[0] = i2;
+      insns[1] = i1;
+      reversed = true;
+    }
+
+  rtx cand_mems[2];
+  rtx reg_ops[2];
+  rtx pats[2];
+  for (int i = 0; i < 2; i++)
+    {
+      pats[i] = PATTERN (insns[i]->rtl ());
+      cand_mems[i] = XEXP (pats[i], load_p);
+      reg_ops[i] = XEXP (pats[i], !load_p);
+    }
+
+  if (!load_p && !fuseable_store_p (i1, i2))
+    {
+      if (dump_file)
+	fprintf (dump_file,
+		 "punting on store-mem-pairs due to non fuseable cand (%d,%d)\n",
+		 insns[0]->uid (), insns[1]->uid ());
+      return false;
+    }
+
+
+  if (load_p && reg_overlap_mentioned_p (reg_ops[0], reg_ops[1]))
+    {
+      if (dump_file)
+	fprintf (dump_file,
+		 "punting on pair mem load  due to reg conflcits (%d,%d)\n",
+		 insns[0]->uid (), insns[1]->uid ());
+      return false;
+    }
+
+  if (cfun->can_throw_non_call_exceptions
+      && find_reg_note (insns[0]->rtl (), REG_EH_REGION, NULL_RTX)
+      && find_reg_note (insns[1]->rtl (), REG_EH_REGION, NULL_RTX))
+    {
+      if (dump_file)
+	fprintf (dump_file,
+		 "can't combine insns with EH side effects (%d,%d)\n",
+		 insns[0]->uid (), insns[1]->uid ());
+      return false;
+    }
+
+  auto_vec<base_cand, 2> base_cands (2);
+
+  int writeback = get_viable_bases (insns, base_cands, cand_mems,
+				    access_size, reversed);
+  if (base_cands.is_empty ())
+    {
+      if (dump_file)
+	fprintf (dump_file, "no viable base for pair (%d,%d)\n",
+		 insns[0]->uid (), insns[1]->uid ());
+      return false;
+    }
+
+  // Punt on frame-related insns with writeback.  We probably won't see
+  // these in practice, but this is conservative and ensures we don't
+  // have to worry about these later on.
+  if (writeback && (RTX_FRAME_RELATED_P (i1->rtl ())
+		    || RTX_FRAME_RELATED_P (i2->rtl ())))
+    {
+      if (dump_file)
+	fprintf (dump_file,
+		 "rejecting pair (%d,%d): frame-related insn with writeback\n",
+		 i1->uid (), i2->uid ());
+      return false;
+    }
+
+  rtx *ignore = &XEXP (pats[1], load_p);
+  for (auto use : insns[1]->uses ())
+    if (!use->is_mem ()
+	&& refers_to_regno_p (use->regno (), use->regno () + 1, pats[1], ignore)
+	&& use->def () && use->def ()->insn () == insns[0])
+      {
+	// N.B. we allow a true dependence on the base address, as this
+	// happens in the case of auto-inc accesses.  Consider a post-increment
+	// load followed by a regular indexed load, for example.
+	if (dump_file)
+	  fprintf (dump_file,
+		   "%d has non-address true dependence on %d, rejecting pair\n",
+		   insns[1]->uid (), insns[0]->uid ());
+	return false;
+      }
 
-  // If we don't currently have a writeback pair, and we don't have
-  // a load that clobbers the base register, look for a trailing destructive
-  // update of the base register and try and fold it in to make this into a
-  // writeback pair.
-  insn_info *trailing_add = nullptr;
-  if (aarch64_ldp_writeback > 1
-      && !writeback_effect
-      && (!load_p || (!refers_to_regno_p (base_regno, base_regno + 1,
-					 XEXP (pats[0], 0), nullptr)
-		      && !refers_to_regno_p (base_regno, base_regno + 1,
-					     XEXP (pats[1], 0), nullptr))))
+  unsigned i = 0;
+  while (i < base_cands.length ())
     {
-      def_info *add_def;
-      trailing_add = find_trailing_add (insns, move_range, writeback,
-					&writeback_effect,
-					&add_def, base.def, offsets[0],
-					access_size);
-      if (trailing_add)
+      base_cand &cand = base_cands[i];
+
+      rtx *ignore[2] = {};
+      for (int j = 0; j < 2; j++)
+	if (cand.from_insn == !j)
+	  ignore[j] = &XEXP (cand_mems[j], 0);
+
+      insn_info *h = first_hazard_after (insns[0], ignore[0]);
+      if (h && *h < *insns[1])
+	cand.hazards[0] = h;
+
+      h = latest_hazard_before (insns[1], ignore[1]);
+      if (h && *h > *insns[0])
+	cand.hazards[1] = h;
+
+      if (!cand.viable ())
 	{
-	  // The def of the base register from the trailing add should prevail.
-	  input_defs[0] = insert_access (attempt, add_def, input_defs[0]);
-	  gcc_assert (input_defs[0].is_valid ());
+	  if (dump_file)
+	    fprintf (dump_file,
+		     "pair (%d,%d): rejecting base %d due to dataflow "
+		     "hazards (%d,%d)\n",
+		     insns[0]->uid (),
+		     insns[1]->uid (),
+		     cand.def->regno (),
+		     cand.hazards[0]->uid (),
+		     cand.hazards[1]->uid ());
+
+	  base_cands.ordered_remove (i);
 	}
+      else
+	i++;
     }
 
-  // Now that we know what base mem we're going to use, check if it's OK
-  // with the ldp/stp policy.
-  rtx first_mem = XEXP (pats[0], load_p);
-  if (!aarch64_mem_ok_with_ldpstp_policy_model (first_mem,
-						load_p,
-						GET_MODE (first_mem)))
+  if (base_cands.is_empty ())
     {
       if (dump_file)
-	fprintf (dump_file, "punting on pair (%d,%d), ldp/stp policy says no\n",
-		 i1->uid (), i2->uid ());
+	fprintf (dump_file,
+		 "can't form pair (%d,%d) due to dataflow hazards\n",
+		 insns[0]->uid (), insns[1]->uid ());
       return false;
     }
 
-  rtx reg_notes = combine_reg_notes (first, second, load_p);
+  insn_info *alias_hazards[4] = {};
 
-  rtx pair_pat;
-  if (writeback_effect)
+  // First def of memory after the first insn, and last def of memory
+  // before the second insn, respectively.
+  def_info *mem_defs[2] = {};
+  if (load_p)
     {
-      auto patvec = gen_rtvec (3, writeback_effect, pats[0], pats[1]);
-      pair_pat = gen_rtx_PARALLEL (VOIDmode, patvec);
+      if (!MEM_READONLY_P (cand_mems[0]))
+	{
+	  mem_defs[0] = memory_access (insns[0]->uses ())->def ();
+	  gcc_checking_assert (mem_defs[0]);
+	  mem_defs[0] = mem_defs[0]->next_def ();
+	}
+      if (!MEM_READONLY_P (cand_mems[1]))
+	{
+	  mem_defs[1] = memory_access (insns[1]->uses ())->def ();
+	  gcc_checking_assert (mem_defs[1]);
+	}
     }
-  else if (load_p)
-    pair_pat = aarch64_gen_load_pair (XEXP (pats[0], 0),
-				      XEXP (pats[1], 0),
-				      XEXP (pats[0], 1));
   else
-    pair_pat = aarch64_gen_store_pair (XEXP (pats[0], 0),
-				       XEXP (pats[0], 1),
-				       XEXP (pats[1], 1));
+    {
+      mem_defs[0] = memory_access (insns[0]->defs ())->next_def ();
+      mem_defs[1] = memory_access (insns[1]->defs ())->prev_def ();
+      gcc_checking_assert (mem_defs[0]);
+      gcc_checking_assert (mem_defs[1]);
+    }
 
-  insn_change *pair_change = nullptr;
-  auto set_pair_pat = [pair_pat,reg_notes](insn_change *change) {
-    rtx_insn *rti = change->insn ()->rtl ();
-    validate_unshare_change (rti, &PATTERN (rti), pair_pat, true);
-    validate_change (rti, &REG_NOTES (rti), reg_notes, true);
+  auto tombstone_p = [&](insn_info *insn) -> bool {
+    return m_emitted_tombstone
+	   && bitmap_bit_p (&m_tombstone_bitmap, insn->uid ());
   };
 
-  if (load_p)
-    {
-      changes.safe_push (make_delete (first));
-      pair_change = make_change (second);
-      changes.safe_push (pair_change);
+  store_walker<false, decltype(tombstone_p)>
+    forward_store_walker (mem_defs[0], cand_mems[0], insns[1], tombstone_p);
 
-      pair_change->move_range = move_range;
-      pair_change->new_defs = merge_access_arrays (attempt,
-						   input_defs[0],
-						   input_defs[1]);
-      gcc_assert (pair_change->new_defs.is_valid ());
+  store_walker<true, decltype(tombstone_p)>
+    backward_store_walker (mem_defs[1], cand_mems[1], insns[0], tombstone_p);
 
-      pair_change->new_uses
-	= merge_access_arrays (attempt,
-			       drop_memory_access (input_uses[0]),
-			       drop_memory_access (input_uses[1]));
-      gcc_assert (pair_change->new_uses.is_valid ());
-      set_pair_pat (pair_change);
-    }
+  alias_walker *walkers[4] = {};
+  if (mem_defs[0])
+    walkers[0] = &forward_store_walker;
+  if (mem_defs[1])
+    walkers[1] = &backward_store_walker;
+
+  if (load_p && (mem_defs[0] || mem_defs[1]))
+    do_alias_analysis (alias_hazards, walkers, load_p);
   else
     {
-      using Action = stp_change_builder::action;
-      insn_info *store_to_change = try_repurpose_store (first, second,
-							move_range);
-      stp_change_builder builder (insns, store_to_change, pair_dst);
-      insn_change *change;
-      set_info *new_set = nullptr;
-      for (; !builder.done (); builder.advance ())
-	{
-	  auto action = builder.get_change ();
-	  change = (action.type == Action::INSERT)
-	    ? nullptr : make_change (action.insn);
-	  switch (action.type)
-	    {
-	    case Action::CHANGE:
-	    {
-	      set_pair_pat (change);
-	      change->new_uses = merge_access_arrays (attempt,
-						      input_uses[0],
-						      input_uses[1]);
-	      auto d1 = drop_memory_access (input_defs[0]);
-	      auto d2 = drop_memory_access (input_defs[1]);
-	      change->new_defs = merge_access_arrays (attempt, d1, d2);
-	      gcc_assert (change->new_defs.is_valid ());
-	      def_info *stp_def = memory_access (change->insn ()->defs ());
-	      change->new_defs = insert_access (attempt,
-						stp_def,
-						change->new_defs);
-	      gcc_assert (change->new_defs.is_valid ());
-	      change->move_range = move_range;
-	      pair_change = change;
-	      break;
-	    }
-	    case Action::TOMBSTONE:
-	    {
-	      tombstone_uids.quick_push (change->insn ()->uid ());
-	      rtx_insn *rti = change->insn ()->rtl ();
-	      validate_change (rti, &PATTERN (rti), gen_tombstone (), true);
-	      validate_change (rti, &REG_NOTES (rti), NULL_RTX, true);
-	      change->new_uses = use_array (nullptr, 0);
-	      break;
-	    }
-	    case Action::INSERT:
-	    {
-	      if (dump_file)
-		fprintf (dump_file,
-			 "  stp: cannot re-purpose candidate stores\n");
-
-	      auto new_insn = crtl->ssa->create_insn (attempt, INSN, pair_pat);
-	      change = make_change (new_insn);
-	      change->move_range = move_range;
-	      change->new_uses = merge_access_arrays (attempt,
-						      input_uses[0],
-						      input_uses[1]);
-	      gcc_assert (change->new_uses.is_valid ());
+      // We want to find any loads hanging off the first store.
+      mem_defs[0] = memory_access (insns[0]->defs ());
+      load_walker<false> forward_load_walker (mem_defs[0], insns[0], insns[1]);
+      load_walker<true> backward_load_walker (mem_defs[1], insns[1], insns[0]);
+      walkers[2] = &forward_load_walker;
+      walkers[3] = &backward_load_walker;
+      do_alias_analysis (alias_hazards, walkers, load_p);
+      // Now consolidate hazards back down.
+      if (alias_hazards[2]
+	  && (!alias_hazards[0] || (*alias_hazards[2] < *alias_hazards[0])))
+	alias_hazards[0] = alias_hazards[2];
 
-	      auto d1 = drop_memory_access (input_defs[0]);
-	      auto d2 = drop_memory_access (input_defs[1]);
-	      change->new_defs = merge_access_arrays (attempt, d1, d2);
-	      gcc_assert (change->new_defs.is_valid ());
+      if (alias_hazards[3]
+	  && (!alias_hazards[1] || (*alias_hazards[3] > *alias_hazards[1])))
+	alias_hazards[1] = alias_hazards[3];
+    }
 
-	      new_set = crtl->ssa->create_set (attempt, new_insn, memory);
-	      change->new_defs = insert_access (attempt, new_set,
-						change->new_defs);
-	      gcc_assert (change->new_defs.is_valid ());
-	      pair_change = change;
-	      break;
-	    }
-	    case Action::FIXUP_USE:
-	    {
-	      // This use now needs to consume memory from our stp.
-	      if (dump_file)
-		fprintf (dump_file,
-			 "  stp: changing i%d to use mem from new stp "
-			 "(after i%d)\n",
-			 action.insn->uid (), pair_dst->uid ());
-	      change->new_uses = drop_memory_access (change->new_uses);
-	      gcc_assert (new_set);
-	      auto new_use = crtl->ssa->create_use (attempt, action.insn,
-						    new_set);
-	      change->new_uses = insert_access (attempt, new_use,
-						change->new_uses);
-	      break;
-	    }
-	    }
-	  changes.safe_push (change);
-	}
+  if (alias_hazards[0] && alias_hazards[1]
+      && *alias_hazards[0] <= *alias_hazards[1])
+    {
+      if (dump_file)
+	fprintf (dump_file,
+		 "cannot form pair (%d,%d) due to alias conflicts (%d,%d)\n",
+		 i1->uid (), i2->uid (),
+		 alias_hazards[0]->uid (), alias_hazards[1]->uid ());
+      return false;
     }
 
-  if (trailing_add)
-    changes.safe_push (make_delete (trailing_add));
-  else if ((writeback & 2) && !writeback_effect)
+  // Now narrow the hazards on each base candidate using
+  // the alias hazards.
+  i = 0;
+  while (i < base_cands.length ())
     {
-      // The second insn initially had writeback but now the pair does not,
-      // need to update any nondebug uses of the base register def in the
-      // second insn.  We'll take care of debug uses later.
-      auto def = find_access (insns[1]->defs (), base_regno);
-      gcc_assert (def);
-      auto set = dyn_cast<set_info *> (def);
-      if (set && set->has_nondebug_uses ())
-	{
-	  auto orig_use = find_access (insns[0]->uses (), base_regno);
-	  for (auto use : set->nondebug_insn_uses ())
-	    {
-	      auto change = make_change (use->insn ());
-	      change->new_uses = check_remove_regno_access (attempt,
-							    change->new_uses,
-							    base_regno);
-	      change->new_uses = insert_access (attempt,
-						orig_use,
-						change->new_uses);
-	      changes.safe_push (change);
-	    }
+      base_cand &cand = base_cands[i];
+      if (alias_hazards[0] && (!cand.hazards[0]
+			       || *alias_hazards[0] < *cand.hazards[0]))
+	cand.hazards[0] = alias_hazards[0];
+      if (alias_hazards[1] && (!cand.hazards[1]
+			       || *alias_hazards[1] > *cand.hazards[1]))
+	cand.hazards[1] = alias_hazards[1];
+
+      if (cand.viable ())
+	i++;
+      else
+	{
+	  if (dump_file)
+	    fprintf (dump_file, "pair (%d,%d): rejecting base %d due to "
+				"alias/dataflow hazards (%d,%d)",
+				insns[0]->uid (), insns[1]->uid (),
+				cand.def->regno (),
+				cand.hazards[0]->uid (),
+				cand.hazards[1]->uid ());
+
+	  base_cands.ordered_remove (i);
 	}
     }
 
-  auto is_changing = insn_is_changing (changes);
-  for (unsigned i = 0; i < changes.length (); i++)
-    gcc_assert (rtl_ssa::restrict_movement_ignoring (*changes[i], is_changing));
-
-  // Check the pair pattern is recog'd.
-  if (!rtl_ssa::recog_ignoring (attempt, *pair_change, is_changing))
+  if (base_cands.is_empty ())
     {
       if (dump_file)
-	fprintf (dump_file, "  failed to form pair, recog failed\n");
+	fprintf (dump_file,
+		 "cannot form pair (%d,%d) due to alias/dataflow hazards",
+		 insns[0]->uid (), insns[1]->uid ());
 
-      // Free any reg notes we allocated.
-      while (reg_notes)
-	{
-	  rtx next = XEXP (reg_notes, 1);
-	  free_EXPR_LIST_node (reg_notes);
-	  reg_notes = next;
-	}
-      cancel_changes (0);
       return false;
     }
 
-  gcc_assert (crtl->ssa->verify_insn_changes (changes));
-
-  // Fix up any debug uses that will be affected by the changes.
-  if (MAY_HAVE_DEBUG_INSNS)
-    fixup_debug_uses (attempt, insns, orig_rtl, pair_dst, trailing_add,
-		      load_p, writeback, writeback_effect, base_regno);
-
-  confirm_change_group ();
-  crtl->ssa->change_insns (changes);
-
-  gcc_checking_assert (tombstone_uids.length () <= 2);
-  for (auto uid : tombstone_uids)
-    track_tombstone (uid);
-
-  return true;
-}
-
-// Return true if STORE_INSN may modify mem rtx MEM.  Make sure we keep
-// within our BUDGET for alias analysis.
-static bool
-store_modifies_mem_p (rtx mem, insn_info *store_insn, int &budget)
-{
-  if (!budget)
+  base_cand *base = &base_cands[0];
+  if (base_cands.length () > 1)
     {
-      if (dump_file)
+      // If there are still multiple viable bases, it makes sense
+      // to choose one that allows us to reduce register pressure,
+      // for loads this means moving further down, for stores this
+      // means moving further up.
+      gcc_checking_assert (base_cands.length () == 2);
+      const int hazard_i = !load_p;
+      if (base->hazards[hazard_i])
 	{
-	  fprintf (dump_file,
-		   "exceeded budget, assuming store %d aliases with mem ",
-		   store_insn->uid ());
-	  print_simple_rtl (dump_file, mem);
-	  fprintf (dump_file, "\n");
+	  if (!base_cands[1].hazards[hazard_i])
+	    base = &base_cands[1];
+	  else if (load_p
+		   && *base_cands[1].hazards[hazard_i]
+		      > *(base->hazards[hazard_i]))
+	    base = &base_cands[1];
+	  else if (!load_p
+		   && *base_cands[1].hazards[hazard_i]
+		      < *(base->hazards[hazard_i]))
+	    base = &base_cands[1];
 	}
-
-      return true;
     }
 
-  budget--;
-  return memory_modified_in_insn_p (mem, store_insn->rtl ());
-}
-
-// Return true if LOAD may be modified by STORE.  Make sure we keep
-// within our BUDGET for alias analysis.
-static bool
-load_modified_by_store_p (insn_info *load,
-			  insn_info *store,
-			  int &budget)
-{
-  gcc_checking_assert (budget >= 0);
+  // Otherwise, hazards[0] > hazards[1].
+  // Pair can be formed anywhere in (hazards[1], hazards[0]).
+  insn_range_info range (insns[0], insns[1]);
+  if (base->hazards[1])
+    range.first = base->hazards[1];
+  if (base->hazards[0])
+    range.last = base->hazards[0]->prev_nondebug_insn ();
 
-  if (!budget)
+  // If the second insn can throw, narrow the move range to exactly that insn.
+  // This prevents us trying to move the second insn from the end of the BB.
+  if (cfun->can_throw_non_call_exceptions
+      && find_reg_note (insns[1]->rtl (), REG_EH_REGION, NULL_RTX))
     {
-      if (dump_file)
-	{
-	  fprintf (dump_file,
-		   "exceeded budget, assuming load %d aliases with store %d\n",
-		   load->uid (), store->uid ());
-	}
-      return true;
+      gcc_assert (range.includes (insns[1]));
+      range = insn_range_info (insns[1]);
     }
 
-  // It isn't safe to re-order stores over calls.
-  if (CALL_P (load->rtl ()))
-    return true;
+  // Placement strategy: push loads down and pull stores up, this should
+  // help register pressure by reducing live ranges.
+  if (load_p)
+    range.first = range.last;
+  else
+    range.last = range.first;
 
-  budget--;
+  if (dump_file)
+    {
+      auto print_hazard = [](insn_info *i)
+	{
+	  if (i)
+	    fprintf (dump_file, "%d", i->uid ());
+	  else
+	    fprintf (dump_file, "-");
+	};
+      auto print_pair = [print_hazard](insn_info **i)
+	{
+	  print_hazard (i[0]);
+	  fprintf (dump_file, ",");
+	  print_hazard (i[1]);
+	};
 
-  // Iterate over all MEMs in the load, seeing if any alias with
-  // our store.
-  subrtx_var_iterator::array_type array;
-  rtx pat = PATTERN (load->rtl ());
-  FOR_EACH_SUBRTX_VAR (iter, array, pat, NONCONST)
-    if (MEM_P (*iter) && memory_modified_in_insn_p (*iter, store->rtl ()))
-      return true;
+      fprintf (dump_file, "fusing pair [L=%d] (%d,%d), base=%d, hazards: (",
+	      load_p, insns[0]->uid (), insns[1]->uid (),
+	      base->def->regno ());
+      print_pair (base->hazards);
+      fprintf (dump_file, "), move_range: (%d,%d)\n",
+	       range.first->uid (), range.last->uid ());
+    }
 
-  return false;
+  return fuse_pair (load_p, access_size, writeback,
+		    i1, i2, *base, range);
 }
 
-// Virtual base class for load/store walkers used in alias analysis.
-struct alias_walker
-{
-  virtual bool conflict_p (int &budget) const = 0;
-  virtual insn_info *insn () const = 0;
-  virtual bool valid () const  = 0;
-  virtual void advance () = 0;
-};
-
-// Implement some common functionality used by both store_walker
-// and load_walker.
-template<bool reverse>
-class def_walker : public alias_walker
-{
-protected:
-  using def_iter_t = typename std::conditional<reverse,
-	reverse_def_iterator, def_iterator>::type;
-
-  static use_info *start_use_chain (def_iter_t &def_iter)
-  {
-    set_info *set = nullptr;
-    for (; *def_iter; def_iter++)
-      {
-	set = dyn_cast<set_info *> (*def_iter);
-	if (!set)
-	  continue;
-
-	use_info *use = reverse
-	  ? set->last_nondebug_insn_use ()
-	  : set->first_nondebug_insn_use ();
-
-	if (use)
-	  return use;
-      }
-
-    return nullptr;
-  }
-
-  def_iter_t def_iter;
-  insn_info *limit;
-  def_walker (def_info *def, insn_info *limit) :
-    def_iter (def), limit (limit) {}
-
-  virtual bool iter_valid () const { return *def_iter; }
-
-public:
-  insn_info *insn () const override { return (*def_iter)->insn (); }
-  void advance () override { def_iter++; }
-  bool valid () const override final
-  {
-    if (!iter_valid ())
-      return false;
-
-    if (reverse)
-      return *(insn ()) > *limit;
-    else
-      return *(insn ()) < *limit;
-  }
-};
 
-// alias_walker that iterates over stores.
-template<bool reverse, typename InsnPredicate>
-class store_walker : public def_walker<reverse>
+// LEFT_LIST and RIGHT_LIST are lists of candidate instructions where all insns
+// in LEFT_LIST are known to be adjacent to those in RIGHT_LIST.
+//
+// This function traverses the resulting 2D matrix of possible pair candidates
+// and attempts to merge them into pairs.
+//
+// The algorithm is straightforward: if we consider a combined list of
+// candidates X obtained by merging LEFT_LIST and RIGHT_LIST in program order,
+// then we advance through X until we reach a crossing point (where X[i] and
+// X[i+1] come from different source lists).
+//
+// At this point we know X[i] and X[i+1] are adjacent accesses, and we try to
+// fuse them into a pair.  If this succeeds, we remove X[i] and X[i+1] from
+// their original lists and continue as above.
+//
+// In the failure case, we advance through the source list containing X[i] and
+// continue as above (proceeding to the next crossing point).
+//
+// The rationale for skipping over groups of consecutive candidates from the
+// same source list is as follows:
+//
+// In the store case, the insns in the group can't be re-ordered over each
+// other as they are guaranteed to store to the same location, so we're
+// guaranteed not to lose opportunities by doing this.
+//
+// In the load case, subsequent loads from the same location are either
+// redundant (in which case they should have been cleaned up by an earlier
+// optimization pass) or there is an intervening aliasing hazard, in which case
+// we can't re-order them anyway, so provided earlier passes have cleaned up
+// redundant loads, we shouldn't miss opportunities by doing this.
+void
+pair_fusion::merge_pairs (insn_list_t &left_list,
+			  insn_list_t &right_list,
+			  bool load_p,
+			  unsigned access_size)
 {
-  rtx cand_mem;
-  InsnPredicate tombstone_p;
-
-public:
-  store_walker (def_info *mem_def, rtx mem, insn_info *limit_insn,
-		InsnPredicate tombstone_fn) :
-    def_walker<reverse> (mem_def, limit_insn),
-    cand_mem (mem), tombstone_p (tombstone_fn) {}
-
-  bool conflict_p (int &budget) const override final
-  {
-    if (tombstone_p (this->insn ()))
-      return false;
+  if (dump_file)
+    {
+      fprintf (dump_file, "merge_pairs [L=%d], cand vecs ", load_p);
+      dump_insn_list (dump_file, left_list);
+      fprintf (dump_file, " x ");
+      dump_insn_list (dump_file, right_list);
+      fprintf (dump_file, "\n");
+    }
 
-    return store_modifies_mem_p (cand_mem, this->insn (), budget);
-  }
-};
+  auto iter_l = left_list.begin ();
+  auto iter_r = right_list.begin ();
 
-// alias_walker that iterates over loads.
-template<bool reverse>
-class load_walker : public def_walker<reverse>
+  while (iter_l != left_list.end () && iter_r != right_list.end ())
+    {
+      auto next_l = std::next (iter_l);
+      auto next_r = std::next (iter_r);
+      if (**iter_l < **iter_r
+	  && next_l != left_list.end ()
+	  && **next_l < **iter_r)
+	iter_l = next_l;
+      else if (**iter_r < **iter_l
+	       && next_r != right_list.end ()
+	       && **next_r < **iter_l)
+	iter_r = next_r;
+      else if (try_fuse_pair (load_p, access_size, *iter_l, *iter_r))
+	{
+	  left_list.erase (iter_l);
+	  iter_l = next_l;
+	  right_list.erase (iter_r);
+	  iter_r = next_r;
+	}
+      else if (**iter_l < **iter_r)
+	iter_l = next_l;
+      else
+	iter_r = next_r;
+    }
+}
+// If we emitted tombstone insns for this BB, iterate through the BB
+// and remove all the tombstone insns, being sure to reparent any uses
+// of mem to previous defs when we do this.
+void
+pair_fusion::cleanup_tombstones ()
 {
-  using Base = def_walker<reverse>;
-  using use_iter_t = typename std::conditional<reverse,
-	reverse_use_iterator, nondebug_insn_use_iterator>::type;
+  // No need to do anything if we didn't emit a tombstone insn for this BB.
+  if (!m_emitted_tombstone)
+    return;
 
-  use_iter_t use_iter;
-  insn_info *cand_store;
+  insn_info *insn = m_bb->head_insn ();
+  while (insn)
+    {
+      insn_info *next = insn->next_nondebug_insn ();
+      if (!insn->is_real ()
+	  || !bitmap_bit_p (&m_tombstone_bitmap, insn->uid ()))
+	{
+	  insn = next;
+	  continue;
+	}
 
-  bool iter_valid () const override final { return *use_iter; }
+      auto def = memory_access (insn->defs ());
+      auto set = dyn_cast<set_info *> (def);
+      if (set && set->has_any_uses ())
+	{
+	  def_info *prev_def = def->prev_def ();
+	  auto prev_set = dyn_cast<set_info *> (prev_def);
+	  if (!prev_set)
+	    gcc_unreachable ();
 
-public:
-  void advance () override final
-  {
-    use_iter++;
-    if (*use_iter)
-      return;
-    this->def_iter++;
-    use_iter = Base::start_use_chain (this->def_iter);
-  }
+	  while (set->first_use ())
+	    crtl->ssa->reparent_use (set->first_use (), prev_set);
+	}
 
-  insn_info *insn () const override final
-  {
-    return (*use_iter)->insn ();
-  }
+      // Now set has no uses, we can delete it.
+      insn_change change (insn, insn_change::DELETE);
+      crtl->ssa->change_insn (change);
+      insn = next;
+    }
+}
 
-  bool conflict_p (int &budget) const override final
-  {
-    return load_modified_by_store_p (insn (), cand_store, budget);
-  }
+template<typename Map>
+void
+pair_fusion::traverse_base_map (Map &map)
+{
+  for (auto kv : map)
+    {
+      const auto &key = kv.first;
+      auto &value = kv.second;
+      transform_for_base (key.second, value);
+    }
+}
 
-  load_walker (def_info *def, insn_info *store, insn_info *limit_insn)
-    : Base (def, limit_insn),
-      use_iter (Base::start_use_chain (this->def_iter)),
-      cand_store (store) {}
-};
+void
+pair_fusion::transform ()
+{
+  traverse_base_map (expr_map);
+  traverse_base_map (def_map);
+}
 
 // Process our alias_walkers in a round-robin fashion, proceeding until
 // nothing more can be learned from alias analysis.
 //
 // We try to maintain the invariant that if a walker becomes invalid, we
 // set its pointer to null.
-static void
-do_alias_analysis (insn_info *alias_hazards[4],
+void
+pair_fusion::do_alias_analysis (insn_info *alias_hazards[4],
 		   alias_walker *walkers[4],
 		   bool load_p)
 {
   const int n_walkers = 2 + (2 * !load_p);
-  int budget = aarch64_ldp_alias_check_limit;
+  int budget = pair_mem_alias_check_limit();
 
   auto next_walker = [walkers,n_walkers](int current) -> int {
     for (int j = 1; j <= n_walkers; j++)
@@ -2341,548 +2553,554 @@ do_alias_analysis (insn_info *alias_hazards[4],
     }
 }
 
-// Given INSNS (in program order) which are known to be adjacent, look
-// to see if either insn has a suitable RTL (register) base that we can
-// use to form a pair.  Push these to BASE_CANDS if we find any.  CAND_MEMs
-// gives the relevant mems from the candidate insns, ACCESS_SIZE gives the
-// size of a single candidate access, and REVERSED says whether the accesses
-// are inverted in offset order.
+// Try and actually fuse the pair given by insns I1 and I2.
 //
-// Returns an integer where bit (1 << i) is set if INSNS[i] uses writeback
-// addressing.
-static int
-get_viable_bases (insn_info *insns[2],
-		  vec<base_cand> &base_cands,
-		  rtx cand_mems[2],
-		  unsigned access_size,
-		  bool reversed)
+// Here we've done enough analysis to know this is safe, we only
+// reject the pair at this stage if either the tuning policy says to,
+// or recog fails on the final pair insn.
+//
+// LOAD_P is true for loads, ACCESS_SIZE gives the access size of each
+// candidate insn.  Bit i of WRITEBACK is set if the ith insn (in program
+// order) uses writeback.
+//
+// BASE gives the chosen base candidate for the pair and MOVE_RANGE is
+// a singleton range which says where to place the pair.
+bool
+pair_fusion::fuse_pair (bool load_p,
+			unsigned access_size,
+			int writeback,
+			insn_info *i1, insn_info *i2,
+			base_cand &base,
+			const insn_range_info &move_range)
 {
-  // We discovered this pair through a common base.  Need to ensure that
-  // we have a common base register that is live at both locations.
-  def_info *base_defs[2] = {};
-  int writeback = 0;
-  for (int i = 0; i < 2; i++)
-    {
-      const bool is_lower = (i == reversed);
-      poly_int64 poly_off;
-      rtx base = ldp_strip_offset (cand_mems[i], &poly_off);
-      if (GET_RTX_CLASS (GET_CODE (XEXP (cand_mems[i], 0))) == RTX_AUTOINC)
-	writeback |= (1 << i);
-
-      if (!REG_P (base) || !poly_off.is_constant ())
-	continue;
-
-      // Punt on accesses relative to eliminable regs.  See the comment in
-      // ldp_bb_info::track_access for a detailed explanation of this.
-      if (!reload_completed
-	  && (REGNO (base) == FRAME_POINTER_REGNUM
-	      || REGNO (base) == ARG_POINTER_REGNUM))
-	continue;
-
-      HOST_WIDE_INT base_off = poly_off.to_constant ();
-
-      // It should be unlikely that we ever punt here, since MEM_EXPR offset
-      // alignment should be a good proxy for register offset alignment.
-      if (base_off % access_size != 0)
-	{
-	  if (dump_file)
-	    fprintf (dump_file,
-		     "base not viable, offset misaligned (insn %d)\n",
-		     insns[i]->uid ());
-	  continue;
-	}
-
-      base_off /= access_size;
-
-      if (!is_lower)
-	base_off--;
-
-      if (base_off < LDP_MIN_IMM || base_off > LDP_MAX_IMM)
-	continue;
-
-      use_info *use = find_access (insns[i]->uses (), REGNO (base));
-      gcc_assert (use);
-      base_defs[i] = use->def ();
-    }
+  auto attempt = crtl->ssa->new_change_attempt ();
 
-  if (!base_defs[0] && !base_defs[1])
+  auto make_change = [&attempt](insn_info *insn)
     {
-      if (dump_file)
-	fprintf (dump_file, "no viable base register for pair (%d,%d)\n",
-		 insns[0]->uid (), insns[1]->uid ());
-      return writeback;
-    }
-
-  for (int i = 0; i < 2; i++)
-    if ((writeback & (1 << i)) && !base_defs[i])
-      {
-	if (dump_file)
-	  fprintf (dump_file, "insn %d has writeback but base isn't viable\n",
-		   insns[i]->uid ());
-	return writeback;
-      }
-
-  if (writeback == 3
-      && base_defs[0]->regno () != base_defs[1]->regno ())
+      return crtl->ssa->change_alloc<insn_change> (attempt, insn);
+    };
+  auto make_delete = [&attempt](insn_info *insn)
     {
-      if (dump_file)
-	fprintf (dump_file,
-		 "pair (%d,%d): double writeback with distinct regs (%d,%d): "
-		 "punting\n",
-		 insns[0]->uid (), insns[1]->uid (),
-		 base_defs[0]->regno (), base_defs[1]->regno ());
-      return writeback;
-    }
+      return crtl->ssa->change_alloc<insn_change> (attempt,
+						   insn,
+						   insn_change::DELETE);
+    };
 
-  if (base_defs[0] && base_defs[1]
-      && base_defs[0]->regno () == base_defs[1]->regno ())
-    {
-      // Easy case: insns already share the same base reg.
-      base_cands.quick_push (base_defs[0]);
-      return writeback;
-    }
+  if (*i1 > *i2)
+    return false;
 
-  // Otherwise, we know that one of the bases must change.
-  //
-  // Note that if there is writeback we must use the writeback base
-  // (we know now there is exactly one).
-  for (int i = 0; i < 2; i++)
-    if (base_defs[i] && (!writeback || (writeback & (1 << i))))
-      base_cands.quick_push (base_cand { base_defs[i], i });
+  insn_info *first = (*i1 < *i2) ? i1 : i2;
+  insn_info *second = (first == i1) ? i2 : i1;
 
-  return writeback;
-}
+  insn_info *pair_dst = move_range.singleton ();
+  gcc_assert (pair_dst);
+
+  insn_info *insns[2] = { first, second };
 
-// Given two adjacent memory accesses of the same size, I1 and I2, try
-// and see if we can merge them into a ldp or stp.
-//
-// ACCESS_SIZE gives the (common) size of a single access, LOAD_P is true
-// if the accesses are both loads, otherwise they are both stores.
-bool
-ldp_bb_info::try_fuse_pair (bool load_p, unsigned access_size,
-			    insn_info *i1, insn_info *i2)
-{
-  if (dump_file)
-    fprintf (dump_file, "analyzing pair (load=%d): (%d,%d)\n",
-	     load_p, i1->uid (), i2->uid ());
+  auto_vec<insn_change *> changes;
+  auto_vec<int, 2> tombstone_uids (2);
 
-  insn_info *insns[2];
-  bool reversed = false;
-  if (*i1 < *i2)
-    {
-      insns[0] = i1;
-      insns[1] = i2;
-    }
-  else
-    {
-      insns[0] = i2;
-      insns[1] = i1;
-      reversed = true;
-    }
+  rtx pats[2] = {
+    PATTERN (first->rtl ()),
+    PATTERN (second->rtl ())
+  };
 
-  rtx cand_mems[2];
-  rtx reg_ops[2];
-  rtx pats[2];
+  // Make copies of the patterns as we might need to refer to the original RTL
+  // later, for example when updating debug uses (which is after we've updated
+  // one or both of the patterns in the candidate insns).
+  rtx orig_rtl[2];
   for (int i = 0; i < 2; i++)
-    {
-      pats[i] = PATTERN (insns[i]->rtl ());
-      cand_mems[i] = XEXP (pats[i], load_p);
-      reg_ops[i] = XEXP (pats[i], !load_p);
-    }
+    orig_rtl[i] = copy_rtx (pats[i]);
 
-  if (load_p && reg_overlap_mentioned_p (reg_ops[0], reg_ops[1]))
-    {
-      if (dump_file)
-	fprintf (dump_file,
-		 "punting on ldp due to reg conflcits (%d,%d)\n",
-		 insns[0]->uid (), insns[1]->uid ());
-      return false;
-    }
+  use_array input_uses[2] = { first->uses (), second->uses () };
+  def_array input_defs[2] = { first->defs (), second->defs () };
 
-  if (cfun->can_throw_non_call_exceptions
-      && find_reg_note (insns[0]->rtl (), REG_EH_REGION, NULL_RTX)
-      && find_reg_note (insns[1]->rtl (), REG_EH_REGION, NULL_RTX))
+  int changed_insn = -1;
+  if (base.from_insn != -1)
     {
-      if (dump_file)
-	fprintf (dump_file,
-		 "can't combine insns with EH side effects (%d,%d)\n",
-		 insns[0]->uid (), insns[1]->uid ());
-      return false;
-    }
+      // If we're not already using a shared base, we need
+      // to re-write one of the accesses to use the base from
+      // the other insn.
+      gcc_checking_assert (base.from_insn == 0 || base.from_insn == 1);
+      changed_insn = !base.from_insn;
 
-  auto_vec<base_cand, 2> base_cands (2);
+      rtx base_pat = pats[base.from_insn];
+      rtx change_pat = pats[changed_insn];
+      rtx base_mem = XEXP (base_pat, load_p);
+      rtx change_mem = XEXP (change_pat, load_p);
 
-  int writeback = get_viable_bases (insns, base_cands, cand_mems,
-				    access_size, reversed);
-  if (base_cands.is_empty ())
-    {
-      if (dump_file)
-	fprintf (dump_file, "no viable base for pair (%d,%d)\n",
-		 insns[0]->uid (), insns[1]->uid ());
-      return false;
-    }
+      const bool lower_base_p = (insns[base.from_insn] == i1);
+      HOST_WIDE_INT adjust_amt = access_size;
+      if (!lower_base_p)
+	adjust_amt *= -1;
 
-  // Punt on frame-related insns with writeback.  We probably won't see
-  // these in practice, but this is conservative and ensures we don't
-  // have to worry about these later on.
-  if (writeback && (RTX_FRAME_RELATED_P (i1->rtl ())
-		    || RTX_FRAME_RELATED_P (i2->rtl ())))
-    {
-      if (dump_file)
-	fprintf (dump_file,
-		 "rejecting pair (%d,%d): frame-related insn with writeback\n",
-		 i1->uid (), i2->uid ());
-      return false;
-    }
+      rtx change_reg = XEXP (change_pat, !load_p);
+      machine_mode mode_for_mem = GET_MODE (change_mem);
+      rtx effective_base = drop_writeback (base_mem);
+      rtx new_mem = adjust_address_nv (effective_base,
+				       mode_for_mem,
+				       adjust_amt);
+      rtx new_set = load_p
+	? gen_rtx_SET (change_reg, new_mem)
+	: gen_rtx_SET (new_mem, change_reg);
 
-  rtx *ignore = &XEXP (pats[1], load_p);
-  for (auto use : insns[1]->uses ())
-    if (!use->is_mem ()
-	&& refers_to_regno_p (use->regno (), use->regno () + 1, pats[1], ignore)
-	&& use->def () && use->def ()->insn () == insns[0])
-      {
-	// N.B. we allow a true dependence on the base address, as this
-	// happens in the case of auto-inc accesses.  Consider a post-increment
-	// load followed by a regular indexed load, for example.
-	if (dump_file)
-	  fprintf (dump_file,
-		   "%d has non-address true dependence on %d, rejecting pair\n",
-		   insns[1]->uid (), insns[0]->uid ());
-	return false;
-      }
+      pats[changed_insn] = new_set;
 
-  unsigned i = 0;
-  while (i < base_cands.length ())
-    {
-      base_cand &cand = base_cands[i];
+      auto keep_use = [&](use_info *u)
+	{
+	  return refers_to_regno_p (u->regno (), u->regno () + 1,
+				    change_pat, &XEXP (change_pat, load_p));
+	};
 
-      rtx *ignore[2] = {};
-      for (int j = 0; j < 2; j++)
-	if (cand.from_insn == !j)
-	  ignore[j] = &XEXP (cand_mems[j], 0);
+      // Drop any uses that only occur in the old address.
+      input_uses[changed_insn] = filter_accesses (attempt,
+						  input_uses[changed_insn],
+						  keep_use);
+    }
 
-      insn_info *h = first_hazard_after (insns[0], ignore[0]);
-      if (h && *h < *insns[1])
-	cand.hazards[0] = h;
+  rtx writeback_effect = NULL_RTX;
+  if (writeback)
+    writeback_effect = extract_writebacks (load_p, pats, changed_insn);
 
-      h = latest_hazard_before (insns[1], ignore[1]);
-      if (h && *h > *insns[0])
-	cand.hazards[1] = h;
+  const auto base_regno = base.def->regno ();
 
-      if (!cand.viable ())
-	{
-	  if (dump_file)
-	    fprintf (dump_file,
-		     "pair (%d,%d): rejecting base %d due to dataflow "
-		     "hazards (%d,%d)\n",
-		     insns[0]->uid (),
-		     insns[1]->uid (),
-		     cand.def->regno (),
-		     cand.hazards[0]->uid (),
-		     cand.hazards[1]->uid ());
+  if (base.from_insn == -1 && (writeback & 1))
+    {
+      // If the first of the candidate insns had a writeback form, we'll need to
+      // drop the use of the updated base register from the second insn's uses.
+      //
+      // N.B. we needn't worry about the base register occurring as a store
+      // operand, as we checked that there was no non-address true dependence
+      // between the insns in try_fuse_pair.
+      gcc_checking_assert (find_access (input_uses[1], base_regno));
+      input_uses[1] = check_remove_regno_access (attempt,
+						 input_uses[1],
+						 base_regno);
+    }
 
-	  base_cands.ordered_remove (i);
-	}
-      else
-	i++;
+  // Go through and drop uses that only occur in register notes,
+  // as we won't be preserving those.
+  for (int i = 0; i < 2; i++)
+    {
+      auto rti = insns[i]->rtl ();
+      if (!REG_NOTES (rti))
+	continue;
+
+      input_uses[i] = remove_note_accesses (attempt, input_uses[i]);
     }
 
-  if (base_cands.is_empty ())
+  // Edge case: if the first insn is a writeback load and the
+  // second insn is a non-writeback load which transfers into the base
+  // register, then we should drop the writeback altogether as the
+  // update of the base register from the second load should prevail.
+  //
+  // For example:
+  //   ldr x2, [x1], #8
+  //   ldr x1, [x1]
+  //   -->
+  //   ldp x2, x1, [x1]
+  if (writeback == 1
+      && load_p
+      && find_access (input_defs[1], base_regno))
     {
       if (dump_file)
 	fprintf (dump_file,
-		 "can't form pair (%d,%d) due to dataflow hazards\n",
-		 insns[0]->uid (), insns[1]->uid ());
-      return false;
+		 "  pair_mem: i%d has wb but subsequent i%d has non-wb "
+		 "update of base (r%d), dropping wb\n",
+		 insns[0]->uid (), insns[1]->uid (), base_regno);
+      gcc_assert (writeback_effect);
+      writeback_effect = NULL_RTX;
     }
 
-  insn_info *alias_hazards[4] = {};
+  // So far the patterns have been in instruction order,
+  // now we want them in offset order.
+  if (i1 != first)
+    std::swap (pats[0], pats[1]);
 
-  // First def of memory after the first insn, and last def of memory
-  // before the second insn, respectively.
-  def_info *mem_defs[2] = {};
-  if (load_p)
+  poly_int64 offsets[2];
+  for (int i = 0; i < 2; i++)
     {
-      if (!MEM_READONLY_P (cand_mems[0]))
-	{
-	  mem_defs[0] = memory_access (insns[0]->uses ())->def ();
-	  gcc_checking_assert (mem_defs[0]);
-	  mem_defs[0] = mem_defs[0]->next_def ();
-	}
-      if (!MEM_READONLY_P (cand_mems[1]))
+      rtx mem = XEXP (pats[i], load_p);
+      gcc_checking_assert (MEM_P (mem));
+      rtx base = strip_offset (XEXP (mem, 0), offsets + i);
+      gcc_checking_assert (REG_P (base));
+      gcc_checking_assert (base_regno == REGNO (base));
+    }
+
+  // If either of the original insns had writeback, but the resulting pair insn
+  // does not (can happen e.g. in the pair mem  edge case above, or if the writeback
+  // effects cancel out), then drop the def(s) of the base register as
+  // appropriate.
+  //
+  // Also drop the first def in the case that both of the original insns had
+  // writeback.  The second def could well have uses, but the first def should
+  // only be used by the second insn (and we dropped that use above).
+  for (int i = 0; i < 2; i++)
+    if ((!writeback_effect && (writeback & (1 << i)))
+	|| (i == 0 && writeback == 3))
+      input_defs[i] = check_remove_regno_access (attempt,
+						 input_defs[i],
+						 base_regno);
+
+  // If we don't currently have a writeback pair, and we don't have
+  // a load that clobbers the base register, look for a trailing destructive
+  // update of the base register and try and fold it in to make this into a
+  // writeback pair.
+  insn_info *trailing_add = nullptr;
+  if (pair_trailing_writeback_p ()
+      && !writeback_effect
+      && (!load_p || (!refers_to_regno_p (base_regno, base_regno + 1,
+					 XEXP (pats[0], 0), nullptr)
+		      && !refers_to_regno_p (base_regno, base_regno + 1,
+					     XEXP (pats[1], 0), nullptr))))
+    {
+      def_info *add_def;
+      trailing_add = find_trailing_add (insns, move_range, writeback,
+					&writeback_effect,
+					&add_def, base.def, offsets[0],
+					access_size);
+      if (trailing_add)
 	{
-	  mem_defs[1] = memory_access (insns[1]->uses ())->def ();
-	  gcc_checking_assert (mem_defs[1]);
+	  // The def of the base register from the trailing add should prevail.
+	  input_defs[0] = insert_access (attempt, add_def, input_defs[0]);
+	  gcc_assert (input_defs[0].is_valid ());
 	}
     }
-  else
+
+  // Now that we know what base mem we're going to use, check if it's OK
+  // with the pair mem  policy.
+  rtx first_mem = XEXP (pats[0], load_p);
+  if (pair_mem_ok_policy (first_mem,
+			  load_p,
+			  GET_MODE (first_mem)))
     {
-      mem_defs[0] = memory_access (insns[0]->defs ())->next_def ();
-      mem_defs[1] = memory_access (insns[1]->defs ())->prev_def ();
-      gcc_checking_assert (mem_defs[0]);
-      gcc_checking_assert (mem_defs[1]);
+      if (dump_file)
+	fprintf (dump_file, "punting on pair (%d,%d), pair mem  policy says no\n",
+		 i1->uid (), i2->uid ());
+      return false;
     }
 
-  auto tombstone_p = [&](insn_info *insn) -> bool {
-    return m_emitted_tombstone
-	   && bitmap_bit_p (&m_tombstone_bitmap, insn->uid ());
-  };
+  rtx reg_notes = combine_reg_notes (first, second, load_p);
 
-  store_walker<false, decltype(tombstone_p)>
-    forward_store_walker (mem_defs[0], cand_mems[0], insns[1], tombstone_p);
+  rtx pair_pat;
 
-  store_walker<true, decltype(tombstone_p)>
-    backward_store_walker (mem_defs[1], cand_mems[1], insns[0], tombstone_p);
+  set_multiword_subreg (first, second, load_p);
 
-  alias_walker *walkers[4] = {};
-  if (mem_defs[0])
-    walkers[0] = &forward_store_walker;
-  if (mem_defs[1])
-    walkers[1] = &backward_store_walker;
+  pair_pat = gen_load_store_pair (pats, writeback_effect, load_p);
+  if (pair_pat == NULL_RTX)
+    return false;
+  insn_change *pair_change = nullptr;
+  auto set_pair_pat = [pair_pat,reg_notes](insn_change *change) {
+    rtx_insn *rti = change->insn ()->rtl ();
+    validate_unshare_change (rti, &PATTERN (rti), pair_pat, true);
+    validate_change (rti, &REG_NOTES (rti), reg_notes, true);
+  };
 
-  if (load_p && (mem_defs[0] || mem_defs[1]))
-    do_alias_analysis (alias_hazards, walkers, load_p);
-  else
+  if (load_p)
     {
-      // We want to find any loads hanging off the first store.
-      mem_defs[0] = memory_access (insns[0]->defs ());
-      load_walker<false> forward_load_walker (mem_defs[0], insns[0], insns[1]);
-      load_walker<true> backward_load_walker (mem_defs[1], insns[1], insns[0]);
-      walkers[2] = &forward_load_walker;
-      walkers[3] = &backward_load_walker;
-      do_alias_analysis (alias_hazards, walkers, load_p);
-      // Now consolidate hazards back down.
-      if (alias_hazards[2]
-	  && (!alias_hazards[0] || (*alias_hazards[2] < *alias_hazards[0])))
-	alias_hazards[0] = alias_hazards[2];
+      changes.safe_push (make_delete (first));
+      pair_change = make_change (second);
+      changes.safe_push (pair_change);
 
-      if (alias_hazards[3]
-	  && (!alias_hazards[1] || (*alias_hazards[3] > *alias_hazards[1])))
-	alias_hazards[1] = alias_hazards[3];
-    }
+      pair_change->move_range = move_range;
+      pair_change->new_defs = merge_access_arrays (attempt,
+						   input_defs[0],
+						   input_defs[1]);
+      gcc_assert (pair_change->new_defs.is_valid ());
 
-  if (alias_hazards[0] && alias_hazards[1]
-      && *alias_hazards[0] <= *alias_hazards[1])
-    {
-      if (dump_file)
-	fprintf (dump_file,
-		 "cannot form pair (%d,%d) due to alias conflicts (%d,%d)\n",
-		 i1->uid (), i2->uid (),
-		 alias_hazards[0]->uid (), alias_hazards[1]->uid ());
-      return false;
+      pair_change->new_uses
+	= merge_access_arrays (attempt,
+			       drop_memory_access (input_uses[0]),
+			       drop_memory_access (input_uses[1]));
+      gcc_assert (pair_change->new_uses.is_valid ());
+      set_pair_pat (pair_change);
     }
-
-  // Now narrow the hazards on each base candidate using
-  // the alias hazards.
-  i = 0;
-  while (i < base_cands.length ())
+  else
     {
-      base_cand &cand = base_cands[i];
-      if (alias_hazards[0] && (!cand.hazards[0]
-			       || *alias_hazards[0] < *cand.hazards[0]))
-	cand.hazards[0] = alias_hazards[0];
-      if (alias_hazards[1] && (!cand.hazards[1]
-			       || *alias_hazards[1] > *cand.hazards[1]))
-	cand.hazards[1] = alias_hazards[1];
-
-      if (cand.viable ())
-	i++;
-      else
+      using Action = stp_change_builder::action;
+      insn_info *store_to_change = try_repurpose_store (first, second,
+							move_range);
+      stp_change_builder builder (insns, store_to_change, pair_dst);
+      insn_change *change;
+      set_info *new_set = nullptr;
+      for (; !builder.done (); builder.advance ())
 	{
-	  if (dump_file)
-	    fprintf (dump_file, "pair (%d,%d): rejecting base %d due to "
-				"alias/dataflow hazards (%d,%d)",
-				insns[0]->uid (), insns[1]->uid (),
-				cand.def->regno (),
-				cand.hazards[0]->uid (),
-				cand.hazards[1]->uid ());
-
-	  base_cands.ordered_remove (i);
-	}
-    }
+	  auto action = builder.get_change ();
+	  change = (action.type == Action::INSERT)
+	    ? nullptr : make_change (action.insn);
+	  switch (action.type)
+	    {
+	    case Action::CHANGE:
+	    {
+	      set_pair_pat (change);
+	      change->new_uses = merge_access_arrays (attempt,
+						      input_uses[0],
+						      input_uses[1]);
+	      auto d1 = drop_memory_access (input_defs[0]);
+	      auto d2 = drop_memory_access (input_defs[1]);
+	      change->new_defs = merge_access_arrays (attempt, d1, d2);
+	      gcc_assert (change->new_defs.is_valid ());
+	      def_info *stp_def = memory_access (change->insn ()->defs ());
+	      change->new_defs = insert_access (attempt,
+						stp_def,
+						change->new_defs);
+	      gcc_assert (change->new_defs.is_valid ());
+	      change->move_range = move_range;
+	      pair_change = change;
+	      break;
+	    }
+	    case Action::TOMBSTONE:
+	    {
+	      tombstone_uids.quick_push (change->insn ()->uid ());
+	      rtx_insn *rti = change->insn ()->rtl ();
+	      validate_change (rti, &PATTERN (rti), gen_tombstone (), true);
+	      validate_change (rti, &REG_NOTES (rti), NULL_RTX, true);
+	      change->new_uses = use_array (nullptr, 0);
+	      break;
+	    }
+	    case Action::INSERT:
+	    {
+	      if (dump_file)
+		fprintf (dump_file,
+			 "  stp: cannot re-purpose candidate stores\n");
 
-  if (base_cands.is_empty ())
-    {
-      if (dump_file)
-	fprintf (dump_file,
-		 "cannot form pair (%d,%d) due to alias/dataflow hazards",
-		 insns[0]->uid (), insns[1]->uid ());
+	      auto new_insn = crtl->ssa->create_insn (attempt, INSN, pair_pat);
+	      change = make_change (new_insn);
+	      change->move_range = move_range;
+	      change->new_uses = merge_access_arrays (attempt,
+						      input_uses[0],
+						      input_uses[1]);
+	      gcc_assert (change->new_uses.is_valid ());
 
-      return false;
-    }
+	      auto d1 = drop_memory_access (input_defs[0]);
+	      auto d2 = drop_memory_access (input_defs[1]);
+	      change->new_defs = merge_access_arrays (attempt, d1, d2);
+	      gcc_assert (change->new_defs.is_valid ());
 
-  base_cand *base = &base_cands[0];
-  if (base_cands.length () > 1)
-    {
-      // If there are still multiple viable bases, it makes sense
-      // to choose one that allows us to reduce register pressure,
-      // for loads this means moving further down, for stores this
-      // means moving further up.
-      gcc_checking_assert (base_cands.length () == 2);
-      const int hazard_i = !load_p;
-      if (base->hazards[hazard_i])
-	{
-	  if (!base_cands[1].hazards[hazard_i])
-	    base = &base_cands[1];
-	  else if (load_p
-		   && *base_cands[1].hazards[hazard_i]
-		      > *(base->hazards[hazard_i]))
-	    base = &base_cands[1];
-	  else if (!load_p
-		   && *base_cands[1].hazards[hazard_i]
-		      < *(base->hazards[hazard_i]))
-	    base = &base_cands[1];
+	      new_set = crtl->ssa->create_set (attempt, new_insn, memory);
+	      change->new_defs = insert_access (attempt, new_set,
+						change->new_defs);
+	      gcc_assert (change->new_defs.is_valid ());
+	      pair_change = change;
+	      break;
+	    }
+	    case Action::FIXUP_USE:
+	    {
+	      // This use now needs to consume memory from our stp.
+	      if (dump_file)
+		fprintf (dump_file,
+			 "  stp: changing i%d to use mem from new stp "
+			 "(after i%d)\n",
+			 action.insn->uid (), pair_dst->uid ());
+	      change->new_uses = drop_memory_access (change->new_uses);
+	      gcc_assert (new_set);
+	      auto new_use = crtl->ssa->create_use (attempt, action.insn,
+						    new_set);
+	      change->new_uses = insert_access (attempt, new_use,
+						change->new_uses);
+	      break;
+	    }
+	    }
+	  changes.safe_push (change);
 	}
     }
 
-  // Otherwise, hazards[0] > hazards[1].
-  // Pair can be formed anywhere in (hazards[1], hazards[0]).
-  insn_range_info range (insns[0], insns[1]);
-  if (base->hazards[1])
-    range.first = base->hazards[1];
-  if (base->hazards[0])
-    range.last = base->hazards[0]->prev_nondebug_insn ();
-
-  // If the second insn can throw, narrow the move range to exactly that insn.
-  // This prevents us trying to move the second insn from the end of the BB.
-  if (cfun->can_throw_non_call_exceptions
-      && find_reg_note (insns[1]->rtl (), REG_EH_REGION, NULL_RTX))
+  if (trailing_add)
+    changes.safe_push (make_delete (trailing_add));
+  else if ((writeback & 2) && !writeback_effect)
     {
-      gcc_assert (range.includes (insns[1]));
-      range = insn_range_info (insns[1]);
+      // The second insn initially had writeback but now the pair does not,
+      // need to update any nondebug uses of the base register def in the
+      // second insn.  We'll take care of debug uses later.
+      auto def = find_access (insns[1]->defs (), base_regno);
+      gcc_assert (def);
+      auto set = dyn_cast<set_info *> (def);
+      if (set && set->has_nondebug_uses ())
+	{
+	  auto orig_use = find_access (insns[0]->uses (), base_regno);
+	  for (auto use : set->nondebug_insn_uses ())
+	    {
+	      auto change = make_change (use->insn ());
+	      change->new_uses = check_remove_regno_access (attempt,
+							    change->new_uses,
+							    base_regno);
+	      change->new_uses = insert_access (attempt,
+						orig_use,
+						change->new_uses);
+	      changes.safe_push (change);
+	    }
+	}
     }
 
-  // Placement strategy: push loads down and pull stores up, this should
-  // help register pressure by reducing live ranges.
-  if (load_p)
-    range.first = range.last;
-  else
-    range.last = range.first;
+  auto is_changing = insn_is_changing (changes);
+  for (unsigned i = 0; i < changes.length (); i++)
+    gcc_assert (rtl_ssa::restrict_movement_ignoring (*changes[i], is_changing));
 
-  if (dump_file)
+  // Check the pair pattern is recog'd.
+  if (!rtl_ssa::recog_ignoring (attempt, *pair_change, is_changing))
     {
-      auto print_hazard = [](insn_info *i)
-	{
-	  if (i)
-	    fprintf (dump_file, "%d", i->uid ());
-	  else
-	    fprintf (dump_file, "-");
-	};
-      auto print_pair = [print_hazard](insn_info **i)
-	{
-	  print_hazard (i[0]);
-	  fprintf (dump_file, ",");
-	  print_hazard (i[1]);
-	};
+      if (dump_file)
+	fprintf (dump_file, "  failed to form pair, recog failed\n");
 
-      fprintf (dump_file, "fusing pair [L=%d] (%d,%d), base=%d, hazards: (",
-	      load_p, insns[0]->uid (), insns[1]->uid (),
-	      base->def->regno ());
-      print_pair (base->hazards);
-      fprintf (dump_file, "), move_range: (%d,%d)\n",
-	       range.first->uid (), range.last->uid ());
+      // Free any reg notes we allocated.
+      while (reg_notes)
+	{
+	  rtx next = XEXP (reg_notes, 1);
+	  free_EXPR_LIST_node (reg_notes);
+	  reg_notes = next;
+	}
+      cancel_changes (0);
+      return false;
     }
 
-  return fuse_pair (load_p, access_size, writeback,
-		    i1, i2, *base, range);
+  gcc_assert (crtl->ssa->verify_insn_changes (changes));
+
+  // Fix up any debug uses that will be affected by the changes.
+  if (MAY_HAVE_DEBUG_INSNS)
+    fixup_debug_uses (attempt, insns, orig_rtl, pair_dst, trailing_add,
+		      load_p, writeback, writeback_effect, base_regno);
+
+  confirm_change_group ();
+  crtl->ssa->change_insns (changes);
+
+  gcc_checking_assert (tombstone_uids.length () <= 2);
+  for (auto uid : tombstone_uids)
+    track_tombstone (uid);
+
+  return true;
 }
 
-static void
-dump_insn_list (FILE *f, const insn_list_t &l)
+struct  aarch64_pair_fusion : public pair_fusion
 {
-  fprintf (f, "(");
+public:
+  aarch64_pair_fusion (bb_info *bb) : pair_fusion (bb) {};
+  bool is_fpsimd_op_p (rtx reg_op, machine_mode mem_mode, bool load_p)
+  {
+    const bool fpsimd_op_p
+      = reload_completed
+      ? (REG_P (reg_op) && FP_REGNUM_P (REGNO (reg_op)))
+      : (GET_MODE_CLASS (mem_mode) != MODE_INT
+	 && (load_p || !aarch64_const_zero_rtx_p (reg_op)));
+    return fpsimd_op_p;
+  }
 
-  auto i = l.begin ();
-  auto end = l.end ();
+  bool pair_mem_ok_policy (rtx first_mem, bool load_p, machine_mode mode)
+  {
+    return !aarch64_mem_ok_with_ldpstp_policy_model (first_mem,
+						     load_p,
+						     mode);
+  }
+  bool pair_operand_mode_ok_p (machine_mode mode);
 
-  if (i != end)
-    fprintf (f, "%d", (*i)->uid ());
-  i++;
+  void transform_for_base (int encoded_lfs,
+			   access_group &group);
+  rtx gen_load_store_pair (rtx *pats,
+			   rtx writeback,
+			   bool load_p)
+  {
+    rtx pair_pat;
 
-  for (; i != end; i++)
-    fprintf (f, ", %d", (*i)->uid ());
+    if (writeback)
+      {
+	auto patvec = gen_rtvec (3, writeback, pats[0], pats[1]);
+	pair_pat = gen_rtx_PARALLEL (VOIDmode, patvec);
+      }
+    else if (load_p)
+      pair_pat = aarch64_gen_load_pair (XEXP (pats[0], 0),
+					XEXP (pats[1], 0),
+					XEXP (pats[0], 1));
+    else
+      pair_pat = aarch64_gen_store_pair (XEXP (pats[0], 0),
+					 XEXP (pats[0], 1),
+					 XEXP (pats[1], 1));
+     return pair_pat;
+  }
 
-  fprintf (f, ")");
-}
+  void set_multiword_subreg (insn_info *i1, insn_info *i2, bool load_p)
+  {
+    if (i1 || i2 || load_p)
+      return;
+    return;
+  }
+  bool pair_trailing_writeback_p  ()
+  {
+    return aarch64_ldp_writeback > 1;
+  }
+  bool pair_check_register_operand (bool load_p, rtx reg_op, machine_mode mem_mode)
+  {
+    return  (load_p
+	     ? !aarch64_ldp_reg_operand (reg_op, mem_mode)
+	     : !aarch64_stp_reg_operand (reg_op, mem_mode));
+  }
+  int pair_mem_alias_check_limit ()
+  {
+    return aarch64_ldp_alias_check_limit;
+  }
+  bool fuseable_store_p (insn_info *i1, insn_info *i2) { return i1 || i2;}
+  bool fuseable_load_p (insn_info *insn) { return insn;}
+  bool pair_is_writeback ()
+  {
+    return !aarch64_ldp_writeback;
+  }
+private:
+   int num_pairs;
+   rtx_insn *reg_ops[3];
+};
 
-DEBUG_FUNCTION void
-debug (const insn_list_t &l)
+static lfs_fields
+decode_lfs (int lfs)
 {
-  dump_insn_list (stderr, l);
-  fprintf (stderr, "\n");
+  bool load_p = (lfs & (1 << 3));
+  bool fpsimd_p = (lfs & (1 << 2));
+  unsigned size = 1U << ((lfs & 3) + 2);
+  return { load_p, fpsimd_p, size };
 }
 
-// LEFT_LIST and RIGHT_LIST are lists of candidate instructions where all insns
-// in LEFT_LIST are known to be adjacent to those in RIGHT_LIST.
-//
-// This function traverses the resulting 2D matrix of possible pair candidates
-// and attempts to merge them into pairs.
-//
-// The algorithm is straightforward: if we consider a combined list of
-// candidates X obtained by merging LEFT_LIST and RIGHT_LIST in program order,
-// then we advance through X until we reach a crossing point (where X[i] and
-// X[i+1] come from different source lists).
-//
-// At this point we know X[i] and X[i+1] are adjacent accesses, and we try to
-// fuse them into a pair.  If this succeeds, we remove X[i] and X[i+1] from
-// their original lists and continue as above.
-//
-// In the failure case, we advance through the source list containing X[i] and
-// continue as above (proceeding to the next crossing point).
-//
-// The rationale for skipping over groups of consecutive candidates from the
-// same source list is as follows:
-//
-// In the store case, the insns in the group can't be re-ordered over each
-// other as they are guaranteed to store to the same location, so we're
-// guaranteed not to lose opportunities by doing this.
-//
-// In the load case, subsequent loads from the same location are either
-// redundant (in which case they should have been cleaned up by an earlier
-// optimization pass) or there is an intervening aliasing hazard, in which case
-// we can't re-order them anyway, so provided earlier passes have cleaned up
-// redundant loads, we shouldn't miss opportunities by doing this.
-void
-ldp_bb_info::merge_pairs (insn_list_t &left_list,
-			  insn_list_t &right_list,
-			  bool load_p,
-			  unsigned access_size)
+// Return true if we should consider forming ldp/stp insns from memory
+// accesses with operand mode MODE at this stage in compilation.
+static bool
+ldp_operand_mode_ok_p (machine_mode mode)
 {
-  if (dump_file)
-    {
-      fprintf (dump_file, "merge_pairs [L=%d], cand vecs ", load_p);
-      dump_insn_list (dump_file, left_list);
-      fprintf (dump_file, " x ");
-      dump_insn_list (dump_file, right_list);
-      fprintf (dump_file, "\n");
-    }
+  const bool allow_qregs
+    = !(aarch64_tune_params.extra_tuning_flags
+	& AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS);
 
-  auto iter_l = left_list.begin ();
-  auto iter_r = right_list.begin ();
+  if (!aarch64_ldpstp_operand_mode_p (mode))
+    return false;
 
-  while (iter_l != left_list.end () && iter_r != right_list.end ())
+  const auto size = GET_MODE_SIZE (mode).to_constant ();
+  if (size == 16 && !allow_qregs)
+    return false;
+
+  // We don't pair up TImode accesses before RA because TImode is
+  // special in that it can be allocated to a pair of GPRs or a single
+  // FPR, and the RA is best placed to make that decision.
+  return reload_completed || mode != TImode;
+}
+
+bool
+aarch64_pair_fusion::pair_operand_mode_ok_p (machine_mode mode)
+{
+  return (ldp_operand_mode_ok_p (mode));
+}
+
+// Given a pair mode MODE, return a canonical mode to be used for a single
+// operand of such a pair.  Currently we only use this when promoting a
+// non-writeback pair into a writeback pair, as it isn't otherwise clear
+// which mode to use when storing a modeless CONST_INT.
+static machine_mode
+aarch64_operand_mode_for_pair_mode (machine_mode mode)
+{
+  switch (mode)
     {
-      auto next_l = std::next (iter_l);
-      auto next_r = std::next (iter_r);
-      if (**iter_l < **iter_r
-	  && next_l != left_list.end ()
-	  && **next_l < **iter_r)
-	iter_l = next_l;
-      else if (**iter_r < **iter_l
-	       && next_r != right_list.end ()
-	       && **next_r < **iter_l)
-	iter_r = next_r;
-      else if (try_fuse_pair (load_p, access_size, *iter_l, *iter_r))
-	{
-	  left_list.erase (iter_l);
-	  iter_l = next_l;
-	  right_list.erase (iter_r);
-	  iter_r = next_r;
-	}
-      else if (**iter_l < **iter_r)
-	iter_l = next_l;
-      else
-	iter_r = next_r;
+    case E_V2x4QImode:
+      return SImode;
+    case E_V2x8QImode:
+      return DImode;
+    case E_V2x16QImode:
+      return V16QImode;
+    default:
+      gcc_unreachable ();
     }
 }
 
@@ -2890,8 +3108,8 @@ ldp_bb_info::merge_pairs (insn_list_t &left_list,
 // of accesses.  If we find two sets of adjacent accesses, call
 // merge_pairs.
 void
-ldp_bb_info::transform_for_base (int encoded_lfs,
-				 access_group &group)
+aarch64_pair_fusion::transform_for_base (int encoded_lfs,
+					 access_group &group)
 {
   const auto lfs = decode_lfs (encoded_lfs);
   const unsigned access_size = lfs.size;
@@ -2915,55 +3133,6 @@ ldp_bb_info::transform_for_base (int encoded_lfs,
     }
 }
 
-// If we emitted tombstone insns for this BB, iterate through the BB
-// and remove all the tombstone insns, being sure to reparent any uses
-// of mem to previous defs when we do this.
-void
-ldp_bb_info::cleanup_tombstones ()
-{
-  // No need to do anything if we didn't emit a tombstone insn for this BB.
-  if (!m_emitted_tombstone)
-    return;
-
-  for (auto insn : iterate_safely (m_bb->nondebug_insns ()))
-    {
-      if (!insn->is_real ()
-	  || !bitmap_bit_p (&m_tombstone_bitmap, insn->uid ()))
-	continue;
-
-      auto set = as_a<set_info *> (memory_access (insn->defs ()));
-      if (set->has_any_uses ())
-	{
-	  auto prev_set = as_a<set_info *> (set->prev_def ());
-	  while (set->first_use ())
-	    crtl->ssa->reparent_use (set->first_use (), prev_set);
-	}
-
-      // Now set has no uses, we can delete it.
-      insn_change change (insn, insn_change::DELETE);
-      crtl->ssa->change_insn (change);
-    }
-}
-
-template<typename Map>
-void
-ldp_bb_info::traverse_base_map (Map &map)
-{
-  for (auto kv : map)
-    {
-      const auto &key = kv.first;
-      auto &value = kv.second;
-      transform_for_base (key.second, value);
-    }
-}
-
-void
-ldp_bb_info::transform ()
-{
-  traverse_base_map (expr_map);
-  traverse_base_map (def_map);
-}
-
 static void
 ldp_fusion_init ()
 {
@@ -3174,7 +3343,9 @@ void ldp_fusion_bb (bb_info *bb)
   const bool track_stores
     = aarch64_tune_params.stp_policy_model != AARCH64_LDP_STP_POLICY_NEVER;
 
-  ldp_bb_info bb_state (bb);
+  pair_fusion *bb_state;
+  aarch64_pair_fusion derived (bb);
+  bb_state = &derived;
 
   for (auto insn : bb->nondebug_insns ())
     {
@@ -3194,13 +3365,13 @@ void ldp_fusion_bb (bb_info *bb)
 	continue;
 
       if (track_stores && MEM_P (XEXP (pat, 0)))
-	bb_state.track_access (insn, false, XEXP (pat, 0));
+	bb_state->track_access (insn, false, XEXP (pat, 0));
       else if (track_loads && MEM_P (XEXP (pat, 1)))
-	bb_state.track_access (insn, true, XEXP (pat, 1));
+	bb_state->track_access (insn, true, XEXP (pat, 1));
     }
 
-  bb_state.transform ();
-  bb_state.cleanup_tombstones ();
+  bb_state->transform ();
+  bb_state->cleanup_tombstones ();
 }
 
 void ldp_fusion ()
@@ -3263,7 +3434,7 @@ public:
     }
 };
 
-} // anon namespace
+}// anon namespace
 
 rtl_opt_pass *
 make_pass_ldp_fusion (gcc::context *ctx)
-- 
2.39.3


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH 0/2 V2] aarch64: Place target independent and dependent code in one file.
  2024-02-15 18:43 [PATCH 0/2 V2] aarch64: Place target independent and dependent code in one file Ajit Agarwal
@ 2024-02-22 19:49 ` Richard Sandiford
  2024-02-22 21:17   ` Segher Boessenkool
  2024-02-23 11:25   ` Ajit Agarwal
  0 siblings, 2 replies; 4+ messages in thread
From: Richard Sandiford @ 2024-02-22 19:49 UTC (permalink / raw)
  To: Ajit Agarwal
  Cc: Alex Coplan, Kewen.Lin, Segher Boessenkool, Michael Meissner,
	Peter Bergner, David Edelsohn, gcc-patches

Ajit Agarwal <aagarwa1@linux.ibm.com> writes:
> Hello Alex/Richard:
>
> I have placed target indpendent and target dependent code in
> aarch64-ldp-fusion for load store fusion.
>
> Common infrastructure of load store pair fusion is divided into
> target independent and target dependent code.
>
> Target independent code is the Generic code with pure virtual
> function to interface betwwen target independent and dependent
> code.
>
> Target dependent code is the implementation of pure virtual
> function for aarch64 target and the call to target independent
> code.

Thanks for the update.  This is still quite hard to review though.
Sorry to ask for another round, but could you split it up further?
The ideal thing would be if patches that move code do nothing other
than move code, and if patches that change code do those changes
in-place.

Richard

>
> Bootstrapped in aarch64-linux-gnu.
>
> Thanks & Regards
> Ajit
>
>
> aarch64: Place target independent and dependent code in one file.
>
> Common infrastructure of load store pair fusion is divided into
> target independent and target dependent code.
>
> Target independent code is the Generic code with pure virtual
> function to interface betwwen target independent and dependent
> code.
>
> Target dependent code is the implementation of pure virtual
> function for aarch64 target and the call to target independent
> code.
>
> 2024-02-15  Ajit Kumar Agarwal  <aagarwa1@linux.ibm.com>
>
> gcc/ChangeLog:
>
> 	* config/aarch64/aarch64-ldp-fusion.cc: Place target
> 	independent and dependent code.
> ---
>  gcc/config/aarch64/aarch64-ldp-fusion.cc | 3513 ++++++++++++----------
>  1 file changed, 1842 insertions(+), 1671 deletions(-)
>
> diff --git a/gcc/config/aarch64/aarch64-ldp-fusion.cc b/gcc/config/aarch64/aarch64-ldp-fusion.cc
> index 22ed95eb743..0ab842e2bbb 100644
> --- a/gcc/config/aarch64/aarch64-ldp-fusion.cc
> +++ b/gcc/config/aarch64/aarch64-ldp-fusion.cc
> @@ -17,6 +17,7 @@
>  // along with GCC; see the file COPYING3.  If not see
>  // <http://www.gnu.org/licenses/>.
>  
> +
>  #define INCLUDE_ALGORITHM
>  #define INCLUDE_FUNCTIONAL
>  #define INCLUDE_LIST
> @@ -37,13 +38,12 @@
>  #include "tree-hash-traits.h"
>  #include "print-tree.h"
>  #include "insn-attr.h"
> -
>  using namespace rtl_ssa;
>  
> -static constexpr HOST_WIDE_INT LDP_IMM_BITS = 7;
> -static constexpr HOST_WIDE_INT LDP_IMM_SIGN_BIT = (1 << (LDP_IMM_BITS - 1));
> -static constexpr HOST_WIDE_INT LDP_MAX_IMM = LDP_IMM_SIGN_BIT - 1;
> -static constexpr HOST_WIDE_INT LDP_MIN_IMM = -LDP_MAX_IMM - 1;
> +static constexpr HOST_WIDE_INT PAIR_MEM_IMM_BITS = 7;
> +static constexpr HOST_WIDE_INT PAIR_MEM_IMM_SIGN_BIT = (1 << (PAIR_MEM_IMM_BITS - 1));
> +static constexpr HOST_WIDE_INT PAIR_MEM_MAX_IMM = PAIR_MEM_IMM_SIGN_BIT - 1;
> +static constexpr HOST_WIDE_INT PAIR_MEM_MIN_IMM = -PAIR_MEM_MAX_IMM - 1;
>  
>  // We pack these fields (load_p, fpsimd_p, and size) into an integer
>  // (LFS) which we use as part of the key into the main hash tables.
> @@ -138,8 +138,144 @@ struct alt_base
>    poly_int64 offset;
>  };
>  
> +// Class that implements a state machine for building the changes needed to form
> +// a store pair instruction.  This allows us to easily build the changes in
> +// program order, as required by rtl-ssa.
> +struct stp_change_builder
> +{
> +  enum class state
> +  {
> +    FIRST,
> +    INSERT,
> +    FIXUP_USE,
> +    LAST,
> +    DONE
> +  };
> +
> +  enum class action
> +  {
> +    TOMBSTONE,
> +    CHANGE,
> +    INSERT,
> +    FIXUP_USE
> +  };
> +
> +  struct change
> +  {
> +    action type;
> +    insn_info *insn;
> +  };
> +
> +  bool done () const { return m_state == state::DONE; }
> +
> +  stp_change_builder (insn_info *insns[2],
> +		      insn_info *repurpose,
> +		      insn_info *dest)
> +    : m_state (state::FIRST), m_insns { insns[0], insns[1] },
> +      m_repurpose (repurpose), m_dest (dest), m_use (nullptr) {}
> +
> +  change get_change () const
> +  {
> +    switch (m_state)
> +      {
> +      case state::FIRST:
> +	return {
> +	  m_insns[0] == m_repurpose ? action::CHANGE : action::TOMBSTONE,
> +	  m_insns[0]
> +	};
> +      case state::LAST:
> +	return {
> +	  m_insns[1] == m_repurpose ? action::CHANGE : action::TOMBSTONE,
> +	  m_insns[1]
> +	};
> +      case state::INSERT:
> +	return { action::INSERT, m_dest };
> +      case state::FIXUP_USE:
> +	return { action::FIXUP_USE, m_use->insn () };
> +      case state::DONE:
> +	break;
> +      }
> +
> +    gcc_unreachable ();
> +  }
> +
> +  // Transition to the next state.
> +  void advance ()
> +  {
> +    switch (m_state)
> +      {
> +      case state::FIRST:
> +	if (m_repurpose)
> +	  m_state = state::LAST;
> +	else
> +	  m_state = state::INSERT;
> +	break;
> +      case state::INSERT:
> +      {
> +	def_info *def = memory_access (m_insns[0]->defs ());
> +	while (*def->next_def ()->insn () <= *m_dest)
> +	  def = def->next_def ();
> +
> +	// Now we know DEF feeds the insertion point for the new stp.
> +	// Look for any uses of DEF that will consume the new stp.
> +	gcc_assert (*def->insn () <= *m_dest
> +		    && *def->next_def ()->insn () > *m_dest);
> +
> +	auto set = as_a<set_info *> (def);
> +	for (auto use : set->nondebug_insn_uses ())
> +	  if (*use->insn () > *m_dest)
> +	    {
> +	      m_use = use;
> +	      break;
> +	    }
> +
> +	if (m_use)
> +	  m_state = state::FIXUP_USE;
> +	else
> +	  m_state = state::LAST;
> +	break;
> +      }
> +      case state::FIXUP_USE:
> +	m_use = m_use->next_nondebug_insn_use ();
> +	if (!m_use)
> +	  m_state = state::LAST;
> +	break;
> +      case state::LAST:
> +	m_state = state::DONE;
> +	break;
> +      case state::DONE:
> +	gcc_unreachable ();
> +      }
> +  }
> +
> +private:
> +  state m_state;
> +
> +  // Original candidate stores.
> +  insn_info *m_insns[2];
> +
> +  // If non-null, this is a candidate insn to change into an stp.  Otherwise we
> +  // are deleting both original insns and inserting a new insn for the stp.
> +  insn_info *m_repurpose;
> +
> +  // Destionation of the stp, it will be placed immediately after m_dest.
> +  insn_info *m_dest;
> +
> +  // Current nondebug use that needs updating due to stp insertion.
> +  use_info *m_use;
> +};
> +
> +// Virtual base class for load/store walkers used in alias analysis.
> +struct alias_walker
> +{
> +  virtual bool conflict_p (int &budget) const = 0;
> +  virtual insn_info *insn () const = 0;
> +  virtual bool valid () const  = 0;
> +  virtual void advance () = 0;
> +};
> +
>  // State used by the pass for a given basic block.
> -struct ldp_bb_info
> +struct pair_fusion
>  {
>    using def_hash = nofree_ptr_hash<def_info>;
>    using expr_key_t = pair_hash<tree_operand_hash, int_hash<int, -1, -2>>;
> @@ -161,13 +297,13 @@ struct ldp_bb_info
>    static const size_t obstack_alignment = sizeof (void *);
>    bb_info *m_bb;
>  
> -  ldp_bb_info (bb_info *bb) : m_bb (bb), m_emitted_tombstone (false)
> +  pair_fusion (bb_info *bb) : m_bb (bb), m_emitted_tombstone (false)
>    {
>      obstack_specify_allocation (&m_obstack, OBSTACK_CHUNK_SIZE,
>  				obstack_alignment, obstack_chunk_alloc,
>  				obstack_chunk_free);
>    }
> -  ~ldp_bb_info ()
> +  ~pair_fusion ()
>    {
>      obstack_free (&m_obstack, nullptr);
>  
> @@ -177,10 +313,50 @@ struct ldp_bb_info
>  	bitmap_obstack_release (&m_bitmap_obstack);
>        }
>    }
> +  void track_access (insn_info *, bool load, rtx mem);
> +  void transform ();
> +  void cleanup_tombstones ();
> +  virtual void set_multiword_subreg (insn_info *i1, insn_info *i2,
> +				     bool load_p) = 0;
> +  virtual rtx gen_load_store_pair (rtx *pats,  rtx writeback,
> +				   bool load_p) = 0;
> +  void merge_pairs (insn_list_t &, insn_list_t &,
> +		    bool load_p, unsigned access_size);
> +  virtual void transform_for_base (int load_size, access_group &group) = 0;
> +
> +  bool try_fuse_pair (bool load_p, unsigned access_size,
> +			     insn_info *i1, insn_info *i2);
> +
> +  bool fuse_pair (bool load_p, unsigned access_size,
> +		  int writeback,
> +		  insn_info *i1, insn_info *i2,
> +		  base_cand &base,
> +		  const insn_range_info &move_range);
> +
> +  void do_alias_analysis (insn_info *alias_hazards[4],
> +			  alias_walker *walkers[4],
> +			  bool load_p);
> +
> +  void track_tombstone (int uid);
> +
> +  bool track_via_mem_expr (insn_info *, rtx mem, lfs_fields lfs);
>  
> -  inline void track_access (insn_info *, bool load, rtx mem);
> -  inline void transform ();
> -  inline void cleanup_tombstones ();
> +  virtual bool is_fpsimd_op_p (rtx reg_op, machine_mode mem_mode,
> +			       bool load_p) = 0;
> +
> +  virtual bool pair_operand_mode_ok_p (machine_mode mode) = 0;
> +  virtual bool pair_trailing_writeback_p () = 0;
> +  virtual bool pair_check_register_operand (bool load_p, rtx reg_op,
> +					    machine_mode mem_mode) = 0;
> +  virtual int pair_mem_alias_check_limit () = 0;
> +  virtual bool pair_is_writeback () = 0 ;
> +  virtual bool pair_mem_ok_policy (rtx first_mem, bool load_p,
> +				   machine_mode mode) = 0;
> +  virtual bool fuseable_store_p (insn_info *i1, insn_info *i2) = 0;
> +  virtual bool fuseable_load_p (insn_info *info) = 0;
> +
> +  template<typename Map>
> +    void traverse_base_map (Map &map);
>  
>  private:
>    obstack m_obstack;
> @@ -191,100 +367,292 @@ private:
>    bool m_emitted_tombstone;
>  
>    inline splay_tree_node<access_record *> *node_alloc (access_record *);
> -
> -  template<typename Map>
> -  inline void traverse_base_map (Map &map);
> -  inline void transform_for_base (int load_size, access_group &group);
> -
> -  inline void merge_pairs (insn_list_t &, insn_list_t &,
> -			   bool load_p, unsigned access_size);
> -
> -  inline bool try_fuse_pair (bool load_p, unsigned access_size,
> -			     insn_info *i1, insn_info *i2);
> -
> -  inline bool fuse_pair (bool load_p, unsigned access_size,
> -			 int writeback,
> -			 insn_info *i1, insn_info *i2,
> -			 base_cand &base,
> -			 const insn_range_info &move_range);
> -
> -  inline void track_tombstone (int uid);
> -
> -  inline bool track_via_mem_expr (insn_info *, rtx mem, lfs_fields lfs);
>  };
> -
> -splay_tree_node<access_record *> *
> -ldp_bb_info::node_alloc (access_record *access)
> -{
> -  using T = splay_tree_node<access_record *>;
> -  void *addr = obstack_alloc (&m_obstack, sizeof (T));
> -  return new (addr) T (access);
> -}
> -
> -// Given a mem MEM, if the address has side effects, return a MEM that accesses
> -// the same address but without the side effects.  Otherwise, return
> -// MEM unchanged.
> -static rtx
> -drop_writeback (rtx mem)
> +// Track the access INSN at offset OFFSET in this access group.
> +// ALLOC_NODE is used to allocate splay tree nodes.
> +template<typename Alloc>
> +void
> +access_group::track (Alloc alloc_node, poly_int64 offset, insn_info *insn)
>  {
> -  rtx addr = XEXP (mem, 0);
> +  auto insert_before = [&](std::list<access_record>::iterator after)
> +    {
> +      auto it = list.emplace (after, offset);
> +      it->cand_insns.push_back (insn);
> +      it->place = it;
> +      return &*it;
> +    };
>  
> -  if (!side_effects_p (addr))
> -    return mem;
> +  if (!list.size ())
> +    {
> +      auto access = insert_before (list.end ());
> +      tree.insert_max_node (alloc_node (access));
> +      return;
> +    }
>  
> -  switch (GET_CODE (addr))
> +  auto compare = [&](splay_tree_node<access_record *> *node)
>      {
> -    case PRE_MODIFY:
> -      addr = XEXP (addr, 1);
> -      break;
> -    case POST_MODIFY:
> -    case POST_INC:
> -    case POST_DEC:
> -      addr = XEXP (addr, 0);
> -      break;
> -    case PRE_INC:
> -    case PRE_DEC:
> +      return compare_sizes_for_sort (offset, node->value ()->offset);
> +    };
> +  auto result = tree.lookup (compare);
> +  splay_tree_node<access_record *> *node = tree.root ();
> +  if (result == 0)
> +    node->value ()->cand_insns.push_back (insn);
> +  else
>      {
> -      poly_int64 adjustment = GET_MODE_SIZE (GET_MODE (mem));
> -      if (GET_CODE (addr) == PRE_DEC)
> -	adjustment *= -1;
> -      addr = plus_constant (GET_MODE (addr), XEXP (addr, 0), adjustment);
> -      break;
> -    }
> -    default:
> -      gcc_unreachable ();
> +      auto it = node->value ()->place;
> +      auto after = (result > 0) ? std::next (it) : it;
> +      auto access = insert_before (after);
> +      tree.insert_child (node, result > 0, alloc_node (access));
>      }
> -
> -  return change_address (mem, GET_MODE (mem), addr);
>  }
>  
> -// Convenience wrapper around strip_offset that can also look through
> -// RTX_AUTOINC addresses.  The interface is like strip_offset except we take a
> -// MEM so that we know the mode of the access.
> -static rtx
> -ldp_strip_offset (rtx mem, poly_int64 *offset)
> +bool
> +store_modifies_mem_p (rtx mem, insn_info *store_insn, int &budget);
> +bool load_modified_by_store_p (insn_info *load,
> +			  insn_info *store,
> +			  int &budget);
> +
> +// Implement some common functionality used by both store_walker
> +// and load_walker.
> +template<bool reverse>
> +class def_walker : public alias_walker
>  {
> -  rtx addr = XEXP (mem, 0);
> +protected:
> +  using def_iter_t = typename std::conditional<reverse,
> +	reverse_def_iterator, def_iterator>::type;
>  
> -  switch (GET_CODE (addr))
> -    {
> -    case PRE_MODIFY:
> -    case POST_MODIFY:
> -      addr = strip_offset (XEXP (addr, 1), offset);
> -      gcc_checking_assert (REG_P (addr));
> -      gcc_checking_assert (rtx_equal_p (XEXP (XEXP (mem, 0), 0), addr));
> -      break;
> -    case PRE_INC:
> -    case POST_INC:
> -      addr = XEXP (addr, 0);
> -      *offset = GET_MODE_SIZE (GET_MODE (mem));
> -      gcc_checking_assert (REG_P (addr));
> -      break;
> -    case PRE_DEC:
> -    case POST_DEC:
> -      addr = XEXP (addr, 0);
> -      *offset = -GET_MODE_SIZE (GET_MODE (mem));
> -      gcc_checking_assert (REG_P (addr));
> +  static use_info *start_use_chain (def_iter_t &def_iter)
> +  {
> +    set_info *set = nullptr;
> +    for (; *def_iter; def_iter++)
> +      {
> +	set = dyn_cast<set_info *> (*def_iter);
> +	if (!set)
> +	  continue;
> +
> +	use_info *use = reverse
> +	  ? set->last_nondebug_insn_use ()
> +	  : set->first_nondebug_insn_use ();
> +
> +	if (use)
> +	  return use;
> +      }
> +
> +    return nullptr;
> +  }
> +
> +  def_iter_t def_iter;
> +  insn_info *limit;
> +  def_walker (def_info *def, insn_info *limit) :
> +    def_iter (def), limit (limit) {}
> +
> +  virtual bool iter_valid () const { return *def_iter; }
> +
> +public:
> +  insn_info *insn () const override { return (*def_iter)->insn (); }
> +  void advance () override { def_iter++; }
> +  bool valid () const override final
> +  {
> +    if (!iter_valid ())
> +      return false;
> +
> +    if (reverse)
> +      return *(insn ()) > *limit;
> +    else
> +      return *(insn ()) < *limit;
> +  }
> +};
> +
> +// alias_walker that iterates over stores.
> +template<bool reverse, typename InsnPredicate>
> +class store_walker : public def_walker<reverse>
> +{
> +  rtx cand_mem;
> +  InsnPredicate tombstone_p;
> +
> +public:
> +  store_walker (def_info *mem_def, rtx mem, insn_info *limit_insn,
> +		InsnPredicate tombstone_fn) :
> +    def_walker<reverse> (mem_def, limit_insn),
> +    cand_mem (mem), tombstone_p (tombstone_fn) {}
> +  bool conflict_p (int &budget) const override final
> +  {
> +    if (tombstone_p (this->insn ()))
> +      return false;
> +
> +    return store_modifies_mem_p (cand_mem, this->insn (), budget);
> +  }
> +};
> +
> +// alias_walker that iterates over loads.
> +template<bool reverse>
> +class load_walker : public def_walker<reverse>
> +{
> +  using Base = def_walker<reverse>;
> +  using use_iter_t = typename std::conditional<reverse,
> +	reverse_use_iterator, nondebug_insn_use_iterator>::type;
> +
> +  use_iter_t use_iter;
> +  insn_info *cand_store;
> +
> +  bool iter_valid () const override final { return *use_iter; }
> +
> +public:
> +  void advance () override final
> +  {
> +    use_iter++;
> +    if (*use_iter)
> +      return;
> +    this->def_iter++;
> +    use_iter = Base::start_use_chain (this->def_iter);
> +  }
> +
> +  insn_info *insn () const override final
> +  {
> +    return (*use_iter)->insn ();
> +  }
> +  bool conflict_p (int &budget) const override final
> +  {
> +    return load_modified_by_store_p (insn (), cand_store, budget);
> +  }
> +  load_walker (def_info *def, insn_info *store, insn_info *limit_insn)
> +    : Base (def, limit_insn),
> +      use_iter (Base::start_use_chain (this->def_iter)),
> +      cand_store (store) {}
> +};
> +
> +extern insn_info *
> +try_repurpose_store (insn_info *first,
> +		     insn_info *second,
> +		     const insn_range_info &move_range);
> +
> +void reset_debug_use (use_info *use);
> +
> +extern void
> +fixup_debug_uses (obstack_watermark &attempt,
> +		  insn_info *insns[2],
> +		  rtx orig_rtl[2],
> +		  insn_info *pair_dst,
> +		  insn_info *trailing_add,
> +		  bool load_p,
> +		  int writeback,
> +		  rtx writeback_effect,
> +		  unsigned base_regno);
> +
> +void
> +fixup_debug_uses_trailing_add (obstack_watermark &attempt,
> +			       insn_info *pair_dst,
> +			       insn_info *trailing_add,
> +			       rtx writeback_effect);
> +
> +
> +extern void
> +fixup_debug_use (obstack_watermark &attempt,
> +		 use_info *use,
> +		 def_info *def,
> +		 rtx base,
> +		 poly_int64 wb_offset);
> +
> +extern insn_info *
> +find_trailing_add (insn_info *insns[2],
> +		   const insn_range_info &pair_range,
> +		   int initial_writeback,
> +		   rtx *writeback_effect,
> +		   def_info **add_def,
> +		   def_info *base_def,
> +		   poly_int64 initial_offset,
> +		   unsigned access_size);
> +
> +rtx drop_writeback (rtx mem);
> +rtx pair_mem_strip_offset (rtx mem, poly_int64 *offset);
> +bool any_pre_modify_p (rtx x);
> +bool any_post_modify_p (rtx x);
> +int encode_lfs (lfs_fields fields);
> +extern insn_info * latest_hazard_before (insn_info *insn, rtx *ignore,
> +		      insn_info *ignore_insn = nullptr);
> +insn_info * first_hazard_after (insn_info *insn, rtx *ignore);
> +bool ranges_overlap_p (const insn_range_info &r1, const insn_range_info &r2);
> +insn_range_info get_def_range (def_info *def);
> +insn_range_info def_downwards_move_range (def_info *def);
> +insn_range_info def_upwards_move_range (def_info *def);
> +rtx gen_tombstone (void);
> +rtx filter_notes (rtx note, rtx result, bool *eh_region, rtx *fr_expr);
> +rtx combine_reg_notes (insn_info *i1, insn_info *i2, bool load_p);
> +rtx extract_writebacks (bool load_p, rtx pats[2], int changed);
> +void do_alias_analysis (insn_info *alias_hazards[4],
> +		   alias_walker *walkers[4],
> +		   bool load_p);
> +int get_viable_bases (insn_info *insns[2],
> +		  vec<base_cand> &base_cands,
> +		  rtx cand_mems[2],
> +		  unsigned access_size,
> +		  bool reversed);
> +void dump_insn_list (FILE *f, const insn_list_t &l);
> +
> +// Given a mem MEM, if the address has side effects, return a MEM that accesses
> +// the same address but without the side effects.  Otherwise, return
> +// MEM unchanged.
> +rtx
> +drop_writeback (rtx mem)
> +{
> +  rtx addr = XEXP (mem, 0);
> +
> +  if (!side_effects_p (addr))
> +    return mem;
> +
> +  switch (GET_CODE (addr))
> +    {
> +    case PRE_MODIFY:
> +      addr = XEXP (addr, 1);
> +      break;
> +    case POST_MODIFY:
> +    case POST_INC:
> +    case POST_DEC:
> +      addr = XEXP (addr, 0);
> +      break;
> +    case PRE_INC:
> +    case PRE_DEC:
> +    {
> +      poly_int64 adjustment = GET_MODE_SIZE (GET_MODE (mem));
> +      if (GET_CODE (addr) == PRE_DEC)
> +	adjustment *= -1;
> +      addr = plus_constant (GET_MODE (addr), XEXP (addr, 0), adjustment);
> +      break;
> +    }
> +    default:
> +      gcc_unreachable ();
> +    }
> +
> +  return change_address (mem, GET_MODE (mem), addr);
> +}
> +
> +// Convenience wrapper around strip_offset that can also look through
> +// RTX_AUTOINC addresses.  The interface is like strip_offset except we take a
> +// MEM so that we know the mode of the access.
> +rtx
> +pair_mem_strip_offset (rtx mem, poly_int64 *offset)
> +{
> +  rtx addr = XEXP (mem, 0);
> +
> +  switch (GET_CODE (addr))
> +    {
> +    case PRE_MODIFY:
> +    case POST_MODIFY:
> +      addr = strip_offset (XEXP (addr, 1), offset);
> +      gcc_checking_assert (REG_P (addr));
> +      gcc_checking_assert (rtx_equal_p (XEXP (XEXP (mem, 0), 0), addr));
> +      break;
> +    case PRE_INC:
> +    case POST_INC:
> +      addr = XEXP (addr, 0);
> +      *offset = GET_MODE_SIZE (GET_MODE (mem));
> +      gcc_checking_assert (REG_P (addr));
> +      break;
> +    case PRE_DEC:
> +    case POST_DEC:
> +      addr = XEXP (addr, 0);
> +      *offset = -GET_MODE_SIZE (GET_MODE (mem));
> +      gcc_checking_assert (REG_P (addr));
>        break;
>  
>      default:
> @@ -295,7 +663,7 @@ ldp_strip_offset (rtx mem, poly_int64 *offset)
>  }
>  
>  // Return true if X is a PRE_{INC,DEC,MODIFY} rtx.
> -static bool
> +bool
>  any_pre_modify_p (rtx x)
>  {
>    const auto code = GET_CODE (x);
> @@ -303,318 +671,42 @@ any_pre_modify_p (rtx x)
>  }
>  
>  // Return true if X is a POST_{INC,DEC,MODIFY} rtx.
> -static bool
> +bool
>  any_post_modify_p (rtx x)
>  {
>    const auto code = GET_CODE (x);
>    return code == POST_INC || code == POST_DEC || code == POST_MODIFY;
>  }
>  
> -// Return true if we should consider forming ldp/stp insns from memory
> -// accesses with operand mode MODE at this stage in compilation.
> -static bool
> -ldp_operand_mode_ok_p (machine_mode mode)
> -{
> -  const bool allow_qregs
> -    = !(aarch64_tune_params.extra_tuning_flags
> -	& AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS);
> -
> -  if (!aarch64_ldpstp_operand_mode_p (mode))
> -    return false;
> -
> -  const auto size = GET_MODE_SIZE (mode).to_constant ();
> -  if (size == 16 && !allow_qregs)
> -    return false;
> -
> -  // We don't pair up TImode accesses before RA because TImode is
> -  // special in that it can be allocated to a pair of GPRs or a single
> -  // FPR, and the RA is best placed to make that decision.
> -  return reload_completed || mode != TImode;
> -}
> -
>  // Given LFS (load_p, fpsimd_p, size) fields in FIELDS, encode these
>  // into an integer for use as a hash table key.
> -static int
> +int
>  encode_lfs (lfs_fields fields)
>  {
>    int size_log2 = exact_log2 (fields.size);
> -  gcc_checking_assert (size_log2 >= 2 && size_log2 <= 4);
> +  //gcc_checking_assert (size_log2 >= 2 && size_log2 <= 4);
>    return ((int)fields.load_p << 3)
>      | ((int)fields.fpsimd_p << 2)
>      | (size_log2 - 2);
>  }
>  
> -// Inverse of encode_lfs.
> -static lfs_fields
> -decode_lfs (int lfs)
> -{
> -  bool load_p = (lfs & (1 << 3));
> -  bool fpsimd_p = (lfs & (1 << 2));
> -  unsigned size = 1U << ((lfs & 3) + 2);
> -  return { load_p, fpsimd_p, size };
> -}
> +// Dummy predicate that never ignores any insns.
> +static bool no_ignore (insn_info *) { return false; }
>  
> -// Track the access INSN at offset OFFSET in this access group.
> -// ALLOC_NODE is used to allocate splay tree nodes.
> -template<typename Alloc>
> -void
> -access_group::track (Alloc alloc_node, poly_int64 offset, insn_info *insn)
> -{
> -  auto insert_before = [&](std::list<access_record>::iterator after)
> -    {
> -      auto it = list.emplace (after, offset);
> -      it->cand_insns.push_back (insn);
> -      it->place = it;
> -      return &*it;
> -    };
> -
> -  if (!list.size ())
> -    {
> -      auto access = insert_before (list.end ());
> -      tree.insert_max_node (alloc_node (access));
> -      return;
> -    }
> -
> -  auto compare = [&](splay_tree_node<access_record *> *node)
> -    {
> -      return compare_sizes_for_sort (offset, node->value ()->offset);
> -    };
> -  auto result = tree.lookup (compare);
> -  splay_tree_node<access_record *> *node = tree.root ();
> -  if (result == 0)
> -    node->value ()->cand_insns.push_back (insn);
> -  else
> -    {
> -      auto it = node->value ()->place;
> -      auto after = (result > 0) ? std::next (it) : it;
> -      auto access = insert_before (after);
> -      tree.insert_child (node, result > 0, alloc_node (access));
> -    }
> -}
> -
> -// Given a candidate access INSN (with mem MEM), see if it has a suitable
> -// MEM_EXPR base (i.e. a tree decl) relative to which we can track the access.
> -// LFS is used as part of the key to the hash table, see track_access.
> -bool
> -ldp_bb_info::track_via_mem_expr (insn_info *insn, rtx mem, lfs_fields lfs)
> -{
> -  if (!MEM_EXPR (mem) || !MEM_OFFSET_KNOWN_P (mem))
> -    return false;
> -
> -  poly_int64 offset;
> -  tree base_expr = get_addr_base_and_unit_offset (MEM_EXPR (mem),
> -						  &offset);
> -  if (!base_expr || !DECL_P (base_expr))
> -    return false;
> -
> -  offset += MEM_OFFSET (mem);
> -
> -  const machine_mode mem_mode = GET_MODE (mem);
> -  const HOST_WIDE_INT mem_size = GET_MODE_SIZE (mem_mode).to_constant ();
> -
> -  // Punt on misaligned offsets.  LDP/STP instructions require offsets to be a
> -  // multiple of the access size, and we believe that misaligned offsets on
> -  // MEM_EXPR bases are likely to lead to misaligned offsets w.r.t. RTL bases.
> -  if (!multiple_p (offset, mem_size))
> -    return false;
> -
> -  const auto key = std::make_pair (base_expr, encode_lfs (lfs));
> -  access_group &group = expr_map.get_or_insert (key, NULL);
> -  auto alloc = [&](access_record *access) { return node_alloc (access); };
> -  group.track (alloc, offset, insn);
> -
> -  if (dump_file)
> -    {
> -      fprintf (dump_file, "[bb %u] tracking insn %d via ",
> -	       m_bb->index (), insn->uid ());
> -      print_node_brief (dump_file, "mem expr", base_expr, 0);
> -      fprintf (dump_file, " [L=%d FP=%d, %smode, off=",
> -	       lfs.load_p, lfs.fpsimd_p, mode_name[mem_mode]);
> -      print_dec (offset, dump_file);
> -      fprintf (dump_file, "]\n");
> -    }
> -
> -  return true;
> -}
> -
> -// Main function to begin pair discovery.  Given a memory access INSN,
> -// determine whether it could be a candidate for fusing into an ldp/stp,
> -// and if so, track it in the appropriate data structure for this basic
> -// block.  LOAD_P is true if the access is a load, and MEM is the mem
> -// rtx that occurs in INSN.
> -void
> -ldp_bb_info::track_access (insn_info *insn, bool load_p, rtx mem)
> -{
> -  // We can't combine volatile MEMs, so punt on these.
> -  if (MEM_VOLATILE_P (mem))
> -    return;
> -
> -  // Ignore writeback accesses if the param says to do so.
> -  if (!aarch64_ldp_writeback
> -      && GET_RTX_CLASS (GET_CODE (XEXP (mem, 0))) == RTX_AUTOINC)
> -    return;
> -
> -  const machine_mode mem_mode = GET_MODE (mem);
> -  if (!ldp_operand_mode_ok_p (mem_mode))
> -    return;
> -
> -  rtx reg_op = XEXP (PATTERN (insn->rtl ()), !load_p);
> -
> -  // Ignore the access if the register operand isn't suitable for ldp/stp.
> -  if (load_p
> -      ? !aarch64_ldp_reg_operand (reg_op, mem_mode)
> -      : !aarch64_stp_reg_operand (reg_op, mem_mode))
> -    return;
> -
> -  // We want to segregate FP/SIMD accesses from GPR accesses.
> -  //
> -  // Before RA, we use the modes, noting that stores of constant zero
> -  // operands use GPRs (even in non-integer modes).  After RA, we use
> -  // the hard register numbers.
> -  const bool fpsimd_op_p
> -    = reload_completed
> -    ? (REG_P (reg_op) && FP_REGNUM_P (REGNO (reg_op)))
> -    : (GET_MODE_CLASS (mem_mode) != MODE_INT
> -       && (load_p || !aarch64_const_zero_rtx_p (reg_op)));
> -
> -  // Note ldp_operand_mode_ok_p already rejected VL modes.
> -  const HOST_WIDE_INT mem_size = GET_MODE_SIZE (mem_mode).to_constant ();
> -  const lfs_fields lfs = { load_p, fpsimd_op_p, mem_size };
> -
> -  if (track_via_mem_expr (insn, mem, lfs))
> -    return;
> -
> -  poly_int64 mem_off;
> -  rtx addr = XEXP (mem, 0);
> -  const bool autoinc_p = GET_RTX_CLASS (GET_CODE (addr)) == RTX_AUTOINC;
> -  rtx base = ldp_strip_offset (mem, &mem_off);
> -  if (!REG_P (base))
> -    return;
> -
> -  // Need to calculate two (possibly different) offsets:
> -  //  - Offset at which the access occurs.
> -  //  - Offset of the new base def.
> -  poly_int64 access_off;
> -  if (autoinc_p && any_post_modify_p (addr))
> -    access_off = 0;
> -  else
> -    access_off = mem_off;
> -
> -  poly_int64 new_def_off = mem_off;
> -
> -  // Punt on accesses relative to eliminable regs.  Since we don't know the
> -  // elimination offset pre-RA, we should postpone forming pairs on such
> -  // accesses until after RA.
> -  //
> -  // As it stands, addresses with offsets in range for LDR but not
> -  // in range for LDP/STP are currently reloaded inefficiently,
> -  // ending up with a separate base register for each pair.
> -  //
> -  // In theory LRA should make use of
> -  // targetm.legitimize_address_displacement to promote sharing of
> -  // bases among multiple (nearby) address reloads, but the current
> -  // LRA code returns early from process_address_1 for operands that
> -  // satisfy "m", even if they don't satisfy the real (relaxed) address
> -  // constraint; this early return means we never get to the code
> -  // that calls targetm.legitimize_address_displacement.
> -  //
> -  // So for now, it's better to punt when we can't be sure that the
> -  // offset is in range for LDP/STP.  Out-of-range cases can then be
> -  // handled after RA by the out-of-range LDP/STP peepholes.  Eventually, it
> -  // would be nice to handle known out-of-range opportunities in the
> -  // pass itself (for stack accesses, this would be in the post-RA pass).
> -  if (!reload_completed
> -      && (REGNO (base) == FRAME_POINTER_REGNUM
> -	  || REGNO (base) == ARG_POINTER_REGNUM))
> -    return;
> -
> -  // Now need to find def of base register.
> -  use_info *base_use = find_access (insn->uses (), REGNO (base));
> -  gcc_assert (base_use);
> -  def_info *base_def = base_use->def ();
> -  if (!base_def)
> -    {
> -      if (dump_file)
> -	fprintf (dump_file,
> -		 "base register (regno %d) of insn %d is undefined",
> -		 REGNO (base), insn->uid ());
> -      return;
> -    }
> -
> -  alt_base *canon_base = canon_base_map.get (base_def);
> -  if (canon_base)
> -    {
> -      // Express this as the combined offset from the canonical base.
> -      base_def = canon_base->base;
> -      new_def_off += canon_base->offset;
> -      access_off += canon_base->offset;
> -    }
> -
> -  if (autoinc_p)
> -    {
> -      auto def = find_access (insn->defs (), REGNO (base));
> -      gcc_assert (def);
> -
> -      // Record that DEF = BASE_DEF + MEM_OFF.
> -      if (dump_file)
> -	{
> -	  pretty_printer pp;
> -	  pp_access (&pp, def, 0);
> -	  pp_string (&pp, " = ");
> -	  pp_access (&pp, base_def, 0);
> -	  fprintf (dump_file, "[bb %u] recording %s + ",
> -		   m_bb->index (), pp_formatted_text (&pp));
> -	  print_dec (new_def_off, dump_file);
> -	  fprintf (dump_file, "\n");
> -	}
> -
> -      alt_base base_rec { base_def, new_def_off };
> -      if (canon_base_map.put (def, base_rec))
> -	gcc_unreachable (); // Base defs should be unique.
> -    }
> -
> -  // Punt on misaligned offsets.  LDP/STP require offsets to be a multiple of
> -  // the access size.
> -  if (!multiple_p (mem_off, mem_size))
> -    return;
> -
> -  const auto key = std::make_pair (base_def, encode_lfs (lfs));
> -  access_group &group = def_map.get_or_insert (key, NULL);
> -  auto alloc = [&](access_record *access) { return node_alloc (access); };
> -  group.track (alloc, access_off, insn);
> -
> -  if (dump_file)
> -    {
> -      pretty_printer pp;
> -      pp_access (&pp, base_def, 0);
> -
> -      fprintf (dump_file, "[bb %u] tracking insn %d via %s",
> -	       m_bb->index (), insn->uid (), pp_formatted_text (&pp));
> -      fprintf (dump_file,
> -	       " [L=%d, WB=%d, FP=%d, %smode, off=",
> -	       lfs.load_p, autoinc_p, lfs.fpsimd_p, mode_name[mem_mode]);
> -      print_dec (access_off, dump_file);
> -      fprintf (dump_file, "]\n");
> -    }
> -}
> -
> -// Dummy predicate that never ignores any insns.
> -static bool no_ignore (insn_info *) { return false; }
> -
> -// Return the latest dataflow hazard before INSN.
> -//
> -// If IGNORE is non-NULL, this points to a sub-rtx which we should ignore for
> -// dataflow purposes.  This is needed when considering changing the RTL base of
> -// an access discovered through a MEM_EXPR base.
> -//
> -// If IGNORE_INSN is non-NULL, we should further ignore any hazards arising
> -// from that insn.
> -//
> -// N.B. we ignore any defs/uses of memory here as we deal with that separately,
> -// making use of alias disambiguation.
> -static insn_info *
> -latest_hazard_before (insn_info *insn, rtx *ignore,
> -		      insn_info *ignore_insn = nullptr)
> +// Return the latest dataflow hazard before INSN.
> +//
> +// If IGNORE is non-NULL, this points to a sub-rtx which we should ignore for
> +// dataflow purposes.  This is needed when considering changing the RTL base of
> +// an access discovered through a MEM_EXPR base.
> +//
> +// If IGNORE_INSN is non-NULL, we should further ignore any hazards arising
> +// from that insn.
> +//
> +// N.B. we ignore any defs/uses of memory here as we deal with that separately,
> +// making use of alias disambiguation.
> +insn_info *
> +latest_hazard_before (insn_info *insn, rtx *ignore,
> +		      insn_info *ignore_insn)// = nullptr)
>  {
>    insn_info *result = nullptr;
>  
> @@ -698,7 +790,7 @@ latest_hazard_before (insn_info *insn, rtx *ignore,
>  //
>  // N.B. we ignore any defs/uses of memory here as we deal with that separately,
>  // making use of alias disambiguation.
> -static insn_info *
> +insn_info *
>  first_hazard_after (insn_info *insn, rtx *ignore)
>  {
>    insn_info *result = nullptr;
> @@ -787,7 +879,7 @@ first_hazard_after (insn_info *insn, rtx *ignore)
>  }
>  
>  // Return true iff R1 and R2 overlap.
> -static bool
> +bool
>  ranges_overlap_p (const insn_range_info &r1, const insn_range_info &r2)
>  {
>    // If either range is empty, then their intersection is empty.
> @@ -799,9 +891,8 @@ ranges_overlap_p (const insn_range_info &r1, const insn_range_info &r2)
>    // Inverting this, we get the below.
>    return *r1.last >= *r2.first && *r2.last >= *r1.first;
>  }
> -
>  // Get the range of insns that def feeds.
> -static insn_range_info get_def_range (def_info *def)
> + insn_range_info get_def_range (def_info *def)
>  {
>    insn_info *last = def->next_def ()->insn ()->prev_nondebug_insn ();
>    return { def->insn (), last };
> @@ -809,7 +900,7 @@ static insn_range_info get_def_range (def_info *def)
>  
>  // Given a def (of memory), return the downwards range within which we
>  // can safely move this def.
> -static insn_range_info
> +insn_range_info
>  def_downwards_move_range (def_info *def)
>  {
>    auto range = get_def_range (def);
> @@ -827,7 +918,7 @@ def_downwards_move_range (def_info *def)
>  
>  // Given a def (of memory), return the upwards range within which we can
>  // safely move this def.
> -static insn_range_info
> +insn_range_info
>  def_upwards_move_range (def_info *def)
>  {
>    def_info *prev = def->prev_def ();
> @@ -844,189 +935,18 @@ def_upwards_move_range (def_info *def)
>    return range;
>  }
>  
> -// Class that implements a state machine for building the changes needed to form
> -// a store pair instruction.  This allows us to easily build the changes in
> -// program order, as required by rtl-ssa.
> -struct stp_change_builder
> +// Generate the RTL pattern for a "tombstone"; used temporarily during this pass
> +// to replace stores that are marked for deletion where we can't immediately
> +// delete the store (since there are uses of mem hanging off the store).
> +//
> +// These are deleted at the end of the pass and uses re-parented appropriately
> +// at this point.
> +rtx
> +gen_tombstone (void)
>  {
> -  enum class state
> -  {
> -    FIRST,
> -    INSERT,
> -    FIXUP_USE,
> -    LAST,
> -    DONE
> -  };
> -
> -  enum class action
> -  {
> -    TOMBSTONE,
> -    CHANGE,
> -    INSERT,
> -    FIXUP_USE
> -  };
> -
> -  struct change
> -  {
> -    action type;
> -    insn_info *insn;
> -  };
> -
> -  bool done () const { return m_state == state::DONE; }
> -
> -  stp_change_builder (insn_info *insns[2],
> -		      insn_info *repurpose,
> -		      insn_info *dest)
> -    : m_state (state::FIRST), m_insns { insns[0], insns[1] },
> -      m_repurpose (repurpose), m_dest (dest), m_use (nullptr) {}
> -
> -  change get_change () const
> -  {
> -    switch (m_state)
> -      {
> -      case state::FIRST:
> -	return {
> -	  m_insns[0] == m_repurpose ? action::CHANGE : action::TOMBSTONE,
> -	  m_insns[0]
> -	};
> -      case state::LAST:
> -	return {
> -	  m_insns[1] == m_repurpose ? action::CHANGE : action::TOMBSTONE,
> -	  m_insns[1]
> -	};
> -      case state::INSERT:
> -	return { action::INSERT, m_dest };
> -      case state::FIXUP_USE:
> -	return { action::FIXUP_USE, m_use->insn () };
> -      case state::DONE:
> -	break;
> -      }
> -
> -    gcc_unreachable ();
> -  }
> -
> -  // Transition to the next state.
> -  void advance ()
> -  {
> -    switch (m_state)
> -      {
> -      case state::FIRST:
> -	if (m_repurpose)
> -	  m_state = state::LAST;
> -	else
> -	  m_state = state::INSERT;
> -	break;
> -      case state::INSERT:
> -      {
> -	def_info *def = memory_access (m_insns[0]->defs ());
> -	while (*def->next_def ()->insn () <= *m_dest)
> -	  def = def->next_def ();
> -
> -	// Now we know DEF feeds the insertion point for the new stp.
> -	// Look for any uses of DEF that will consume the new stp.
> -	gcc_assert (*def->insn () <= *m_dest
> -		    && *def->next_def ()->insn () > *m_dest);
> -
> -	auto set = as_a<set_info *> (def);
> -	for (auto use : set->nondebug_insn_uses ())
> -	  if (*use->insn () > *m_dest)
> -	    {
> -	      m_use = use;
> -	      break;
> -	    }
> -
> -	if (m_use)
> -	  m_state = state::FIXUP_USE;
> -	else
> -	  m_state = state::LAST;
> -	break;
> -      }
> -      case state::FIXUP_USE:
> -	m_use = m_use->next_nondebug_insn_use ();
> -	if (!m_use)
> -	  m_state = state::LAST;
> -	break;
> -      case state::LAST:
> -	m_state = state::DONE;
> -	break;
> -      case state::DONE:
> -	gcc_unreachable ();
> -      }
> -  }
> -
> -private:
> -  state m_state;
> -
> -  // Original candidate stores.
> -  insn_info *m_insns[2];
> -
> -  // If non-null, this is a candidate insn to change into an stp.  Otherwise we
> -  // are deleting both original insns and inserting a new insn for the stp.
> -  insn_info *m_repurpose;
> -
> -  // Destionation of the stp, it will be placed immediately after m_dest.
> -  insn_info *m_dest;
> -
> -  // Current nondebug use that needs updating due to stp insertion.
> -  use_info *m_use;
> -};
> -
> -// Given candidate store insns FIRST and SECOND, see if we can re-purpose one
> -// of them (together with its def of memory) for the stp insn.  If so, return
> -// that insn.  Otherwise, return null.
> -static insn_info *
> -try_repurpose_store (insn_info *first,
> -		     insn_info *second,
> -		     const insn_range_info &move_range)
> -{
> -  def_info * const defs[2] = {
> -    memory_access (first->defs ()),
> -    memory_access (second->defs ())
> -  };
> -
> -  if (move_range.includes (first)
> -      || ranges_overlap_p (move_range, def_downwards_move_range (defs[0])))
> -    return first;
> -
> -  if (move_range.includes (second)
> -      || ranges_overlap_p (move_range, def_upwards_move_range (defs[1])))
> -    return second;
> -
> -  return nullptr;
> -}
> -
> -// Generate the RTL pattern for a "tombstone"; used temporarily during this pass
> -// to replace stores that are marked for deletion where we can't immediately
> -// delete the store (since there are uses of mem hanging off the store).
> -//
> -// These are deleted at the end of the pass and uses re-parented appropriately
> -// at this point.
> -static rtx
> -gen_tombstone (void)
> -{
> -  return gen_rtx_CLOBBER (VOIDmode,
> -			  gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (Pmode)));
> -}
> -
> -// Given a pair mode MODE, return a canonical mode to be used for a single
> -// operand of such a pair.  Currently we only use this when promoting a
> -// non-writeback pair into a writeback pair, as it isn't otherwise clear
> -// which mode to use when storing a modeless CONST_INT.
> -static machine_mode
> -aarch64_operand_mode_for_pair_mode (machine_mode mode)
> -{
> -  switch (mode)
> -    {
> -    case E_V2x4QImode:
> -      return SImode;
> -    case E_V2x8QImode:
> -      return DImode;
> -    case E_V2x16QImode:
> -      return V16QImode;
> -    default:
> -      gcc_unreachable ();
> -    }
> -}
> +  return gen_rtx_CLOBBER (VOIDmode,
> +			  gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (Pmode)));
> +}
>  
>  // Go through the reg notes rooted at NOTE, dropping those that we should drop,
>  // and preserving those that we want to keep by prepending them to (and
> @@ -1034,7 +954,7 @@ aarch64_operand_mode_for_pair_mode (machine_mode mode)
>  // REG_EH_REGION note in the resulting list.  FR_EXPR is used to return any
>  // REG_FRAME_RELATED_EXPR note we find, as these can need special handling in
>  // combine_reg_notes.
> -static rtx
> +rtx
>  filter_notes (rtx note, rtx result, bool *eh_region, rtx *fr_expr)
>  {
>    for (; note; note = XEXP (note, 1))
> @@ -1084,7 +1004,7 @@ filter_notes (rtx note, rtx result, bool *eh_region, rtx *fr_expr)
>  
>  // Return the notes that should be attached to a combination of I1 and I2, where
>  // *I1 < *I2.  LOAD_P is true for loads.
> -static rtx
> +rtx
>  combine_reg_notes (insn_info *i1, insn_info *i2, bool load_p)
>  {
>    // Temporary storage for REG_FRAME_RELATED_EXPR notes.
> @@ -1100,8 +1020,8 @@ combine_reg_notes (insn_info *i1, insn_info *i2, bool load_p)
>    if (!load_p)
>      {
>        // Simple frame-related sp-relative saves don't need CFI notes, but when
> -      // we combine them into an stp we will need a CFI note as dwarf2cfi can't
> -      // interpret the unspec pair representation directly.
> +      // we combine them into an pair mem store  we will need a CFI note as
> +      // dwarf2cfi can't interpret the unspec pair representation directly.
>        if (RTX_FRAME_RELATED_P (i1->rtl ()) && !fr_expr[0])
>  	fr_expr[0] = copy_rtx (PATTERN (i1->rtl ()));
>        if (RTX_FRAME_RELATED_P (i2->rtl ()) && !fr_expr[1])
> @@ -1133,7 +1053,7 @@ combine_reg_notes (insn_info *i1, insn_info *i2, bool load_p)
>  // relative to the initial value of the base register, and output these
>  // in PATS.  Return an rtx that represents the overall change to the
>  // base register.
> -static rtx
> +rtx
>  extract_writebacks (bool load_p, rtx pats[2], int changed)
>  {
>    rtx base_reg = NULL_RTX;
> @@ -1150,7 +1070,7 @@ extract_writebacks (bool load_p, rtx pats[2], int changed)
>        const bool autoinc_p = GET_RTX_CLASS (GET_CODE (addr)) == RTX_AUTOINC;
>  
>        poly_int64 offset;
> -      rtx this_base = ldp_strip_offset (mem, &offset);
> +      rtx this_base = pair_mem_strip_offset (mem, &offset);
>        gcc_assert (REG_P (this_base));
>        if (base_reg)
>  	gcc_assert (rtx_equal_p (base_reg, this_base));
> @@ -1207,7 +1127,7 @@ extract_writebacks (bool load_p, rtx pats[2], int changed)
>  // base register.  If there is one, we choose the first such update after
>  // PAIR_DST that is still in the same BB as our pair.  We return the new def in
>  // *ADD_DEF and the resulting writeback effect in *WRITEBACK_EFFECT.
> -static insn_info *
> +insn_info *
>  find_trailing_add (insn_info *insns[2],
>  		   const insn_range_info &pair_range,
>  		   int initial_writeback,
> @@ -1286,7 +1206,7 @@ find_trailing_add (insn_info *insns[2],
>  
>    off_hwi /= access_size;
>  
> -  if (off_hwi < LDP_MIN_IMM || off_hwi > LDP_MAX_IMM)
> +  if (off_hwi < PAIR_MEM_MIN_IMM || off_hwi > PAIR_MEM_MAX_IMM)
>      return nullptr;
>  
>    auto dump_prefix = [&]()
> @@ -1325,26 +1245,93 @@ find_trailing_add (insn_info *insns[2],
>    return nullptr;
>  }
>  
> -// We just emitted a tombstone with uid UID, track it in a bitmap for
> -// this BB so we can easily identify it later when cleaning up tombstones.
> -void
> -ldp_bb_info::track_tombstone (int uid)
> +// Return true if STORE_INSN may modify mem rtx MEM.  Make sure we keep
> +// within our BUDGET for alias analysis.
> +bool
> +store_modifies_mem_p (rtx mem, insn_info *store_insn, int &budget)
>  {
> -  if (!m_emitted_tombstone)
> +  if (!budget)
>      {
> -      // Lazily initialize the bitmap for tracking tombstone insns.
> -      bitmap_obstack_initialize (&m_bitmap_obstack);
> -      bitmap_initialize (&m_tombstone_bitmap, &m_bitmap_obstack);
> -      m_emitted_tombstone = true;
> +      if (dump_file)
> +	{
> +	  fprintf (dump_file,
> +		   "exceeded budget, assuming store %d aliases with mem ",
> +		   store_insn->uid ());
> +	  print_simple_rtl (dump_file, mem);
> +	  fprintf (dump_file, "\n");
> +	}
> +
> +      return true;
>      }
>  
> -  if (!bitmap_set_bit (&m_tombstone_bitmap, uid))
> -    gcc_unreachable (); // Bit should have changed.
> +  budget--;
> +  return memory_modified_in_insn_p (mem, store_insn->rtl ());
> +}
> +
> +// Return true if LOAD may be modified by STORE.  Make sure we keep
> +// within our BUDGET for alias analysis.
> +bool
> +load_modified_by_store_p (insn_info *load,
> +			  insn_info *store,
> +			  int &budget)
> +{
> +  gcc_checking_assert (budget >= 0);
> +
> +  if (!budget)
> +    {
> +      if (dump_file)
> +	{
> +	  fprintf (dump_file,
> +		   "exceeded budget, assuming load %d aliases with store %d\n",
> +		   load->uid (), store->uid ());
> +	}
> +      return true;
> +    }
> +
> +  // It isn't safe to re-order stores over calls.
> +  if (CALL_P (load->rtl ()))
> +    return true;
> +
> +  budget--;
> +
> +  // Iterate over all MEMs in the load, seeing if any alias with
> +  // our store.
> +  subrtx_var_iterator::array_type array;
> +  rtx pat = PATTERN (load->rtl ());
> +  FOR_EACH_SUBRTX_VAR (iter, array, pat, NONCONST)
> +    if (MEM_P (*iter) && memory_modified_in_insn_p (*iter, store->rtl ()))
> +      return true;
> +
> +  return false;
> +}
> +// Given candidate store insns FIRST and SECOND, see if we can re-purpose one
> +// of them (together with its def of memory) for the stp insn.  If so, return
> +// that insn.  Otherwise, return null.
> +insn_info *
> +try_repurpose_store (insn_info *first,
> +		     insn_info *second,
> +		     const insn_range_info &move_range)
> +{
> +  def_info * const defs[2] = {
> +    memory_access (first->defs ()),
> +    memory_access (second->defs ())
> +  };
> +
> +  if (move_range.includes (first)
> +      || ranges_overlap_p (move_range, def_downwards_move_range (defs[0])))
> +    return first;
> +
> +  if (move_range.includes (second)
> +      || ranges_overlap_p (move_range, def_upwards_move_range (defs[1])))
> +    return second;
> +
> +  return nullptr;
>  }
>  
> +
>  // Reset the debug insn containing USE (the debug insn has been
>  // optimized away).
> -static void
> +void
>  reset_debug_use (use_info *use)
>  {
>    auto use_insn = use->insn ();
> @@ -1355,12 +1342,43 @@ reset_debug_use (use_info *use)
>    crtl->ssa->change_insn (change);
>  }
>  
> +// Update debug uses when folding in a trailing add insn to form a
> +// writeback pair.
> +//
> +// ATTEMPT is used to allocate RTL-SSA temporaries for the changes,
> +// the final pair is placed immediately after PAIR_DST, TRAILING_ADD
> +// is a trailing add insn which is being folded into the pair to make it
> +// use writeback addressing, and WRITEBACK_EFFECT is the pattern for
> +// TRAILING_ADD.
> +void
> +fixup_debug_uses_trailing_add (obstack_watermark &attempt,
> +			       insn_info *pair_dst,
> +			       insn_info *trailing_add,
> +			       rtx writeback_effect)
> +{
> +  rtx base = SET_DEST (writeback_effect);
> +
> +  poly_int64 wb_offset;
> +  rtx base2 = strip_offset (SET_SRC (writeback_effect), &wb_offset);
> +  gcc_checking_assert (rtx_equal_p (base, base2));
> +
> +  auto defs = trailing_add->defs ();
> +  gcc_checking_assert (defs.size () == 1);
> +  def_info *def = defs[0];
> +
> +  if (auto set = safe_dyn_cast<set_info *> (def->prev_def ()))
> +    for (auto use : iterate_safely (set->debug_insn_uses ()))
> +      if (*use->insn () > *pair_dst)
> +	// DEF is getting re-ordered above USE, fix up USE accordingly.
> +	fixup_debug_use (attempt, use, def, base, wb_offset);
> +}
> +
>  // USE is a debug use that needs updating because DEF (a def of the same
>  // register) is being re-ordered over it.  If BASE is non-null, then DEF
>  // is an update of the register BASE by a constant, given by WB_OFFSET,
>  // and we can preserve debug info by accounting for the change in side
>  // effects.
> -static void
> +void
>  fixup_debug_use (obstack_watermark &attempt,
>  		 use_info *use,
>  		 def_info *def,
> @@ -1455,37 +1473,6 @@ fixup_debug_use (obstack_watermark &attempt,
>      }
>  }
>  
> -// Update debug uses when folding in a trailing add insn to form a
> -// writeback pair.
> -//
> -// ATTEMPT is used to allocate RTL-SSA temporaries for the changes,
> -// the final pair is placed immediately after PAIR_DST, TRAILING_ADD
> -// is a trailing add insn which is being folded into the pair to make it
> -// use writeback addressing, and WRITEBACK_EFFECT is the pattern for
> -// TRAILING_ADD.
> -static void
> -fixup_debug_uses_trailing_add (obstack_watermark &attempt,
> -			       insn_info *pair_dst,
> -			       insn_info *trailing_add,
> -			       rtx writeback_effect)
> -{
> -  rtx base = SET_DEST (writeback_effect);
> -
> -  poly_int64 wb_offset;
> -  rtx base2 = strip_offset (SET_SRC (writeback_effect), &wb_offset);
> -  gcc_checking_assert (rtx_equal_p (base, base2));
> -
> -  auto defs = trailing_add->defs ();
> -  gcc_checking_assert (defs.size () == 1);
> -  def_info *def = defs[0];
> -
> -  if (auto set = safe_dyn_cast<set_info *> (def->prev_def ()))
> -    for (auto use : iterate_safely (set->debug_insn_uses ()))
> -      if (*use->insn () > *pair_dst)
> -	// DEF is getting re-ordered above USE, fix up USE accordingly.
> -	fixup_debug_use (attempt, use, def, base, wb_offset);
> -}
> -
>  // Called from fuse_pair, fixes up any debug uses that will be affected
>  // by the changes.
>  //
> @@ -1500,7 +1487,7 @@ fixup_debug_uses_trailing_add (obstack_watermark &attempt,
>  // writeback, and WRITEBACK_EFFECT is an rtx describing the overall update to
>  // the base register in the final pair (if any).  BASE_REGNO gives the register
>  // number of the base register used in the final pair.
> -static void
> +void
>  fixup_debug_uses (obstack_watermark &attempt,
>  		  insn_info *insns[2],
>  		  rtx orig_rtl[2],
> @@ -1528,7 +1515,7 @@ fixup_debug_uses (obstack_watermark &attempt,
>  	  gcc_checking_assert (GET_RTX_CLASS (GET_CODE (XEXP (mem, 0)))
>  			       == RTX_AUTOINC);
>  
> -	  base = ldp_strip_offset (mem, &offset);
> +	  base = pair_mem_strip_offset (mem, &offset);
>  	  gcc_checking_assert (REG_P (base) && REGNO (base) == base_regno);
>  	}
>        fixup_debug_use (attempt, use, def, base, offset);
> @@ -1651,621 +1638,846 @@ fixup_debug_uses (obstack_watermark &attempt,
>  				   writeback_effect);
>  }
>  
> -// Try and actually fuse the pair given by insns I1 and I2.
> -//
> -// Here we've done enough analysis to know this is safe, we only
> -// reject the pair at this stage if either the tuning policy says to,
> -// or recog fails on the final pair insn.
> -//
> -// LOAD_P is true for loads, ACCESS_SIZE gives the access size of each
> -// candidate insn.  Bit i of WRITEBACK is set if the ith insn (in program
> -// order) uses writeback.
> +// Given INSNS (in program order) which are known to be adjacent, look
> +// to see if either insn has a suitable RTL (register) base that we can
> +// use to form a pair.  Push these to BASE_CANDS if we find any.  CAND_MEMs
> +// gives the relevant mems from the candidate insns, ACCESS_SIZE gives the
> +// size of a single candidate access, and REVERSED says whether the accesses
> +// are inverted in offset order.
>  //
> -// BASE gives the chosen base candidate for the pair and MOVE_RANGE is
> -// a singleton range which says where to place the pair.
> -bool
> -ldp_bb_info::fuse_pair (bool load_p,
> -			unsigned access_size,
> -			int writeback,
> -			insn_info *i1, insn_info *i2,
> -			base_cand &base,
> -			const insn_range_info &move_range)
> +// Returns an integer where bit (1 << i) is set if INSNS[i] uses writeback
> +// addressing.
> +int
> +get_viable_bases (insn_info *insns[2],
> +		  vec<base_cand> &base_cands,
> +		  rtx cand_mems[2],
> +		  unsigned access_size,
> +		  bool reversed)
>  {
> -  auto attempt = crtl->ssa->new_change_attempt ();
> -
> -  auto make_change = [&attempt](insn_info *insn)
> -    {
> -      return crtl->ssa->change_alloc<insn_change> (attempt, insn);
> -    };
> -  auto make_delete = [&attempt](insn_info *insn)
> -    {
> -      return crtl->ssa->change_alloc<insn_change> (attempt,
> -						   insn,
> -						   insn_change::DELETE);
> -    };
> -
> -  insn_info *first = (*i1 < *i2) ? i1 : i2;
> -  insn_info *second = (first == i1) ? i2 : i1;
> -
> -  insn_info *pair_dst = move_range.singleton ();
> -  gcc_assert (pair_dst);
> -
> -  insn_info *insns[2] = { first, second };
> -
> -  auto_vec<insn_change *> changes;
> -  auto_vec<int, 2> tombstone_uids (2);
> -
> -  rtx pats[2] = {
> -    PATTERN (first->rtl ()),
> -    PATTERN (second->rtl ())
> -  };
> -
> -  // Make copies of the patterns as we might need to refer to the original RTL
> -  // later, for example when updating debug uses (which is after we've updated
> -  // one or both of the patterns in the candidate insns).
> -  rtx orig_rtl[2];
> +  // We discovered this pair through a common base.  Need to ensure that
> +  // we have a common base register that is live at both locations.
> +  def_info *base_defs[2] = {};
> +  int writeback = 0;
>    for (int i = 0; i < 2; i++)
> -    orig_rtl[i] = copy_rtx (pats[i]);
> -
> -  use_array input_uses[2] = { first->uses (), second->uses () };
> -  def_array input_defs[2] = { first->defs (), second->defs () };
> -
> -  int changed_insn = -1;
> -  if (base.from_insn != -1)
>      {
> -      // If we're not already using a shared base, we need
> -      // to re-write one of the accesses to use the base from
> -      // the other insn.
> -      gcc_checking_assert (base.from_insn == 0 || base.from_insn == 1);
> -      changed_insn = !base.from_insn;
> -
> -      rtx base_pat = pats[base.from_insn];
> -      rtx change_pat = pats[changed_insn];
> -      rtx base_mem = XEXP (base_pat, load_p);
> -      rtx change_mem = XEXP (change_pat, load_p);
> +      const bool is_lower = (i == reversed);
> +      poly_int64 poly_off;
> +      rtx base = pair_mem_strip_offset (cand_mems[i], &poly_off);
> +      if (GET_RTX_CLASS (GET_CODE (XEXP (cand_mems[i], 0))) == RTX_AUTOINC)
> +	writeback |= (1 << i);
>  
> -      const bool lower_base_p = (insns[base.from_insn] == i1);
> -      HOST_WIDE_INT adjust_amt = access_size;
> -      if (!lower_base_p)
> -	adjust_amt *= -1;
> +      if (!REG_P (base) || !poly_off.is_constant ())
> +	continue;
>  
> -      rtx change_reg = XEXP (change_pat, !load_p);
> -      machine_mode mode_for_mem = GET_MODE (change_mem);
> -      rtx effective_base = drop_writeback (base_mem);
> -      rtx new_mem = adjust_address_nv (effective_base,
> -				       mode_for_mem,
> -				       adjust_amt);
> -      rtx new_set = load_p
> -	? gen_rtx_SET (change_reg, new_mem)
> -	: gen_rtx_SET (new_mem, change_reg);
> +      // Punt on accesses relative to eliminable regs.  See the comment in
> +      // pair_fusion::track_access for a detailed explanation of this.
> +      if (!reload_completed
> +	  && (REGNO (base) == FRAME_POINTER_REGNUM
> +	      || REGNO (base) == ARG_POINTER_REGNUM))
> +	continue;
>  
> -      pats[changed_insn] = new_set;
> +      HOST_WIDE_INT base_off = poly_off.to_constant ();
>  
> -      auto keep_use = [&](use_info *u)
> +      // It should be unlikely that we ever punt here, since MEM_EXPR offset
> +      // alignment should be a good proxy for register offset alignment.
> +      if (base_off % access_size != 0)
>  	{
> -	  return refers_to_regno_p (u->regno (), u->regno () + 1,
> -				    change_pat, &XEXP (change_pat, load_p));
> -	};
> -
> -      // Drop any uses that only occur in the old address.
> -      input_uses[changed_insn] = filter_accesses (attempt,
> -						  input_uses[changed_insn],
> -						  keep_use);
> -    }
> -
> -  rtx writeback_effect = NULL_RTX;
> -  if (writeback)
> -    writeback_effect = extract_writebacks (load_p, pats, changed_insn);
> +	  if (dump_file)
> +	    fprintf (dump_file,
> +		     "base not viable, offset misaligned (insn %d)\n",
> +		     insns[i]->uid ());
> +	  continue;
> +	}
>  
> -  const auto base_regno = base.def->regno ();
> +      base_off /= access_size;
>  
> -  if (base.from_insn == -1 && (writeback & 1))
> -    {
> -      // If the first of the candidate insns had a writeback form, we'll need to
> -      // drop the use of the updated base register from the second insn's uses.
> -      //
> -      // N.B. we needn't worry about the base register occurring as a store
> -      // operand, as we checked that there was no non-address true dependence
> -      // between the insns in try_fuse_pair.
> -      gcc_checking_assert (find_access (input_uses[1], base_regno));
> -      input_uses[1] = check_remove_regno_access (attempt,
> -						 input_uses[1],
> -						 base_regno);
> -    }
> +      if (!is_lower)
> +	base_off--;
>  
> -  // Go through and drop uses that only occur in register notes,
> -  // as we won't be preserving those.
> -  for (int i = 0; i < 2; i++)
> -    {
> -      auto rti = insns[i]->rtl ();
> -      if (!REG_NOTES (rti))
> +      if (base_off < PAIR_MEM_MIN_IMM || base_off > PAIR_MEM_MAX_IMM)
>  	continue;
>  
> -      input_uses[i] = remove_note_accesses (attempt, input_uses[i]);
> +      use_info *use = find_access (insns[i]->uses (), REGNO (base));
> +      gcc_assert (use);
> +      base_defs[i] = use->def ();
>      }
>  
> -  // Edge case: if the first insn is a writeback load and the
> -  // second insn is a non-writeback load which transfers into the base
> -  // register, then we should drop the writeback altogether as the
> -  // update of the base register from the second load should prevail.
> -  //
> -  // For example:
> -  //   ldr x2, [x1], #8
> -  //   ldr x1, [x1]
> -  //   -->
> -  //   ldp x2, x1, [x1]
> -  if (writeback == 1
> -      && load_p
> -      && find_access (input_defs[1], base_regno))
> +  if (!base_defs[0] && !base_defs[1])
>      {
>        if (dump_file)
> -	fprintf (dump_file,
> -		 "  ldp: i%d has wb but subsequent i%d has non-wb "
> -		 "update of base (r%d), dropping wb\n",
> -		 insns[0]->uid (), insns[1]->uid (), base_regno);
> -      gcc_assert (writeback_effect);
> -      writeback_effect = NULL_RTX;
> +	fprintf (dump_file, "no viable base register for pair (%d,%d)\n",
> +		 insns[0]->uid (), insns[1]->uid ());
> +      return writeback;
>      }
>  
> -  // So far the patterns have been in instruction order,
> -  // now we want them in offset order.
> -  if (i1 != first)
> -    std::swap (pats[0], pats[1]);
> -
> -  poly_int64 offsets[2];
>    for (int i = 0; i < 2; i++)
> -    {
> -      rtx mem = XEXP (pats[i], load_p);
> -      gcc_checking_assert (MEM_P (mem));
> -      rtx base = strip_offset (XEXP (mem, 0), offsets + i);
> -      gcc_checking_assert (REG_P (base));
> -      gcc_checking_assert (base_regno == REGNO (base));
> +    if ((writeback & (1 << i)) && !base_defs[i])
> +      {
> +	if (dump_file)
> +	  fprintf (dump_file, "insn %d has writeback but base isn't viable\n",
> +		   insns[i]->uid ());
> +	return writeback;
> +      }
> +
> +  if (writeback == 3
> +      && base_defs[0]->regno () != base_defs[1]->regno ())
> +    {
> +      if (dump_file)
> +	fprintf (dump_file,
> +		 "pair (%d,%d): double writeback with distinct regs (%d,%d): "
> +		 "punting\n",
> +		 insns[0]->uid (), insns[1]->uid (),
> +		 base_defs[0]->regno (), base_defs[1]->regno ());
> +      return writeback;
>      }
>  
> -  // If either of the original insns had writeback, but the resulting pair insn
> -  // does not (can happen e.g. in the ldp edge case above, or if the writeback
> -  // effects cancel out), then drop the def(s) of the base register as
> -  // appropriate.
> +  if (base_defs[0] && base_defs[1]
> +      && base_defs[0]->regno () == base_defs[1]->regno ())
> +    {
> +      // Easy case: insns already share the same base reg.
> +      base_cands.quick_push (base_defs[0]);
> +      return writeback;
> +    }
> +
> +  // Otherwise, we know that one of the bases must change.
>    //
> -  // Also drop the first def in the case that both of the original insns had
> -  // writeback.  The second def could well have uses, but the first def should
> -  // only be used by the second insn (and we dropped that use above).
> +  // Note that if there is writeback we must use the writeback base
> +  // (we know now there is exactly one).
>    for (int i = 0; i < 2; i++)
> -    if ((!writeback_effect && (writeback & (1 << i)))
> -	|| (i == 0 && writeback == 3))
> -      input_defs[i] = check_remove_regno_access (attempt,
> -						 input_defs[i],
> -						 base_regno);
> +    if (base_defs[i] && (!writeback || (writeback & (1 << i))))
> +      base_cands.quick_push (base_cand { base_defs[i], i });
> +
> +  return writeback;
> +}
> +
> +void
> +dump_insn_list (FILE *f, const insn_list_t &l)
> +{
> +  fprintf (f, "(");
> +
> +  auto i = l.begin ();
> +  auto end = l.end ();
> +
> +  if (i != end)
> +    fprintf (f, "%d", (*i)->uid ());
> +  i++;
> +
> +  for (; i != end; i++)
> +    fprintf (f, ", %d", (*i)->uid ());
> +
> +  fprintf (f, ")");
> +}
> +splay_tree_node<access_record *> *
> +pair_fusion::node_alloc (access_record *access)
> +{
> +  using T = splay_tree_node<access_record *>;
> +  void *addr = obstack_alloc (&m_obstack, sizeof (T));
> +  return new (addr) T (access);
> +}
> +// Given a candidate access INSN (with mem MEM), see if it has a suitable
> +// MEM_EXPR base (i.e. a tree decl) relative to which we can track the access.
> +// LFS is used as part of the key to the hash table, see track_access.
> +bool
> +pair_fusion::track_via_mem_expr (insn_info *insn, rtx mem, lfs_fields lfs)
> +{
> +  if (!MEM_EXPR (mem) || !MEM_OFFSET_KNOWN_P (mem))
> +    return false;
> +
> +  poly_int64 offset;
> +  tree base_expr = get_addr_base_and_unit_offset (MEM_EXPR (mem),
> +						  &offset);
> +  if (!base_expr || !DECL_P (base_expr))
> +    return false;
> +
> +  offset += MEM_OFFSET (mem);
> +
> +  const machine_mode mem_mode = GET_MODE (mem);
> +  const HOST_WIDE_INT mem_size = GET_MODE_SIZE (mem_mode).to_constant ();
> +
> +  // Punt on misaligned offsets.  PAIR MEM  instructions require offsets to be a
> +  // multiple of the access size, and we believe that misaligned offsets on
> +  // MEM_EXPR bases are likely to lead to misaligned offsets w.r.t. RTL bases.
> +  if (!multiple_p (offset, mem_size))
> +    return false;
> +
> +  const auto key = std::make_pair (base_expr, encode_lfs (lfs));
> +  access_group &group = expr_map.get_or_insert (key, NULL);
> +  auto alloc = [&](access_record *access) { return node_alloc (access); };
> +  group.track (alloc, offset, insn);
> +
> +  if (dump_file)
> +    {
> +      fprintf (dump_file, "[bb %u] tracking insn %d via ",
> +	       m_bb->index (), insn->uid ());
> +      print_node_brief (dump_file, "mem expr", base_expr, 0);
> +      fprintf (dump_file, " [L=%d FP=%d, %smode, off=",
> +	       lfs.load_p, lfs.fpsimd_p, mode_name[mem_mode]);
> +      print_dec (offset, dump_file);
> +      fprintf (dump_file, "]\n");
> +    }
> +
> +  return true;
> +}
> +// Main function to begin pair discovery.  Given a memory access INSN,
> +// determine whether it could be a candidate for fusing into an pair mem,
> +// and if so, track it in the appropriate data structure for this basic
> +// block.  LOAD_P is true if the access is a load, and MEM is the mem
> +// rtx that occurs in INSN.
> +void
> +pair_fusion::track_access (insn_info *insn, bool load_p, rtx mem)
> +{
> +  // We can't combine volatile MEMs, so punt on these.
> +  if (MEM_VOLATILE_P (mem))
> +    return;
> +
> +  // Ignore writeback accesses if the param says to do so
> +  if (pair_is_writeback ()
> +      && GET_RTX_CLASS (GET_CODE (XEXP (mem, 0))) == RTX_AUTOINC)
> +    return;
> +
> +  const machine_mode mem_mode = GET_MODE (mem);
> +
> +  if (!pair_operand_mode_ok_p (mem_mode))
> +    return;
> +
> +  rtx reg_op = XEXP (PATTERN (insn->rtl ()), !load_p);
> +
> +  if (pair_check_register_operand (load_p, reg_op, mem_mode))
> +    return;
> +  // We want to segregate FP/SIMD accesses from GPR accesses.
> +  //
> +  // Before RA, we use the modes, noting that stores of constant zero
> +  // operands use GPRs (even in non-integer modes).  After RA, we use
> +  // the hard register numbers.
> + const bool fpsimd_op_p = is_fpsimd_op_p (reg_op, mem_mode, load_p);
> +  // Note pair_operand_mode_ok_p already rejected VL modes.
> +  const HOST_WIDE_INT mem_size = GET_MODE_SIZE (mem_mode).to_constant ();
> +  const lfs_fields lfs = { load_p, fpsimd_op_p, mem_size };
> +
> +  if (track_via_mem_expr (insn, mem, lfs))
> +    return;
> +
> +  poly_int64 mem_off;
> +  rtx addr = XEXP (mem, 0);
> +  const bool autoinc_p = GET_RTX_CLASS (GET_CODE (addr)) == RTX_AUTOINC;
> +  rtx base = pair_mem_strip_offset (mem, &mem_off);
> +  if (!REG_P (base))
> +    return;
> +
> +  // Need to calculate two (possibly different) offsets:
> +  //  - Offset at which the access occurs.
> +  //  - Offset of the new base def.
> +  poly_int64 access_off;
> +  if (autoinc_p && any_post_modify_p (addr))
> +    access_off = 0;
> +  else
> +    access_off = mem_off;
> +
> +  poly_int64 new_def_off = mem_off;
> +
> +  // Punt on accesses relative to eliminable regs.  Since we don't know the
> +  // elimination offset pre-RA, we should postpone forming pairs on such
> +  // accesses until after RA.
> +  //
> +  // As it stands, addresses with offsets in range for LDR but not
> +  // in range for PAIR MEM LOAD STORE  are currently reloaded inefficiently,
> +  // ending up with a separate base register for each pair.
> +  //
> +  // In theory LRA should make use of
> +  // targetm.legitimize_address_displacement to promote sharing of
> +  // bases among multiple (nearby) address reloads, but the current
> +  // LRA code returns early from process_address_1 for operands that
> +  // satisfy "m", even if they don't satisfy the real (relaxed) address
> +  // constraint; this early return means we never get to the code
> +  // that calls targetm.legitimize_address_displacement.
> +  //
> +  // So for now, it's better to punt when we can't be sure that the
> +  // offset is in range for PAIR MEM LOAD STORE.  Out-of-range cases can then be
> +  // handled after RA by the out-of-range PAIR MEM  peepholes.  Eventually, it
> +  // would be nice to handle known out-of-range opportunities in the
> +  // pass itself (for stack accesses, this would be in the post-RA pass).
> +  if (!reload_completed
> +      && (REGNO (base) == FRAME_POINTER_REGNUM
> +	  || REGNO (base) == ARG_POINTER_REGNUM))
> +    return;
> +
> +  // Now need to find def of base register.
> +  use_info *base_use = find_access (insn->uses (), REGNO (base));
> +  gcc_assert (base_use);
> +  def_info *base_def = base_use->def ();
> +  if (!base_def)
> +    {
> +      if (dump_file)
> +	fprintf (dump_file,
> +		 "base register (regno %d) of insn %d is undefined",
> +		 REGNO (base), insn->uid ());
> +      return;
> +    }
> +
> +  alt_base *canon_base = canon_base_map.get (base_def);
> +  if (canon_base)
> +    {
> +      // Express this as the combined offset from the canonical base.
> +      base_def = canon_base->base;
> +      new_def_off += canon_base->offset;
> +      access_off += canon_base->offset;
> +    }
> +
> +  if (autoinc_p)
> +    {
> +      auto def = find_access (insn->defs (), REGNO (base));
> +      gcc_assert (def);
> +
> +      // Record that DEF = BASE_DEF + MEM_OFF.
> +      if (dump_file)
> +	{
> +	  pretty_printer pp;
> +	  pp_access (&pp, def, 0);
> +	  pp_string (&pp, " = ");
> +	  pp_access (&pp, base_def, 0);
> +	  fprintf (dump_file, "[bb %u] recording %s + ",
> +		   m_bb->index (), pp_formatted_text (&pp));
> +	  print_dec (new_def_off, dump_file);
> +	  fprintf (dump_file, "\n");
> +	}
> +
> +      alt_base base_rec { base_def, new_def_off };
> +      if (canon_base_map.put (def, base_rec))
> +	gcc_unreachable (); // Base defs should be unique.
> +    }
> +
> +  // Punt on misaligned offsets.  PAIR MEM  require offsets to be a multiple of
> +  // the access size.
> +  if (!multiple_p (mem_off, mem_size))
> +    return;
> +
> +  const auto key = std::make_pair (base_def, encode_lfs (lfs));
> +  access_group &group = def_map.get_or_insert (key, NULL);
> +  auto alloc = [&](access_record *access) { return node_alloc (access); };
> +  group.track (alloc, access_off, insn);
> +
> +  if (dump_file)
> +    {
> +      pretty_printer pp;
> +      pp_access (&pp, base_def, 0);
> +
> +      fprintf (dump_file, "[bb %u] tracking insn %d via %s",
> +	       m_bb->index (), insn->uid (), pp_formatted_text (&pp));
> +      fprintf (dump_file,
> +	       " [L=%d, WB=%d, FP=%d, %smode, off=",
> +	       lfs.load_p, autoinc_p, lfs.fpsimd_p, mode_name[mem_mode]);
> +      print_dec (access_off, dump_file);
> +      fprintf (dump_file, "]\n");
> +    }
> +}
> +
> +// We just emitted a tombstone with uid UID, track it in a bitmap for
> +// this BB so we can easily identify it later when cleaning up tombstones.
> +void
> +pair_fusion::track_tombstone (int uid)
> +{
> +  if (!m_emitted_tombstone)
> +    {
> +      // Lazily initialize the bitmap for tracking tombstone insns.
> +      bitmap_obstack_initialize (&m_bitmap_obstack);
> +      bitmap_initialize (&m_tombstone_bitmap, &m_bitmap_obstack);
> +      m_emitted_tombstone = true;
> +    }
> +
> +  if (!bitmap_set_bit (&m_tombstone_bitmap, uid))
> +    gcc_unreachable (); // Bit should have changed.
> +}
> +
> +// Given two adjacent memory accesses of the same size, I1 and I2, try
> +// and see if we can merge them into a pair mem load and store.
> +//
> +// ACCESS_SIZE gives the (common) size of a single access, LOAD_P is true
> +// if the accesses are both loads, otherwise they are both stores.
> +bool
> +pair_fusion::try_fuse_pair (bool load_p, unsigned access_size,
> +			    insn_info *i1, insn_info *i2)
> +{
> +  if (dump_file)
> +    fprintf (dump_file, "analyzing pair (load=%d): (%d,%d)\n",
> +	     load_p, i1->uid (), i2->uid ());
> +
> +  insn_info *insns[2];
> +  bool reversed = false;
> +  if (*i1 < *i2)
> +    {
> +      insns[0] = i1;
> +      insns[1] = i2;
> +    }
> +  else
> +    {
> +      insns[0] = i2;
> +      insns[1] = i1;
> +      reversed = true;
> +    }
> +
> +  rtx cand_mems[2];
> +  rtx reg_ops[2];
> +  rtx pats[2];
> +  for (int i = 0; i < 2; i++)
> +    {
> +      pats[i] = PATTERN (insns[i]->rtl ());
> +      cand_mems[i] = XEXP (pats[i], load_p);
> +      reg_ops[i] = XEXP (pats[i], !load_p);
> +    }
> +
> +  if (!load_p && !fuseable_store_p (i1, i2))
> +    {
> +      if (dump_file)
> +	fprintf (dump_file,
> +		 "punting on store-mem-pairs due to non fuseable cand (%d,%d)\n",
> +		 insns[0]->uid (), insns[1]->uid ());
> +      return false;
> +    }
> +
> +
> +  if (load_p && reg_overlap_mentioned_p (reg_ops[0], reg_ops[1]))
> +    {
> +      if (dump_file)
> +	fprintf (dump_file,
> +		 "punting on pair mem load  due to reg conflcits (%d,%d)\n",
> +		 insns[0]->uid (), insns[1]->uid ());
> +      return false;
> +    }
> +
> +  if (cfun->can_throw_non_call_exceptions
> +      && find_reg_note (insns[0]->rtl (), REG_EH_REGION, NULL_RTX)
> +      && find_reg_note (insns[1]->rtl (), REG_EH_REGION, NULL_RTX))
> +    {
> +      if (dump_file)
> +	fprintf (dump_file,
> +		 "can't combine insns with EH side effects (%d,%d)\n",
> +		 insns[0]->uid (), insns[1]->uid ());
> +      return false;
> +    }
> +
> +  auto_vec<base_cand, 2> base_cands (2);
> +
> +  int writeback = get_viable_bases (insns, base_cands, cand_mems,
> +				    access_size, reversed);
> +  if (base_cands.is_empty ())
> +    {
> +      if (dump_file)
> +	fprintf (dump_file, "no viable base for pair (%d,%d)\n",
> +		 insns[0]->uid (), insns[1]->uid ());
> +      return false;
> +    }
> +
> +  // Punt on frame-related insns with writeback.  We probably won't see
> +  // these in practice, but this is conservative and ensures we don't
> +  // have to worry about these later on.
> +  if (writeback && (RTX_FRAME_RELATED_P (i1->rtl ())
> +		    || RTX_FRAME_RELATED_P (i2->rtl ())))
> +    {
> +      if (dump_file)
> +	fprintf (dump_file,
> +		 "rejecting pair (%d,%d): frame-related insn with writeback\n",
> +		 i1->uid (), i2->uid ());
> +      return false;
> +    }
> +
> +  rtx *ignore = &XEXP (pats[1], load_p);
> +  for (auto use : insns[1]->uses ())
> +    if (!use->is_mem ()
> +	&& refers_to_regno_p (use->regno (), use->regno () + 1, pats[1], ignore)
> +	&& use->def () && use->def ()->insn () == insns[0])
> +      {
> +	// N.B. we allow a true dependence on the base address, as this
> +	// happens in the case of auto-inc accesses.  Consider a post-increment
> +	// load followed by a regular indexed load, for example.
> +	if (dump_file)
> +	  fprintf (dump_file,
> +		   "%d has non-address true dependence on %d, rejecting pair\n",
> +		   insns[1]->uid (), insns[0]->uid ());
> +	return false;
> +      }
>  
> -  // If we don't currently have a writeback pair, and we don't have
> -  // a load that clobbers the base register, look for a trailing destructive
> -  // update of the base register and try and fold it in to make this into a
> -  // writeback pair.
> -  insn_info *trailing_add = nullptr;
> -  if (aarch64_ldp_writeback > 1
> -      && !writeback_effect
> -      && (!load_p || (!refers_to_regno_p (base_regno, base_regno + 1,
> -					 XEXP (pats[0], 0), nullptr)
> -		      && !refers_to_regno_p (base_regno, base_regno + 1,
> -					     XEXP (pats[1], 0), nullptr))))
> +  unsigned i = 0;
> +  while (i < base_cands.length ())
>      {
> -      def_info *add_def;
> -      trailing_add = find_trailing_add (insns, move_range, writeback,
> -					&writeback_effect,
> -					&add_def, base.def, offsets[0],
> -					access_size);
> -      if (trailing_add)
> +      base_cand &cand = base_cands[i];
> +
> +      rtx *ignore[2] = {};
> +      for (int j = 0; j < 2; j++)
> +	if (cand.from_insn == !j)
> +	  ignore[j] = &XEXP (cand_mems[j], 0);
> +
> +      insn_info *h = first_hazard_after (insns[0], ignore[0]);
> +      if (h && *h < *insns[1])
> +	cand.hazards[0] = h;
> +
> +      h = latest_hazard_before (insns[1], ignore[1]);
> +      if (h && *h > *insns[0])
> +	cand.hazards[1] = h;
> +
> +      if (!cand.viable ())
>  	{
> -	  // The def of the base register from the trailing add should prevail.
> -	  input_defs[0] = insert_access (attempt, add_def, input_defs[0]);
> -	  gcc_assert (input_defs[0].is_valid ());
> +	  if (dump_file)
> +	    fprintf (dump_file,
> +		     "pair (%d,%d): rejecting base %d due to dataflow "
> +		     "hazards (%d,%d)\n",
> +		     insns[0]->uid (),
> +		     insns[1]->uid (),
> +		     cand.def->regno (),
> +		     cand.hazards[0]->uid (),
> +		     cand.hazards[1]->uid ());
> +
> +	  base_cands.ordered_remove (i);
>  	}
> +      else
> +	i++;
>      }
>  
> -  // Now that we know what base mem we're going to use, check if it's OK
> -  // with the ldp/stp policy.
> -  rtx first_mem = XEXP (pats[0], load_p);
> -  if (!aarch64_mem_ok_with_ldpstp_policy_model (first_mem,
> -						load_p,
> -						GET_MODE (first_mem)))
> +  if (base_cands.is_empty ())
>      {
>        if (dump_file)
> -	fprintf (dump_file, "punting on pair (%d,%d), ldp/stp policy says no\n",
> -		 i1->uid (), i2->uid ());
> +	fprintf (dump_file,
> +		 "can't form pair (%d,%d) due to dataflow hazards\n",
> +		 insns[0]->uid (), insns[1]->uid ());
>        return false;
>      }
>  
> -  rtx reg_notes = combine_reg_notes (first, second, load_p);
> +  insn_info *alias_hazards[4] = {};
>  
> -  rtx pair_pat;
> -  if (writeback_effect)
> +  // First def of memory after the first insn, and last def of memory
> +  // before the second insn, respectively.
> +  def_info *mem_defs[2] = {};
> +  if (load_p)
>      {
> -      auto patvec = gen_rtvec (3, writeback_effect, pats[0], pats[1]);
> -      pair_pat = gen_rtx_PARALLEL (VOIDmode, patvec);
> +      if (!MEM_READONLY_P (cand_mems[0]))
> +	{
> +	  mem_defs[0] = memory_access (insns[0]->uses ())->def ();
> +	  gcc_checking_assert (mem_defs[0]);
> +	  mem_defs[0] = mem_defs[0]->next_def ();
> +	}
> +      if (!MEM_READONLY_P (cand_mems[1]))
> +	{
> +	  mem_defs[1] = memory_access (insns[1]->uses ())->def ();
> +	  gcc_checking_assert (mem_defs[1]);
> +	}
>      }
> -  else if (load_p)
> -    pair_pat = aarch64_gen_load_pair (XEXP (pats[0], 0),
> -				      XEXP (pats[1], 0),
> -				      XEXP (pats[0], 1));
>    else
> -    pair_pat = aarch64_gen_store_pair (XEXP (pats[0], 0),
> -				       XEXP (pats[0], 1),
> -				       XEXP (pats[1], 1));
> +    {
> +      mem_defs[0] = memory_access (insns[0]->defs ())->next_def ();
> +      mem_defs[1] = memory_access (insns[1]->defs ())->prev_def ();
> +      gcc_checking_assert (mem_defs[0]);
> +      gcc_checking_assert (mem_defs[1]);
> +    }
>  
> -  insn_change *pair_change = nullptr;
> -  auto set_pair_pat = [pair_pat,reg_notes](insn_change *change) {
> -    rtx_insn *rti = change->insn ()->rtl ();
> -    validate_unshare_change (rti, &PATTERN (rti), pair_pat, true);
> -    validate_change (rti, &REG_NOTES (rti), reg_notes, true);
> +  auto tombstone_p = [&](insn_info *insn) -> bool {
> +    return m_emitted_tombstone
> +	   && bitmap_bit_p (&m_tombstone_bitmap, insn->uid ());
>    };
>  
> -  if (load_p)
> -    {
> -      changes.safe_push (make_delete (first));
> -      pair_change = make_change (second);
> -      changes.safe_push (pair_change);
> +  store_walker<false, decltype(tombstone_p)>
> +    forward_store_walker (mem_defs[0], cand_mems[0], insns[1], tombstone_p);
>  
> -      pair_change->move_range = move_range;
> -      pair_change->new_defs = merge_access_arrays (attempt,
> -						   input_defs[0],
> -						   input_defs[1]);
> -      gcc_assert (pair_change->new_defs.is_valid ());
> +  store_walker<true, decltype(tombstone_p)>
> +    backward_store_walker (mem_defs[1], cand_mems[1], insns[0], tombstone_p);
>  
> -      pair_change->new_uses
> -	= merge_access_arrays (attempt,
> -			       drop_memory_access (input_uses[0]),
> -			       drop_memory_access (input_uses[1]));
> -      gcc_assert (pair_change->new_uses.is_valid ());
> -      set_pair_pat (pair_change);
> -    }
> +  alias_walker *walkers[4] = {};
> +  if (mem_defs[0])
> +    walkers[0] = &forward_store_walker;
> +  if (mem_defs[1])
> +    walkers[1] = &backward_store_walker;
> +
> +  if (load_p && (mem_defs[0] || mem_defs[1]))
> +    do_alias_analysis (alias_hazards, walkers, load_p);
>    else
>      {
> -      using Action = stp_change_builder::action;
> -      insn_info *store_to_change = try_repurpose_store (first, second,
> -							move_range);
> -      stp_change_builder builder (insns, store_to_change, pair_dst);
> -      insn_change *change;
> -      set_info *new_set = nullptr;
> -      for (; !builder.done (); builder.advance ())
> -	{
> -	  auto action = builder.get_change ();
> -	  change = (action.type == Action::INSERT)
> -	    ? nullptr : make_change (action.insn);
> -	  switch (action.type)
> -	    {
> -	    case Action::CHANGE:
> -	    {
> -	      set_pair_pat (change);
> -	      change->new_uses = merge_access_arrays (attempt,
> -						      input_uses[0],
> -						      input_uses[1]);
> -	      auto d1 = drop_memory_access (input_defs[0]);
> -	      auto d2 = drop_memory_access (input_defs[1]);
> -	      change->new_defs = merge_access_arrays (attempt, d1, d2);
> -	      gcc_assert (change->new_defs.is_valid ());
> -	      def_info *stp_def = memory_access (change->insn ()->defs ());
> -	      change->new_defs = insert_access (attempt,
> -						stp_def,
> -						change->new_defs);
> -	      gcc_assert (change->new_defs.is_valid ());
> -	      change->move_range = move_range;
> -	      pair_change = change;
> -	      break;
> -	    }
> -	    case Action::TOMBSTONE:
> -	    {
> -	      tombstone_uids.quick_push (change->insn ()->uid ());
> -	      rtx_insn *rti = change->insn ()->rtl ();
> -	      validate_change (rti, &PATTERN (rti), gen_tombstone (), true);
> -	      validate_change (rti, &REG_NOTES (rti), NULL_RTX, true);
> -	      change->new_uses = use_array (nullptr, 0);
> -	      break;
> -	    }
> -	    case Action::INSERT:
> -	    {
> -	      if (dump_file)
> -		fprintf (dump_file,
> -			 "  stp: cannot re-purpose candidate stores\n");
> -
> -	      auto new_insn = crtl->ssa->create_insn (attempt, INSN, pair_pat);
> -	      change = make_change (new_insn);
> -	      change->move_range = move_range;
> -	      change->new_uses = merge_access_arrays (attempt,
> -						      input_uses[0],
> -						      input_uses[1]);
> -	      gcc_assert (change->new_uses.is_valid ());
> +      // We want to find any loads hanging off the first store.
> +      mem_defs[0] = memory_access (insns[0]->defs ());
> +      load_walker<false> forward_load_walker (mem_defs[0], insns[0], insns[1]);
> +      load_walker<true> backward_load_walker (mem_defs[1], insns[1], insns[0]);
> +      walkers[2] = &forward_load_walker;
> +      walkers[3] = &backward_load_walker;
> +      do_alias_analysis (alias_hazards, walkers, load_p);
> +      // Now consolidate hazards back down.
> +      if (alias_hazards[2]
> +	  && (!alias_hazards[0] || (*alias_hazards[2] < *alias_hazards[0])))
> +	alias_hazards[0] = alias_hazards[2];
>  
> -	      auto d1 = drop_memory_access (input_defs[0]);
> -	      auto d2 = drop_memory_access (input_defs[1]);
> -	      change->new_defs = merge_access_arrays (attempt, d1, d2);
> -	      gcc_assert (change->new_defs.is_valid ());
> +      if (alias_hazards[3]
> +	  && (!alias_hazards[1] || (*alias_hazards[3] > *alias_hazards[1])))
> +	alias_hazards[1] = alias_hazards[3];
> +    }
>  
> -	      new_set = crtl->ssa->create_set (attempt, new_insn, memory);
> -	      change->new_defs = insert_access (attempt, new_set,
> -						change->new_defs);
> -	      gcc_assert (change->new_defs.is_valid ());
> -	      pair_change = change;
> -	      break;
> -	    }
> -	    case Action::FIXUP_USE:
> -	    {
> -	      // This use now needs to consume memory from our stp.
> -	      if (dump_file)
> -		fprintf (dump_file,
> -			 "  stp: changing i%d to use mem from new stp "
> -			 "(after i%d)\n",
> -			 action.insn->uid (), pair_dst->uid ());
> -	      change->new_uses = drop_memory_access (change->new_uses);
> -	      gcc_assert (new_set);
> -	      auto new_use = crtl->ssa->create_use (attempt, action.insn,
> -						    new_set);
> -	      change->new_uses = insert_access (attempt, new_use,
> -						change->new_uses);
> -	      break;
> -	    }
> -	    }
> -	  changes.safe_push (change);
> -	}
> +  if (alias_hazards[0] && alias_hazards[1]
> +      && *alias_hazards[0] <= *alias_hazards[1])
> +    {
> +      if (dump_file)
> +	fprintf (dump_file,
> +		 "cannot form pair (%d,%d) due to alias conflicts (%d,%d)\n",
> +		 i1->uid (), i2->uid (),
> +		 alias_hazards[0]->uid (), alias_hazards[1]->uid ());
> +      return false;
>      }
>  
> -  if (trailing_add)
> -    changes.safe_push (make_delete (trailing_add));
> -  else if ((writeback & 2) && !writeback_effect)
> +  // Now narrow the hazards on each base candidate using
> +  // the alias hazards.
> +  i = 0;
> +  while (i < base_cands.length ())
>      {
> -      // The second insn initially had writeback but now the pair does not,
> -      // need to update any nondebug uses of the base register def in the
> -      // second insn.  We'll take care of debug uses later.
> -      auto def = find_access (insns[1]->defs (), base_regno);
> -      gcc_assert (def);
> -      auto set = dyn_cast<set_info *> (def);
> -      if (set && set->has_nondebug_uses ())
> -	{
> -	  auto orig_use = find_access (insns[0]->uses (), base_regno);
> -	  for (auto use : set->nondebug_insn_uses ())
> -	    {
> -	      auto change = make_change (use->insn ());
> -	      change->new_uses = check_remove_regno_access (attempt,
> -							    change->new_uses,
> -							    base_regno);
> -	      change->new_uses = insert_access (attempt,
> -						orig_use,
> -						change->new_uses);
> -	      changes.safe_push (change);
> -	    }
> +      base_cand &cand = base_cands[i];
> +      if (alias_hazards[0] && (!cand.hazards[0]
> +			       || *alias_hazards[0] < *cand.hazards[0]))
> +	cand.hazards[0] = alias_hazards[0];
> +      if (alias_hazards[1] && (!cand.hazards[1]
> +			       || *alias_hazards[1] > *cand.hazards[1]))
> +	cand.hazards[1] = alias_hazards[1];
> +
> +      if (cand.viable ())
> +	i++;
> +      else
> +	{
> +	  if (dump_file)
> +	    fprintf (dump_file, "pair (%d,%d): rejecting base %d due to "
> +				"alias/dataflow hazards (%d,%d)",
> +				insns[0]->uid (), insns[1]->uid (),
> +				cand.def->regno (),
> +				cand.hazards[0]->uid (),
> +				cand.hazards[1]->uid ());
> +
> +	  base_cands.ordered_remove (i);
>  	}
>      }
>  
> -  auto is_changing = insn_is_changing (changes);
> -  for (unsigned i = 0; i < changes.length (); i++)
> -    gcc_assert (rtl_ssa::restrict_movement_ignoring (*changes[i], is_changing));
> -
> -  // Check the pair pattern is recog'd.
> -  if (!rtl_ssa::recog_ignoring (attempt, *pair_change, is_changing))
> +  if (base_cands.is_empty ())
>      {
>        if (dump_file)
> -	fprintf (dump_file, "  failed to form pair, recog failed\n");
> +	fprintf (dump_file,
> +		 "cannot form pair (%d,%d) due to alias/dataflow hazards",
> +		 insns[0]->uid (), insns[1]->uid ());
>  
> -      // Free any reg notes we allocated.
> -      while (reg_notes)
> -	{
> -	  rtx next = XEXP (reg_notes, 1);
> -	  free_EXPR_LIST_node (reg_notes);
> -	  reg_notes = next;
> -	}
> -      cancel_changes (0);
>        return false;
>      }
>  
> -  gcc_assert (crtl->ssa->verify_insn_changes (changes));
> -
> -  // Fix up any debug uses that will be affected by the changes.
> -  if (MAY_HAVE_DEBUG_INSNS)
> -    fixup_debug_uses (attempt, insns, orig_rtl, pair_dst, trailing_add,
> -		      load_p, writeback, writeback_effect, base_regno);
> -
> -  confirm_change_group ();
> -  crtl->ssa->change_insns (changes);
> -
> -  gcc_checking_assert (tombstone_uids.length () <= 2);
> -  for (auto uid : tombstone_uids)
> -    track_tombstone (uid);
> -
> -  return true;
> -}
> -
> -// Return true if STORE_INSN may modify mem rtx MEM.  Make sure we keep
> -// within our BUDGET for alias analysis.
> -static bool
> -store_modifies_mem_p (rtx mem, insn_info *store_insn, int &budget)
> -{
> -  if (!budget)
> +  base_cand *base = &base_cands[0];
> +  if (base_cands.length () > 1)
>      {
> -      if (dump_file)
> +      // If there are still multiple viable bases, it makes sense
> +      // to choose one that allows us to reduce register pressure,
> +      // for loads this means moving further down, for stores this
> +      // means moving further up.
> +      gcc_checking_assert (base_cands.length () == 2);
> +      const int hazard_i = !load_p;
> +      if (base->hazards[hazard_i])
>  	{
> -	  fprintf (dump_file,
> -		   "exceeded budget, assuming store %d aliases with mem ",
> -		   store_insn->uid ());
> -	  print_simple_rtl (dump_file, mem);
> -	  fprintf (dump_file, "\n");
> +	  if (!base_cands[1].hazards[hazard_i])
> +	    base = &base_cands[1];
> +	  else if (load_p
> +		   && *base_cands[1].hazards[hazard_i]
> +		      > *(base->hazards[hazard_i]))
> +	    base = &base_cands[1];
> +	  else if (!load_p
> +		   && *base_cands[1].hazards[hazard_i]
> +		      < *(base->hazards[hazard_i]))
> +	    base = &base_cands[1];
>  	}
> -
> -      return true;
>      }
>  
> -  budget--;
> -  return memory_modified_in_insn_p (mem, store_insn->rtl ());
> -}
> -
> -// Return true if LOAD may be modified by STORE.  Make sure we keep
> -// within our BUDGET for alias analysis.
> -static bool
> -load_modified_by_store_p (insn_info *load,
> -			  insn_info *store,
> -			  int &budget)
> -{
> -  gcc_checking_assert (budget >= 0);
> +  // Otherwise, hazards[0] > hazards[1].
> +  // Pair can be formed anywhere in (hazards[1], hazards[0]).
> +  insn_range_info range (insns[0], insns[1]);
> +  if (base->hazards[1])
> +    range.first = base->hazards[1];
> +  if (base->hazards[0])
> +    range.last = base->hazards[0]->prev_nondebug_insn ();
>  
> -  if (!budget)
> +  // If the second insn can throw, narrow the move range to exactly that insn.
> +  // This prevents us trying to move the second insn from the end of the BB.
> +  if (cfun->can_throw_non_call_exceptions
> +      && find_reg_note (insns[1]->rtl (), REG_EH_REGION, NULL_RTX))
>      {
> -      if (dump_file)
> -	{
> -	  fprintf (dump_file,
> -		   "exceeded budget, assuming load %d aliases with store %d\n",
> -		   load->uid (), store->uid ());
> -	}
> -      return true;
> +      gcc_assert (range.includes (insns[1]));
> +      range = insn_range_info (insns[1]);
>      }
>  
> -  // It isn't safe to re-order stores over calls.
> -  if (CALL_P (load->rtl ()))
> -    return true;
> +  // Placement strategy: push loads down and pull stores up, this should
> +  // help register pressure by reducing live ranges.
> +  if (load_p)
> +    range.first = range.last;
> +  else
> +    range.last = range.first;
>  
> -  budget--;
> +  if (dump_file)
> +    {
> +      auto print_hazard = [](insn_info *i)
> +	{
> +	  if (i)
> +	    fprintf (dump_file, "%d", i->uid ());
> +	  else
> +	    fprintf (dump_file, "-");
> +	};
> +      auto print_pair = [print_hazard](insn_info **i)
> +	{
> +	  print_hazard (i[0]);
> +	  fprintf (dump_file, ",");
> +	  print_hazard (i[1]);
> +	};
>  
> -  // Iterate over all MEMs in the load, seeing if any alias with
> -  // our store.
> -  subrtx_var_iterator::array_type array;
> -  rtx pat = PATTERN (load->rtl ());
> -  FOR_EACH_SUBRTX_VAR (iter, array, pat, NONCONST)
> -    if (MEM_P (*iter) && memory_modified_in_insn_p (*iter, store->rtl ()))
> -      return true;
> +      fprintf (dump_file, "fusing pair [L=%d] (%d,%d), base=%d, hazards: (",
> +	      load_p, insns[0]->uid (), insns[1]->uid (),
> +	      base->def->regno ());
> +      print_pair (base->hazards);
> +      fprintf (dump_file, "), move_range: (%d,%d)\n",
> +	       range.first->uid (), range.last->uid ());
> +    }
>  
> -  return false;
> +  return fuse_pair (load_p, access_size, writeback,
> +		    i1, i2, *base, range);
>  }
>  
> -// Virtual base class for load/store walkers used in alias analysis.
> -struct alias_walker
> -{
> -  virtual bool conflict_p (int &budget) const = 0;
> -  virtual insn_info *insn () const = 0;
> -  virtual bool valid () const  = 0;
> -  virtual void advance () = 0;
> -};
> -
> -// Implement some common functionality used by both store_walker
> -// and load_walker.
> -template<bool reverse>
> -class def_walker : public alias_walker
> -{
> -protected:
> -  using def_iter_t = typename std::conditional<reverse,
> -	reverse_def_iterator, def_iterator>::type;
> -
> -  static use_info *start_use_chain (def_iter_t &def_iter)
> -  {
> -    set_info *set = nullptr;
> -    for (; *def_iter; def_iter++)
> -      {
> -	set = dyn_cast<set_info *> (*def_iter);
> -	if (!set)
> -	  continue;
> -
> -	use_info *use = reverse
> -	  ? set->last_nondebug_insn_use ()
> -	  : set->first_nondebug_insn_use ();
> -
> -	if (use)
> -	  return use;
> -      }
> -
> -    return nullptr;
> -  }
> -
> -  def_iter_t def_iter;
> -  insn_info *limit;
> -  def_walker (def_info *def, insn_info *limit) :
> -    def_iter (def), limit (limit) {}
> -
> -  virtual bool iter_valid () const { return *def_iter; }
> -
> -public:
> -  insn_info *insn () const override { return (*def_iter)->insn (); }
> -  void advance () override { def_iter++; }
> -  bool valid () const override final
> -  {
> -    if (!iter_valid ())
> -      return false;
> -
> -    if (reverse)
> -      return *(insn ()) > *limit;
> -    else
> -      return *(insn ()) < *limit;
> -  }
> -};
>  
> -// alias_walker that iterates over stores.
> -template<bool reverse, typename InsnPredicate>
> -class store_walker : public def_walker<reverse>
> +// LEFT_LIST and RIGHT_LIST are lists of candidate instructions where all insns
> +// in LEFT_LIST are known to be adjacent to those in RIGHT_LIST.
> +//
> +// This function traverses the resulting 2D matrix of possible pair candidates
> +// and attempts to merge them into pairs.
> +//
> +// The algorithm is straightforward: if we consider a combined list of
> +// candidates X obtained by merging LEFT_LIST and RIGHT_LIST in program order,
> +// then we advance through X until we reach a crossing point (where X[i] and
> +// X[i+1] come from different source lists).
> +//
> +// At this point we know X[i] and X[i+1] are adjacent accesses, and we try to
> +// fuse them into a pair.  If this succeeds, we remove X[i] and X[i+1] from
> +// their original lists and continue as above.
> +//
> +// In the failure case, we advance through the source list containing X[i] and
> +// continue as above (proceeding to the next crossing point).
> +//
> +// The rationale for skipping over groups of consecutive candidates from the
> +// same source list is as follows:
> +//
> +// In the store case, the insns in the group can't be re-ordered over each
> +// other as they are guaranteed to store to the same location, so we're
> +// guaranteed not to lose opportunities by doing this.
> +//
> +// In the load case, subsequent loads from the same location are either
> +// redundant (in which case they should have been cleaned up by an earlier
> +// optimization pass) or there is an intervening aliasing hazard, in which case
> +// we can't re-order them anyway, so provided earlier passes have cleaned up
> +// redundant loads, we shouldn't miss opportunities by doing this.
> +void
> +pair_fusion::merge_pairs (insn_list_t &left_list,
> +			  insn_list_t &right_list,
> +			  bool load_p,
> +			  unsigned access_size)
>  {
> -  rtx cand_mem;
> -  InsnPredicate tombstone_p;
> -
> -public:
> -  store_walker (def_info *mem_def, rtx mem, insn_info *limit_insn,
> -		InsnPredicate tombstone_fn) :
> -    def_walker<reverse> (mem_def, limit_insn),
> -    cand_mem (mem), tombstone_p (tombstone_fn) {}
> -
> -  bool conflict_p (int &budget) const override final
> -  {
> -    if (tombstone_p (this->insn ()))
> -      return false;
> +  if (dump_file)
> +    {
> +      fprintf (dump_file, "merge_pairs [L=%d], cand vecs ", load_p);
> +      dump_insn_list (dump_file, left_list);
> +      fprintf (dump_file, " x ");
> +      dump_insn_list (dump_file, right_list);
> +      fprintf (dump_file, "\n");
> +    }
>  
> -    return store_modifies_mem_p (cand_mem, this->insn (), budget);
> -  }
> -};
> +  auto iter_l = left_list.begin ();
> +  auto iter_r = right_list.begin ();
>  
> -// alias_walker that iterates over loads.
> -template<bool reverse>
> -class load_walker : public def_walker<reverse>
> +  while (iter_l != left_list.end () && iter_r != right_list.end ())
> +    {
> +      auto next_l = std::next (iter_l);
> +      auto next_r = std::next (iter_r);
> +      if (**iter_l < **iter_r
> +	  && next_l != left_list.end ()
> +	  && **next_l < **iter_r)
> +	iter_l = next_l;
> +      else if (**iter_r < **iter_l
> +	       && next_r != right_list.end ()
> +	       && **next_r < **iter_l)
> +	iter_r = next_r;
> +      else if (try_fuse_pair (load_p, access_size, *iter_l, *iter_r))
> +	{
> +	  left_list.erase (iter_l);
> +	  iter_l = next_l;
> +	  right_list.erase (iter_r);
> +	  iter_r = next_r;
> +	}
> +      else if (**iter_l < **iter_r)
> +	iter_l = next_l;
> +      else
> +	iter_r = next_r;
> +    }
> +}
> +// If we emitted tombstone insns for this BB, iterate through the BB
> +// and remove all the tombstone insns, being sure to reparent any uses
> +// of mem to previous defs when we do this.
> +void
> +pair_fusion::cleanup_tombstones ()
>  {
> -  using Base = def_walker<reverse>;
> -  using use_iter_t = typename std::conditional<reverse,
> -	reverse_use_iterator, nondebug_insn_use_iterator>::type;
> +  // No need to do anything if we didn't emit a tombstone insn for this BB.
> +  if (!m_emitted_tombstone)
> +    return;
>  
> -  use_iter_t use_iter;
> -  insn_info *cand_store;
> +  insn_info *insn = m_bb->head_insn ();
> +  while (insn)
> +    {
> +      insn_info *next = insn->next_nondebug_insn ();
> +      if (!insn->is_real ()
> +	  || !bitmap_bit_p (&m_tombstone_bitmap, insn->uid ()))
> +	{
> +	  insn = next;
> +	  continue;
> +	}
>  
> -  bool iter_valid () const override final { return *use_iter; }
> +      auto def = memory_access (insn->defs ());
> +      auto set = dyn_cast<set_info *> (def);
> +      if (set && set->has_any_uses ())
> +	{
> +	  def_info *prev_def = def->prev_def ();
> +	  auto prev_set = dyn_cast<set_info *> (prev_def);
> +	  if (!prev_set)
> +	    gcc_unreachable ();
>  
> -public:
> -  void advance () override final
> -  {
> -    use_iter++;
> -    if (*use_iter)
> -      return;
> -    this->def_iter++;
> -    use_iter = Base::start_use_chain (this->def_iter);
> -  }
> +	  while (set->first_use ())
> +	    crtl->ssa->reparent_use (set->first_use (), prev_set);
> +	}
>  
> -  insn_info *insn () const override final
> -  {
> -    return (*use_iter)->insn ();
> -  }
> +      // Now set has no uses, we can delete it.
> +      insn_change change (insn, insn_change::DELETE);
> +      crtl->ssa->change_insn (change);
> +      insn = next;
> +    }
> +}
>  
> -  bool conflict_p (int &budget) const override final
> -  {
> -    return load_modified_by_store_p (insn (), cand_store, budget);
> -  }
> +template<typename Map>
> +void
> +pair_fusion::traverse_base_map (Map &map)
> +{
> +  for (auto kv : map)
> +    {
> +      const auto &key = kv.first;
> +      auto &value = kv.second;
> +      transform_for_base (key.second, value);
> +    }
> +}
>  
> -  load_walker (def_info *def, insn_info *store, insn_info *limit_insn)
> -    : Base (def, limit_insn),
> -      use_iter (Base::start_use_chain (this->def_iter)),
> -      cand_store (store) {}
> -};
> +void
> +pair_fusion::transform ()
> +{
> +  traverse_base_map (expr_map);
> +  traverse_base_map (def_map);
> +}
>  
>  // Process our alias_walkers in a round-robin fashion, proceeding until
>  // nothing more can be learned from alias analysis.
>  //
>  // We try to maintain the invariant that if a walker becomes invalid, we
>  // set its pointer to null.
> -static void
> -do_alias_analysis (insn_info *alias_hazards[4],
> +void
> +pair_fusion::do_alias_analysis (insn_info *alias_hazards[4],
>  		   alias_walker *walkers[4],
>  		   bool load_p)
>  {
>    const int n_walkers = 2 + (2 * !load_p);
> -  int budget = aarch64_ldp_alias_check_limit;
> +  int budget = pair_mem_alias_check_limit();
>  
>    auto next_walker = [walkers,n_walkers](int current) -> int {
>      for (int j = 1; j <= n_walkers; j++)
> @@ -2341,548 +2553,554 @@ do_alias_analysis (insn_info *alias_hazards[4],
>      }
>  }
>  
> -// Given INSNS (in program order) which are known to be adjacent, look
> -// to see if either insn has a suitable RTL (register) base that we can
> -// use to form a pair.  Push these to BASE_CANDS if we find any.  CAND_MEMs
> -// gives the relevant mems from the candidate insns, ACCESS_SIZE gives the
> -// size of a single candidate access, and REVERSED says whether the accesses
> -// are inverted in offset order.
> +// Try and actually fuse the pair given by insns I1 and I2.
>  //
> -// Returns an integer where bit (1 << i) is set if INSNS[i] uses writeback
> -// addressing.
> -static int
> -get_viable_bases (insn_info *insns[2],
> -		  vec<base_cand> &base_cands,
> -		  rtx cand_mems[2],
> -		  unsigned access_size,
> -		  bool reversed)
> +// Here we've done enough analysis to know this is safe, we only
> +// reject the pair at this stage if either the tuning policy says to,
> +// or recog fails on the final pair insn.
> +//
> +// LOAD_P is true for loads, ACCESS_SIZE gives the access size of each
> +// candidate insn.  Bit i of WRITEBACK is set if the ith insn (in program
> +// order) uses writeback.
> +//
> +// BASE gives the chosen base candidate for the pair and MOVE_RANGE is
> +// a singleton range which says where to place the pair.
> +bool
> +pair_fusion::fuse_pair (bool load_p,
> +			unsigned access_size,
> +			int writeback,
> +			insn_info *i1, insn_info *i2,
> +			base_cand &base,
> +			const insn_range_info &move_range)
>  {
> -  // We discovered this pair through a common base.  Need to ensure that
> -  // we have a common base register that is live at both locations.
> -  def_info *base_defs[2] = {};
> -  int writeback = 0;
> -  for (int i = 0; i < 2; i++)
> -    {
> -      const bool is_lower = (i == reversed);
> -      poly_int64 poly_off;
> -      rtx base = ldp_strip_offset (cand_mems[i], &poly_off);
> -      if (GET_RTX_CLASS (GET_CODE (XEXP (cand_mems[i], 0))) == RTX_AUTOINC)
> -	writeback |= (1 << i);
> -
> -      if (!REG_P (base) || !poly_off.is_constant ())
> -	continue;
> -
> -      // Punt on accesses relative to eliminable regs.  See the comment in
> -      // ldp_bb_info::track_access for a detailed explanation of this.
> -      if (!reload_completed
> -	  && (REGNO (base) == FRAME_POINTER_REGNUM
> -	      || REGNO (base) == ARG_POINTER_REGNUM))
> -	continue;
> -
> -      HOST_WIDE_INT base_off = poly_off.to_constant ();
> -
> -      // It should be unlikely that we ever punt here, since MEM_EXPR offset
> -      // alignment should be a good proxy for register offset alignment.
> -      if (base_off % access_size != 0)
> -	{
> -	  if (dump_file)
> -	    fprintf (dump_file,
> -		     "base not viable, offset misaligned (insn %d)\n",
> -		     insns[i]->uid ());
> -	  continue;
> -	}
> -
> -      base_off /= access_size;
> -
> -      if (!is_lower)
> -	base_off--;
> -
> -      if (base_off < LDP_MIN_IMM || base_off > LDP_MAX_IMM)
> -	continue;
> -
> -      use_info *use = find_access (insns[i]->uses (), REGNO (base));
> -      gcc_assert (use);
> -      base_defs[i] = use->def ();
> -    }
> +  auto attempt = crtl->ssa->new_change_attempt ();
>  
> -  if (!base_defs[0] && !base_defs[1])
> +  auto make_change = [&attempt](insn_info *insn)
>      {
> -      if (dump_file)
> -	fprintf (dump_file, "no viable base register for pair (%d,%d)\n",
> -		 insns[0]->uid (), insns[1]->uid ());
> -      return writeback;
> -    }
> -
> -  for (int i = 0; i < 2; i++)
> -    if ((writeback & (1 << i)) && !base_defs[i])
> -      {
> -	if (dump_file)
> -	  fprintf (dump_file, "insn %d has writeback but base isn't viable\n",
> -		   insns[i]->uid ());
> -	return writeback;
> -      }
> -
> -  if (writeback == 3
> -      && base_defs[0]->regno () != base_defs[1]->regno ())
> +      return crtl->ssa->change_alloc<insn_change> (attempt, insn);
> +    };
> +  auto make_delete = [&attempt](insn_info *insn)
>      {
> -      if (dump_file)
> -	fprintf (dump_file,
> -		 "pair (%d,%d): double writeback with distinct regs (%d,%d): "
> -		 "punting\n",
> -		 insns[0]->uid (), insns[1]->uid (),
> -		 base_defs[0]->regno (), base_defs[1]->regno ());
> -      return writeback;
> -    }
> +      return crtl->ssa->change_alloc<insn_change> (attempt,
> +						   insn,
> +						   insn_change::DELETE);
> +    };
>  
> -  if (base_defs[0] && base_defs[1]
> -      && base_defs[0]->regno () == base_defs[1]->regno ())
> -    {
> -      // Easy case: insns already share the same base reg.
> -      base_cands.quick_push (base_defs[0]);
> -      return writeback;
> -    }
> +  if (*i1 > *i2)
> +    return false;
>  
> -  // Otherwise, we know that one of the bases must change.
> -  //
> -  // Note that if there is writeback we must use the writeback base
> -  // (we know now there is exactly one).
> -  for (int i = 0; i < 2; i++)
> -    if (base_defs[i] && (!writeback || (writeback & (1 << i))))
> -      base_cands.quick_push (base_cand { base_defs[i], i });
> +  insn_info *first = (*i1 < *i2) ? i1 : i2;
> +  insn_info *second = (first == i1) ? i2 : i1;
>  
> -  return writeback;
> -}
> +  insn_info *pair_dst = move_range.singleton ();
> +  gcc_assert (pair_dst);
> +
> +  insn_info *insns[2] = { first, second };
>  
> -// Given two adjacent memory accesses of the same size, I1 and I2, try
> -// and see if we can merge them into a ldp or stp.
> -//
> -// ACCESS_SIZE gives the (common) size of a single access, LOAD_P is true
> -// if the accesses are both loads, otherwise they are both stores.
> -bool
> -ldp_bb_info::try_fuse_pair (bool load_p, unsigned access_size,
> -			    insn_info *i1, insn_info *i2)
> -{
> -  if (dump_file)
> -    fprintf (dump_file, "analyzing pair (load=%d): (%d,%d)\n",
> -	     load_p, i1->uid (), i2->uid ());
> +  auto_vec<insn_change *> changes;
> +  auto_vec<int, 2> tombstone_uids (2);
>  
> -  insn_info *insns[2];
> -  bool reversed = false;
> -  if (*i1 < *i2)
> -    {
> -      insns[0] = i1;
> -      insns[1] = i2;
> -    }
> -  else
> -    {
> -      insns[0] = i2;
> -      insns[1] = i1;
> -      reversed = true;
> -    }
> +  rtx pats[2] = {
> +    PATTERN (first->rtl ()),
> +    PATTERN (second->rtl ())
> +  };
>  
> -  rtx cand_mems[2];
> -  rtx reg_ops[2];
> -  rtx pats[2];
> +  // Make copies of the patterns as we might need to refer to the original RTL
> +  // later, for example when updating debug uses (which is after we've updated
> +  // one or both of the patterns in the candidate insns).
> +  rtx orig_rtl[2];
>    for (int i = 0; i < 2; i++)
> -    {
> -      pats[i] = PATTERN (insns[i]->rtl ());
> -      cand_mems[i] = XEXP (pats[i], load_p);
> -      reg_ops[i] = XEXP (pats[i], !load_p);
> -    }
> +    orig_rtl[i] = copy_rtx (pats[i]);
>  
> -  if (load_p && reg_overlap_mentioned_p (reg_ops[0], reg_ops[1]))
> -    {
> -      if (dump_file)
> -	fprintf (dump_file,
> -		 "punting on ldp due to reg conflcits (%d,%d)\n",
> -		 insns[0]->uid (), insns[1]->uid ());
> -      return false;
> -    }
> +  use_array input_uses[2] = { first->uses (), second->uses () };
> +  def_array input_defs[2] = { first->defs (), second->defs () };
>  
> -  if (cfun->can_throw_non_call_exceptions
> -      && find_reg_note (insns[0]->rtl (), REG_EH_REGION, NULL_RTX)
> -      && find_reg_note (insns[1]->rtl (), REG_EH_REGION, NULL_RTX))
> +  int changed_insn = -1;
> +  if (base.from_insn != -1)
>      {
> -      if (dump_file)
> -	fprintf (dump_file,
> -		 "can't combine insns with EH side effects (%d,%d)\n",
> -		 insns[0]->uid (), insns[1]->uid ());
> -      return false;
> -    }
> +      // If we're not already using a shared base, we need
> +      // to re-write one of the accesses to use the base from
> +      // the other insn.
> +      gcc_checking_assert (base.from_insn == 0 || base.from_insn == 1);
> +      changed_insn = !base.from_insn;
>  
> -  auto_vec<base_cand, 2> base_cands (2);
> +      rtx base_pat = pats[base.from_insn];
> +      rtx change_pat = pats[changed_insn];
> +      rtx base_mem = XEXP (base_pat, load_p);
> +      rtx change_mem = XEXP (change_pat, load_p);
>  
> -  int writeback = get_viable_bases (insns, base_cands, cand_mems,
> -				    access_size, reversed);
> -  if (base_cands.is_empty ())
> -    {
> -      if (dump_file)
> -	fprintf (dump_file, "no viable base for pair (%d,%d)\n",
> -		 insns[0]->uid (), insns[1]->uid ());
> -      return false;
> -    }
> +      const bool lower_base_p = (insns[base.from_insn] == i1);
> +      HOST_WIDE_INT adjust_amt = access_size;
> +      if (!lower_base_p)
> +	adjust_amt *= -1;
>  
> -  // Punt on frame-related insns with writeback.  We probably won't see
> -  // these in practice, but this is conservative and ensures we don't
> -  // have to worry about these later on.
> -  if (writeback && (RTX_FRAME_RELATED_P (i1->rtl ())
> -		    || RTX_FRAME_RELATED_P (i2->rtl ())))
> -    {
> -      if (dump_file)
> -	fprintf (dump_file,
> -		 "rejecting pair (%d,%d): frame-related insn with writeback\n",
> -		 i1->uid (), i2->uid ());
> -      return false;
> -    }
> +      rtx change_reg = XEXP (change_pat, !load_p);
> +      machine_mode mode_for_mem = GET_MODE (change_mem);
> +      rtx effective_base = drop_writeback (base_mem);
> +      rtx new_mem = adjust_address_nv (effective_base,
> +				       mode_for_mem,
> +				       adjust_amt);
> +      rtx new_set = load_p
> +	? gen_rtx_SET (change_reg, new_mem)
> +	: gen_rtx_SET (new_mem, change_reg);
>  
> -  rtx *ignore = &XEXP (pats[1], load_p);
> -  for (auto use : insns[1]->uses ())
> -    if (!use->is_mem ()
> -	&& refers_to_regno_p (use->regno (), use->regno () + 1, pats[1], ignore)
> -	&& use->def () && use->def ()->insn () == insns[0])
> -      {
> -	// N.B. we allow a true dependence on the base address, as this
> -	// happens in the case of auto-inc accesses.  Consider a post-increment
> -	// load followed by a regular indexed load, for example.
> -	if (dump_file)
> -	  fprintf (dump_file,
> -		   "%d has non-address true dependence on %d, rejecting pair\n",
> -		   insns[1]->uid (), insns[0]->uid ());
> -	return false;
> -      }
> +      pats[changed_insn] = new_set;
>  
> -  unsigned i = 0;
> -  while (i < base_cands.length ())
> -    {
> -      base_cand &cand = base_cands[i];
> +      auto keep_use = [&](use_info *u)
> +	{
> +	  return refers_to_regno_p (u->regno (), u->regno () + 1,
> +				    change_pat, &XEXP (change_pat, load_p));
> +	};
>  
> -      rtx *ignore[2] = {};
> -      for (int j = 0; j < 2; j++)
> -	if (cand.from_insn == !j)
> -	  ignore[j] = &XEXP (cand_mems[j], 0);
> +      // Drop any uses that only occur in the old address.
> +      input_uses[changed_insn] = filter_accesses (attempt,
> +						  input_uses[changed_insn],
> +						  keep_use);
> +    }
>  
> -      insn_info *h = first_hazard_after (insns[0], ignore[0]);
> -      if (h && *h < *insns[1])
> -	cand.hazards[0] = h;
> +  rtx writeback_effect = NULL_RTX;
> +  if (writeback)
> +    writeback_effect = extract_writebacks (load_p, pats, changed_insn);
>  
> -      h = latest_hazard_before (insns[1], ignore[1]);
> -      if (h && *h > *insns[0])
> -	cand.hazards[1] = h;
> +  const auto base_regno = base.def->regno ();
>  
> -      if (!cand.viable ())
> -	{
> -	  if (dump_file)
> -	    fprintf (dump_file,
> -		     "pair (%d,%d): rejecting base %d due to dataflow "
> -		     "hazards (%d,%d)\n",
> -		     insns[0]->uid (),
> -		     insns[1]->uid (),
> -		     cand.def->regno (),
> -		     cand.hazards[0]->uid (),
> -		     cand.hazards[1]->uid ());
> +  if (base.from_insn == -1 && (writeback & 1))
> +    {
> +      // If the first of the candidate insns had a writeback form, we'll need to
> +      // drop the use of the updated base register from the second insn's uses.
> +      //
> +      // N.B. we needn't worry about the base register occurring as a store
> +      // operand, as we checked that there was no non-address true dependence
> +      // between the insns in try_fuse_pair.
> +      gcc_checking_assert (find_access (input_uses[1], base_regno));
> +      input_uses[1] = check_remove_regno_access (attempt,
> +						 input_uses[1],
> +						 base_regno);
> +    }
>  
> -	  base_cands.ordered_remove (i);
> -	}
> -      else
> -	i++;
> +  // Go through and drop uses that only occur in register notes,
> +  // as we won't be preserving those.
> +  for (int i = 0; i < 2; i++)
> +    {
> +      auto rti = insns[i]->rtl ();
> +      if (!REG_NOTES (rti))
> +	continue;
> +
> +      input_uses[i] = remove_note_accesses (attempt, input_uses[i]);
>      }
>  
> -  if (base_cands.is_empty ())
> +  // Edge case: if the first insn is a writeback load and the
> +  // second insn is a non-writeback load which transfers into the base
> +  // register, then we should drop the writeback altogether as the
> +  // update of the base register from the second load should prevail.
> +  //
> +  // For example:
> +  //   ldr x2, [x1], #8
> +  //   ldr x1, [x1]
> +  //   -->
> +  //   ldp x2, x1, [x1]
> +  if (writeback == 1
> +      && load_p
> +      && find_access (input_defs[1], base_regno))
>      {
>        if (dump_file)
>  	fprintf (dump_file,
> -		 "can't form pair (%d,%d) due to dataflow hazards\n",
> -		 insns[0]->uid (), insns[1]->uid ());
> -      return false;
> +		 "  pair_mem: i%d has wb but subsequent i%d has non-wb "
> +		 "update of base (r%d), dropping wb\n",
> +		 insns[0]->uid (), insns[1]->uid (), base_regno);
> +      gcc_assert (writeback_effect);
> +      writeback_effect = NULL_RTX;
>      }
>  
> -  insn_info *alias_hazards[4] = {};
> +  // So far the patterns have been in instruction order,
> +  // now we want them in offset order.
> +  if (i1 != first)
> +    std::swap (pats[0], pats[1]);
>  
> -  // First def of memory after the first insn, and last def of memory
> -  // before the second insn, respectively.
> -  def_info *mem_defs[2] = {};
> -  if (load_p)
> +  poly_int64 offsets[2];
> +  for (int i = 0; i < 2; i++)
>      {
> -      if (!MEM_READONLY_P (cand_mems[0]))
> -	{
> -	  mem_defs[0] = memory_access (insns[0]->uses ())->def ();
> -	  gcc_checking_assert (mem_defs[0]);
> -	  mem_defs[0] = mem_defs[0]->next_def ();
> -	}
> -      if (!MEM_READONLY_P (cand_mems[1]))
> +      rtx mem = XEXP (pats[i], load_p);
> +      gcc_checking_assert (MEM_P (mem));
> +      rtx base = strip_offset (XEXP (mem, 0), offsets + i);
> +      gcc_checking_assert (REG_P (base));
> +      gcc_checking_assert (base_regno == REGNO (base));
> +    }
> +
> +  // If either of the original insns had writeback, but the resulting pair insn
> +  // does not (can happen e.g. in the pair mem  edge case above, or if the writeback
> +  // effects cancel out), then drop the def(s) of the base register as
> +  // appropriate.
> +  //
> +  // Also drop the first def in the case that both of the original insns had
> +  // writeback.  The second def could well have uses, but the first def should
> +  // only be used by the second insn (and we dropped that use above).
> +  for (int i = 0; i < 2; i++)
> +    if ((!writeback_effect && (writeback & (1 << i)))
> +	|| (i == 0 && writeback == 3))
> +      input_defs[i] = check_remove_regno_access (attempt,
> +						 input_defs[i],
> +						 base_regno);
> +
> +  // If we don't currently have a writeback pair, and we don't have
> +  // a load that clobbers the base register, look for a trailing destructive
> +  // update of the base register and try and fold it in to make this into a
> +  // writeback pair.
> +  insn_info *trailing_add = nullptr;
> +  if (pair_trailing_writeback_p ()
> +      && !writeback_effect
> +      && (!load_p || (!refers_to_regno_p (base_regno, base_regno + 1,
> +					 XEXP (pats[0], 0), nullptr)
> +		      && !refers_to_regno_p (base_regno, base_regno + 1,
> +					     XEXP (pats[1], 0), nullptr))))
> +    {
> +      def_info *add_def;
> +      trailing_add = find_trailing_add (insns, move_range, writeback,
> +					&writeback_effect,
> +					&add_def, base.def, offsets[0],
> +					access_size);
> +      if (trailing_add)
>  	{
> -	  mem_defs[1] = memory_access (insns[1]->uses ())->def ();
> -	  gcc_checking_assert (mem_defs[1]);
> +	  // The def of the base register from the trailing add should prevail.
> +	  input_defs[0] = insert_access (attempt, add_def, input_defs[0]);
> +	  gcc_assert (input_defs[0].is_valid ());
>  	}
>      }
> -  else
> +
> +  // Now that we know what base mem we're going to use, check if it's OK
> +  // with the pair mem  policy.
> +  rtx first_mem = XEXP (pats[0], load_p);
> +  if (pair_mem_ok_policy (first_mem,
> +			  load_p,
> +			  GET_MODE (first_mem)))
>      {
> -      mem_defs[0] = memory_access (insns[0]->defs ())->next_def ();
> -      mem_defs[1] = memory_access (insns[1]->defs ())->prev_def ();
> -      gcc_checking_assert (mem_defs[0]);
> -      gcc_checking_assert (mem_defs[1]);
> +      if (dump_file)
> +	fprintf (dump_file, "punting on pair (%d,%d), pair mem  policy says no\n",
> +		 i1->uid (), i2->uid ());
> +      return false;
>      }
>  
> -  auto tombstone_p = [&](insn_info *insn) -> bool {
> -    return m_emitted_tombstone
> -	   && bitmap_bit_p (&m_tombstone_bitmap, insn->uid ());
> -  };
> +  rtx reg_notes = combine_reg_notes (first, second, load_p);
>  
> -  store_walker<false, decltype(tombstone_p)>
> -    forward_store_walker (mem_defs[0], cand_mems[0], insns[1], tombstone_p);
> +  rtx pair_pat;
>  
> -  store_walker<true, decltype(tombstone_p)>
> -    backward_store_walker (mem_defs[1], cand_mems[1], insns[0], tombstone_p);
> +  set_multiword_subreg (first, second, load_p);
>  
> -  alias_walker *walkers[4] = {};
> -  if (mem_defs[0])
> -    walkers[0] = &forward_store_walker;
> -  if (mem_defs[1])
> -    walkers[1] = &backward_store_walker;
> +  pair_pat = gen_load_store_pair (pats, writeback_effect, load_p);
> +  if (pair_pat == NULL_RTX)
> +    return false;
> +  insn_change *pair_change = nullptr;
> +  auto set_pair_pat = [pair_pat,reg_notes](insn_change *change) {
> +    rtx_insn *rti = change->insn ()->rtl ();
> +    validate_unshare_change (rti, &PATTERN (rti), pair_pat, true);
> +    validate_change (rti, &REG_NOTES (rti), reg_notes, true);
> +  };
>  
> -  if (load_p && (mem_defs[0] || mem_defs[1]))
> -    do_alias_analysis (alias_hazards, walkers, load_p);
> -  else
> +  if (load_p)
>      {
> -      // We want to find any loads hanging off the first store.
> -      mem_defs[0] = memory_access (insns[0]->defs ());
> -      load_walker<false> forward_load_walker (mem_defs[0], insns[0], insns[1]);
> -      load_walker<true> backward_load_walker (mem_defs[1], insns[1], insns[0]);
> -      walkers[2] = &forward_load_walker;
> -      walkers[3] = &backward_load_walker;
> -      do_alias_analysis (alias_hazards, walkers, load_p);
> -      // Now consolidate hazards back down.
> -      if (alias_hazards[2]
> -	  && (!alias_hazards[0] || (*alias_hazards[2] < *alias_hazards[0])))
> -	alias_hazards[0] = alias_hazards[2];
> +      changes.safe_push (make_delete (first));
> +      pair_change = make_change (second);
> +      changes.safe_push (pair_change);
>  
> -      if (alias_hazards[3]
> -	  && (!alias_hazards[1] || (*alias_hazards[3] > *alias_hazards[1])))
> -	alias_hazards[1] = alias_hazards[3];
> -    }
> +      pair_change->move_range = move_range;
> +      pair_change->new_defs = merge_access_arrays (attempt,
> +						   input_defs[0],
> +						   input_defs[1]);
> +      gcc_assert (pair_change->new_defs.is_valid ());
>  
> -  if (alias_hazards[0] && alias_hazards[1]
> -      && *alias_hazards[0] <= *alias_hazards[1])
> -    {
> -      if (dump_file)
> -	fprintf (dump_file,
> -		 "cannot form pair (%d,%d) due to alias conflicts (%d,%d)\n",
> -		 i1->uid (), i2->uid (),
> -		 alias_hazards[0]->uid (), alias_hazards[1]->uid ());
> -      return false;
> +      pair_change->new_uses
> +	= merge_access_arrays (attempt,
> +			       drop_memory_access (input_uses[0]),
> +			       drop_memory_access (input_uses[1]));
> +      gcc_assert (pair_change->new_uses.is_valid ());
> +      set_pair_pat (pair_change);
>      }
> -
> -  // Now narrow the hazards on each base candidate using
> -  // the alias hazards.
> -  i = 0;
> -  while (i < base_cands.length ())
> +  else
>      {
> -      base_cand &cand = base_cands[i];
> -      if (alias_hazards[0] && (!cand.hazards[0]
> -			       || *alias_hazards[0] < *cand.hazards[0]))
> -	cand.hazards[0] = alias_hazards[0];
> -      if (alias_hazards[1] && (!cand.hazards[1]
> -			       || *alias_hazards[1] > *cand.hazards[1]))
> -	cand.hazards[1] = alias_hazards[1];
> -
> -      if (cand.viable ())
> -	i++;
> -      else
> +      using Action = stp_change_builder::action;
> +      insn_info *store_to_change = try_repurpose_store (first, second,
> +							move_range);
> +      stp_change_builder builder (insns, store_to_change, pair_dst);
> +      insn_change *change;
> +      set_info *new_set = nullptr;
> +      for (; !builder.done (); builder.advance ())
>  	{
> -	  if (dump_file)
> -	    fprintf (dump_file, "pair (%d,%d): rejecting base %d due to "
> -				"alias/dataflow hazards (%d,%d)",
> -				insns[0]->uid (), insns[1]->uid (),
> -				cand.def->regno (),
> -				cand.hazards[0]->uid (),
> -				cand.hazards[1]->uid ());
> -
> -	  base_cands.ordered_remove (i);
> -	}
> -    }
> +	  auto action = builder.get_change ();
> +	  change = (action.type == Action::INSERT)
> +	    ? nullptr : make_change (action.insn);
> +	  switch (action.type)
> +	    {
> +	    case Action::CHANGE:
> +	    {
> +	      set_pair_pat (change);
> +	      change->new_uses = merge_access_arrays (attempt,
> +						      input_uses[0],
> +						      input_uses[1]);
> +	      auto d1 = drop_memory_access (input_defs[0]);
> +	      auto d2 = drop_memory_access (input_defs[1]);
> +	      change->new_defs = merge_access_arrays (attempt, d1, d2);
> +	      gcc_assert (change->new_defs.is_valid ());
> +	      def_info *stp_def = memory_access (change->insn ()->defs ());
> +	      change->new_defs = insert_access (attempt,
> +						stp_def,
> +						change->new_defs);
> +	      gcc_assert (change->new_defs.is_valid ());
> +	      change->move_range = move_range;
> +	      pair_change = change;
> +	      break;
> +	    }
> +	    case Action::TOMBSTONE:
> +	    {
> +	      tombstone_uids.quick_push (change->insn ()->uid ());
> +	      rtx_insn *rti = change->insn ()->rtl ();
> +	      validate_change (rti, &PATTERN (rti), gen_tombstone (), true);
> +	      validate_change (rti, &REG_NOTES (rti), NULL_RTX, true);
> +	      change->new_uses = use_array (nullptr, 0);
> +	      break;
> +	    }
> +	    case Action::INSERT:
> +	    {
> +	      if (dump_file)
> +		fprintf (dump_file,
> +			 "  stp: cannot re-purpose candidate stores\n");
>  
> -  if (base_cands.is_empty ())
> -    {
> -      if (dump_file)
> -	fprintf (dump_file,
> -		 "cannot form pair (%d,%d) due to alias/dataflow hazards",
> -		 insns[0]->uid (), insns[1]->uid ());
> +	      auto new_insn = crtl->ssa->create_insn (attempt, INSN, pair_pat);
> +	      change = make_change (new_insn);
> +	      change->move_range = move_range;
> +	      change->new_uses = merge_access_arrays (attempt,
> +						      input_uses[0],
> +						      input_uses[1]);
> +	      gcc_assert (change->new_uses.is_valid ());
>  
> -      return false;
> -    }
> +	      auto d1 = drop_memory_access (input_defs[0]);
> +	      auto d2 = drop_memory_access (input_defs[1]);
> +	      change->new_defs = merge_access_arrays (attempt, d1, d2);
> +	      gcc_assert (change->new_defs.is_valid ());
>  
> -  base_cand *base = &base_cands[0];
> -  if (base_cands.length () > 1)
> -    {
> -      // If there are still multiple viable bases, it makes sense
> -      // to choose one that allows us to reduce register pressure,
> -      // for loads this means moving further down, for stores this
> -      // means moving further up.
> -      gcc_checking_assert (base_cands.length () == 2);
> -      const int hazard_i = !load_p;
> -      if (base->hazards[hazard_i])
> -	{
> -	  if (!base_cands[1].hazards[hazard_i])
> -	    base = &base_cands[1];
> -	  else if (load_p
> -		   && *base_cands[1].hazards[hazard_i]
> -		      > *(base->hazards[hazard_i]))
> -	    base = &base_cands[1];
> -	  else if (!load_p
> -		   && *base_cands[1].hazards[hazard_i]
> -		      < *(base->hazards[hazard_i]))
> -	    base = &base_cands[1];
> +	      new_set = crtl->ssa->create_set (attempt, new_insn, memory);
> +	      change->new_defs = insert_access (attempt, new_set,
> +						change->new_defs);
> +	      gcc_assert (change->new_defs.is_valid ());
> +	      pair_change = change;
> +	      break;
> +	    }
> +	    case Action::FIXUP_USE:
> +	    {
> +	      // This use now needs to consume memory from our stp.
> +	      if (dump_file)
> +		fprintf (dump_file,
> +			 "  stp: changing i%d to use mem from new stp "
> +			 "(after i%d)\n",
> +			 action.insn->uid (), pair_dst->uid ());
> +	      change->new_uses = drop_memory_access (change->new_uses);
> +	      gcc_assert (new_set);
> +	      auto new_use = crtl->ssa->create_use (attempt, action.insn,
> +						    new_set);
> +	      change->new_uses = insert_access (attempt, new_use,
> +						change->new_uses);
> +	      break;
> +	    }
> +	    }
> +	  changes.safe_push (change);
>  	}
>      }
>  
> -  // Otherwise, hazards[0] > hazards[1].
> -  // Pair can be formed anywhere in (hazards[1], hazards[0]).
> -  insn_range_info range (insns[0], insns[1]);
> -  if (base->hazards[1])
> -    range.first = base->hazards[1];
> -  if (base->hazards[0])
> -    range.last = base->hazards[0]->prev_nondebug_insn ();
> -
> -  // If the second insn can throw, narrow the move range to exactly that insn.
> -  // This prevents us trying to move the second insn from the end of the BB.
> -  if (cfun->can_throw_non_call_exceptions
> -      && find_reg_note (insns[1]->rtl (), REG_EH_REGION, NULL_RTX))
> +  if (trailing_add)
> +    changes.safe_push (make_delete (trailing_add));
> +  else if ((writeback & 2) && !writeback_effect)
>      {
> -      gcc_assert (range.includes (insns[1]));
> -      range = insn_range_info (insns[1]);
> +      // The second insn initially had writeback but now the pair does not,
> +      // need to update any nondebug uses of the base register def in the
> +      // second insn.  We'll take care of debug uses later.
> +      auto def = find_access (insns[1]->defs (), base_regno);
> +      gcc_assert (def);
> +      auto set = dyn_cast<set_info *> (def);
> +      if (set && set->has_nondebug_uses ())
> +	{
> +	  auto orig_use = find_access (insns[0]->uses (), base_regno);
> +	  for (auto use : set->nondebug_insn_uses ())
> +	    {
> +	      auto change = make_change (use->insn ());
> +	      change->new_uses = check_remove_regno_access (attempt,
> +							    change->new_uses,
> +							    base_regno);
> +	      change->new_uses = insert_access (attempt,
> +						orig_use,
> +						change->new_uses);
> +	      changes.safe_push (change);
> +	    }
> +	}
>      }
>  
> -  // Placement strategy: push loads down and pull stores up, this should
> -  // help register pressure by reducing live ranges.
> -  if (load_p)
> -    range.first = range.last;
> -  else
> -    range.last = range.first;
> +  auto is_changing = insn_is_changing (changes);
> +  for (unsigned i = 0; i < changes.length (); i++)
> +    gcc_assert (rtl_ssa::restrict_movement_ignoring (*changes[i], is_changing));
>  
> -  if (dump_file)
> +  // Check the pair pattern is recog'd.
> +  if (!rtl_ssa::recog_ignoring (attempt, *pair_change, is_changing))
>      {
> -      auto print_hazard = [](insn_info *i)
> -	{
> -	  if (i)
> -	    fprintf (dump_file, "%d", i->uid ());
> -	  else
> -	    fprintf (dump_file, "-");
> -	};
> -      auto print_pair = [print_hazard](insn_info **i)
> -	{
> -	  print_hazard (i[0]);
> -	  fprintf (dump_file, ",");
> -	  print_hazard (i[1]);
> -	};
> +      if (dump_file)
> +	fprintf (dump_file, "  failed to form pair, recog failed\n");
>  
> -      fprintf (dump_file, "fusing pair [L=%d] (%d,%d), base=%d, hazards: (",
> -	      load_p, insns[0]->uid (), insns[1]->uid (),
> -	      base->def->regno ());
> -      print_pair (base->hazards);
> -      fprintf (dump_file, "), move_range: (%d,%d)\n",
> -	       range.first->uid (), range.last->uid ());
> +      // Free any reg notes we allocated.
> +      while (reg_notes)
> +	{
> +	  rtx next = XEXP (reg_notes, 1);
> +	  free_EXPR_LIST_node (reg_notes);
> +	  reg_notes = next;
> +	}
> +      cancel_changes (0);
> +      return false;
>      }
>  
> -  return fuse_pair (load_p, access_size, writeback,
> -		    i1, i2, *base, range);
> +  gcc_assert (crtl->ssa->verify_insn_changes (changes));
> +
> +  // Fix up any debug uses that will be affected by the changes.
> +  if (MAY_HAVE_DEBUG_INSNS)
> +    fixup_debug_uses (attempt, insns, orig_rtl, pair_dst, trailing_add,
> +		      load_p, writeback, writeback_effect, base_regno);
> +
> +  confirm_change_group ();
> +  crtl->ssa->change_insns (changes);
> +
> +  gcc_checking_assert (tombstone_uids.length () <= 2);
> +  for (auto uid : tombstone_uids)
> +    track_tombstone (uid);
> +
> +  return true;
>  }
>  
> -static void
> -dump_insn_list (FILE *f, const insn_list_t &l)
> +struct  aarch64_pair_fusion : public pair_fusion
>  {
> -  fprintf (f, "(");
> +public:
> +  aarch64_pair_fusion (bb_info *bb) : pair_fusion (bb) {};
> +  bool is_fpsimd_op_p (rtx reg_op, machine_mode mem_mode, bool load_p)
> +  {
> +    const bool fpsimd_op_p
> +      = reload_completed
> +      ? (REG_P (reg_op) && FP_REGNUM_P (REGNO (reg_op)))
> +      : (GET_MODE_CLASS (mem_mode) != MODE_INT
> +	 && (load_p || !aarch64_const_zero_rtx_p (reg_op)));
> +    return fpsimd_op_p;
> +  }
>  
> -  auto i = l.begin ();
> -  auto end = l.end ();
> +  bool pair_mem_ok_policy (rtx first_mem, bool load_p, machine_mode mode)
> +  {
> +    return !aarch64_mem_ok_with_ldpstp_policy_model (first_mem,
> +						     load_p,
> +						     mode);
> +  }
> +  bool pair_operand_mode_ok_p (machine_mode mode);
>  
> -  if (i != end)
> -    fprintf (f, "%d", (*i)->uid ());
> -  i++;
> +  void transform_for_base (int encoded_lfs,
> +			   access_group &group);
> +  rtx gen_load_store_pair (rtx *pats,
> +			   rtx writeback,
> +			   bool load_p)
> +  {
> +    rtx pair_pat;
>  
> -  for (; i != end; i++)
> -    fprintf (f, ", %d", (*i)->uid ());
> +    if (writeback)
> +      {
> +	auto patvec = gen_rtvec (3, writeback, pats[0], pats[1]);
> +	pair_pat = gen_rtx_PARALLEL (VOIDmode, patvec);
> +      }
> +    else if (load_p)
> +      pair_pat = aarch64_gen_load_pair (XEXP (pats[0], 0),
> +					XEXP (pats[1], 0),
> +					XEXP (pats[0], 1));
> +    else
> +      pair_pat = aarch64_gen_store_pair (XEXP (pats[0], 0),
> +					 XEXP (pats[0], 1),
> +					 XEXP (pats[1], 1));
> +     return pair_pat;
> +  }
>  
> -  fprintf (f, ")");
> -}
> +  void set_multiword_subreg (insn_info *i1, insn_info *i2, bool load_p)
> +  {
> +    if (i1 || i2 || load_p)
> +      return;
> +    return;
> +  }
> +  bool pair_trailing_writeback_p  ()
> +  {
> +    return aarch64_ldp_writeback > 1;
> +  }
> +  bool pair_check_register_operand (bool load_p, rtx reg_op, machine_mode mem_mode)
> +  {
> +    return  (load_p
> +	     ? !aarch64_ldp_reg_operand (reg_op, mem_mode)
> +	     : !aarch64_stp_reg_operand (reg_op, mem_mode));
> +  }
> +  int pair_mem_alias_check_limit ()
> +  {
> +    return aarch64_ldp_alias_check_limit;
> +  }
> +  bool fuseable_store_p (insn_info *i1, insn_info *i2) { return i1 || i2;}
> +  bool fuseable_load_p (insn_info *insn) { return insn;}
> +  bool pair_is_writeback ()
> +  {
> +    return !aarch64_ldp_writeback;
> +  }
> +private:
> +   int num_pairs;
> +   rtx_insn *reg_ops[3];
> +};
>  
> -DEBUG_FUNCTION void
> -debug (const insn_list_t &l)
> +static lfs_fields
> +decode_lfs (int lfs)
>  {
> -  dump_insn_list (stderr, l);
> -  fprintf (stderr, "\n");
> +  bool load_p = (lfs & (1 << 3));
> +  bool fpsimd_p = (lfs & (1 << 2));
> +  unsigned size = 1U << ((lfs & 3) + 2);
> +  return { load_p, fpsimd_p, size };
>  }
>  
> -// LEFT_LIST and RIGHT_LIST are lists of candidate instructions where all insns
> -// in LEFT_LIST are known to be adjacent to those in RIGHT_LIST.
> -//
> -// This function traverses the resulting 2D matrix of possible pair candidates
> -// and attempts to merge them into pairs.
> -//
> -// The algorithm is straightforward: if we consider a combined list of
> -// candidates X obtained by merging LEFT_LIST and RIGHT_LIST in program order,
> -// then we advance through X until we reach a crossing point (where X[i] and
> -// X[i+1] come from different source lists).
> -//
> -// At this point we know X[i] and X[i+1] are adjacent accesses, and we try to
> -// fuse them into a pair.  If this succeeds, we remove X[i] and X[i+1] from
> -// their original lists and continue as above.
> -//
> -// In the failure case, we advance through the source list containing X[i] and
> -// continue as above (proceeding to the next crossing point).
> -//
> -// The rationale for skipping over groups of consecutive candidates from the
> -// same source list is as follows:
> -//
> -// In the store case, the insns in the group can't be re-ordered over each
> -// other as they are guaranteed to store to the same location, so we're
> -// guaranteed not to lose opportunities by doing this.
> -//
> -// In the load case, subsequent loads from the same location are either
> -// redundant (in which case they should have been cleaned up by an earlier
> -// optimization pass) or there is an intervening aliasing hazard, in which case
> -// we can't re-order them anyway, so provided earlier passes have cleaned up
> -// redundant loads, we shouldn't miss opportunities by doing this.
> -void
> -ldp_bb_info::merge_pairs (insn_list_t &left_list,
> -			  insn_list_t &right_list,
> -			  bool load_p,
> -			  unsigned access_size)
> +// Return true if we should consider forming ldp/stp insns from memory
> +// accesses with operand mode MODE at this stage in compilation.
> +static bool
> +ldp_operand_mode_ok_p (machine_mode mode)
>  {
> -  if (dump_file)
> -    {
> -      fprintf (dump_file, "merge_pairs [L=%d], cand vecs ", load_p);
> -      dump_insn_list (dump_file, left_list);
> -      fprintf (dump_file, " x ");
> -      dump_insn_list (dump_file, right_list);
> -      fprintf (dump_file, "\n");
> -    }
> +  const bool allow_qregs
> +    = !(aarch64_tune_params.extra_tuning_flags
> +	& AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS);
>  
> -  auto iter_l = left_list.begin ();
> -  auto iter_r = right_list.begin ();
> +  if (!aarch64_ldpstp_operand_mode_p (mode))
> +    return false;
>  
> -  while (iter_l != left_list.end () && iter_r != right_list.end ())
> +  const auto size = GET_MODE_SIZE (mode).to_constant ();
> +  if (size == 16 && !allow_qregs)
> +    return false;
> +
> +  // We don't pair up TImode accesses before RA because TImode is
> +  // special in that it can be allocated to a pair of GPRs or a single
> +  // FPR, and the RA is best placed to make that decision.
> +  return reload_completed || mode != TImode;
> +}
> +
> +bool
> +aarch64_pair_fusion::pair_operand_mode_ok_p (machine_mode mode)
> +{
> +  return (ldp_operand_mode_ok_p (mode));
> +}
> +
> +// Given a pair mode MODE, return a canonical mode to be used for a single
> +// operand of such a pair.  Currently we only use this when promoting a
> +// non-writeback pair into a writeback pair, as it isn't otherwise clear
> +// which mode to use when storing a modeless CONST_INT.
> +static machine_mode
> +aarch64_operand_mode_for_pair_mode (machine_mode mode)
> +{
> +  switch (mode)
>      {
> -      auto next_l = std::next (iter_l);
> -      auto next_r = std::next (iter_r);
> -      if (**iter_l < **iter_r
> -	  && next_l != left_list.end ()
> -	  && **next_l < **iter_r)
> -	iter_l = next_l;
> -      else if (**iter_r < **iter_l
> -	       && next_r != right_list.end ()
> -	       && **next_r < **iter_l)
> -	iter_r = next_r;
> -      else if (try_fuse_pair (load_p, access_size, *iter_l, *iter_r))
> -	{
> -	  left_list.erase (iter_l);
> -	  iter_l = next_l;
> -	  right_list.erase (iter_r);
> -	  iter_r = next_r;
> -	}
> -      else if (**iter_l < **iter_r)
> -	iter_l = next_l;
> -      else
> -	iter_r = next_r;
> +    case E_V2x4QImode:
> +      return SImode;
> +    case E_V2x8QImode:
> +      return DImode;
> +    case E_V2x16QImode:
> +      return V16QImode;
> +    default:
> +      gcc_unreachable ();
>      }
>  }
>  
> @@ -2890,8 +3108,8 @@ ldp_bb_info::merge_pairs (insn_list_t &left_list,
>  // of accesses.  If we find two sets of adjacent accesses, call
>  // merge_pairs.
>  void
> -ldp_bb_info::transform_for_base (int encoded_lfs,
> -				 access_group &group)
> +aarch64_pair_fusion::transform_for_base (int encoded_lfs,
> +					 access_group &group)
>  {
>    const auto lfs = decode_lfs (encoded_lfs);
>    const unsigned access_size = lfs.size;
> @@ -2915,55 +3133,6 @@ ldp_bb_info::transform_for_base (int encoded_lfs,
>      }
>  }
>  
> -// If we emitted tombstone insns for this BB, iterate through the BB
> -// and remove all the tombstone insns, being sure to reparent any uses
> -// of mem to previous defs when we do this.
> -void
> -ldp_bb_info::cleanup_tombstones ()
> -{
> -  // No need to do anything if we didn't emit a tombstone insn for this BB.
> -  if (!m_emitted_tombstone)
> -    return;
> -
> -  for (auto insn : iterate_safely (m_bb->nondebug_insns ()))
> -    {
> -      if (!insn->is_real ()
> -	  || !bitmap_bit_p (&m_tombstone_bitmap, insn->uid ()))
> -	continue;
> -
> -      auto set = as_a<set_info *> (memory_access (insn->defs ()));
> -      if (set->has_any_uses ())
> -	{
> -	  auto prev_set = as_a<set_info *> (set->prev_def ());
> -	  while (set->first_use ())
> -	    crtl->ssa->reparent_use (set->first_use (), prev_set);
> -	}
> -
> -      // Now set has no uses, we can delete it.
> -      insn_change change (insn, insn_change::DELETE);
> -      crtl->ssa->change_insn (change);
> -    }
> -}
> -
> -template<typename Map>
> -void
> -ldp_bb_info::traverse_base_map (Map &map)
> -{
> -  for (auto kv : map)
> -    {
> -      const auto &key = kv.first;
> -      auto &value = kv.second;
> -      transform_for_base (key.second, value);
> -    }
> -}
> -
> -void
> -ldp_bb_info::transform ()
> -{
> -  traverse_base_map (expr_map);
> -  traverse_base_map (def_map);
> -}
> -
>  static void
>  ldp_fusion_init ()
>  {
> @@ -3174,7 +3343,9 @@ void ldp_fusion_bb (bb_info *bb)
>    const bool track_stores
>      = aarch64_tune_params.stp_policy_model != AARCH64_LDP_STP_POLICY_NEVER;
>  
> -  ldp_bb_info bb_state (bb);
> +  pair_fusion *bb_state;
> +  aarch64_pair_fusion derived (bb);
> +  bb_state = &derived;
>  
>    for (auto insn : bb->nondebug_insns ())
>      {
> @@ -3194,13 +3365,13 @@ void ldp_fusion_bb (bb_info *bb)
>  	continue;
>  
>        if (track_stores && MEM_P (XEXP (pat, 0)))
> -	bb_state.track_access (insn, false, XEXP (pat, 0));
> +	bb_state->track_access (insn, false, XEXP (pat, 0));
>        else if (track_loads && MEM_P (XEXP (pat, 1)))
> -	bb_state.track_access (insn, true, XEXP (pat, 1));
> +	bb_state->track_access (insn, true, XEXP (pat, 1));
>      }
>  
> -  bb_state.transform ();
> -  bb_state.cleanup_tombstones ();
> +  bb_state->transform ();
> +  bb_state->cleanup_tombstones ();
>  }
>  
>  void ldp_fusion ()
> @@ -3263,7 +3434,7 @@ public:
>      }
>  };
>  
> -} // anon namespace
> +}// anon namespace
>  
>  rtl_opt_pass *
>  make_pass_ldp_fusion (gcc::context *ctx)

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH 0/2 V2] aarch64: Place target independent and dependent code in one file.
  2024-02-22 19:49 ` Richard Sandiford
@ 2024-02-22 21:17   ` Segher Boessenkool
  2024-02-23 11:25   ` Ajit Agarwal
  1 sibling, 0 replies; 4+ messages in thread
From: Segher Boessenkool @ 2024-02-22 21:17 UTC (permalink / raw)
  To: Ajit Agarwal, Alex Coplan, Kewen.Lin, Michael Meissner,
	Peter Bergner, David Edelsohn, gcc-patches, richard.sandiford

On Thu, Feb 22, 2024 at 07:49:20PM +0000, Richard Sandiford wrote:
> Thanks for the update.  This is still quite hard to review though.
> Sorry to ask for another round, but could you split it up further?
> The ideal thing would be if patches that move code do nothing other
> than move code, and if patches that change code do those changes
> in-place.

In general, if there is a (big) part to the patch that does not change
behaviour at all, it should be a separate patch.  Such a patch is then
easy to review (write down in the commit message that it does not change
behaviour though, it helps reviewers).

It also makes the remaining tiny patches much easier to review then.

Very generally, any patch that makes interesting changes should not
have more than a few lines semantic content.  That can be repeated of
course, and have fall-out mechanical follow-up changes, but that's the
essence of good patchsets: one change per patch.

And then the commit message can be simple as well, and the chanegelog
will be easy to write.  That is the litmus test for good patch series :-)

Segher

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH 0/2 V2] aarch64: Place target independent and dependent code in one file.
  2024-02-22 19:49 ` Richard Sandiford
  2024-02-22 21:17   ` Segher Boessenkool
@ 2024-02-23 11:25   ` Ajit Agarwal
  1 sibling, 0 replies; 4+ messages in thread
From: Ajit Agarwal @ 2024-02-23 11:25 UTC (permalink / raw)
  To: Alex Coplan, Kewen.Lin, Segher Boessenkool, Michael Meissner,
	Peter Bergner, David Edelsohn, gcc-patches, richard.sandiford

Hello Richard:

On 23/02/24 1:19 am, Richard Sandiford wrote:
> Ajit Agarwal <aagarwa1@linux.ibm.com> writes:
>> Hello Alex/Richard:
>>
>> I have placed target indpendent and target dependent code in
>> aarch64-ldp-fusion for load store fusion.
>>
>> Common infrastructure of load store pair fusion is divided into
>> target independent and target dependent code.
>>
>> Target independent code is the Generic code with pure virtual
>> function to interface betwwen target independent and dependent
>> code.
>>
>> Target dependent code is the implementation of pure virtual
>> function for aarch64 target and the call to target independent
>> code.
> 
> Thanks for the update.  This is still quite hard to review though.
> Sorry to ask for another round, but could you split it up further?
> The ideal thing would be if patches that move code do nothing other
> than move code, and if patches that change code do those changes
> in-place.
> 

As per your suggestion I have submitted new patch with above changes.
Sorry for inconvenience caused.

Thanks & Regards
Ajit


> Richard
> 
>>
>> Bootstrapped in aarch64-linux-gnu.
>>
>> Thanks & Regards
>> Ajit
>>
>>
>> aarch64: Place target independent and dependent code in one file.
>>
>> Common infrastructure of load store pair fusion is divided into
>> target independent and target dependent code.
>>
>> Target independent code is the Generic code with pure virtual
>> function to interface betwwen target independent and dependent
>> code.
>>
>> Target dependent code is the implementation of pure virtual
>> function for aarch64 target and the call to target independent
>> code.
>>
>> 2024-02-15  Ajit Kumar Agarwal  <aagarwa1@linux.ibm.com>
>>
>> gcc/ChangeLog:
>>
>> 	* config/aarch64/aarch64-ldp-fusion.cc: Place target
>> 	independent and dependent code.
>> ---
>>  gcc/config/aarch64/aarch64-ldp-fusion.cc | 3513 ++++++++++++----------
>>  1 file changed, 1842 insertions(+), 1671 deletions(-)
>>
>> diff --git a/gcc/config/aarch64/aarch64-ldp-fusion.cc b/gcc/config/aarch64/aarch64-ldp-fusion.cc
>> index 22ed95eb743..0ab842e2bbb 100644
>> --- a/gcc/config/aarch64/aarch64-ldp-fusion.cc
>> +++ b/gcc/config/aarch64/aarch64-ldp-fusion.cc
>> @@ -17,6 +17,7 @@
>>  // along with GCC; see the file COPYING3.  If not see
>>  // <http://www.gnu.org/licenses/>.
>>  
>> +
>>  #define INCLUDE_ALGORITHM
>>  #define INCLUDE_FUNCTIONAL
>>  #define INCLUDE_LIST
>> @@ -37,13 +38,12 @@
>>  #include "tree-hash-traits.h"
>>  #include "print-tree.h"
>>  #include "insn-attr.h"
>> -
>>  using namespace rtl_ssa;
>>  
>> -static constexpr HOST_WIDE_INT LDP_IMM_BITS = 7;
>> -static constexpr HOST_WIDE_INT LDP_IMM_SIGN_BIT = (1 << (LDP_IMM_BITS - 1));
>> -static constexpr HOST_WIDE_INT LDP_MAX_IMM = LDP_IMM_SIGN_BIT - 1;
>> -static constexpr HOST_WIDE_INT LDP_MIN_IMM = -LDP_MAX_IMM - 1;
>> +static constexpr HOST_WIDE_INT PAIR_MEM_IMM_BITS = 7;
>> +static constexpr HOST_WIDE_INT PAIR_MEM_IMM_SIGN_BIT = (1 << (PAIR_MEM_IMM_BITS - 1));
>> +static constexpr HOST_WIDE_INT PAIR_MEM_MAX_IMM = PAIR_MEM_IMM_SIGN_BIT - 1;
>> +static constexpr HOST_WIDE_INT PAIR_MEM_MIN_IMM = -PAIR_MEM_MAX_IMM - 1;
>>  
>>  // We pack these fields (load_p, fpsimd_p, and size) into an integer
>>  // (LFS) which we use as part of the key into the main hash tables.
>> @@ -138,8 +138,144 @@ struct alt_base
>>    poly_int64 offset;
>>  };
>>  
>> +// Class that implements a state machine for building the changes needed to form
>> +// a store pair instruction.  This allows us to easily build the changes in
>> +// program order, as required by rtl-ssa.
>> +struct stp_change_builder
>> +{
>> +  enum class state
>> +  {
>> +    FIRST,
>> +    INSERT,
>> +    FIXUP_USE,
>> +    LAST,
>> +    DONE
>> +  };
>> +
>> +  enum class action
>> +  {
>> +    TOMBSTONE,
>> +    CHANGE,
>> +    INSERT,
>> +    FIXUP_USE
>> +  };
>> +
>> +  struct change
>> +  {
>> +    action type;
>> +    insn_info *insn;
>> +  };
>> +
>> +  bool done () const { return m_state == state::DONE; }
>> +
>> +  stp_change_builder (insn_info *insns[2],
>> +		      insn_info *repurpose,
>> +		      insn_info *dest)
>> +    : m_state (state::FIRST), m_insns { insns[0], insns[1] },
>> +      m_repurpose (repurpose), m_dest (dest), m_use (nullptr) {}
>> +
>> +  change get_change () const
>> +  {
>> +    switch (m_state)
>> +      {
>> +      case state::FIRST:
>> +	return {
>> +	  m_insns[0] == m_repurpose ? action::CHANGE : action::TOMBSTONE,
>> +	  m_insns[0]
>> +	};
>> +      case state::LAST:
>> +	return {
>> +	  m_insns[1] == m_repurpose ? action::CHANGE : action::TOMBSTONE,
>> +	  m_insns[1]
>> +	};
>> +      case state::INSERT:
>> +	return { action::INSERT, m_dest };
>> +      case state::FIXUP_USE:
>> +	return { action::FIXUP_USE, m_use->insn () };
>> +      case state::DONE:
>> +	break;
>> +      }
>> +
>> +    gcc_unreachable ();
>> +  }
>> +
>> +  // Transition to the next state.
>> +  void advance ()
>> +  {
>> +    switch (m_state)
>> +      {
>> +      case state::FIRST:
>> +	if (m_repurpose)
>> +	  m_state = state::LAST;
>> +	else
>> +	  m_state = state::INSERT;
>> +	break;
>> +      case state::INSERT:
>> +      {
>> +	def_info *def = memory_access (m_insns[0]->defs ());
>> +	while (*def->next_def ()->insn () <= *m_dest)
>> +	  def = def->next_def ();
>> +
>> +	// Now we know DEF feeds the insertion point for the new stp.
>> +	// Look for any uses of DEF that will consume the new stp.
>> +	gcc_assert (*def->insn () <= *m_dest
>> +		    && *def->next_def ()->insn () > *m_dest);
>> +
>> +	auto set = as_a<set_info *> (def);
>> +	for (auto use : set->nondebug_insn_uses ())
>> +	  if (*use->insn () > *m_dest)
>> +	    {
>> +	      m_use = use;
>> +	      break;
>> +	    }
>> +
>> +	if (m_use)
>> +	  m_state = state::FIXUP_USE;
>> +	else
>> +	  m_state = state::LAST;
>> +	break;
>> +      }
>> +      case state::FIXUP_USE:
>> +	m_use = m_use->next_nondebug_insn_use ();
>> +	if (!m_use)
>> +	  m_state = state::LAST;
>> +	break;
>> +      case state::LAST:
>> +	m_state = state::DONE;
>> +	break;
>> +      case state::DONE:
>> +	gcc_unreachable ();
>> +      }
>> +  }
>> +
>> +private:
>> +  state m_state;
>> +
>> +  // Original candidate stores.
>> +  insn_info *m_insns[2];
>> +
>> +  // If non-null, this is a candidate insn to change into an stp.  Otherwise we
>> +  // are deleting both original insns and inserting a new insn for the stp.
>> +  insn_info *m_repurpose;
>> +
>> +  // Destionation of the stp, it will be placed immediately after m_dest.
>> +  insn_info *m_dest;
>> +
>> +  // Current nondebug use that needs updating due to stp insertion.
>> +  use_info *m_use;
>> +};
>> +
>> +// Virtual base class for load/store walkers used in alias analysis.
>> +struct alias_walker
>> +{
>> +  virtual bool conflict_p (int &budget) const = 0;
>> +  virtual insn_info *insn () const = 0;
>> +  virtual bool valid () const  = 0;
>> +  virtual void advance () = 0;
>> +};
>> +
>>  // State used by the pass for a given basic block.
>> -struct ldp_bb_info
>> +struct pair_fusion
>>  {
>>    using def_hash = nofree_ptr_hash<def_info>;
>>    using expr_key_t = pair_hash<tree_operand_hash, int_hash<int, -1, -2>>;
>> @@ -161,13 +297,13 @@ struct ldp_bb_info
>>    static const size_t obstack_alignment = sizeof (void *);
>>    bb_info *m_bb;
>>  
>> -  ldp_bb_info (bb_info *bb) : m_bb (bb), m_emitted_tombstone (false)
>> +  pair_fusion (bb_info *bb) : m_bb (bb), m_emitted_tombstone (false)
>>    {
>>      obstack_specify_allocation (&m_obstack, OBSTACK_CHUNK_SIZE,
>>  				obstack_alignment, obstack_chunk_alloc,
>>  				obstack_chunk_free);
>>    }
>> -  ~ldp_bb_info ()
>> +  ~pair_fusion ()
>>    {
>>      obstack_free (&m_obstack, nullptr);
>>  
>> @@ -177,10 +313,50 @@ struct ldp_bb_info
>>  	bitmap_obstack_release (&m_bitmap_obstack);
>>        }
>>    }
>> +  void track_access (insn_info *, bool load, rtx mem);
>> +  void transform ();
>> +  void cleanup_tombstones ();
>> +  virtual void set_multiword_subreg (insn_info *i1, insn_info *i2,
>> +				     bool load_p) = 0;
>> +  virtual rtx gen_load_store_pair (rtx *pats,  rtx writeback,
>> +				   bool load_p) = 0;
>> +  void merge_pairs (insn_list_t &, insn_list_t &,
>> +		    bool load_p, unsigned access_size);
>> +  virtual void transform_for_base (int load_size, access_group &group) = 0;
>> +
>> +  bool try_fuse_pair (bool load_p, unsigned access_size,
>> +			     insn_info *i1, insn_info *i2);
>> +
>> +  bool fuse_pair (bool load_p, unsigned access_size,
>> +		  int writeback,
>> +		  insn_info *i1, insn_info *i2,
>> +		  base_cand &base,
>> +		  const insn_range_info &move_range);
>> +
>> +  void do_alias_analysis (insn_info *alias_hazards[4],
>> +			  alias_walker *walkers[4],
>> +			  bool load_p);
>> +
>> +  void track_tombstone (int uid);
>> +
>> +  bool track_via_mem_expr (insn_info *, rtx mem, lfs_fields lfs);
>>  
>> -  inline void track_access (insn_info *, bool load, rtx mem);
>> -  inline void transform ();
>> -  inline void cleanup_tombstones ();
>> +  virtual bool is_fpsimd_op_p (rtx reg_op, machine_mode mem_mode,
>> +			       bool load_p) = 0;
>> +
>> +  virtual bool pair_operand_mode_ok_p (machine_mode mode) = 0;
>> +  virtual bool pair_trailing_writeback_p () = 0;
>> +  virtual bool pair_check_register_operand (bool load_p, rtx reg_op,
>> +					    machine_mode mem_mode) = 0;
>> +  virtual int pair_mem_alias_check_limit () = 0;
>> +  virtual bool pair_is_writeback () = 0 ;
>> +  virtual bool pair_mem_ok_policy (rtx first_mem, bool load_p,
>> +				   machine_mode mode) = 0;
>> +  virtual bool fuseable_store_p (insn_info *i1, insn_info *i2) = 0;
>> +  virtual bool fuseable_load_p (insn_info *info) = 0;
>> +
>> +  template<typename Map>
>> +    void traverse_base_map (Map &map);
>>  
>>  private:
>>    obstack m_obstack;
>> @@ -191,100 +367,292 @@ private:
>>    bool m_emitted_tombstone;
>>  
>>    inline splay_tree_node<access_record *> *node_alloc (access_record *);
>> -
>> -  template<typename Map>
>> -  inline void traverse_base_map (Map &map);
>> -  inline void transform_for_base (int load_size, access_group &group);
>> -
>> -  inline void merge_pairs (insn_list_t &, insn_list_t &,
>> -			   bool load_p, unsigned access_size);
>> -
>> -  inline bool try_fuse_pair (bool load_p, unsigned access_size,
>> -			     insn_info *i1, insn_info *i2);
>> -
>> -  inline bool fuse_pair (bool load_p, unsigned access_size,
>> -			 int writeback,
>> -			 insn_info *i1, insn_info *i2,
>> -			 base_cand &base,
>> -			 const insn_range_info &move_range);
>> -
>> -  inline void track_tombstone (int uid);
>> -
>> -  inline bool track_via_mem_expr (insn_info *, rtx mem, lfs_fields lfs);
>>  };
>> -
>> -splay_tree_node<access_record *> *
>> -ldp_bb_info::node_alloc (access_record *access)
>> -{
>> -  using T = splay_tree_node<access_record *>;
>> -  void *addr = obstack_alloc (&m_obstack, sizeof (T));
>> -  return new (addr) T (access);
>> -}
>> -
>> -// Given a mem MEM, if the address has side effects, return a MEM that accesses
>> -// the same address but without the side effects.  Otherwise, return
>> -// MEM unchanged.
>> -static rtx
>> -drop_writeback (rtx mem)
>> +// Track the access INSN at offset OFFSET in this access group.
>> +// ALLOC_NODE is used to allocate splay tree nodes.
>> +template<typename Alloc>
>> +void
>> +access_group::track (Alloc alloc_node, poly_int64 offset, insn_info *insn)
>>  {
>> -  rtx addr = XEXP (mem, 0);
>> +  auto insert_before = [&](std::list<access_record>::iterator after)
>> +    {
>> +      auto it = list.emplace (after, offset);
>> +      it->cand_insns.push_back (insn);
>> +      it->place = it;
>> +      return &*it;
>> +    };
>>  
>> -  if (!side_effects_p (addr))
>> -    return mem;
>> +  if (!list.size ())
>> +    {
>> +      auto access = insert_before (list.end ());
>> +      tree.insert_max_node (alloc_node (access));
>> +      return;
>> +    }
>>  
>> -  switch (GET_CODE (addr))
>> +  auto compare = [&](splay_tree_node<access_record *> *node)
>>      {
>> -    case PRE_MODIFY:
>> -      addr = XEXP (addr, 1);
>> -      break;
>> -    case POST_MODIFY:
>> -    case POST_INC:
>> -    case POST_DEC:
>> -      addr = XEXP (addr, 0);
>> -      break;
>> -    case PRE_INC:
>> -    case PRE_DEC:
>> +      return compare_sizes_for_sort (offset, node->value ()->offset);
>> +    };
>> +  auto result = tree.lookup (compare);
>> +  splay_tree_node<access_record *> *node = tree.root ();
>> +  if (result == 0)
>> +    node->value ()->cand_insns.push_back (insn);
>> +  else
>>      {
>> -      poly_int64 adjustment = GET_MODE_SIZE (GET_MODE (mem));
>> -      if (GET_CODE (addr) == PRE_DEC)
>> -	adjustment *= -1;
>> -      addr = plus_constant (GET_MODE (addr), XEXP (addr, 0), adjustment);
>> -      break;
>> -    }
>> -    default:
>> -      gcc_unreachable ();
>> +      auto it = node->value ()->place;
>> +      auto after = (result > 0) ? std::next (it) : it;
>> +      auto access = insert_before (after);
>> +      tree.insert_child (node, result > 0, alloc_node (access));
>>      }
>> -
>> -  return change_address (mem, GET_MODE (mem), addr);
>>  }
>>  
>> -// Convenience wrapper around strip_offset that can also look through
>> -// RTX_AUTOINC addresses.  The interface is like strip_offset except we take a
>> -// MEM so that we know the mode of the access.
>> -static rtx
>> -ldp_strip_offset (rtx mem, poly_int64 *offset)
>> +bool
>> +store_modifies_mem_p (rtx mem, insn_info *store_insn, int &budget);
>> +bool load_modified_by_store_p (insn_info *load,
>> +			  insn_info *store,
>> +			  int &budget);
>> +
>> +// Implement some common functionality used by both store_walker
>> +// and load_walker.
>> +template<bool reverse>
>> +class def_walker : public alias_walker
>>  {
>> -  rtx addr = XEXP (mem, 0);
>> +protected:
>> +  using def_iter_t = typename std::conditional<reverse,
>> +	reverse_def_iterator, def_iterator>::type;
>>  
>> -  switch (GET_CODE (addr))
>> -    {
>> -    case PRE_MODIFY:
>> -    case POST_MODIFY:
>> -      addr = strip_offset (XEXP (addr, 1), offset);
>> -      gcc_checking_assert (REG_P (addr));
>> -      gcc_checking_assert (rtx_equal_p (XEXP (XEXP (mem, 0), 0), addr));
>> -      break;
>> -    case PRE_INC:
>> -    case POST_INC:
>> -      addr = XEXP (addr, 0);
>> -      *offset = GET_MODE_SIZE (GET_MODE (mem));
>> -      gcc_checking_assert (REG_P (addr));
>> -      break;
>> -    case PRE_DEC:
>> -    case POST_DEC:
>> -      addr = XEXP (addr, 0);
>> -      *offset = -GET_MODE_SIZE (GET_MODE (mem));
>> -      gcc_checking_assert (REG_P (addr));
>> +  static use_info *start_use_chain (def_iter_t &def_iter)
>> +  {
>> +    set_info *set = nullptr;
>> +    for (; *def_iter; def_iter++)
>> +      {
>> +	set = dyn_cast<set_info *> (*def_iter);
>> +	if (!set)
>> +	  continue;
>> +
>> +	use_info *use = reverse
>> +	  ? set->last_nondebug_insn_use ()
>> +	  : set->first_nondebug_insn_use ();
>> +
>> +	if (use)
>> +	  return use;
>> +      }
>> +
>> +    return nullptr;
>> +  }
>> +
>> +  def_iter_t def_iter;
>> +  insn_info *limit;
>> +  def_walker (def_info *def, insn_info *limit) :
>> +    def_iter (def), limit (limit) {}
>> +
>> +  virtual bool iter_valid () const { return *def_iter; }
>> +
>> +public:
>> +  insn_info *insn () const override { return (*def_iter)->insn (); }
>> +  void advance () override { def_iter++; }
>> +  bool valid () const override final
>> +  {
>> +    if (!iter_valid ())
>> +      return false;
>> +
>> +    if (reverse)
>> +      return *(insn ()) > *limit;
>> +    else
>> +      return *(insn ()) < *limit;
>> +  }
>> +};
>> +
>> +// alias_walker that iterates over stores.
>> +template<bool reverse, typename InsnPredicate>
>> +class store_walker : public def_walker<reverse>
>> +{
>> +  rtx cand_mem;
>> +  InsnPredicate tombstone_p;
>> +
>> +public:
>> +  store_walker (def_info *mem_def, rtx mem, insn_info *limit_insn,
>> +		InsnPredicate tombstone_fn) :
>> +    def_walker<reverse> (mem_def, limit_insn),
>> +    cand_mem (mem), tombstone_p (tombstone_fn) {}
>> +  bool conflict_p (int &budget) const override final
>> +  {
>> +    if (tombstone_p (this->insn ()))
>> +      return false;
>> +
>> +    return store_modifies_mem_p (cand_mem, this->insn (), budget);
>> +  }
>> +};
>> +
>> +// alias_walker that iterates over loads.
>> +template<bool reverse>
>> +class load_walker : public def_walker<reverse>
>> +{
>> +  using Base = def_walker<reverse>;
>> +  using use_iter_t = typename std::conditional<reverse,
>> +	reverse_use_iterator, nondebug_insn_use_iterator>::type;
>> +
>> +  use_iter_t use_iter;
>> +  insn_info *cand_store;
>> +
>> +  bool iter_valid () const override final { return *use_iter; }
>> +
>> +public:
>> +  void advance () override final
>> +  {
>> +    use_iter++;
>> +    if (*use_iter)
>> +      return;
>> +    this->def_iter++;
>> +    use_iter = Base::start_use_chain (this->def_iter);
>> +  }
>> +
>> +  insn_info *insn () const override final
>> +  {
>> +    return (*use_iter)->insn ();
>> +  }
>> +  bool conflict_p (int &budget) const override final
>> +  {
>> +    return load_modified_by_store_p (insn (), cand_store, budget);
>> +  }
>> +  load_walker (def_info *def, insn_info *store, insn_info *limit_insn)
>> +    : Base (def, limit_insn),
>> +      use_iter (Base::start_use_chain (this->def_iter)),
>> +      cand_store (store) {}
>> +};
>> +
>> +extern insn_info *
>> +try_repurpose_store (insn_info *first,
>> +		     insn_info *second,
>> +		     const insn_range_info &move_range);
>> +
>> +void reset_debug_use (use_info *use);
>> +
>> +extern void
>> +fixup_debug_uses (obstack_watermark &attempt,
>> +		  insn_info *insns[2],
>> +		  rtx orig_rtl[2],
>> +		  insn_info *pair_dst,
>> +		  insn_info *trailing_add,
>> +		  bool load_p,
>> +		  int writeback,
>> +		  rtx writeback_effect,
>> +		  unsigned base_regno);
>> +
>> +void
>> +fixup_debug_uses_trailing_add (obstack_watermark &attempt,
>> +			       insn_info *pair_dst,
>> +			       insn_info *trailing_add,
>> +			       rtx writeback_effect);
>> +
>> +
>> +extern void
>> +fixup_debug_use (obstack_watermark &attempt,
>> +		 use_info *use,
>> +		 def_info *def,
>> +		 rtx base,
>> +		 poly_int64 wb_offset);
>> +
>> +extern insn_info *
>> +find_trailing_add (insn_info *insns[2],
>> +		   const insn_range_info &pair_range,
>> +		   int initial_writeback,
>> +		   rtx *writeback_effect,
>> +		   def_info **add_def,
>> +		   def_info *base_def,
>> +		   poly_int64 initial_offset,
>> +		   unsigned access_size);
>> +
>> +rtx drop_writeback (rtx mem);
>> +rtx pair_mem_strip_offset (rtx mem, poly_int64 *offset);
>> +bool any_pre_modify_p (rtx x);
>> +bool any_post_modify_p (rtx x);
>> +int encode_lfs (lfs_fields fields);
>> +extern insn_info * latest_hazard_before (insn_info *insn, rtx *ignore,
>> +		      insn_info *ignore_insn = nullptr);
>> +insn_info * first_hazard_after (insn_info *insn, rtx *ignore);
>> +bool ranges_overlap_p (const insn_range_info &r1, const insn_range_info &r2);
>> +insn_range_info get_def_range (def_info *def);
>> +insn_range_info def_downwards_move_range (def_info *def);
>> +insn_range_info def_upwards_move_range (def_info *def);
>> +rtx gen_tombstone (void);
>> +rtx filter_notes (rtx note, rtx result, bool *eh_region, rtx *fr_expr);
>> +rtx combine_reg_notes (insn_info *i1, insn_info *i2, bool load_p);
>> +rtx extract_writebacks (bool load_p, rtx pats[2], int changed);
>> +void do_alias_analysis (insn_info *alias_hazards[4],
>> +		   alias_walker *walkers[4],
>> +		   bool load_p);
>> +int get_viable_bases (insn_info *insns[2],
>> +		  vec<base_cand> &base_cands,
>> +		  rtx cand_mems[2],
>> +		  unsigned access_size,
>> +		  bool reversed);
>> +void dump_insn_list (FILE *f, const insn_list_t &l);
>> +
>> +// Given a mem MEM, if the address has side effects, return a MEM that accesses
>> +// the same address but without the side effects.  Otherwise, return
>> +// MEM unchanged.
>> +rtx
>> +drop_writeback (rtx mem)
>> +{
>> +  rtx addr = XEXP (mem, 0);
>> +
>> +  if (!side_effects_p (addr))
>> +    return mem;
>> +
>> +  switch (GET_CODE (addr))
>> +    {
>> +    case PRE_MODIFY:
>> +      addr = XEXP (addr, 1);
>> +      break;
>> +    case POST_MODIFY:
>> +    case POST_INC:
>> +    case POST_DEC:
>> +      addr = XEXP (addr, 0);
>> +      break;
>> +    case PRE_INC:
>> +    case PRE_DEC:
>> +    {
>> +      poly_int64 adjustment = GET_MODE_SIZE (GET_MODE (mem));
>> +      if (GET_CODE (addr) == PRE_DEC)
>> +	adjustment *= -1;
>> +      addr = plus_constant (GET_MODE (addr), XEXP (addr, 0), adjustment);
>> +      break;
>> +    }
>> +    default:
>> +      gcc_unreachable ();
>> +    }
>> +
>> +  return change_address (mem, GET_MODE (mem), addr);
>> +}
>> +
>> +// Convenience wrapper around strip_offset that can also look through
>> +// RTX_AUTOINC addresses.  The interface is like strip_offset except we take a
>> +// MEM so that we know the mode of the access.
>> +rtx
>> +pair_mem_strip_offset (rtx mem, poly_int64 *offset)
>> +{
>> +  rtx addr = XEXP (mem, 0);
>> +
>> +  switch (GET_CODE (addr))
>> +    {
>> +    case PRE_MODIFY:
>> +    case POST_MODIFY:
>> +      addr = strip_offset (XEXP (addr, 1), offset);
>> +      gcc_checking_assert (REG_P (addr));
>> +      gcc_checking_assert (rtx_equal_p (XEXP (XEXP (mem, 0), 0), addr));
>> +      break;
>> +    case PRE_INC:
>> +    case POST_INC:
>> +      addr = XEXP (addr, 0);
>> +      *offset = GET_MODE_SIZE (GET_MODE (mem));
>> +      gcc_checking_assert (REG_P (addr));
>> +      break;
>> +    case PRE_DEC:
>> +    case POST_DEC:
>> +      addr = XEXP (addr, 0);
>> +      *offset = -GET_MODE_SIZE (GET_MODE (mem));
>> +      gcc_checking_assert (REG_P (addr));
>>        break;
>>  
>>      default:
>> @@ -295,7 +663,7 @@ ldp_strip_offset (rtx mem, poly_int64 *offset)
>>  }
>>  
>>  // Return true if X is a PRE_{INC,DEC,MODIFY} rtx.
>> -static bool
>> +bool
>>  any_pre_modify_p (rtx x)
>>  {
>>    const auto code = GET_CODE (x);
>> @@ -303,318 +671,42 @@ any_pre_modify_p (rtx x)
>>  }
>>  
>>  // Return true if X is a POST_{INC,DEC,MODIFY} rtx.
>> -static bool
>> +bool
>>  any_post_modify_p (rtx x)
>>  {
>>    const auto code = GET_CODE (x);
>>    return code == POST_INC || code == POST_DEC || code == POST_MODIFY;
>>  }
>>  
>> -// Return true if we should consider forming ldp/stp insns from memory
>> -// accesses with operand mode MODE at this stage in compilation.
>> -static bool
>> -ldp_operand_mode_ok_p (machine_mode mode)
>> -{
>> -  const bool allow_qregs
>> -    = !(aarch64_tune_params.extra_tuning_flags
>> -	& AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS);
>> -
>> -  if (!aarch64_ldpstp_operand_mode_p (mode))
>> -    return false;
>> -
>> -  const auto size = GET_MODE_SIZE (mode).to_constant ();
>> -  if (size == 16 && !allow_qregs)
>> -    return false;
>> -
>> -  // We don't pair up TImode accesses before RA because TImode is
>> -  // special in that it can be allocated to a pair of GPRs or a single
>> -  // FPR, and the RA is best placed to make that decision.
>> -  return reload_completed || mode != TImode;
>> -}
>> -
>>  // Given LFS (load_p, fpsimd_p, size) fields in FIELDS, encode these
>>  // into an integer for use as a hash table key.
>> -static int
>> +int
>>  encode_lfs (lfs_fields fields)
>>  {
>>    int size_log2 = exact_log2 (fields.size);
>> -  gcc_checking_assert (size_log2 >= 2 && size_log2 <= 4);
>> +  //gcc_checking_assert (size_log2 >= 2 && size_log2 <= 4);
>>    return ((int)fields.load_p << 3)
>>      | ((int)fields.fpsimd_p << 2)
>>      | (size_log2 - 2);
>>  }
>>  
>> -// Inverse of encode_lfs.
>> -static lfs_fields
>> -decode_lfs (int lfs)
>> -{
>> -  bool load_p = (lfs & (1 << 3));
>> -  bool fpsimd_p = (lfs & (1 << 2));
>> -  unsigned size = 1U << ((lfs & 3) + 2);
>> -  return { load_p, fpsimd_p, size };
>> -}
>> +// Dummy predicate that never ignores any insns.
>> +static bool no_ignore (insn_info *) { return false; }
>>  
>> -// Track the access INSN at offset OFFSET in this access group.
>> -// ALLOC_NODE is used to allocate splay tree nodes.
>> -template<typename Alloc>
>> -void
>> -access_group::track (Alloc alloc_node, poly_int64 offset, insn_info *insn)
>> -{
>> -  auto insert_before = [&](std::list<access_record>::iterator after)
>> -    {
>> -      auto it = list.emplace (after, offset);
>> -      it->cand_insns.push_back (insn);
>> -      it->place = it;
>> -      return &*it;
>> -    };
>> -
>> -  if (!list.size ())
>> -    {
>> -      auto access = insert_before (list.end ());
>> -      tree.insert_max_node (alloc_node (access));
>> -      return;
>> -    }
>> -
>> -  auto compare = [&](splay_tree_node<access_record *> *node)
>> -    {
>> -      return compare_sizes_for_sort (offset, node->value ()->offset);
>> -    };
>> -  auto result = tree.lookup (compare);
>> -  splay_tree_node<access_record *> *node = tree.root ();
>> -  if (result == 0)
>> -    node->value ()->cand_insns.push_back (insn);
>> -  else
>> -    {
>> -      auto it = node->value ()->place;
>> -      auto after = (result > 0) ? std::next (it) : it;
>> -      auto access = insert_before (after);
>> -      tree.insert_child (node, result > 0, alloc_node (access));
>> -    }
>> -}
>> -
>> -// Given a candidate access INSN (with mem MEM), see if it has a suitable
>> -// MEM_EXPR base (i.e. a tree decl) relative to which we can track the access.
>> -// LFS is used as part of the key to the hash table, see track_access.
>> -bool
>> -ldp_bb_info::track_via_mem_expr (insn_info *insn, rtx mem, lfs_fields lfs)
>> -{
>> -  if (!MEM_EXPR (mem) || !MEM_OFFSET_KNOWN_P (mem))
>> -    return false;
>> -
>> -  poly_int64 offset;
>> -  tree base_expr = get_addr_base_and_unit_offset (MEM_EXPR (mem),
>> -						  &offset);
>> -  if (!base_expr || !DECL_P (base_expr))
>> -    return false;
>> -
>> -  offset += MEM_OFFSET (mem);
>> -
>> -  const machine_mode mem_mode = GET_MODE (mem);
>> -  const HOST_WIDE_INT mem_size = GET_MODE_SIZE (mem_mode).to_constant ();
>> -
>> -  // Punt on misaligned offsets.  LDP/STP instructions require offsets to be a
>> -  // multiple of the access size, and we believe that misaligned offsets on
>> -  // MEM_EXPR bases are likely to lead to misaligned offsets w.r.t. RTL bases.
>> -  if (!multiple_p (offset, mem_size))
>> -    return false;
>> -
>> -  const auto key = std::make_pair (base_expr, encode_lfs (lfs));
>> -  access_group &group = expr_map.get_or_insert (key, NULL);
>> -  auto alloc = [&](access_record *access) { return node_alloc (access); };
>> -  group.track (alloc, offset, insn);
>> -
>> -  if (dump_file)
>> -    {
>> -      fprintf (dump_file, "[bb %u] tracking insn %d via ",
>> -	       m_bb->index (), insn->uid ());
>> -      print_node_brief (dump_file, "mem expr", base_expr, 0);
>> -      fprintf (dump_file, " [L=%d FP=%d, %smode, off=",
>> -	       lfs.load_p, lfs.fpsimd_p, mode_name[mem_mode]);
>> -      print_dec (offset, dump_file);
>> -      fprintf (dump_file, "]\n");
>> -    }
>> -
>> -  return true;
>> -}
>> -
>> -// Main function to begin pair discovery.  Given a memory access INSN,
>> -// determine whether it could be a candidate for fusing into an ldp/stp,
>> -// and if so, track it in the appropriate data structure for this basic
>> -// block.  LOAD_P is true if the access is a load, and MEM is the mem
>> -// rtx that occurs in INSN.
>> -void
>> -ldp_bb_info::track_access (insn_info *insn, bool load_p, rtx mem)
>> -{
>> -  // We can't combine volatile MEMs, so punt on these.
>> -  if (MEM_VOLATILE_P (mem))
>> -    return;
>> -
>> -  // Ignore writeback accesses if the param says to do so.
>> -  if (!aarch64_ldp_writeback
>> -      && GET_RTX_CLASS (GET_CODE (XEXP (mem, 0))) == RTX_AUTOINC)
>> -    return;
>> -
>> -  const machine_mode mem_mode = GET_MODE (mem);
>> -  if (!ldp_operand_mode_ok_p (mem_mode))
>> -    return;
>> -
>> -  rtx reg_op = XEXP (PATTERN (insn->rtl ()), !load_p);
>> -
>> -  // Ignore the access if the register operand isn't suitable for ldp/stp.
>> -  if (load_p
>> -      ? !aarch64_ldp_reg_operand (reg_op, mem_mode)
>> -      : !aarch64_stp_reg_operand (reg_op, mem_mode))
>> -    return;
>> -
>> -  // We want to segregate FP/SIMD accesses from GPR accesses.
>> -  //
>> -  // Before RA, we use the modes, noting that stores of constant zero
>> -  // operands use GPRs (even in non-integer modes).  After RA, we use
>> -  // the hard register numbers.
>> -  const bool fpsimd_op_p
>> -    = reload_completed
>> -    ? (REG_P (reg_op) && FP_REGNUM_P (REGNO (reg_op)))
>> -    : (GET_MODE_CLASS (mem_mode) != MODE_INT
>> -       && (load_p || !aarch64_const_zero_rtx_p (reg_op)));
>> -
>> -  // Note ldp_operand_mode_ok_p already rejected VL modes.
>> -  const HOST_WIDE_INT mem_size = GET_MODE_SIZE (mem_mode).to_constant ();
>> -  const lfs_fields lfs = { load_p, fpsimd_op_p, mem_size };
>> -
>> -  if (track_via_mem_expr (insn, mem, lfs))
>> -    return;
>> -
>> -  poly_int64 mem_off;
>> -  rtx addr = XEXP (mem, 0);
>> -  const bool autoinc_p = GET_RTX_CLASS (GET_CODE (addr)) == RTX_AUTOINC;
>> -  rtx base = ldp_strip_offset (mem, &mem_off);
>> -  if (!REG_P (base))
>> -    return;
>> -
>> -  // Need to calculate two (possibly different) offsets:
>> -  //  - Offset at which the access occurs.
>> -  //  - Offset of the new base def.
>> -  poly_int64 access_off;
>> -  if (autoinc_p && any_post_modify_p (addr))
>> -    access_off = 0;
>> -  else
>> -    access_off = mem_off;
>> -
>> -  poly_int64 new_def_off = mem_off;
>> -
>> -  // Punt on accesses relative to eliminable regs.  Since we don't know the
>> -  // elimination offset pre-RA, we should postpone forming pairs on such
>> -  // accesses until after RA.
>> -  //
>> -  // As it stands, addresses with offsets in range for LDR but not
>> -  // in range for LDP/STP are currently reloaded inefficiently,
>> -  // ending up with a separate base register for each pair.
>> -  //
>> -  // In theory LRA should make use of
>> -  // targetm.legitimize_address_displacement to promote sharing of
>> -  // bases among multiple (nearby) address reloads, but the current
>> -  // LRA code returns early from process_address_1 for operands that
>> -  // satisfy "m", even if they don't satisfy the real (relaxed) address
>> -  // constraint; this early return means we never get to the code
>> -  // that calls targetm.legitimize_address_displacement.
>> -  //
>> -  // So for now, it's better to punt when we can't be sure that the
>> -  // offset is in range for LDP/STP.  Out-of-range cases can then be
>> -  // handled after RA by the out-of-range LDP/STP peepholes.  Eventually, it
>> -  // would be nice to handle known out-of-range opportunities in the
>> -  // pass itself (for stack accesses, this would be in the post-RA pass).
>> -  if (!reload_completed
>> -      && (REGNO (base) == FRAME_POINTER_REGNUM
>> -	  || REGNO (base) == ARG_POINTER_REGNUM))
>> -    return;
>> -
>> -  // Now need to find def of base register.
>> -  use_info *base_use = find_access (insn->uses (), REGNO (base));
>> -  gcc_assert (base_use);
>> -  def_info *base_def = base_use->def ();
>> -  if (!base_def)
>> -    {
>> -      if (dump_file)
>> -	fprintf (dump_file,
>> -		 "base register (regno %d) of insn %d is undefined",
>> -		 REGNO (base), insn->uid ());
>> -      return;
>> -    }
>> -
>> -  alt_base *canon_base = canon_base_map.get (base_def);
>> -  if (canon_base)
>> -    {
>> -      // Express this as the combined offset from the canonical base.
>> -      base_def = canon_base->base;
>> -      new_def_off += canon_base->offset;
>> -      access_off += canon_base->offset;
>> -    }
>> -
>> -  if (autoinc_p)
>> -    {
>> -      auto def = find_access (insn->defs (), REGNO (base));
>> -      gcc_assert (def);
>> -
>> -      // Record that DEF = BASE_DEF + MEM_OFF.
>> -      if (dump_file)
>> -	{
>> -	  pretty_printer pp;
>> -	  pp_access (&pp, def, 0);
>> -	  pp_string (&pp, " = ");
>> -	  pp_access (&pp, base_def, 0);
>> -	  fprintf (dump_file, "[bb %u] recording %s + ",
>> -		   m_bb->index (), pp_formatted_text (&pp));
>> -	  print_dec (new_def_off, dump_file);
>> -	  fprintf (dump_file, "\n");
>> -	}
>> -
>> -      alt_base base_rec { base_def, new_def_off };
>> -      if (canon_base_map.put (def, base_rec))
>> -	gcc_unreachable (); // Base defs should be unique.
>> -    }
>> -
>> -  // Punt on misaligned offsets.  LDP/STP require offsets to be a multiple of
>> -  // the access size.
>> -  if (!multiple_p (mem_off, mem_size))
>> -    return;
>> -
>> -  const auto key = std::make_pair (base_def, encode_lfs (lfs));
>> -  access_group &group = def_map.get_or_insert (key, NULL);
>> -  auto alloc = [&](access_record *access) { return node_alloc (access); };
>> -  group.track (alloc, access_off, insn);
>> -
>> -  if (dump_file)
>> -    {
>> -      pretty_printer pp;
>> -      pp_access (&pp, base_def, 0);
>> -
>> -      fprintf (dump_file, "[bb %u] tracking insn %d via %s",
>> -	       m_bb->index (), insn->uid (), pp_formatted_text (&pp));
>> -      fprintf (dump_file,
>> -	       " [L=%d, WB=%d, FP=%d, %smode, off=",
>> -	       lfs.load_p, autoinc_p, lfs.fpsimd_p, mode_name[mem_mode]);
>> -      print_dec (access_off, dump_file);
>> -      fprintf (dump_file, "]\n");
>> -    }
>> -}
>> -
>> -// Dummy predicate that never ignores any insns.
>> -static bool no_ignore (insn_info *) { return false; }
>> -
>> -// Return the latest dataflow hazard before INSN.
>> -//
>> -// If IGNORE is non-NULL, this points to a sub-rtx which we should ignore for
>> -// dataflow purposes.  This is needed when considering changing the RTL base of
>> -// an access discovered through a MEM_EXPR base.
>> -//
>> -// If IGNORE_INSN is non-NULL, we should further ignore any hazards arising
>> -// from that insn.
>> -//
>> -// N.B. we ignore any defs/uses of memory here as we deal with that separately,
>> -// making use of alias disambiguation.
>> -static insn_info *
>> -latest_hazard_before (insn_info *insn, rtx *ignore,
>> -		      insn_info *ignore_insn = nullptr)
>> +// Return the latest dataflow hazard before INSN.
>> +//
>> +// If IGNORE is non-NULL, this points to a sub-rtx which we should ignore for
>> +// dataflow purposes.  This is needed when considering changing the RTL base of
>> +// an access discovered through a MEM_EXPR base.
>> +//
>> +// If IGNORE_INSN is non-NULL, we should further ignore any hazards arising
>> +// from that insn.
>> +//
>> +// N.B. we ignore any defs/uses of memory here as we deal with that separately,
>> +// making use of alias disambiguation.
>> +insn_info *
>> +latest_hazard_before (insn_info *insn, rtx *ignore,
>> +		      insn_info *ignore_insn)// = nullptr)
>>  {
>>    insn_info *result = nullptr;
>>  
>> @@ -698,7 +790,7 @@ latest_hazard_before (insn_info *insn, rtx *ignore,
>>  //
>>  // N.B. we ignore any defs/uses of memory here as we deal with that separately,
>>  // making use of alias disambiguation.
>> -static insn_info *
>> +insn_info *
>>  first_hazard_after (insn_info *insn, rtx *ignore)
>>  {
>>    insn_info *result = nullptr;
>> @@ -787,7 +879,7 @@ first_hazard_after (insn_info *insn, rtx *ignore)
>>  }
>>  
>>  // Return true iff R1 and R2 overlap.
>> -static bool
>> +bool
>>  ranges_overlap_p (const insn_range_info &r1, const insn_range_info &r2)
>>  {
>>    // If either range is empty, then their intersection is empty.
>> @@ -799,9 +891,8 @@ ranges_overlap_p (const insn_range_info &r1, const insn_range_info &r2)
>>    // Inverting this, we get the below.
>>    return *r1.last >= *r2.first && *r2.last >= *r1.first;
>>  }
>> -
>>  // Get the range of insns that def feeds.
>> -static insn_range_info get_def_range (def_info *def)
>> + insn_range_info get_def_range (def_info *def)
>>  {
>>    insn_info *last = def->next_def ()->insn ()->prev_nondebug_insn ();
>>    return { def->insn (), last };
>> @@ -809,7 +900,7 @@ static insn_range_info get_def_range (def_info *def)
>>  
>>  // Given a def (of memory), return the downwards range within which we
>>  // can safely move this def.
>> -static insn_range_info
>> +insn_range_info
>>  def_downwards_move_range (def_info *def)
>>  {
>>    auto range = get_def_range (def);
>> @@ -827,7 +918,7 @@ def_downwards_move_range (def_info *def)
>>  
>>  // Given a def (of memory), return the upwards range within which we can
>>  // safely move this def.
>> -static insn_range_info
>> +insn_range_info
>>  def_upwards_move_range (def_info *def)
>>  {
>>    def_info *prev = def->prev_def ();
>> @@ -844,189 +935,18 @@ def_upwards_move_range (def_info *def)
>>    return range;
>>  }
>>  
>> -// Class that implements a state machine for building the changes needed to form
>> -// a store pair instruction.  This allows us to easily build the changes in
>> -// program order, as required by rtl-ssa.
>> -struct stp_change_builder
>> +// Generate the RTL pattern for a "tombstone"; used temporarily during this pass
>> +// to replace stores that are marked for deletion where we can't immediately
>> +// delete the store (since there are uses of mem hanging off the store).
>> +//
>> +// These are deleted at the end of the pass and uses re-parented appropriately
>> +// at this point.
>> +rtx
>> +gen_tombstone (void)
>>  {
>> -  enum class state
>> -  {
>> -    FIRST,
>> -    INSERT,
>> -    FIXUP_USE,
>> -    LAST,
>> -    DONE
>> -  };
>> -
>> -  enum class action
>> -  {
>> -    TOMBSTONE,
>> -    CHANGE,
>> -    INSERT,
>> -    FIXUP_USE
>> -  };
>> -
>> -  struct change
>> -  {
>> -    action type;
>> -    insn_info *insn;
>> -  };
>> -
>> -  bool done () const { return m_state == state::DONE; }
>> -
>> -  stp_change_builder (insn_info *insns[2],
>> -		      insn_info *repurpose,
>> -		      insn_info *dest)
>> -    : m_state (state::FIRST), m_insns { insns[0], insns[1] },
>> -      m_repurpose (repurpose), m_dest (dest), m_use (nullptr) {}
>> -
>> -  change get_change () const
>> -  {
>> -    switch (m_state)
>> -      {
>> -      case state::FIRST:
>> -	return {
>> -	  m_insns[0] == m_repurpose ? action::CHANGE : action::TOMBSTONE,
>> -	  m_insns[0]
>> -	};
>> -      case state::LAST:
>> -	return {
>> -	  m_insns[1] == m_repurpose ? action::CHANGE : action::TOMBSTONE,
>> -	  m_insns[1]
>> -	};
>> -      case state::INSERT:
>> -	return { action::INSERT, m_dest };
>> -      case state::FIXUP_USE:
>> -	return { action::FIXUP_USE, m_use->insn () };
>> -      case state::DONE:
>> -	break;
>> -      }
>> -
>> -    gcc_unreachable ();
>> -  }
>> -
>> -  // Transition to the next state.
>> -  void advance ()
>> -  {
>> -    switch (m_state)
>> -      {
>> -      case state::FIRST:
>> -	if (m_repurpose)
>> -	  m_state = state::LAST;
>> -	else
>> -	  m_state = state::INSERT;
>> -	break;
>> -      case state::INSERT:
>> -      {
>> -	def_info *def = memory_access (m_insns[0]->defs ());
>> -	while (*def->next_def ()->insn () <= *m_dest)
>> -	  def = def->next_def ();
>> -
>> -	// Now we know DEF feeds the insertion point for the new stp.
>> -	// Look for any uses of DEF that will consume the new stp.
>> -	gcc_assert (*def->insn () <= *m_dest
>> -		    && *def->next_def ()->insn () > *m_dest);
>> -
>> -	auto set = as_a<set_info *> (def);
>> -	for (auto use : set->nondebug_insn_uses ())
>> -	  if (*use->insn () > *m_dest)
>> -	    {
>> -	      m_use = use;
>> -	      break;
>> -	    }
>> -
>> -	if (m_use)
>> -	  m_state = state::FIXUP_USE;
>> -	else
>> -	  m_state = state::LAST;
>> -	break;
>> -      }
>> -      case state::FIXUP_USE:
>> -	m_use = m_use->next_nondebug_insn_use ();
>> -	if (!m_use)
>> -	  m_state = state::LAST;
>> -	break;
>> -      case state::LAST:
>> -	m_state = state::DONE;
>> -	break;
>> -      case state::DONE:
>> -	gcc_unreachable ();
>> -      }
>> -  }
>> -
>> -private:
>> -  state m_state;
>> -
>> -  // Original candidate stores.
>> -  insn_info *m_insns[2];
>> -
>> -  // If non-null, this is a candidate insn to change into an stp.  Otherwise we
>> -  // are deleting both original insns and inserting a new insn for the stp.
>> -  insn_info *m_repurpose;
>> -
>> -  // Destionation of the stp, it will be placed immediately after m_dest.
>> -  insn_info *m_dest;
>> -
>> -  // Current nondebug use that needs updating due to stp insertion.
>> -  use_info *m_use;
>> -};
>> -
>> -// Given candidate store insns FIRST and SECOND, see if we can re-purpose one
>> -// of them (together with its def of memory) for the stp insn.  If so, return
>> -// that insn.  Otherwise, return null.
>> -static insn_info *
>> -try_repurpose_store (insn_info *first,
>> -		     insn_info *second,
>> -		     const insn_range_info &move_range)
>> -{
>> -  def_info * const defs[2] = {
>> -    memory_access (first->defs ()),
>> -    memory_access (second->defs ())
>> -  };
>> -
>> -  if (move_range.includes (first)
>> -      || ranges_overlap_p (move_range, def_downwards_move_range (defs[0])))
>> -    return first;
>> -
>> -  if (move_range.includes (second)
>> -      || ranges_overlap_p (move_range, def_upwards_move_range (defs[1])))
>> -    return second;
>> -
>> -  return nullptr;
>> -}
>> -
>> -// Generate the RTL pattern for a "tombstone"; used temporarily during this pass
>> -// to replace stores that are marked for deletion where we can't immediately
>> -// delete the store (since there are uses of mem hanging off the store).
>> -//
>> -// These are deleted at the end of the pass and uses re-parented appropriately
>> -// at this point.
>> -static rtx
>> -gen_tombstone (void)
>> -{
>> -  return gen_rtx_CLOBBER (VOIDmode,
>> -			  gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (Pmode)));
>> -}
>> -
>> -// Given a pair mode MODE, return a canonical mode to be used for a single
>> -// operand of such a pair.  Currently we only use this when promoting a
>> -// non-writeback pair into a writeback pair, as it isn't otherwise clear
>> -// which mode to use when storing a modeless CONST_INT.
>> -static machine_mode
>> -aarch64_operand_mode_for_pair_mode (machine_mode mode)
>> -{
>> -  switch (mode)
>> -    {
>> -    case E_V2x4QImode:
>> -      return SImode;
>> -    case E_V2x8QImode:
>> -      return DImode;
>> -    case E_V2x16QImode:
>> -      return V16QImode;
>> -    default:
>> -      gcc_unreachable ();
>> -    }
>> -}
>> +  return gen_rtx_CLOBBER (VOIDmode,
>> +			  gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (Pmode)));
>> +}
>>  
>>  // Go through the reg notes rooted at NOTE, dropping those that we should drop,
>>  // and preserving those that we want to keep by prepending them to (and
>> @@ -1034,7 +954,7 @@ aarch64_operand_mode_for_pair_mode (machine_mode mode)
>>  // REG_EH_REGION note in the resulting list.  FR_EXPR is used to return any
>>  // REG_FRAME_RELATED_EXPR note we find, as these can need special handling in
>>  // combine_reg_notes.
>> -static rtx
>> +rtx
>>  filter_notes (rtx note, rtx result, bool *eh_region, rtx *fr_expr)
>>  {
>>    for (; note; note = XEXP (note, 1))
>> @@ -1084,7 +1004,7 @@ filter_notes (rtx note, rtx result, bool *eh_region, rtx *fr_expr)
>>  
>>  // Return the notes that should be attached to a combination of I1 and I2, where
>>  // *I1 < *I2.  LOAD_P is true for loads.
>> -static rtx
>> +rtx
>>  combine_reg_notes (insn_info *i1, insn_info *i2, bool load_p)
>>  {
>>    // Temporary storage for REG_FRAME_RELATED_EXPR notes.
>> @@ -1100,8 +1020,8 @@ combine_reg_notes (insn_info *i1, insn_info *i2, bool load_p)
>>    if (!load_p)
>>      {
>>        // Simple frame-related sp-relative saves don't need CFI notes, but when
>> -      // we combine them into an stp we will need a CFI note as dwarf2cfi can't
>> -      // interpret the unspec pair representation directly.
>> +      // we combine them into an pair mem store  we will need a CFI note as
>> +      // dwarf2cfi can't interpret the unspec pair representation directly.
>>        if (RTX_FRAME_RELATED_P (i1->rtl ()) && !fr_expr[0])
>>  	fr_expr[0] = copy_rtx (PATTERN (i1->rtl ()));
>>        if (RTX_FRAME_RELATED_P (i2->rtl ()) && !fr_expr[1])
>> @@ -1133,7 +1053,7 @@ combine_reg_notes (insn_info *i1, insn_info *i2, bool load_p)
>>  // relative to the initial value of the base register, and output these
>>  // in PATS.  Return an rtx that represents the overall change to the
>>  // base register.
>> -static rtx
>> +rtx
>>  extract_writebacks (bool load_p, rtx pats[2], int changed)
>>  {
>>    rtx base_reg = NULL_RTX;
>> @@ -1150,7 +1070,7 @@ extract_writebacks (bool load_p, rtx pats[2], int changed)
>>        const bool autoinc_p = GET_RTX_CLASS (GET_CODE (addr)) == RTX_AUTOINC;
>>  
>>        poly_int64 offset;
>> -      rtx this_base = ldp_strip_offset (mem, &offset);
>> +      rtx this_base = pair_mem_strip_offset (mem, &offset);
>>        gcc_assert (REG_P (this_base));
>>        if (base_reg)
>>  	gcc_assert (rtx_equal_p (base_reg, this_base));
>> @@ -1207,7 +1127,7 @@ extract_writebacks (bool load_p, rtx pats[2], int changed)
>>  // base register.  If there is one, we choose the first such update after
>>  // PAIR_DST that is still in the same BB as our pair.  We return the new def in
>>  // *ADD_DEF and the resulting writeback effect in *WRITEBACK_EFFECT.
>> -static insn_info *
>> +insn_info *
>>  find_trailing_add (insn_info *insns[2],
>>  		   const insn_range_info &pair_range,
>>  		   int initial_writeback,
>> @@ -1286,7 +1206,7 @@ find_trailing_add (insn_info *insns[2],
>>  
>>    off_hwi /= access_size;
>>  
>> -  if (off_hwi < LDP_MIN_IMM || off_hwi > LDP_MAX_IMM)
>> +  if (off_hwi < PAIR_MEM_MIN_IMM || off_hwi > PAIR_MEM_MAX_IMM)
>>      return nullptr;
>>  
>>    auto dump_prefix = [&]()
>> @@ -1325,26 +1245,93 @@ find_trailing_add (insn_info *insns[2],
>>    return nullptr;
>>  }
>>  
>> -// We just emitted a tombstone with uid UID, track it in a bitmap for
>> -// this BB so we can easily identify it later when cleaning up tombstones.
>> -void
>> -ldp_bb_info::track_tombstone (int uid)
>> +// Return true if STORE_INSN may modify mem rtx MEM.  Make sure we keep
>> +// within our BUDGET for alias analysis.
>> +bool
>> +store_modifies_mem_p (rtx mem, insn_info *store_insn, int &budget)
>>  {
>> -  if (!m_emitted_tombstone)
>> +  if (!budget)
>>      {
>> -      // Lazily initialize the bitmap for tracking tombstone insns.
>> -      bitmap_obstack_initialize (&m_bitmap_obstack);
>> -      bitmap_initialize (&m_tombstone_bitmap, &m_bitmap_obstack);
>> -      m_emitted_tombstone = true;
>> +      if (dump_file)
>> +	{
>> +	  fprintf (dump_file,
>> +		   "exceeded budget, assuming store %d aliases with mem ",
>> +		   store_insn->uid ());
>> +	  print_simple_rtl (dump_file, mem);
>> +	  fprintf (dump_file, "\n");
>> +	}
>> +
>> +      return true;
>>      }
>>  
>> -  if (!bitmap_set_bit (&m_tombstone_bitmap, uid))
>> -    gcc_unreachable (); // Bit should have changed.
>> +  budget--;
>> +  return memory_modified_in_insn_p (mem, store_insn->rtl ());
>> +}
>> +
>> +// Return true if LOAD may be modified by STORE.  Make sure we keep
>> +// within our BUDGET for alias analysis.
>> +bool
>> +load_modified_by_store_p (insn_info *load,
>> +			  insn_info *store,
>> +			  int &budget)
>> +{
>> +  gcc_checking_assert (budget >= 0);
>> +
>> +  if (!budget)
>> +    {
>> +      if (dump_file)
>> +	{
>> +	  fprintf (dump_file,
>> +		   "exceeded budget, assuming load %d aliases with store %d\n",
>> +		   load->uid (), store->uid ());
>> +	}
>> +      return true;
>> +    }
>> +
>> +  // It isn't safe to re-order stores over calls.
>> +  if (CALL_P (load->rtl ()))
>> +    return true;
>> +
>> +  budget--;
>> +
>> +  // Iterate over all MEMs in the load, seeing if any alias with
>> +  // our store.
>> +  subrtx_var_iterator::array_type array;
>> +  rtx pat = PATTERN (load->rtl ());
>> +  FOR_EACH_SUBRTX_VAR (iter, array, pat, NONCONST)
>> +    if (MEM_P (*iter) && memory_modified_in_insn_p (*iter, store->rtl ()))
>> +      return true;
>> +
>> +  return false;
>> +}
>> +// Given candidate store insns FIRST and SECOND, see if we can re-purpose one
>> +// of them (together with its def of memory) for the stp insn.  If so, return
>> +// that insn.  Otherwise, return null.
>> +insn_info *
>> +try_repurpose_store (insn_info *first,
>> +		     insn_info *second,
>> +		     const insn_range_info &move_range)
>> +{
>> +  def_info * const defs[2] = {
>> +    memory_access (first->defs ()),
>> +    memory_access (second->defs ())
>> +  };
>> +
>> +  if (move_range.includes (first)
>> +      || ranges_overlap_p (move_range, def_downwards_move_range (defs[0])))
>> +    return first;
>> +
>> +  if (move_range.includes (second)
>> +      || ranges_overlap_p (move_range, def_upwards_move_range (defs[1])))
>> +    return second;
>> +
>> +  return nullptr;
>>  }
>>  
>> +
>>  // Reset the debug insn containing USE (the debug insn has been
>>  // optimized away).
>> -static void
>> +void
>>  reset_debug_use (use_info *use)
>>  {
>>    auto use_insn = use->insn ();
>> @@ -1355,12 +1342,43 @@ reset_debug_use (use_info *use)
>>    crtl->ssa->change_insn (change);
>>  }
>>  
>> +// Update debug uses when folding in a trailing add insn to form a
>> +// writeback pair.
>> +//
>> +// ATTEMPT is used to allocate RTL-SSA temporaries for the changes,
>> +// the final pair is placed immediately after PAIR_DST, TRAILING_ADD
>> +// is a trailing add insn which is being folded into the pair to make it
>> +// use writeback addressing, and WRITEBACK_EFFECT is the pattern for
>> +// TRAILING_ADD.
>> +void
>> +fixup_debug_uses_trailing_add (obstack_watermark &attempt,
>> +			       insn_info *pair_dst,
>> +			       insn_info *trailing_add,
>> +			       rtx writeback_effect)
>> +{
>> +  rtx base = SET_DEST (writeback_effect);
>> +
>> +  poly_int64 wb_offset;
>> +  rtx base2 = strip_offset (SET_SRC (writeback_effect), &wb_offset);
>> +  gcc_checking_assert (rtx_equal_p (base, base2));
>> +
>> +  auto defs = trailing_add->defs ();
>> +  gcc_checking_assert (defs.size () == 1);
>> +  def_info *def = defs[0];
>> +
>> +  if (auto set = safe_dyn_cast<set_info *> (def->prev_def ()))
>> +    for (auto use : iterate_safely (set->debug_insn_uses ()))
>> +      if (*use->insn () > *pair_dst)
>> +	// DEF is getting re-ordered above USE, fix up USE accordingly.
>> +	fixup_debug_use (attempt, use, def, base, wb_offset);
>> +}
>> +
>>  // USE is a debug use that needs updating because DEF (a def of the same
>>  // register) is being re-ordered over it.  If BASE is non-null, then DEF
>>  // is an update of the register BASE by a constant, given by WB_OFFSET,
>>  // and we can preserve debug info by accounting for the change in side
>>  // effects.
>> -static void
>> +void
>>  fixup_debug_use (obstack_watermark &attempt,
>>  		 use_info *use,
>>  		 def_info *def,
>> @@ -1455,37 +1473,6 @@ fixup_debug_use (obstack_watermark &attempt,
>>      }
>>  }
>>  
>> -// Update debug uses when folding in a trailing add insn to form a
>> -// writeback pair.
>> -//
>> -// ATTEMPT is used to allocate RTL-SSA temporaries for the changes,
>> -// the final pair is placed immediately after PAIR_DST, TRAILING_ADD
>> -// is a trailing add insn which is being folded into the pair to make it
>> -// use writeback addressing, and WRITEBACK_EFFECT is the pattern for
>> -// TRAILING_ADD.
>> -static void
>> -fixup_debug_uses_trailing_add (obstack_watermark &attempt,
>> -			       insn_info *pair_dst,
>> -			       insn_info *trailing_add,
>> -			       rtx writeback_effect)
>> -{
>> -  rtx base = SET_DEST (writeback_effect);
>> -
>> -  poly_int64 wb_offset;
>> -  rtx base2 = strip_offset (SET_SRC (writeback_effect), &wb_offset);
>> -  gcc_checking_assert (rtx_equal_p (base, base2));
>> -
>> -  auto defs = trailing_add->defs ();
>> -  gcc_checking_assert (defs.size () == 1);
>> -  def_info *def = defs[0];
>> -
>> -  if (auto set = safe_dyn_cast<set_info *> (def->prev_def ()))
>> -    for (auto use : iterate_safely (set->debug_insn_uses ()))
>> -      if (*use->insn () > *pair_dst)
>> -	// DEF is getting re-ordered above USE, fix up USE accordingly.
>> -	fixup_debug_use (attempt, use, def, base, wb_offset);
>> -}
>> -
>>  // Called from fuse_pair, fixes up any debug uses that will be affected
>>  // by the changes.
>>  //
>> @@ -1500,7 +1487,7 @@ fixup_debug_uses_trailing_add (obstack_watermark &attempt,
>>  // writeback, and WRITEBACK_EFFECT is an rtx describing the overall update to
>>  // the base register in the final pair (if any).  BASE_REGNO gives the register
>>  // number of the base register used in the final pair.
>> -static void
>> +void
>>  fixup_debug_uses (obstack_watermark &attempt,
>>  		  insn_info *insns[2],
>>  		  rtx orig_rtl[2],
>> @@ -1528,7 +1515,7 @@ fixup_debug_uses (obstack_watermark &attempt,
>>  	  gcc_checking_assert (GET_RTX_CLASS (GET_CODE (XEXP (mem, 0)))
>>  			       == RTX_AUTOINC);
>>  
>> -	  base = ldp_strip_offset (mem, &offset);
>> +	  base = pair_mem_strip_offset (mem, &offset);
>>  	  gcc_checking_assert (REG_P (base) && REGNO (base) == base_regno);
>>  	}
>>        fixup_debug_use (attempt, use, def, base, offset);
>> @@ -1651,621 +1638,846 @@ fixup_debug_uses (obstack_watermark &attempt,
>>  				   writeback_effect);
>>  }
>>  
>> -// Try and actually fuse the pair given by insns I1 and I2.
>> -//
>> -// Here we've done enough analysis to know this is safe, we only
>> -// reject the pair at this stage if either the tuning policy says to,
>> -// or recog fails on the final pair insn.
>> -//
>> -// LOAD_P is true for loads, ACCESS_SIZE gives the access size of each
>> -// candidate insn.  Bit i of WRITEBACK is set if the ith insn (in program
>> -// order) uses writeback.
>> +// Given INSNS (in program order) which are known to be adjacent, look
>> +// to see if either insn has a suitable RTL (register) base that we can
>> +// use to form a pair.  Push these to BASE_CANDS if we find any.  CAND_MEMs
>> +// gives the relevant mems from the candidate insns, ACCESS_SIZE gives the
>> +// size of a single candidate access, and REVERSED says whether the accesses
>> +// are inverted in offset order.
>>  //
>> -// BASE gives the chosen base candidate for the pair and MOVE_RANGE is
>> -// a singleton range which says where to place the pair.
>> -bool
>> -ldp_bb_info::fuse_pair (bool load_p,
>> -			unsigned access_size,
>> -			int writeback,
>> -			insn_info *i1, insn_info *i2,
>> -			base_cand &base,
>> -			const insn_range_info &move_range)
>> +// Returns an integer where bit (1 << i) is set if INSNS[i] uses writeback
>> +// addressing.
>> +int
>> +get_viable_bases (insn_info *insns[2],
>> +		  vec<base_cand> &base_cands,
>> +		  rtx cand_mems[2],
>> +		  unsigned access_size,
>> +		  bool reversed)
>>  {
>> -  auto attempt = crtl->ssa->new_change_attempt ();
>> -
>> -  auto make_change = [&attempt](insn_info *insn)
>> -    {
>> -      return crtl->ssa->change_alloc<insn_change> (attempt, insn);
>> -    };
>> -  auto make_delete = [&attempt](insn_info *insn)
>> -    {
>> -      return crtl->ssa->change_alloc<insn_change> (attempt,
>> -						   insn,
>> -						   insn_change::DELETE);
>> -    };
>> -
>> -  insn_info *first = (*i1 < *i2) ? i1 : i2;
>> -  insn_info *second = (first == i1) ? i2 : i1;
>> -
>> -  insn_info *pair_dst = move_range.singleton ();
>> -  gcc_assert (pair_dst);
>> -
>> -  insn_info *insns[2] = { first, second };
>> -
>> -  auto_vec<insn_change *> changes;
>> -  auto_vec<int, 2> tombstone_uids (2);
>> -
>> -  rtx pats[2] = {
>> -    PATTERN (first->rtl ()),
>> -    PATTERN (second->rtl ())
>> -  };
>> -
>> -  // Make copies of the patterns as we might need to refer to the original RTL
>> -  // later, for example when updating debug uses (which is after we've updated
>> -  // one or both of the patterns in the candidate insns).
>> -  rtx orig_rtl[2];
>> +  // We discovered this pair through a common base.  Need to ensure that
>> +  // we have a common base register that is live at both locations.
>> +  def_info *base_defs[2] = {};
>> +  int writeback = 0;
>>    for (int i = 0; i < 2; i++)
>> -    orig_rtl[i] = copy_rtx (pats[i]);
>> -
>> -  use_array input_uses[2] = { first->uses (), second->uses () };
>> -  def_array input_defs[2] = { first->defs (), second->defs () };
>> -
>> -  int changed_insn = -1;
>> -  if (base.from_insn != -1)
>>      {
>> -      // If we're not already using a shared base, we need
>> -      // to re-write one of the accesses to use the base from
>> -      // the other insn.
>> -      gcc_checking_assert (base.from_insn == 0 || base.from_insn == 1);
>> -      changed_insn = !base.from_insn;
>> -
>> -      rtx base_pat = pats[base.from_insn];
>> -      rtx change_pat = pats[changed_insn];
>> -      rtx base_mem = XEXP (base_pat, load_p);
>> -      rtx change_mem = XEXP (change_pat, load_p);
>> +      const bool is_lower = (i == reversed);
>> +      poly_int64 poly_off;
>> +      rtx base = pair_mem_strip_offset (cand_mems[i], &poly_off);
>> +      if (GET_RTX_CLASS (GET_CODE (XEXP (cand_mems[i], 0))) == RTX_AUTOINC)
>> +	writeback |= (1 << i);
>>  
>> -      const bool lower_base_p = (insns[base.from_insn] == i1);
>> -      HOST_WIDE_INT adjust_amt = access_size;
>> -      if (!lower_base_p)
>> -	adjust_amt *= -1;
>> +      if (!REG_P (base) || !poly_off.is_constant ())
>> +	continue;
>>  
>> -      rtx change_reg = XEXP (change_pat, !load_p);
>> -      machine_mode mode_for_mem = GET_MODE (change_mem);
>> -      rtx effective_base = drop_writeback (base_mem);
>> -      rtx new_mem = adjust_address_nv (effective_base,
>> -				       mode_for_mem,
>> -				       adjust_amt);
>> -      rtx new_set = load_p
>> -	? gen_rtx_SET (change_reg, new_mem)
>> -	: gen_rtx_SET (new_mem, change_reg);
>> +      // Punt on accesses relative to eliminable regs.  See the comment in
>> +      // pair_fusion::track_access for a detailed explanation of this.
>> +      if (!reload_completed
>> +	  && (REGNO (base) == FRAME_POINTER_REGNUM
>> +	      || REGNO (base) == ARG_POINTER_REGNUM))
>> +	continue;
>>  
>> -      pats[changed_insn] = new_set;
>> +      HOST_WIDE_INT base_off = poly_off.to_constant ();
>>  
>> -      auto keep_use = [&](use_info *u)
>> +      // It should be unlikely that we ever punt here, since MEM_EXPR offset
>> +      // alignment should be a good proxy for register offset alignment.
>> +      if (base_off % access_size != 0)
>>  	{
>> -	  return refers_to_regno_p (u->regno (), u->regno () + 1,
>> -				    change_pat, &XEXP (change_pat, load_p));
>> -	};
>> -
>> -      // Drop any uses that only occur in the old address.
>> -      input_uses[changed_insn] = filter_accesses (attempt,
>> -						  input_uses[changed_insn],
>> -						  keep_use);
>> -    }
>> -
>> -  rtx writeback_effect = NULL_RTX;
>> -  if (writeback)
>> -    writeback_effect = extract_writebacks (load_p, pats, changed_insn);
>> +	  if (dump_file)
>> +	    fprintf (dump_file,
>> +		     "base not viable, offset misaligned (insn %d)\n",
>> +		     insns[i]->uid ());
>> +	  continue;
>> +	}
>>  
>> -  const auto base_regno = base.def->regno ();
>> +      base_off /= access_size;
>>  
>> -  if (base.from_insn == -1 && (writeback & 1))
>> -    {
>> -      // If the first of the candidate insns had a writeback form, we'll need to
>> -      // drop the use of the updated base register from the second insn's uses.
>> -      //
>> -      // N.B. we needn't worry about the base register occurring as a store
>> -      // operand, as we checked that there was no non-address true dependence
>> -      // between the insns in try_fuse_pair.
>> -      gcc_checking_assert (find_access (input_uses[1], base_regno));
>> -      input_uses[1] = check_remove_regno_access (attempt,
>> -						 input_uses[1],
>> -						 base_regno);
>> -    }
>> +      if (!is_lower)
>> +	base_off--;
>>  
>> -  // Go through and drop uses that only occur in register notes,
>> -  // as we won't be preserving those.
>> -  for (int i = 0; i < 2; i++)
>> -    {
>> -      auto rti = insns[i]->rtl ();
>> -      if (!REG_NOTES (rti))
>> +      if (base_off < PAIR_MEM_MIN_IMM || base_off > PAIR_MEM_MAX_IMM)
>>  	continue;
>>  
>> -      input_uses[i] = remove_note_accesses (attempt, input_uses[i]);
>> +      use_info *use = find_access (insns[i]->uses (), REGNO (base));
>> +      gcc_assert (use);
>> +      base_defs[i] = use->def ();
>>      }
>>  
>> -  // Edge case: if the first insn is a writeback load and the
>> -  // second insn is a non-writeback load which transfers into the base
>> -  // register, then we should drop the writeback altogether as the
>> -  // update of the base register from the second load should prevail.
>> -  //
>> -  // For example:
>> -  //   ldr x2, [x1], #8
>> -  //   ldr x1, [x1]
>> -  //   -->
>> -  //   ldp x2, x1, [x1]
>> -  if (writeback == 1
>> -      && load_p
>> -      && find_access (input_defs[1], base_regno))
>> +  if (!base_defs[0] && !base_defs[1])
>>      {
>>        if (dump_file)
>> -	fprintf (dump_file,
>> -		 "  ldp: i%d has wb but subsequent i%d has non-wb "
>> -		 "update of base (r%d), dropping wb\n",
>> -		 insns[0]->uid (), insns[1]->uid (), base_regno);
>> -      gcc_assert (writeback_effect);
>> -      writeback_effect = NULL_RTX;
>> +	fprintf (dump_file, "no viable base register for pair (%d,%d)\n",
>> +		 insns[0]->uid (), insns[1]->uid ());
>> +      return writeback;
>>      }
>>  
>> -  // So far the patterns have been in instruction order,
>> -  // now we want them in offset order.
>> -  if (i1 != first)
>> -    std::swap (pats[0], pats[1]);
>> -
>> -  poly_int64 offsets[2];
>>    for (int i = 0; i < 2; i++)
>> -    {
>> -      rtx mem = XEXP (pats[i], load_p);
>> -      gcc_checking_assert (MEM_P (mem));
>> -      rtx base = strip_offset (XEXP (mem, 0), offsets + i);
>> -      gcc_checking_assert (REG_P (base));
>> -      gcc_checking_assert (base_regno == REGNO (base));
>> +    if ((writeback & (1 << i)) && !base_defs[i])
>> +      {
>> +	if (dump_file)
>> +	  fprintf (dump_file, "insn %d has writeback but base isn't viable\n",
>> +		   insns[i]->uid ());
>> +	return writeback;
>> +      }
>> +
>> +  if (writeback == 3
>> +      && base_defs[0]->regno () != base_defs[1]->regno ())
>> +    {
>> +      if (dump_file)
>> +	fprintf (dump_file,
>> +		 "pair (%d,%d): double writeback with distinct regs (%d,%d): "
>> +		 "punting\n",
>> +		 insns[0]->uid (), insns[1]->uid (),
>> +		 base_defs[0]->regno (), base_defs[1]->regno ());
>> +      return writeback;
>>      }
>>  
>> -  // If either of the original insns had writeback, but the resulting pair insn
>> -  // does not (can happen e.g. in the ldp edge case above, or if the writeback
>> -  // effects cancel out), then drop the def(s) of the base register as
>> -  // appropriate.
>> +  if (base_defs[0] && base_defs[1]
>> +      && base_defs[0]->regno () == base_defs[1]->regno ())
>> +    {
>> +      // Easy case: insns already share the same base reg.
>> +      base_cands.quick_push (base_defs[0]);
>> +      return writeback;
>> +    }
>> +
>> +  // Otherwise, we know that one of the bases must change.
>>    //
>> -  // Also drop the first def in the case that both of the original insns had
>> -  // writeback.  The second def could well have uses, but the first def should
>> -  // only be used by the second insn (and we dropped that use above).
>> +  // Note that if there is writeback we must use the writeback base
>> +  // (we know now there is exactly one).
>>    for (int i = 0; i < 2; i++)
>> -    if ((!writeback_effect && (writeback & (1 << i)))
>> -	|| (i == 0 && writeback == 3))
>> -      input_defs[i] = check_remove_regno_access (attempt,
>> -						 input_defs[i],
>> -						 base_regno);
>> +    if (base_defs[i] && (!writeback || (writeback & (1 << i))))
>> +      base_cands.quick_push (base_cand { base_defs[i], i });
>> +
>> +  return writeback;
>> +}
>> +
>> +void
>> +dump_insn_list (FILE *f, const insn_list_t &l)
>> +{
>> +  fprintf (f, "(");
>> +
>> +  auto i = l.begin ();
>> +  auto end = l.end ();
>> +
>> +  if (i != end)
>> +    fprintf (f, "%d", (*i)->uid ());
>> +  i++;
>> +
>> +  for (; i != end; i++)
>> +    fprintf (f, ", %d", (*i)->uid ());
>> +
>> +  fprintf (f, ")");
>> +}
>> +splay_tree_node<access_record *> *
>> +pair_fusion::node_alloc (access_record *access)
>> +{
>> +  using T = splay_tree_node<access_record *>;
>> +  void *addr = obstack_alloc (&m_obstack, sizeof (T));
>> +  return new (addr) T (access);
>> +}
>> +// Given a candidate access INSN (with mem MEM), see if it has a suitable
>> +// MEM_EXPR base (i.e. a tree decl) relative to which we can track the access.
>> +// LFS is used as part of the key to the hash table, see track_access.
>> +bool
>> +pair_fusion::track_via_mem_expr (insn_info *insn, rtx mem, lfs_fields lfs)
>> +{
>> +  if (!MEM_EXPR (mem) || !MEM_OFFSET_KNOWN_P (mem))
>> +    return false;
>> +
>> +  poly_int64 offset;
>> +  tree base_expr = get_addr_base_and_unit_offset (MEM_EXPR (mem),
>> +						  &offset);
>> +  if (!base_expr || !DECL_P (base_expr))
>> +    return false;
>> +
>> +  offset += MEM_OFFSET (mem);
>> +
>> +  const machine_mode mem_mode = GET_MODE (mem);
>> +  const HOST_WIDE_INT mem_size = GET_MODE_SIZE (mem_mode).to_constant ();
>> +
>> +  // Punt on misaligned offsets.  PAIR MEM  instructions require offsets to be a
>> +  // multiple of the access size, and we believe that misaligned offsets on
>> +  // MEM_EXPR bases are likely to lead to misaligned offsets w.r.t. RTL bases.
>> +  if (!multiple_p (offset, mem_size))
>> +    return false;
>> +
>> +  const auto key = std::make_pair (base_expr, encode_lfs (lfs));
>> +  access_group &group = expr_map.get_or_insert (key, NULL);
>> +  auto alloc = [&](access_record *access) { return node_alloc (access); };
>> +  group.track (alloc, offset, insn);
>> +
>> +  if (dump_file)
>> +    {
>> +      fprintf (dump_file, "[bb %u] tracking insn %d via ",
>> +	       m_bb->index (), insn->uid ());
>> +      print_node_brief (dump_file, "mem expr", base_expr, 0);
>> +      fprintf (dump_file, " [L=%d FP=%d, %smode, off=",
>> +	       lfs.load_p, lfs.fpsimd_p, mode_name[mem_mode]);
>> +      print_dec (offset, dump_file);
>> +      fprintf (dump_file, "]\n");
>> +    }
>> +
>> +  return true;
>> +}
>> +// Main function to begin pair discovery.  Given a memory access INSN,
>> +// determine whether it could be a candidate for fusing into an pair mem,
>> +// and if so, track it in the appropriate data structure for this basic
>> +// block.  LOAD_P is true if the access is a load, and MEM is the mem
>> +// rtx that occurs in INSN.
>> +void
>> +pair_fusion::track_access (insn_info *insn, bool load_p, rtx mem)
>> +{
>> +  // We can't combine volatile MEMs, so punt on these.
>> +  if (MEM_VOLATILE_P (mem))
>> +    return;
>> +
>> +  // Ignore writeback accesses if the param says to do so
>> +  if (pair_is_writeback ()
>> +      && GET_RTX_CLASS (GET_CODE (XEXP (mem, 0))) == RTX_AUTOINC)
>> +    return;
>> +
>> +  const machine_mode mem_mode = GET_MODE (mem);
>> +
>> +  if (!pair_operand_mode_ok_p (mem_mode))
>> +    return;
>> +
>> +  rtx reg_op = XEXP (PATTERN (insn->rtl ()), !load_p);
>> +
>> +  if (pair_check_register_operand (load_p, reg_op, mem_mode))
>> +    return;
>> +  // We want to segregate FP/SIMD accesses from GPR accesses.
>> +  //
>> +  // Before RA, we use the modes, noting that stores of constant zero
>> +  // operands use GPRs (even in non-integer modes).  After RA, we use
>> +  // the hard register numbers.
>> + const bool fpsimd_op_p = is_fpsimd_op_p (reg_op, mem_mode, load_p);
>> +  // Note pair_operand_mode_ok_p already rejected VL modes.
>> +  const HOST_WIDE_INT mem_size = GET_MODE_SIZE (mem_mode).to_constant ();
>> +  const lfs_fields lfs = { load_p, fpsimd_op_p, mem_size };
>> +
>> +  if (track_via_mem_expr (insn, mem, lfs))
>> +    return;
>> +
>> +  poly_int64 mem_off;
>> +  rtx addr = XEXP (mem, 0);
>> +  const bool autoinc_p = GET_RTX_CLASS (GET_CODE (addr)) == RTX_AUTOINC;
>> +  rtx base = pair_mem_strip_offset (mem, &mem_off);
>> +  if (!REG_P (base))
>> +    return;
>> +
>> +  // Need to calculate two (possibly different) offsets:
>> +  //  - Offset at which the access occurs.
>> +  //  - Offset of the new base def.
>> +  poly_int64 access_off;
>> +  if (autoinc_p && any_post_modify_p (addr))
>> +    access_off = 0;
>> +  else
>> +    access_off = mem_off;
>> +
>> +  poly_int64 new_def_off = mem_off;
>> +
>> +  // Punt on accesses relative to eliminable regs.  Since we don't know the
>> +  // elimination offset pre-RA, we should postpone forming pairs on such
>> +  // accesses until after RA.
>> +  //
>> +  // As it stands, addresses with offsets in range for LDR but not
>> +  // in range for PAIR MEM LOAD STORE  are currently reloaded inefficiently,
>> +  // ending up with a separate base register for each pair.
>> +  //
>> +  // In theory LRA should make use of
>> +  // targetm.legitimize_address_displacement to promote sharing of
>> +  // bases among multiple (nearby) address reloads, but the current
>> +  // LRA code returns early from process_address_1 for operands that
>> +  // satisfy "m", even if they don't satisfy the real (relaxed) address
>> +  // constraint; this early return means we never get to the code
>> +  // that calls targetm.legitimize_address_displacement.
>> +  //
>> +  // So for now, it's better to punt when we can't be sure that the
>> +  // offset is in range for PAIR MEM LOAD STORE.  Out-of-range cases can then be
>> +  // handled after RA by the out-of-range PAIR MEM  peepholes.  Eventually, it
>> +  // would be nice to handle known out-of-range opportunities in the
>> +  // pass itself (for stack accesses, this would be in the post-RA pass).
>> +  if (!reload_completed
>> +      && (REGNO (base) == FRAME_POINTER_REGNUM
>> +	  || REGNO (base) == ARG_POINTER_REGNUM))
>> +    return;
>> +
>> +  // Now need to find def of base register.
>> +  use_info *base_use = find_access (insn->uses (), REGNO (base));
>> +  gcc_assert (base_use);
>> +  def_info *base_def = base_use->def ();
>> +  if (!base_def)
>> +    {
>> +      if (dump_file)
>> +	fprintf (dump_file,
>> +		 "base register (regno %d) of insn %d is undefined",
>> +		 REGNO (base), insn->uid ());
>> +      return;
>> +    }
>> +
>> +  alt_base *canon_base = canon_base_map.get (base_def);
>> +  if (canon_base)
>> +    {
>> +      // Express this as the combined offset from the canonical base.
>> +      base_def = canon_base->base;
>> +      new_def_off += canon_base->offset;
>> +      access_off += canon_base->offset;
>> +    }
>> +
>> +  if (autoinc_p)
>> +    {
>> +      auto def = find_access (insn->defs (), REGNO (base));
>> +      gcc_assert (def);
>> +
>> +      // Record that DEF = BASE_DEF + MEM_OFF.
>> +      if (dump_file)
>> +	{
>> +	  pretty_printer pp;
>> +	  pp_access (&pp, def, 0);
>> +	  pp_string (&pp, " = ");
>> +	  pp_access (&pp, base_def, 0);
>> +	  fprintf (dump_file, "[bb %u] recording %s + ",
>> +		   m_bb->index (), pp_formatted_text (&pp));
>> +	  print_dec (new_def_off, dump_file);
>> +	  fprintf (dump_file, "\n");
>> +	}
>> +
>> +      alt_base base_rec { base_def, new_def_off };
>> +      if (canon_base_map.put (def, base_rec))
>> +	gcc_unreachable (); // Base defs should be unique.
>> +    }
>> +
>> +  // Punt on misaligned offsets.  PAIR MEM  require offsets to be a multiple of
>> +  // the access size.
>> +  if (!multiple_p (mem_off, mem_size))
>> +    return;
>> +
>> +  const auto key = std::make_pair (base_def, encode_lfs (lfs));
>> +  access_group &group = def_map.get_or_insert (key, NULL);
>> +  auto alloc = [&](access_record *access) { return node_alloc (access); };
>> +  group.track (alloc, access_off, insn);
>> +
>> +  if (dump_file)
>> +    {
>> +      pretty_printer pp;
>> +      pp_access (&pp, base_def, 0);
>> +
>> +      fprintf (dump_file, "[bb %u] tracking insn %d via %s",
>> +	       m_bb->index (), insn->uid (), pp_formatted_text (&pp));
>> +      fprintf (dump_file,
>> +	       " [L=%d, WB=%d, FP=%d, %smode, off=",
>> +	       lfs.load_p, autoinc_p, lfs.fpsimd_p, mode_name[mem_mode]);
>> +      print_dec (access_off, dump_file);
>> +      fprintf (dump_file, "]\n");
>> +    }
>> +}
>> +
>> +// We just emitted a tombstone with uid UID, track it in a bitmap for
>> +// this BB so we can easily identify it later when cleaning up tombstones.
>> +void
>> +pair_fusion::track_tombstone (int uid)
>> +{
>> +  if (!m_emitted_tombstone)
>> +    {
>> +      // Lazily initialize the bitmap for tracking tombstone insns.
>> +      bitmap_obstack_initialize (&m_bitmap_obstack);
>> +      bitmap_initialize (&m_tombstone_bitmap, &m_bitmap_obstack);
>> +      m_emitted_tombstone = true;
>> +    }
>> +
>> +  if (!bitmap_set_bit (&m_tombstone_bitmap, uid))
>> +    gcc_unreachable (); // Bit should have changed.
>> +}
>> +
>> +// Given two adjacent memory accesses of the same size, I1 and I2, try
>> +// and see if we can merge them into a pair mem load and store.
>> +//
>> +// ACCESS_SIZE gives the (common) size of a single access, LOAD_P is true
>> +// if the accesses are both loads, otherwise they are both stores.
>> +bool
>> +pair_fusion::try_fuse_pair (bool load_p, unsigned access_size,
>> +			    insn_info *i1, insn_info *i2)
>> +{
>> +  if (dump_file)
>> +    fprintf (dump_file, "analyzing pair (load=%d): (%d,%d)\n",
>> +	     load_p, i1->uid (), i2->uid ());
>> +
>> +  insn_info *insns[2];
>> +  bool reversed = false;
>> +  if (*i1 < *i2)
>> +    {
>> +      insns[0] = i1;
>> +      insns[1] = i2;
>> +    }
>> +  else
>> +    {
>> +      insns[0] = i2;
>> +      insns[1] = i1;
>> +      reversed = true;
>> +    }
>> +
>> +  rtx cand_mems[2];
>> +  rtx reg_ops[2];
>> +  rtx pats[2];
>> +  for (int i = 0; i < 2; i++)
>> +    {
>> +      pats[i] = PATTERN (insns[i]->rtl ());
>> +      cand_mems[i] = XEXP (pats[i], load_p);
>> +      reg_ops[i] = XEXP (pats[i], !load_p);
>> +    }
>> +
>> +  if (!load_p && !fuseable_store_p (i1, i2))
>> +    {
>> +      if (dump_file)
>> +	fprintf (dump_file,
>> +		 "punting on store-mem-pairs due to non fuseable cand (%d,%d)\n",
>> +		 insns[0]->uid (), insns[1]->uid ());
>> +      return false;
>> +    }
>> +
>> +
>> +  if (load_p && reg_overlap_mentioned_p (reg_ops[0], reg_ops[1]))
>> +    {
>> +      if (dump_file)
>> +	fprintf (dump_file,
>> +		 "punting on pair mem load  due to reg conflcits (%d,%d)\n",
>> +		 insns[0]->uid (), insns[1]->uid ());
>> +      return false;
>> +    }
>> +
>> +  if (cfun->can_throw_non_call_exceptions
>> +      && find_reg_note (insns[0]->rtl (), REG_EH_REGION, NULL_RTX)
>> +      && find_reg_note (insns[1]->rtl (), REG_EH_REGION, NULL_RTX))
>> +    {
>> +      if (dump_file)
>> +	fprintf (dump_file,
>> +		 "can't combine insns with EH side effects (%d,%d)\n",
>> +		 insns[0]->uid (), insns[1]->uid ());
>> +      return false;
>> +    }
>> +
>> +  auto_vec<base_cand, 2> base_cands (2);
>> +
>> +  int writeback = get_viable_bases (insns, base_cands, cand_mems,
>> +				    access_size, reversed);
>> +  if (base_cands.is_empty ())
>> +    {
>> +      if (dump_file)
>> +	fprintf (dump_file, "no viable base for pair (%d,%d)\n",
>> +		 insns[0]->uid (), insns[1]->uid ());
>> +      return false;
>> +    }
>> +
>> +  // Punt on frame-related insns with writeback.  We probably won't see
>> +  // these in practice, but this is conservative and ensures we don't
>> +  // have to worry about these later on.
>> +  if (writeback && (RTX_FRAME_RELATED_P (i1->rtl ())
>> +		    || RTX_FRAME_RELATED_P (i2->rtl ())))
>> +    {
>> +      if (dump_file)
>> +	fprintf (dump_file,
>> +		 "rejecting pair (%d,%d): frame-related insn with writeback\n",
>> +		 i1->uid (), i2->uid ());
>> +      return false;
>> +    }
>> +
>> +  rtx *ignore = &XEXP (pats[1], load_p);
>> +  for (auto use : insns[1]->uses ())
>> +    if (!use->is_mem ()
>> +	&& refers_to_regno_p (use->regno (), use->regno () + 1, pats[1], ignore)
>> +	&& use->def () && use->def ()->insn () == insns[0])
>> +      {
>> +	// N.B. we allow a true dependence on the base address, as this
>> +	// happens in the case of auto-inc accesses.  Consider a post-increment
>> +	// load followed by a regular indexed load, for example.
>> +	if (dump_file)
>> +	  fprintf (dump_file,
>> +		   "%d has non-address true dependence on %d, rejecting pair\n",
>> +		   insns[1]->uid (), insns[0]->uid ());
>> +	return false;
>> +      }
>>  
>> -  // If we don't currently have a writeback pair, and we don't have
>> -  // a load that clobbers the base register, look for a trailing destructive
>> -  // update of the base register and try and fold it in to make this into a
>> -  // writeback pair.
>> -  insn_info *trailing_add = nullptr;
>> -  if (aarch64_ldp_writeback > 1
>> -      && !writeback_effect
>> -      && (!load_p || (!refers_to_regno_p (base_regno, base_regno + 1,
>> -					 XEXP (pats[0], 0), nullptr)
>> -		      && !refers_to_regno_p (base_regno, base_regno + 1,
>> -					     XEXP (pats[1], 0), nullptr))))
>> +  unsigned i = 0;
>> +  while (i < base_cands.length ())
>>      {
>> -      def_info *add_def;
>> -      trailing_add = find_trailing_add (insns, move_range, writeback,
>> -					&writeback_effect,
>> -					&add_def, base.def, offsets[0],
>> -					access_size);
>> -      if (trailing_add)
>> +      base_cand &cand = base_cands[i];
>> +
>> +      rtx *ignore[2] = {};
>> +      for (int j = 0; j < 2; j++)
>> +	if (cand.from_insn == !j)
>> +	  ignore[j] = &XEXP (cand_mems[j], 0);
>> +
>> +      insn_info *h = first_hazard_after (insns[0], ignore[0]);
>> +      if (h && *h < *insns[1])
>> +	cand.hazards[0] = h;
>> +
>> +      h = latest_hazard_before (insns[1], ignore[1]);
>> +      if (h && *h > *insns[0])
>> +	cand.hazards[1] = h;
>> +
>> +      if (!cand.viable ())
>>  	{
>> -	  // The def of the base register from the trailing add should prevail.
>> -	  input_defs[0] = insert_access (attempt, add_def, input_defs[0]);
>> -	  gcc_assert (input_defs[0].is_valid ());
>> +	  if (dump_file)
>> +	    fprintf (dump_file,
>> +		     "pair (%d,%d): rejecting base %d due to dataflow "
>> +		     "hazards (%d,%d)\n",
>> +		     insns[0]->uid (),
>> +		     insns[1]->uid (),
>> +		     cand.def->regno (),
>> +		     cand.hazards[0]->uid (),
>> +		     cand.hazards[1]->uid ());
>> +
>> +	  base_cands.ordered_remove (i);
>>  	}
>> +      else
>> +	i++;
>>      }
>>  
>> -  // Now that we know what base mem we're going to use, check if it's OK
>> -  // with the ldp/stp policy.
>> -  rtx first_mem = XEXP (pats[0], load_p);
>> -  if (!aarch64_mem_ok_with_ldpstp_policy_model (first_mem,
>> -						load_p,
>> -						GET_MODE (first_mem)))
>> +  if (base_cands.is_empty ())
>>      {
>>        if (dump_file)
>> -	fprintf (dump_file, "punting on pair (%d,%d), ldp/stp policy says no\n",
>> -		 i1->uid (), i2->uid ());
>> +	fprintf (dump_file,
>> +		 "can't form pair (%d,%d) due to dataflow hazards\n",
>> +		 insns[0]->uid (), insns[1]->uid ());
>>        return false;
>>      }
>>  
>> -  rtx reg_notes = combine_reg_notes (first, second, load_p);
>> +  insn_info *alias_hazards[4] = {};
>>  
>> -  rtx pair_pat;
>> -  if (writeback_effect)
>> +  // First def of memory after the first insn, and last def of memory
>> +  // before the second insn, respectively.
>> +  def_info *mem_defs[2] = {};
>> +  if (load_p)
>>      {
>> -      auto patvec = gen_rtvec (3, writeback_effect, pats[0], pats[1]);
>> -      pair_pat = gen_rtx_PARALLEL (VOIDmode, patvec);
>> +      if (!MEM_READONLY_P (cand_mems[0]))
>> +	{
>> +	  mem_defs[0] = memory_access (insns[0]->uses ())->def ();
>> +	  gcc_checking_assert (mem_defs[0]);
>> +	  mem_defs[0] = mem_defs[0]->next_def ();
>> +	}
>> +      if (!MEM_READONLY_P (cand_mems[1]))
>> +	{
>> +	  mem_defs[1] = memory_access (insns[1]->uses ())->def ();
>> +	  gcc_checking_assert (mem_defs[1]);
>> +	}
>>      }
>> -  else if (load_p)
>> -    pair_pat = aarch64_gen_load_pair (XEXP (pats[0], 0),
>> -				      XEXP (pats[1], 0),
>> -				      XEXP (pats[0], 1));
>>    else
>> -    pair_pat = aarch64_gen_store_pair (XEXP (pats[0], 0),
>> -				       XEXP (pats[0], 1),
>> -				       XEXP (pats[1], 1));
>> +    {
>> +      mem_defs[0] = memory_access (insns[0]->defs ())->next_def ();
>> +      mem_defs[1] = memory_access (insns[1]->defs ())->prev_def ();
>> +      gcc_checking_assert (mem_defs[0]);
>> +      gcc_checking_assert (mem_defs[1]);
>> +    }
>>  
>> -  insn_change *pair_change = nullptr;
>> -  auto set_pair_pat = [pair_pat,reg_notes](insn_change *change) {
>> -    rtx_insn *rti = change->insn ()->rtl ();
>> -    validate_unshare_change (rti, &PATTERN (rti), pair_pat, true);
>> -    validate_change (rti, &REG_NOTES (rti), reg_notes, true);
>> +  auto tombstone_p = [&](insn_info *insn) -> bool {
>> +    return m_emitted_tombstone
>> +	   && bitmap_bit_p (&m_tombstone_bitmap, insn->uid ());
>>    };
>>  
>> -  if (load_p)
>> -    {
>> -      changes.safe_push (make_delete (first));
>> -      pair_change = make_change (second);
>> -      changes.safe_push (pair_change);
>> +  store_walker<false, decltype(tombstone_p)>
>> +    forward_store_walker (mem_defs[0], cand_mems[0], insns[1], tombstone_p);
>>  
>> -      pair_change->move_range = move_range;
>> -      pair_change->new_defs = merge_access_arrays (attempt,
>> -						   input_defs[0],
>> -						   input_defs[1]);
>> -      gcc_assert (pair_change->new_defs.is_valid ());
>> +  store_walker<true, decltype(tombstone_p)>
>> +    backward_store_walker (mem_defs[1], cand_mems[1], insns[0], tombstone_p);
>>  
>> -      pair_change->new_uses
>> -	= merge_access_arrays (attempt,
>> -			       drop_memory_access (input_uses[0]),
>> -			       drop_memory_access (input_uses[1]));
>> -      gcc_assert (pair_change->new_uses.is_valid ());
>> -      set_pair_pat (pair_change);
>> -    }
>> +  alias_walker *walkers[4] = {};
>> +  if (mem_defs[0])
>> +    walkers[0] = &forward_store_walker;
>> +  if (mem_defs[1])
>> +    walkers[1] = &backward_store_walker;
>> +
>> +  if (load_p && (mem_defs[0] || mem_defs[1]))
>> +    do_alias_analysis (alias_hazards, walkers, load_p);
>>    else
>>      {
>> -      using Action = stp_change_builder::action;
>> -      insn_info *store_to_change = try_repurpose_store (first, second,
>> -							move_range);
>> -      stp_change_builder builder (insns, store_to_change, pair_dst);
>> -      insn_change *change;
>> -      set_info *new_set = nullptr;
>> -      for (; !builder.done (); builder.advance ())
>> -	{
>> -	  auto action = builder.get_change ();
>> -	  change = (action.type == Action::INSERT)
>> -	    ? nullptr : make_change (action.insn);
>> -	  switch (action.type)
>> -	    {
>> -	    case Action::CHANGE:
>> -	    {
>> -	      set_pair_pat (change);
>> -	      change->new_uses = merge_access_arrays (attempt,
>> -						      input_uses[0],
>> -						      input_uses[1]);
>> -	      auto d1 = drop_memory_access (input_defs[0]);
>> -	      auto d2 = drop_memory_access (input_defs[1]);
>> -	      change->new_defs = merge_access_arrays (attempt, d1, d2);
>> -	      gcc_assert (change->new_defs.is_valid ());
>> -	      def_info *stp_def = memory_access (change->insn ()->defs ());
>> -	      change->new_defs = insert_access (attempt,
>> -						stp_def,
>> -						change->new_defs);
>> -	      gcc_assert (change->new_defs.is_valid ());
>> -	      change->move_range = move_range;
>> -	      pair_change = change;
>> -	      break;
>> -	    }
>> -	    case Action::TOMBSTONE:
>> -	    {
>> -	      tombstone_uids.quick_push (change->insn ()->uid ());
>> -	      rtx_insn *rti = change->insn ()->rtl ();
>> -	      validate_change (rti, &PATTERN (rti), gen_tombstone (), true);
>> -	      validate_change (rti, &REG_NOTES (rti), NULL_RTX, true);
>> -	      change->new_uses = use_array (nullptr, 0);
>> -	      break;
>> -	    }
>> -	    case Action::INSERT:
>> -	    {
>> -	      if (dump_file)
>> -		fprintf (dump_file,
>> -			 "  stp: cannot re-purpose candidate stores\n");
>> -
>> -	      auto new_insn = crtl->ssa->create_insn (attempt, INSN, pair_pat);
>> -	      change = make_change (new_insn);
>> -	      change->move_range = move_range;
>> -	      change->new_uses = merge_access_arrays (attempt,
>> -						      input_uses[0],
>> -						      input_uses[1]);
>> -	      gcc_assert (change->new_uses.is_valid ());
>> +      // We want to find any loads hanging off the first store.
>> +      mem_defs[0] = memory_access (insns[0]->defs ());
>> +      load_walker<false> forward_load_walker (mem_defs[0], insns[0], insns[1]);
>> +      load_walker<true> backward_load_walker (mem_defs[1], insns[1], insns[0]);
>> +      walkers[2] = &forward_load_walker;
>> +      walkers[3] = &backward_load_walker;
>> +      do_alias_analysis (alias_hazards, walkers, load_p);
>> +      // Now consolidate hazards back down.
>> +      if (alias_hazards[2]
>> +	  && (!alias_hazards[0] || (*alias_hazards[2] < *alias_hazards[0])))
>> +	alias_hazards[0] = alias_hazards[2];
>>  
>> -	      auto d1 = drop_memory_access (input_defs[0]);
>> -	      auto d2 = drop_memory_access (input_defs[1]);
>> -	      change->new_defs = merge_access_arrays (attempt, d1, d2);
>> -	      gcc_assert (change->new_defs.is_valid ());
>> +      if (alias_hazards[3]
>> +	  && (!alias_hazards[1] || (*alias_hazards[3] > *alias_hazards[1])))
>> +	alias_hazards[1] = alias_hazards[3];
>> +    }
>>  
>> -	      new_set = crtl->ssa->create_set (attempt, new_insn, memory);
>> -	      change->new_defs = insert_access (attempt, new_set,
>> -						change->new_defs);
>> -	      gcc_assert (change->new_defs.is_valid ());
>> -	      pair_change = change;
>> -	      break;
>> -	    }
>> -	    case Action::FIXUP_USE:
>> -	    {
>> -	      // This use now needs to consume memory from our stp.
>> -	      if (dump_file)
>> -		fprintf (dump_file,
>> -			 "  stp: changing i%d to use mem from new stp "
>> -			 "(after i%d)\n",
>> -			 action.insn->uid (), pair_dst->uid ());
>> -	      change->new_uses = drop_memory_access (change->new_uses);
>> -	      gcc_assert (new_set);
>> -	      auto new_use = crtl->ssa->create_use (attempt, action.insn,
>> -						    new_set);
>> -	      change->new_uses = insert_access (attempt, new_use,
>> -						change->new_uses);
>> -	      break;
>> -	    }
>> -	    }
>> -	  changes.safe_push (change);
>> -	}
>> +  if (alias_hazards[0] && alias_hazards[1]
>> +      && *alias_hazards[0] <= *alias_hazards[1])
>> +    {
>> +      if (dump_file)
>> +	fprintf (dump_file,
>> +		 "cannot form pair (%d,%d) due to alias conflicts (%d,%d)\n",
>> +		 i1->uid (), i2->uid (),
>> +		 alias_hazards[0]->uid (), alias_hazards[1]->uid ());
>> +      return false;
>>      }
>>  
>> -  if (trailing_add)
>> -    changes.safe_push (make_delete (trailing_add));
>> -  else if ((writeback & 2) && !writeback_effect)
>> +  // Now narrow the hazards on each base candidate using
>> +  // the alias hazards.
>> +  i = 0;
>> +  while (i < base_cands.length ())
>>      {
>> -      // The second insn initially had writeback but now the pair does not,
>> -      // need to update any nondebug uses of the base register def in the
>> -      // second insn.  We'll take care of debug uses later.
>> -      auto def = find_access (insns[1]->defs (), base_regno);
>> -      gcc_assert (def);
>> -      auto set = dyn_cast<set_info *> (def);
>> -      if (set && set->has_nondebug_uses ())
>> -	{
>> -	  auto orig_use = find_access (insns[0]->uses (), base_regno);
>> -	  for (auto use : set->nondebug_insn_uses ())
>> -	    {
>> -	      auto change = make_change (use->insn ());
>> -	      change->new_uses = check_remove_regno_access (attempt,
>> -							    change->new_uses,
>> -							    base_regno);
>> -	      change->new_uses = insert_access (attempt,
>> -						orig_use,
>> -						change->new_uses);
>> -	      changes.safe_push (change);
>> -	    }
>> +      base_cand &cand = base_cands[i];
>> +      if (alias_hazards[0] && (!cand.hazards[0]
>> +			       || *alias_hazards[0] < *cand.hazards[0]))
>> +	cand.hazards[0] = alias_hazards[0];
>> +      if (alias_hazards[1] && (!cand.hazards[1]
>> +			       || *alias_hazards[1] > *cand.hazards[1]))
>> +	cand.hazards[1] = alias_hazards[1];
>> +
>> +      if (cand.viable ())
>> +	i++;
>> +      else
>> +	{
>> +	  if (dump_file)
>> +	    fprintf (dump_file, "pair (%d,%d): rejecting base %d due to "
>> +				"alias/dataflow hazards (%d,%d)",
>> +				insns[0]->uid (), insns[1]->uid (),
>> +				cand.def->regno (),
>> +				cand.hazards[0]->uid (),
>> +				cand.hazards[1]->uid ());
>> +
>> +	  base_cands.ordered_remove (i);
>>  	}
>>      }
>>  
>> -  auto is_changing = insn_is_changing (changes);
>> -  for (unsigned i = 0; i < changes.length (); i++)
>> -    gcc_assert (rtl_ssa::restrict_movement_ignoring (*changes[i], is_changing));
>> -
>> -  // Check the pair pattern is recog'd.
>> -  if (!rtl_ssa::recog_ignoring (attempt, *pair_change, is_changing))
>> +  if (base_cands.is_empty ())
>>      {
>>        if (dump_file)
>> -	fprintf (dump_file, "  failed to form pair, recog failed\n");
>> +	fprintf (dump_file,
>> +		 "cannot form pair (%d,%d) due to alias/dataflow hazards",
>> +		 insns[0]->uid (), insns[1]->uid ());
>>  
>> -      // Free any reg notes we allocated.
>> -      while (reg_notes)
>> -	{
>> -	  rtx next = XEXP (reg_notes, 1);
>> -	  free_EXPR_LIST_node (reg_notes);
>> -	  reg_notes = next;
>> -	}
>> -      cancel_changes (0);
>>        return false;
>>      }
>>  
>> -  gcc_assert (crtl->ssa->verify_insn_changes (changes));
>> -
>> -  // Fix up any debug uses that will be affected by the changes.
>> -  if (MAY_HAVE_DEBUG_INSNS)
>> -    fixup_debug_uses (attempt, insns, orig_rtl, pair_dst, trailing_add,
>> -		      load_p, writeback, writeback_effect, base_regno);
>> -
>> -  confirm_change_group ();
>> -  crtl->ssa->change_insns (changes);
>> -
>> -  gcc_checking_assert (tombstone_uids.length () <= 2);
>> -  for (auto uid : tombstone_uids)
>> -    track_tombstone (uid);
>> -
>> -  return true;
>> -}
>> -
>> -// Return true if STORE_INSN may modify mem rtx MEM.  Make sure we keep
>> -// within our BUDGET for alias analysis.
>> -static bool
>> -store_modifies_mem_p (rtx mem, insn_info *store_insn, int &budget)
>> -{
>> -  if (!budget)
>> +  base_cand *base = &base_cands[0];
>> +  if (base_cands.length () > 1)
>>      {
>> -      if (dump_file)
>> +      // If there are still multiple viable bases, it makes sense
>> +      // to choose one that allows us to reduce register pressure,
>> +      // for loads this means moving further down, for stores this
>> +      // means moving further up.
>> +      gcc_checking_assert (base_cands.length () == 2);
>> +      const int hazard_i = !load_p;
>> +      if (base->hazards[hazard_i])
>>  	{
>> -	  fprintf (dump_file,
>> -		   "exceeded budget, assuming store %d aliases with mem ",
>> -		   store_insn->uid ());
>> -	  print_simple_rtl (dump_file, mem);
>> -	  fprintf (dump_file, "\n");
>> +	  if (!base_cands[1].hazards[hazard_i])
>> +	    base = &base_cands[1];
>> +	  else if (load_p
>> +		   && *base_cands[1].hazards[hazard_i]
>> +		      > *(base->hazards[hazard_i]))
>> +	    base = &base_cands[1];
>> +	  else if (!load_p
>> +		   && *base_cands[1].hazards[hazard_i]
>> +		      < *(base->hazards[hazard_i]))
>> +	    base = &base_cands[1];
>>  	}
>> -
>> -      return true;
>>      }
>>  
>> -  budget--;
>> -  return memory_modified_in_insn_p (mem, store_insn->rtl ());
>> -}
>> -
>> -// Return true if LOAD may be modified by STORE.  Make sure we keep
>> -// within our BUDGET for alias analysis.
>> -static bool
>> -load_modified_by_store_p (insn_info *load,
>> -			  insn_info *store,
>> -			  int &budget)
>> -{
>> -  gcc_checking_assert (budget >= 0);
>> +  // Otherwise, hazards[0] > hazards[1].
>> +  // Pair can be formed anywhere in (hazards[1], hazards[0]).
>> +  insn_range_info range (insns[0], insns[1]);
>> +  if (base->hazards[1])
>> +    range.first = base->hazards[1];
>> +  if (base->hazards[0])
>> +    range.last = base->hazards[0]->prev_nondebug_insn ();
>>  
>> -  if (!budget)
>> +  // If the second insn can throw, narrow the move range to exactly that insn.
>> +  // This prevents us trying to move the second insn from the end of the BB.
>> +  if (cfun->can_throw_non_call_exceptions
>> +      && find_reg_note (insns[1]->rtl (), REG_EH_REGION, NULL_RTX))
>>      {
>> -      if (dump_file)
>> -	{
>> -	  fprintf (dump_file,
>> -		   "exceeded budget, assuming load %d aliases with store %d\n",
>> -		   load->uid (), store->uid ());
>> -	}
>> -      return true;
>> +      gcc_assert (range.includes (insns[1]));
>> +      range = insn_range_info (insns[1]);
>>      }
>>  
>> -  // It isn't safe to re-order stores over calls.
>> -  if (CALL_P (load->rtl ()))
>> -    return true;
>> +  // Placement strategy: push loads down and pull stores up, this should
>> +  // help register pressure by reducing live ranges.
>> +  if (load_p)
>> +    range.first = range.last;
>> +  else
>> +    range.last = range.first;
>>  
>> -  budget--;
>> +  if (dump_file)
>> +    {
>> +      auto print_hazard = [](insn_info *i)
>> +	{
>> +	  if (i)
>> +	    fprintf (dump_file, "%d", i->uid ());
>> +	  else
>> +	    fprintf (dump_file, "-");
>> +	};
>> +      auto print_pair = [print_hazard](insn_info **i)
>> +	{
>> +	  print_hazard (i[0]);
>> +	  fprintf (dump_file, ",");
>> +	  print_hazard (i[1]);
>> +	};
>>  
>> -  // Iterate over all MEMs in the load, seeing if any alias with
>> -  // our store.
>> -  subrtx_var_iterator::array_type array;
>> -  rtx pat = PATTERN (load->rtl ());
>> -  FOR_EACH_SUBRTX_VAR (iter, array, pat, NONCONST)
>> -    if (MEM_P (*iter) && memory_modified_in_insn_p (*iter, store->rtl ()))
>> -      return true;
>> +      fprintf (dump_file, "fusing pair [L=%d] (%d,%d), base=%d, hazards: (",
>> +	      load_p, insns[0]->uid (), insns[1]->uid (),
>> +	      base->def->regno ());
>> +      print_pair (base->hazards);
>> +      fprintf (dump_file, "), move_range: (%d,%d)\n",
>> +	       range.first->uid (), range.last->uid ());
>> +    }
>>  
>> -  return false;
>> +  return fuse_pair (load_p, access_size, writeback,
>> +		    i1, i2, *base, range);
>>  }
>>  
>> -// Virtual base class for load/store walkers used in alias analysis.
>> -struct alias_walker
>> -{
>> -  virtual bool conflict_p (int &budget) const = 0;
>> -  virtual insn_info *insn () const = 0;
>> -  virtual bool valid () const  = 0;
>> -  virtual void advance () = 0;
>> -};
>> -
>> -// Implement some common functionality used by both store_walker
>> -// and load_walker.
>> -template<bool reverse>
>> -class def_walker : public alias_walker
>> -{
>> -protected:
>> -  using def_iter_t = typename std::conditional<reverse,
>> -	reverse_def_iterator, def_iterator>::type;
>> -
>> -  static use_info *start_use_chain (def_iter_t &def_iter)
>> -  {
>> -    set_info *set = nullptr;
>> -    for (; *def_iter; def_iter++)
>> -      {
>> -	set = dyn_cast<set_info *> (*def_iter);
>> -	if (!set)
>> -	  continue;
>> -
>> -	use_info *use = reverse
>> -	  ? set->last_nondebug_insn_use ()
>> -	  : set->first_nondebug_insn_use ();
>> -
>> -	if (use)
>> -	  return use;
>> -      }
>> -
>> -    return nullptr;
>> -  }
>> -
>> -  def_iter_t def_iter;
>> -  insn_info *limit;
>> -  def_walker (def_info *def, insn_info *limit) :
>> -    def_iter (def), limit (limit) {}
>> -
>> -  virtual bool iter_valid () const { return *def_iter; }
>> -
>> -public:
>> -  insn_info *insn () const override { return (*def_iter)->insn (); }
>> -  void advance () override { def_iter++; }
>> -  bool valid () const override final
>> -  {
>> -    if (!iter_valid ())
>> -      return false;
>> -
>> -    if (reverse)
>> -      return *(insn ()) > *limit;
>> -    else
>> -      return *(insn ()) < *limit;
>> -  }
>> -};
>>  
>> -// alias_walker that iterates over stores.
>> -template<bool reverse, typename InsnPredicate>
>> -class store_walker : public def_walker<reverse>
>> +// LEFT_LIST and RIGHT_LIST are lists of candidate instructions where all insns
>> +// in LEFT_LIST are known to be adjacent to those in RIGHT_LIST.
>> +//
>> +// This function traverses the resulting 2D matrix of possible pair candidates
>> +// and attempts to merge them into pairs.
>> +//
>> +// The algorithm is straightforward: if we consider a combined list of
>> +// candidates X obtained by merging LEFT_LIST and RIGHT_LIST in program order,
>> +// then we advance through X until we reach a crossing point (where X[i] and
>> +// X[i+1] come from different source lists).
>> +//
>> +// At this point we know X[i] and X[i+1] are adjacent accesses, and we try to
>> +// fuse them into a pair.  If this succeeds, we remove X[i] and X[i+1] from
>> +// their original lists and continue as above.
>> +//
>> +// In the failure case, we advance through the source list containing X[i] and
>> +// continue as above (proceeding to the next crossing point).
>> +//
>> +// The rationale for skipping over groups of consecutive candidates from the
>> +// same source list is as follows:
>> +//
>> +// In the store case, the insns in the group can't be re-ordered over each
>> +// other as they are guaranteed to store to the same location, so we're
>> +// guaranteed not to lose opportunities by doing this.
>> +//
>> +// In the load case, subsequent loads from the same location are either
>> +// redundant (in which case they should have been cleaned up by an earlier
>> +// optimization pass) or there is an intervening aliasing hazard, in which case
>> +// we can't re-order them anyway, so provided earlier passes have cleaned up
>> +// redundant loads, we shouldn't miss opportunities by doing this.
>> +void
>> +pair_fusion::merge_pairs (insn_list_t &left_list,
>> +			  insn_list_t &right_list,
>> +			  bool load_p,
>> +			  unsigned access_size)
>>  {
>> -  rtx cand_mem;
>> -  InsnPredicate tombstone_p;
>> -
>> -public:
>> -  store_walker (def_info *mem_def, rtx mem, insn_info *limit_insn,
>> -		InsnPredicate tombstone_fn) :
>> -    def_walker<reverse> (mem_def, limit_insn),
>> -    cand_mem (mem), tombstone_p (tombstone_fn) {}
>> -
>> -  bool conflict_p (int &budget) const override final
>> -  {
>> -    if (tombstone_p (this->insn ()))
>> -      return false;
>> +  if (dump_file)
>> +    {
>> +      fprintf (dump_file, "merge_pairs [L=%d], cand vecs ", load_p);
>> +      dump_insn_list (dump_file, left_list);
>> +      fprintf (dump_file, " x ");
>> +      dump_insn_list (dump_file, right_list);
>> +      fprintf (dump_file, "\n");
>> +    }
>>  
>> -    return store_modifies_mem_p (cand_mem, this->insn (), budget);
>> -  }
>> -};
>> +  auto iter_l = left_list.begin ();
>> +  auto iter_r = right_list.begin ();
>>  
>> -// alias_walker that iterates over loads.
>> -template<bool reverse>
>> -class load_walker : public def_walker<reverse>
>> +  while (iter_l != left_list.end () && iter_r != right_list.end ())
>> +    {
>> +      auto next_l = std::next (iter_l);
>> +      auto next_r = std::next (iter_r);
>> +      if (**iter_l < **iter_r
>> +	  && next_l != left_list.end ()
>> +	  && **next_l < **iter_r)
>> +	iter_l = next_l;
>> +      else if (**iter_r < **iter_l
>> +	       && next_r != right_list.end ()
>> +	       && **next_r < **iter_l)
>> +	iter_r = next_r;
>> +      else if (try_fuse_pair (load_p, access_size, *iter_l, *iter_r))
>> +	{
>> +	  left_list.erase (iter_l);
>> +	  iter_l = next_l;
>> +	  right_list.erase (iter_r);
>> +	  iter_r = next_r;
>> +	}
>> +      else if (**iter_l < **iter_r)
>> +	iter_l = next_l;
>> +      else
>> +	iter_r = next_r;
>> +    }
>> +}
>> +// If we emitted tombstone insns for this BB, iterate through the BB
>> +// and remove all the tombstone insns, being sure to reparent any uses
>> +// of mem to previous defs when we do this.
>> +void
>> +pair_fusion::cleanup_tombstones ()
>>  {
>> -  using Base = def_walker<reverse>;
>> -  using use_iter_t = typename std::conditional<reverse,
>> -	reverse_use_iterator, nondebug_insn_use_iterator>::type;
>> +  // No need to do anything if we didn't emit a tombstone insn for this BB.
>> +  if (!m_emitted_tombstone)
>> +    return;
>>  
>> -  use_iter_t use_iter;
>> -  insn_info *cand_store;
>> +  insn_info *insn = m_bb->head_insn ();
>> +  while (insn)
>> +    {
>> +      insn_info *next = insn->next_nondebug_insn ();
>> +      if (!insn->is_real ()
>> +	  || !bitmap_bit_p (&m_tombstone_bitmap, insn->uid ()))
>> +	{
>> +	  insn = next;
>> +	  continue;
>> +	}
>>  
>> -  bool iter_valid () const override final { return *use_iter; }
>> +      auto def = memory_access (insn->defs ());
>> +      auto set = dyn_cast<set_info *> (def);
>> +      if (set && set->has_any_uses ())
>> +	{
>> +	  def_info *prev_def = def->prev_def ();
>> +	  auto prev_set = dyn_cast<set_info *> (prev_def);
>> +	  if (!prev_set)
>> +	    gcc_unreachable ();
>>  
>> -public:
>> -  void advance () override final
>> -  {
>> -    use_iter++;
>> -    if (*use_iter)
>> -      return;
>> -    this->def_iter++;
>> -    use_iter = Base::start_use_chain (this->def_iter);
>> -  }
>> +	  while (set->first_use ())
>> +	    crtl->ssa->reparent_use (set->first_use (), prev_set);
>> +	}
>>  
>> -  insn_info *insn () const override final
>> -  {
>> -    return (*use_iter)->insn ();
>> -  }
>> +      // Now set has no uses, we can delete it.
>> +      insn_change change (insn, insn_change::DELETE);
>> +      crtl->ssa->change_insn (change);
>> +      insn = next;
>> +    }
>> +}
>>  
>> -  bool conflict_p (int &budget) const override final
>> -  {
>> -    return load_modified_by_store_p (insn (), cand_store, budget);
>> -  }
>> +template<typename Map>
>> +void
>> +pair_fusion::traverse_base_map (Map &map)
>> +{
>> +  for (auto kv : map)
>> +    {
>> +      const auto &key = kv.first;
>> +      auto &value = kv.second;
>> +      transform_for_base (key.second, value);
>> +    }
>> +}
>>  
>> -  load_walker (def_info *def, insn_info *store, insn_info *limit_insn)
>> -    : Base (def, limit_insn),
>> -      use_iter (Base::start_use_chain (this->def_iter)),
>> -      cand_store (store) {}
>> -};
>> +void
>> +pair_fusion::transform ()
>> +{
>> +  traverse_base_map (expr_map);
>> +  traverse_base_map (def_map);
>> +}
>>  
>>  // Process our alias_walkers in a round-robin fashion, proceeding until
>>  // nothing more can be learned from alias analysis.
>>  //
>>  // We try to maintain the invariant that if a walker becomes invalid, we
>>  // set its pointer to null.
>> -static void
>> -do_alias_analysis (insn_info *alias_hazards[4],
>> +void
>> +pair_fusion::do_alias_analysis (insn_info *alias_hazards[4],
>>  		   alias_walker *walkers[4],
>>  		   bool load_p)
>>  {
>>    const int n_walkers = 2 + (2 * !load_p);
>> -  int budget = aarch64_ldp_alias_check_limit;
>> +  int budget = pair_mem_alias_check_limit();
>>  
>>    auto next_walker = [walkers,n_walkers](int current) -> int {
>>      for (int j = 1; j <= n_walkers; j++)
>> @@ -2341,548 +2553,554 @@ do_alias_analysis (insn_info *alias_hazards[4],
>>      }
>>  }
>>  
>> -// Given INSNS (in program order) which are known to be adjacent, look
>> -// to see if either insn has a suitable RTL (register) base that we can
>> -// use to form a pair.  Push these to BASE_CANDS if we find any.  CAND_MEMs
>> -// gives the relevant mems from the candidate insns, ACCESS_SIZE gives the
>> -// size of a single candidate access, and REVERSED says whether the accesses
>> -// are inverted in offset order.
>> +// Try and actually fuse the pair given by insns I1 and I2.
>>  //
>> -// Returns an integer where bit (1 << i) is set if INSNS[i] uses writeback
>> -// addressing.
>> -static int
>> -get_viable_bases (insn_info *insns[2],
>> -		  vec<base_cand> &base_cands,
>> -		  rtx cand_mems[2],
>> -		  unsigned access_size,
>> -		  bool reversed)
>> +// Here we've done enough analysis to know this is safe, we only
>> +// reject the pair at this stage if either the tuning policy says to,
>> +// or recog fails on the final pair insn.
>> +//
>> +// LOAD_P is true for loads, ACCESS_SIZE gives the access size of each
>> +// candidate insn.  Bit i of WRITEBACK is set if the ith insn (in program
>> +// order) uses writeback.
>> +//
>> +// BASE gives the chosen base candidate for the pair and MOVE_RANGE is
>> +// a singleton range which says where to place the pair.
>> +bool
>> +pair_fusion::fuse_pair (bool load_p,
>> +			unsigned access_size,
>> +			int writeback,
>> +			insn_info *i1, insn_info *i2,
>> +			base_cand &base,
>> +			const insn_range_info &move_range)
>>  {
>> -  // We discovered this pair through a common base.  Need to ensure that
>> -  // we have a common base register that is live at both locations.
>> -  def_info *base_defs[2] = {};
>> -  int writeback = 0;
>> -  for (int i = 0; i < 2; i++)
>> -    {
>> -      const bool is_lower = (i == reversed);
>> -      poly_int64 poly_off;
>> -      rtx base = ldp_strip_offset (cand_mems[i], &poly_off);
>> -      if (GET_RTX_CLASS (GET_CODE (XEXP (cand_mems[i], 0))) == RTX_AUTOINC)
>> -	writeback |= (1 << i);
>> -
>> -      if (!REG_P (base) || !poly_off.is_constant ())
>> -	continue;
>> -
>> -      // Punt on accesses relative to eliminable regs.  See the comment in
>> -      // ldp_bb_info::track_access for a detailed explanation of this.
>> -      if (!reload_completed
>> -	  && (REGNO (base) == FRAME_POINTER_REGNUM
>> -	      || REGNO (base) == ARG_POINTER_REGNUM))
>> -	continue;
>> -
>> -      HOST_WIDE_INT base_off = poly_off.to_constant ();
>> -
>> -      // It should be unlikely that we ever punt here, since MEM_EXPR offset
>> -      // alignment should be a good proxy for register offset alignment.
>> -      if (base_off % access_size != 0)
>> -	{
>> -	  if (dump_file)
>> -	    fprintf (dump_file,
>> -		     "base not viable, offset misaligned (insn %d)\n",
>> -		     insns[i]->uid ());
>> -	  continue;
>> -	}
>> -
>> -      base_off /= access_size;
>> -
>> -      if (!is_lower)
>> -	base_off--;
>> -
>> -      if (base_off < LDP_MIN_IMM || base_off > LDP_MAX_IMM)
>> -	continue;
>> -
>> -      use_info *use = find_access (insns[i]->uses (), REGNO (base));
>> -      gcc_assert (use);
>> -      base_defs[i] = use->def ();
>> -    }
>> +  auto attempt = crtl->ssa->new_change_attempt ();
>>  
>> -  if (!base_defs[0] && !base_defs[1])
>> +  auto make_change = [&attempt](insn_info *insn)
>>      {
>> -      if (dump_file)
>> -	fprintf (dump_file, "no viable base register for pair (%d,%d)\n",
>> -		 insns[0]->uid (), insns[1]->uid ());
>> -      return writeback;
>> -    }
>> -
>> -  for (int i = 0; i < 2; i++)
>> -    if ((writeback & (1 << i)) && !base_defs[i])
>> -      {
>> -	if (dump_file)
>> -	  fprintf (dump_file, "insn %d has writeback but base isn't viable\n",
>> -		   insns[i]->uid ());
>> -	return writeback;
>> -      }
>> -
>> -  if (writeback == 3
>> -      && base_defs[0]->regno () != base_defs[1]->regno ())
>> +      return crtl->ssa->change_alloc<insn_change> (attempt, insn);
>> +    };
>> +  auto make_delete = [&attempt](insn_info *insn)
>>      {
>> -      if (dump_file)
>> -	fprintf (dump_file,
>> -		 "pair (%d,%d): double writeback with distinct regs (%d,%d): "
>> -		 "punting\n",
>> -		 insns[0]->uid (), insns[1]->uid (),
>> -		 base_defs[0]->regno (), base_defs[1]->regno ());
>> -      return writeback;
>> -    }
>> +      return crtl->ssa->change_alloc<insn_change> (attempt,
>> +						   insn,
>> +						   insn_change::DELETE);
>> +    };
>>  
>> -  if (base_defs[0] && base_defs[1]
>> -      && base_defs[0]->regno () == base_defs[1]->regno ())
>> -    {
>> -      // Easy case: insns already share the same base reg.
>> -      base_cands.quick_push (base_defs[0]);
>> -      return writeback;
>> -    }
>> +  if (*i1 > *i2)
>> +    return false;
>>  
>> -  // Otherwise, we know that one of the bases must change.
>> -  //
>> -  // Note that if there is writeback we must use the writeback base
>> -  // (we know now there is exactly one).
>> -  for (int i = 0; i < 2; i++)
>> -    if (base_defs[i] && (!writeback || (writeback & (1 << i))))
>> -      base_cands.quick_push (base_cand { base_defs[i], i });
>> +  insn_info *first = (*i1 < *i2) ? i1 : i2;
>> +  insn_info *second = (first == i1) ? i2 : i1;
>>  
>> -  return writeback;
>> -}
>> +  insn_info *pair_dst = move_range.singleton ();
>> +  gcc_assert (pair_dst);
>> +
>> +  insn_info *insns[2] = { first, second };
>>  
>> -// Given two adjacent memory accesses of the same size, I1 and I2, try
>> -// and see if we can merge them into a ldp or stp.
>> -//
>> -// ACCESS_SIZE gives the (common) size of a single access, LOAD_P is true
>> -// if the accesses are both loads, otherwise they are both stores.
>> -bool
>> -ldp_bb_info::try_fuse_pair (bool load_p, unsigned access_size,
>> -			    insn_info *i1, insn_info *i2)
>> -{
>> -  if (dump_file)
>> -    fprintf (dump_file, "analyzing pair (load=%d): (%d,%d)\n",
>> -	     load_p, i1->uid (), i2->uid ());
>> +  auto_vec<insn_change *> changes;
>> +  auto_vec<int, 2> tombstone_uids (2);
>>  
>> -  insn_info *insns[2];
>> -  bool reversed = false;
>> -  if (*i1 < *i2)
>> -    {
>> -      insns[0] = i1;
>> -      insns[1] = i2;
>> -    }
>> -  else
>> -    {
>> -      insns[0] = i2;
>> -      insns[1] = i1;
>> -      reversed = true;
>> -    }
>> +  rtx pats[2] = {
>> +    PATTERN (first->rtl ()),
>> +    PATTERN (second->rtl ())
>> +  };
>>  
>> -  rtx cand_mems[2];
>> -  rtx reg_ops[2];
>> -  rtx pats[2];
>> +  // Make copies of the patterns as we might need to refer to the original RTL
>> +  // later, for example when updating debug uses (which is after we've updated
>> +  // one or both of the patterns in the candidate insns).
>> +  rtx orig_rtl[2];
>>    for (int i = 0; i < 2; i++)
>> -    {
>> -      pats[i] = PATTERN (insns[i]->rtl ());
>> -      cand_mems[i] = XEXP (pats[i], load_p);
>> -      reg_ops[i] = XEXP (pats[i], !load_p);
>> -    }
>> +    orig_rtl[i] = copy_rtx (pats[i]);
>>  
>> -  if (load_p && reg_overlap_mentioned_p (reg_ops[0], reg_ops[1]))
>> -    {
>> -      if (dump_file)
>> -	fprintf (dump_file,
>> -		 "punting on ldp due to reg conflcits (%d,%d)\n",
>> -		 insns[0]->uid (), insns[1]->uid ());
>> -      return false;
>> -    }
>> +  use_array input_uses[2] = { first->uses (), second->uses () };
>> +  def_array input_defs[2] = { first->defs (), second->defs () };
>>  
>> -  if (cfun->can_throw_non_call_exceptions
>> -      && find_reg_note (insns[0]->rtl (), REG_EH_REGION, NULL_RTX)
>> -      && find_reg_note (insns[1]->rtl (), REG_EH_REGION, NULL_RTX))
>> +  int changed_insn = -1;
>> +  if (base.from_insn != -1)
>>      {
>> -      if (dump_file)
>> -	fprintf (dump_file,
>> -		 "can't combine insns with EH side effects (%d,%d)\n",
>> -		 insns[0]->uid (), insns[1]->uid ());
>> -      return false;
>> -    }
>> +      // If we're not already using a shared base, we need
>> +      // to re-write one of the accesses to use the base from
>> +      // the other insn.
>> +      gcc_checking_assert (base.from_insn == 0 || base.from_insn == 1);
>> +      changed_insn = !base.from_insn;
>>  
>> -  auto_vec<base_cand, 2> base_cands (2);
>> +      rtx base_pat = pats[base.from_insn];
>> +      rtx change_pat = pats[changed_insn];
>> +      rtx base_mem = XEXP (base_pat, load_p);
>> +      rtx change_mem = XEXP (change_pat, load_p);
>>  
>> -  int writeback = get_viable_bases (insns, base_cands, cand_mems,
>> -				    access_size, reversed);
>> -  if (base_cands.is_empty ())
>> -    {
>> -      if (dump_file)
>> -	fprintf (dump_file, "no viable base for pair (%d,%d)\n",
>> -		 insns[0]->uid (), insns[1]->uid ());
>> -      return false;
>> -    }
>> +      const bool lower_base_p = (insns[base.from_insn] == i1);
>> +      HOST_WIDE_INT adjust_amt = access_size;
>> +      if (!lower_base_p)
>> +	adjust_amt *= -1;
>>  
>> -  // Punt on frame-related insns with writeback.  We probably won't see
>> -  // these in practice, but this is conservative and ensures we don't
>> -  // have to worry about these later on.
>> -  if (writeback && (RTX_FRAME_RELATED_P (i1->rtl ())
>> -		    || RTX_FRAME_RELATED_P (i2->rtl ())))
>> -    {
>> -      if (dump_file)
>> -	fprintf (dump_file,
>> -		 "rejecting pair (%d,%d): frame-related insn with writeback\n",
>> -		 i1->uid (), i2->uid ());
>> -      return false;
>> -    }
>> +      rtx change_reg = XEXP (change_pat, !load_p);
>> +      machine_mode mode_for_mem = GET_MODE (change_mem);
>> +      rtx effective_base = drop_writeback (base_mem);
>> +      rtx new_mem = adjust_address_nv (effective_base,
>> +				       mode_for_mem,
>> +				       adjust_amt);
>> +      rtx new_set = load_p
>> +	? gen_rtx_SET (change_reg, new_mem)
>> +	: gen_rtx_SET (new_mem, change_reg);
>>  
>> -  rtx *ignore = &XEXP (pats[1], load_p);
>> -  for (auto use : insns[1]->uses ())
>> -    if (!use->is_mem ()
>> -	&& refers_to_regno_p (use->regno (), use->regno () + 1, pats[1], ignore)
>> -	&& use->def () && use->def ()->insn () == insns[0])
>> -      {
>> -	// N.B. we allow a true dependence on the base address, as this
>> -	// happens in the case of auto-inc accesses.  Consider a post-increment
>> -	// load followed by a regular indexed load, for example.
>> -	if (dump_file)
>> -	  fprintf (dump_file,
>> -		   "%d has non-address true dependence on %d, rejecting pair\n",
>> -		   insns[1]->uid (), insns[0]->uid ());
>> -	return false;
>> -      }
>> +      pats[changed_insn] = new_set;
>>  
>> -  unsigned i = 0;
>> -  while (i < base_cands.length ())
>> -    {
>> -      base_cand &cand = base_cands[i];
>> +      auto keep_use = [&](use_info *u)
>> +	{
>> +	  return refers_to_regno_p (u->regno (), u->regno () + 1,
>> +				    change_pat, &XEXP (change_pat, load_p));
>> +	};
>>  
>> -      rtx *ignore[2] = {};
>> -      for (int j = 0; j < 2; j++)
>> -	if (cand.from_insn == !j)
>> -	  ignore[j] = &XEXP (cand_mems[j], 0);
>> +      // Drop any uses that only occur in the old address.
>> +      input_uses[changed_insn] = filter_accesses (attempt,
>> +						  input_uses[changed_insn],
>> +						  keep_use);
>> +    }
>>  
>> -      insn_info *h = first_hazard_after (insns[0], ignore[0]);
>> -      if (h && *h < *insns[1])
>> -	cand.hazards[0] = h;
>> +  rtx writeback_effect = NULL_RTX;
>> +  if (writeback)
>> +    writeback_effect = extract_writebacks (load_p, pats, changed_insn);
>>  
>> -      h = latest_hazard_before (insns[1], ignore[1]);
>> -      if (h && *h > *insns[0])
>> -	cand.hazards[1] = h;
>> +  const auto base_regno = base.def->regno ();
>>  
>> -      if (!cand.viable ())
>> -	{
>> -	  if (dump_file)
>> -	    fprintf (dump_file,
>> -		     "pair (%d,%d): rejecting base %d due to dataflow "
>> -		     "hazards (%d,%d)\n",
>> -		     insns[0]->uid (),
>> -		     insns[1]->uid (),
>> -		     cand.def->regno (),
>> -		     cand.hazards[0]->uid (),
>> -		     cand.hazards[1]->uid ());
>> +  if (base.from_insn == -1 && (writeback & 1))
>> +    {
>> +      // If the first of the candidate insns had a writeback form, we'll need to
>> +      // drop the use of the updated base register from the second insn's uses.
>> +      //
>> +      // N.B. we needn't worry about the base register occurring as a store
>> +      // operand, as we checked that there was no non-address true dependence
>> +      // between the insns in try_fuse_pair.
>> +      gcc_checking_assert (find_access (input_uses[1], base_regno));
>> +      input_uses[1] = check_remove_regno_access (attempt,
>> +						 input_uses[1],
>> +						 base_regno);
>> +    }
>>  
>> -	  base_cands.ordered_remove (i);
>> -	}
>> -      else
>> -	i++;
>> +  // Go through and drop uses that only occur in register notes,
>> +  // as we won't be preserving those.
>> +  for (int i = 0; i < 2; i++)
>> +    {
>> +      auto rti = insns[i]->rtl ();
>> +      if (!REG_NOTES (rti))
>> +	continue;
>> +
>> +      input_uses[i] = remove_note_accesses (attempt, input_uses[i]);
>>      }
>>  
>> -  if (base_cands.is_empty ())
>> +  // Edge case: if the first insn is a writeback load and the
>> +  // second insn is a non-writeback load which transfers into the base
>> +  // register, then we should drop the writeback altogether as the
>> +  // update of the base register from the second load should prevail.
>> +  //
>> +  // For example:
>> +  //   ldr x2, [x1], #8
>> +  //   ldr x1, [x1]
>> +  //   -->
>> +  //   ldp x2, x1, [x1]
>> +  if (writeback == 1
>> +      && load_p
>> +      && find_access (input_defs[1], base_regno))
>>      {
>>        if (dump_file)
>>  	fprintf (dump_file,
>> -		 "can't form pair (%d,%d) due to dataflow hazards\n",
>> -		 insns[0]->uid (), insns[1]->uid ());
>> -      return false;
>> +		 "  pair_mem: i%d has wb but subsequent i%d has non-wb "
>> +		 "update of base (r%d), dropping wb\n",
>> +		 insns[0]->uid (), insns[1]->uid (), base_regno);
>> +      gcc_assert (writeback_effect);
>> +      writeback_effect = NULL_RTX;
>>      }
>>  
>> -  insn_info *alias_hazards[4] = {};
>> +  // So far the patterns have been in instruction order,
>> +  // now we want them in offset order.
>> +  if (i1 != first)
>> +    std::swap (pats[0], pats[1]);
>>  
>> -  // First def of memory after the first insn, and last def of memory
>> -  // before the second insn, respectively.
>> -  def_info *mem_defs[2] = {};
>> -  if (load_p)
>> +  poly_int64 offsets[2];
>> +  for (int i = 0; i < 2; i++)
>>      {
>> -      if (!MEM_READONLY_P (cand_mems[0]))
>> -	{
>> -	  mem_defs[0] = memory_access (insns[0]->uses ())->def ();
>> -	  gcc_checking_assert (mem_defs[0]);
>> -	  mem_defs[0] = mem_defs[0]->next_def ();
>> -	}
>> -      if (!MEM_READONLY_P (cand_mems[1]))
>> +      rtx mem = XEXP (pats[i], load_p);
>> +      gcc_checking_assert (MEM_P (mem));
>> +      rtx base = strip_offset (XEXP (mem, 0), offsets + i);
>> +      gcc_checking_assert (REG_P (base));
>> +      gcc_checking_assert (base_regno == REGNO (base));
>> +    }
>> +
>> +  // If either of the original insns had writeback, but the resulting pair insn
>> +  // does not (can happen e.g. in the pair mem  edge case above, or if the writeback
>> +  // effects cancel out), then drop the def(s) of the base register as
>> +  // appropriate.
>> +  //
>> +  // Also drop the first def in the case that both of the original insns had
>> +  // writeback.  The second def could well have uses, but the first def should
>> +  // only be used by the second insn (and we dropped that use above).
>> +  for (int i = 0; i < 2; i++)
>> +    if ((!writeback_effect && (writeback & (1 << i)))
>> +	|| (i == 0 && writeback == 3))
>> +      input_defs[i] = check_remove_regno_access (attempt,
>> +						 input_defs[i],
>> +						 base_regno);
>> +
>> +  // If we don't currently have a writeback pair, and we don't have
>> +  // a load that clobbers the base register, look for a trailing destructive
>> +  // update of the base register and try and fold it in to make this into a
>> +  // writeback pair.
>> +  insn_info *trailing_add = nullptr;
>> +  if (pair_trailing_writeback_p ()
>> +      && !writeback_effect
>> +      && (!load_p || (!refers_to_regno_p (base_regno, base_regno + 1,
>> +					 XEXP (pats[0], 0), nullptr)
>> +		      && !refers_to_regno_p (base_regno, base_regno + 1,
>> +					     XEXP (pats[1], 0), nullptr))))
>> +    {
>> +      def_info *add_def;
>> +      trailing_add = find_trailing_add (insns, move_range, writeback,
>> +					&writeback_effect,
>> +					&add_def, base.def, offsets[0],
>> +					access_size);
>> +      if (trailing_add)
>>  	{
>> -	  mem_defs[1] = memory_access (insns[1]->uses ())->def ();
>> -	  gcc_checking_assert (mem_defs[1]);
>> +	  // The def of the base register from the trailing add should prevail.
>> +	  input_defs[0] = insert_access (attempt, add_def, input_defs[0]);
>> +	  gcc_assert (input_defs[0].is_valid ());
>>  	}
>>      }
>> -  else
>> +
>> +  // Now that we know what base mem we're going to use, check if it's OK
>> +  // with the pair mem  policy.
>> +  rtx first_mem = XEXP (pats[0], load_p);
>> +  if (pair_mem_ok_policy (first_mem,
>> +			  load_p,
>> +			  GET_MODE (first_mem)))
>>      {
>> -      mem_defs[0] = memory_access (insns[0]->defs ())->next_def ();
>> -      mem_defs[1] = memory_access (insns[1]->defs ())->prev_def ();
>> -      gcc_checking_assert (mem_defs[0]);
>> -      gcc_checking_assert (mem_defs[1]);
>> +      if (dump_file)
>> +	fprintf (dump_file, "punting on pair (%d,%d), pair mem  policy says no\n",
>> +		 i1->uid (), i2->uid ());
>> +      return false;
>>      }
>>  
>> -  auto tombstone_p = [&](insn_info *insn) -> bool {
>> -    return m_emitted_tombstone
>> -	   && bitmap_bit_p (&m_tombstone_bitmap, insn->uid ());
>> -  };
>> +  rtx reg_notes = combine_reg_notes (first, second, load_p);
>>  
>> -  store_walker<false, decltype(tombstone_p)>
>> -    forward_store_walker (mem_defs[0], cand_mems[0], insns[1], tombstone_p);
>> +  rtx pair_pat;
>>  
>> -  store_walker<true, decltype(tombstone_p)>
>> -    backward_store_walker (mem_defs[1], cand_mems[1], insns[0], tombstone_p);
>> +  set_multiword_subreg (first, second, load_p);
>>  
>> -  alias_walker *walkers[4] = {};
>> -  if (mem_defs[0])
>> -    walkers[0] = &forward_store_walker;
>> -  if (mem_defs[1])
>> -    walkers[1] = &backward_store_walker;
>> +  pair_pat = gen_load_store_pair (pats, writeback_effect, load_p);
>> +  if (pair_pat == NULL_RTX)
>> +    return false;
>> +  insn_change *pair_change = nullptr;
>> +  auto set_pair_pat = [pair_pat,reg_notes](insn_change *change) {
>> +    rtx_insn *rti = change->insn ()->rtl ();
>> +    validate_unshare_change (rti, &PATTERN (rti), pair_pat, true);
>> +    validate_change (rti, &REG_NOTES (rti), reg_notes, true);
>> +  };
>>  
>> -  if (load_p && (mem_defs[0] || mem_defs[1]))
>> -    do_alias_analysis (alias_hazards, walkers, load_p);
>> -  else
>> +  if (load_p)
>>      {
>> -      // We want to find any loads hanging off the first store.
>> -      mem_defs[0] = memory_access (insns[0]->defs ());
>> -      load_walker<false> forward_load_walker (mem_defs[0], insns[0], insns[1]);
>> -      load_walker<true> backward_load_walker (mem_defs[1], insns[1], insns[0]);
>> -      walkers[2] = &forward_load_walker;
>> -      walkers[3] = &backward_load_walker;
>> -      do_alias_analysis (alias_hazards, walkers, load_p);
>> -      // Now consolidate hazards back down.
>> -      if (alias_hazards[2]
>> -	  && (!alias_hazards[0] || (*alias_hazards[2] < *alias_hazards[0])))
>> -	alias_hazards[0] = alias_hazards[2];
>> +      changes.safe_push (make_delete (first));
>> +      pair_change = make_change (second);
>> +      changes.safe_push (pair_change);
>>  
>> -      if (alias_hazards[3]
>> -	  && (!alias_hazards[1] || (*alias_hazards[3] > *alias_hazards[1])))
>> -	alias_hazards[1] = alias_hazards[3];
>> -    }
>> +      pair_change->move_range = move_range;
>> +      pair_change->new_defs = merge_access_arrays (attempt,
>> +						   input_defs[0],
>> +						   input_defs[1]);
>> +      gcc_assert (pair_change->new_defs.is_valid ());
>>  
>> -  if (alias_hazards[0] && alias_hazards[1]
>> -      && *alias_hazards[0] <= *alias_hazards[1])
>> -    {
>> -      if (dump_file)
>> -	fprintf (dump_file,
>> -		 "cannot form pair (%d,%d) due to alias conflicts (%d,%d)\n",
>> -		 i1->uid (), i2->uid (),
>> -		 alias_hazards[0]->uid (), alias_hazards[1]->uid ());
>> -      return false;
>> +      pair_change->new_uses
>> +	= merge_access_arrays (attempt,
>> +			       drop_memory_access (input_uses[0]),
>> +			       drop_memory_access (input_uses[1]));
>> +      gcc_assert (pair_change->new_uses.is_valid ());
>> +      set_pair_pat (pair_change);
>>      }
>> -
>> -  // Now narrow the hazards on each base candidate using
>> -  // the alias hazards.
>> -  i = 0;
>> -  while (i < base_cands.length ())
>> +  else
>>      {
>> -      base_cand &cand = base_cands[i];
>> -      if (alias_hazards[0] && (!cand.hazards[0]
>> -			       || *alias_hazards[0] < *cand.hazards[0]))
>> -	cand.hazards[0] = alias_hazards[0];
>> -      if (alias_hazards[1] && (!cand.hazards[1]
>> -			       || *alias_hazards[1] > *cand.hazards[1]))
>> -	cand.hazards[1] = alias_hazards[1];
>> -
>> -      if (cand.viable ())
>> -	i++;
>> -      else
>> +      using Action = stp_change_builder::action;
>> +      insn_info *store_to_change = try_repurpose_store (first, second,
>> +							move_range);
>> +      stp_change_builder builder (insns, store_to_change, pair_dst);
>> +      insn_change *change;
>> +      set_info *new_set = nullptr;
>> +      for (; !builder.done (); builder.advance ())
>>  	{
>> -	  if (dump_file)
>> -	    fprintf (dump_file, "pair (%d,%d): rejecting base %d due to "
>> -				"alias/dataflow hazards (%d,%d)",
>> -				insns[0]->uid (), insns[1]->uid (),
>> -				cand.def->regno (),
>> -				cand.hazards[0]->uid (),
>> -				cand.hazards[1]->uid ());
>> -
>> -	  base_cands.ordered_remove (i);
>> -	}
>> -    }
>> +	  auto action = builder.get_change ();
>> +	  change = (action.type == Action::INSERT)
>> +	    ? nullptr : make_change (action.insn);
>> +	  switch (action.type)
>> +	    {
>> +	    case Action::CHANGE:
>> +	    {
>> +	      set_pair_pat (change);
>> +	      change->new_uses = merge_access_arrays (attempt,
>> +						      input_uses[0],
>> +						      input_uses[1]);
>> +	      auto d1 = drop_memory_access (input_defs[0]);
>> +	      auto d2 = drop_memory_access (input_defs[1]);
>> +	      change->new_defs = merge_access_arrays (attempt, d1, d2);
>> +	      gcc_assert (change->new_defs.is_valid ());
>> +	      def_info *stp_def = memory_access (change->insn ()->defs ());
>> +	      change->new_defs = insert_access (attempt,
>> +						stp_def,
>> +						change->new_defs);
>> +	      gcc_assert (change->new_defs.is_valid ());
>> +	      change->move_range = move_range;
>> +	      pair_change = change;
>> +	      break;
>> +	    }
>> +	    case Action::TOMBSTONE:
>> +	    {
>> +	      tombstone_uids.quick_push (change->insn ()->uid ());
>> +	      rtx_insn *rti = change->insn ()->rtl ();
>> +	      validate_change (rti, &PATTERN (rti), gen_tombstone (), true);
>> +	      validate_change (rti, &REG_NOTES (rti), NULL_RTX, true);
>> +	      change->new_uses = use_array (nullptr, 0);
>> +	      break;
>> +	    }
>> +	    case Action::INSERT:
>> +	    {
>> +	      if (dump_file)
>> +		fprintf (dump_file,
>> +			 "  stp: cannot re-purpose candidate stores\n");
>>  
>> -  if (base_cands.is_empty ())
>> -    {
>> -      if (dump_file)
>> -	fprintf (dump_file,
>> -		 "cannot form pair (%d,%d) due to alias/dataflow hazards",
>> -		 insns[0]->uid (), insns[1]->uid ());
>> +	      auto new_insn = crtl->ssa->create_insn (attempt, INSN, pair_pat);
>> +	      change = make_change (new_insn);
>> +	      change->move_range = move_range;
>> +	      change->new_uses = merge_access_arrays (attempt,
>> +						      input_uses[0],
>> +						      input_uses[1]);
>> +	      gcc_assert (change->new_uses.is_valid ());
>>  
>> -      return false;
>> -    }
>> +	      auto d1 = drop_memory_access (input_defs[0]);
>> +	      auto d2 = drop_memory_access (input_defs[1]);
>> +	      change->new_defs = merge_access_arrays (attempt, d1, d2);
>> +	      gcc_assert (change->new_defs.is_valid ());
>>  
>> -  base_cand *base = &base_cands[0];
>> -  if (base_cands.length () > 1)
>> -    {
>> -      // If there are still multiple viable bases, it makes sense
>> -      // to choose one that allows us to reduce register pressure,
>> -      // for loads this means moving further down, for stores this
>> -      // means moving further up.
>> -      gcc_checking_assert (base_cands.length () == 2);
>> -      const int hazard_i = !load_p;
>> -      if (base->hazards[hazard_i])
>> -	{
>> -	  if (!base_cands[1].hazards[hazard_i])
>> -	    base = &base_cands[1];
>> -	  else if (load_p
>> -		   && *base_cands[1].hazards[hazard_i]
>> -		      > *(base->hazards[hazard_i]))
>> -	    base = &base_cands[1];
>> -	  else if (!load_p
>> -		   && *base_cands[1].hazards[hazard_i]
>> -		      < *(base->hazards[hazard_i]))
>> -	    base = &base_cands[1];
>> +	      new_set = crtl->ssa->create_set (attempt, new_insn, memory);
>> +	      change->new_defs = insert_access (attempt, new_set,
>> +						change->new_defs);
>> +	      gcc_assert (change->new_defs.is_valid ());
>> +	      pair_change = change;
>> +	      break;
>> +	    }
>> +	    case Action::FIXUP_USE:
>> +	    {
>> +	      // This use now needs to consume memory from our stp.
>> +	      if (dump_file)
>> +		fprintf (dump_file,
>> +			 "  stp: changing i%d to use mem from new stp "
>> +			 "(after i%d)\n",
>> +			 action.insn->uid (), pair_dst->uid ());
>> +	      change->new_uses = drop_memory_access (change->new_uses);
>> +	      gcc_assert (new_set);
>> +	      auto new_use = crtl->ssa->create_use (attempt, action.insn,
>> +						    new_set);
>> +	      change->new_uses = insert_access (attempt, new_use,
>> +						change->new_uses);
>> +	      break;
>> +	    }
>> +	    }
>> +	  changes.safe_push (change);
>>  	}
>>      }
>>  
>> -  // Otherwise, hazards[0] > hazards[1].
>> -  // Pair can be formed anywhere in (hazards[1], hazards[0]).
>> -  insn_range_info range (insns[0], insns[1]);
>> -  if (base->hazards[1])
>> -    range.first = base->hazards[1];
>> -  if (base->hazards[0])
>> -    range.last = base->hazards[0]->prev_nondebug_insn ();
>> -
>> -  // If the second insn can throw, narrow the move range to exactly that insn.
>> -  // This prevents us trying to move the second insn from the end of the BB.
>> -  if (cfun->can_throw_non_call_exceptions
>> -      && find_reg_note (insns[1]->rtl (), REG_EH_REGION, NULL_RTX))
>> +  if (trailing_add)
>> +    changes.safe_push (make_delete (trailing_add));
>> +  else if ((writeback & 2) && !writeback_effect)
>>      {
>> -      gcc_assert (range.includes (insns[1]));
>> -      range = insn_range_info (insns[1]);
>> +      // The second insn initially had writeback but now the pair does not,
>> +      // need to update any nondebug uses of the base register def in the
>> +      // second insn.  We'll take care of debug uses later.
>> +      auto def = find_access (insns[1]->defs (), base_regno);
>> +      gcc_assert (def);
>> +      auto set = dyn_cast<set_info *> (def);
>> +      if (set && set->has_nondebug_uses ())
>> +	{
>> +	  auto orig_use = find_access (insns[0]->uses (), base_regno);
>> +	  for (auto use : set->nondebug_insn_uses ())
>> +	    {
>> +	      auto change = make_change (use->insn ());
>> +	      change->new_uses = check_remove_regno_access (attempt,
>> +							    change->new_uses,
>> +							    base_regno);
>> +	      change->new_uses = insert_access (attempt,
>> +						orig_use,
>> +						change->new_uses);
>> +	      changes.safe_push (change);
>> +	    }
>> +	}
>>      }
>>  
>> -  // Placement strategy: push loads down and pull stores up, this should
>> -  // help register pressure by reducing live ranges.
>> -  if (load_p)
>> -    range.first = range.last;
>> -  else
>> -    range.last = range.first;
>> +  auto is_changing = insn_is_changing (changes);
>> +  for (unsigned i = 0; i < changes.length (); i++)
>> +    gcc_assert (rtl_ssa::restrict_movement_ignoring (*changes[i], is_changing));
>>  
>> -  if (dump_file)
>> +  // Check the pair pattern is recog'd.
>> +  if (!rtl_ssa::recog_ignoring (attempt, *pair_change, is_changing))
>>      {
>> -      auto print_hazard = [](insn_info *i)
>> -	{
>> -	  if (i)
>> -	    fprintf (dump_file, "%d", i->uid ());
>> -	  else
>> -	    fprintf (dump_file, "-");
>> -	};
>> -      auto print_pair = [print_hazard](insn_info **i)
>> -	{
>> -	  print_hazard (i[0]);
>> -	  fprintf (dump_file, ",");
>> -	  print_hazard (i[1]);
>> -	};
>> +      if (dump_file)
>> +	fprintf (dump_file, "  failed to form pair, recog failed\n");
>>  
>> -      fprintf (dump_file, "fusing pair [L=%d] (%d,%d), base=%d, hazards: (",
>> -	      load_p, insns[0]->uid (), insns[1]->uid (),
>> -	      base->def->regno ());
>> -      print_pair (base->hazards);
>> -      fprintf (dump_file, "), move_range: (%d,%d)\n",
>> -	       range.first->uid (), range.last->uid ());
>> +      // Free any reg notes we allocated.
>> +      while (reg_notes)
>> +	{
>> +	  rtx next = XEXP (reg_notes, 1);
>> +	  free_EXPR_LIST_node (reg_notes);
>> +	  reg_notes = next;
>> +	}
>> +      cancel_changes (0);
>> +      return false;
>>      }
>>  
>> -  return fuse_pair (load_p, access_size, writeback,
>> -		    i1, i2, *base, range);
>> +  gcc_assert (crtl->ssa->verify_insn_changes (changes));
>> +
>> +  // Fix up any debug uses that will be affected by the changes.
>> +  if (MAY_HAVE_DEBUG_INSNS)
>> +    fixup_debug_uses (attempt, insns, orig_rtl, pair_dst, trailing_add,
>> +		      load_p, writeback, writeback_effect, base_regno);
>> +
>> +  confirm_change_group ();
>> +  crtl->ssa->change_insns (changes);
>> +
>> +  gcc_checking_assert (tombstone_uids.length () <= 2);
>> +  for (auto uid : tombstone_uids)
>> +    track_tombstone (uid);
>> +
>> +  return true;
>>  }
>>  
>> -static void
>> -dump_insn_list (FILE *f, const insn_list_t &l)
>> +struct  aarch64_pair_fusion : public pair_fusion
>>  {
>> -  fprintf (f, "(");
>> +public:
>> +  aarch64_pair_fusion (bb_info *bb) : pair_fusion (bb) {};
>> +  bool is_fpsimd_op_p (rtx reg_op, machine_mode mem_mode, bool load_p)
>> +  {
>> +    const bool fpsimd_op_p
>> +      = reload_completed
>> +      ? (REG_P (reg_op) && FP_REGNUM_P (REGNO (reg_op)))
>> +      : (GET_MODE_CLASS (mem_mode) != MODE_INT
>> +	 && (load_p || !aarch64_const_zero_rtx_p (reg_op)));
>> +    return fpsimd_op_p;
>> +  }
>>  
>> -  auto i = l.begin ();
>> -  auto end = l.end ();
>> +  bool pair_mem_ok_policy (rtx first_mem, bool load_p, machine_mode mode)
>> +  {
>> +    return !aarch64_mem_ok_with_ldpstp_policy_model (first_mem,
>> +						     load_p,
>> +						     mode);
>> +  }
>> +  bool pair_operand_mode_ok_p (machine_mode mode);
>>  
>> -  if (i != end)
>> -    fprintf (f, "%d", (*i)->uid ());
>> -  i++;
>> +  void transform_for_base (int encoded_lfs,
>> +			   access_group &group);
>> +  rtx gen_load_store_pair (rtx *pats,
>> +			   rtx writeback,
>> +			   bool load_p)
>> +  {
>> +    rtx pair_pat;
>>  
>> -  for (; i != end; i++)
>> -    fprintf (f, ", %d", (*i)->uid ());
>> +    if (writeback)
>> +      {
>> +	auto patvec = gen_rtvec (3, writeback, pats[0], pats[1]);
>> +	pair_pat = gen_rtx_PARALLEL (VOIDmode, patvec);
>> +      }
>> +    else if (load_p)
>> +      pair_pat = aarch64_gen_load_pair (XEXP (pats[0], 0),
>> +					XEXP (pats[1], 0),
>> +					XEXP (pats[0], 1));
>> +    else
>> +      pair_pat = aarch64_gen_store_pair (XEXP (pats[0], 0),
>> +					 XEXP (pats[0], 1),
>> +					 XEXP (pats[1], 1));
>> +     return pair_pat;
>> +  }
>>  
>> -  fprintf (f, ")");
>> -}
>> +  void set_multiword_subreg (insn_info *i1, insn_info *i2, bool load_p)
>> +  {
>> +    if (i1 || i2 || load_p)
>> +      return;
>> +    return;
>> +  }
>> +  bool pair_trailing_writeback_p  ()
>> +  {
>> +    return aarch64_ldp_writeback > 1;
>> +  }
>> +  bool pair_check_register_operand (bool load_p, rtx reg_op, machine_mode mem_mode)
>> +  {
>> +    return  (load_p
>> +	     ? !aarch64_ldp_reg_operand (reg_op, mem_mode)
>> +	     : !aarch64_stp_reg_operand (reg_op, mem_mode));
>> +  }
>> +  int pair_mem_alias_check_limit ()
>> +  {
>> +    return aarch64_ldp_alias_check_limit;
>> +  }
>> +  bool fuseable_store_p (insn_info *i1, insn_info *i2) { return i1 || i2;}
>> +  bool fuseable_load_p (insn_info *insn) { return insn;}
>> +  bool pair_is_writeback ()
>> +  {
>> +    return !aarch64_ldp_writeback;
>> +  }
>> +private:
>> +   int num_pairs;
>> +   rtx_insn *reg_ops[3];
>> +};
>>  
>> -DEBUG_FUNCTION void
>> -debug (const insn_list_t &l)
>> +static lfs_fields
>> +decode_lfs (int lfs)
>>  {
>> -  dump_insn_list (stderr, l);
>> -  fprintf (stderr, "\n");
>> +  bool load_p = (lfs & (1 << 3));
>> +  bool fpsimd_p = (lfs & (1 << 2));
>> +  unsigned size = 1U << ((lfs & 3) + 2);
>> +  return { load_p, fpsimd_p, size };
>>  }
>>  
>> -// LEFT_LIST and RIGHT_LIST are lists of candidate instructions where all insns
>> -// in LEFT_LIST are known to be adjacent to those in RIGHT_LIST.
>> -//
>> -// This function traverses the resulting 2D matrix of possible pair candidates
>> -// and attempts to merge them into pairs.
>> -//
>> -// The algorithm is straightforward: if we consider a combined list of
>> -// candidates X obtained by merging LEFT_LIST and RIGHT_LIST in program order,
>> -// then we advance through X until we reach a crossing point (where X[i] and
>> -// X[i+1] come from different source lists).
>> -//
>> -// At this point we know X[i] and X[i+1] are adjacent accesses, and we try to
>> -// fuse them into a pair.  If this succeeds, we remove X[i] and X[i+1] from
>> -// their original lists and continue as above.
>> -//
>> -// In the failure case, we advance through the source list containing X[i] and
>> -// continue as above (proceeding to the next crossing point).
>> -//
>> -// The rationale for skipping over groups of consecutive candidates from the
>> -// same source list is as follows:
>> -//
>> -// In the store case, the insns in the group can't be re-ordered over each
>> -// other as they are guaranteed to store to the same location, so we're
>> -// guaranteed not to lose opportunities by doing this.
>> -//
>> -// In the load case, subsequent loads from the same location are either
>> -// redundant (in which case they should have been cleaned up by an earlier
>> -// optimization pass) or there is an intervening aliasing hazard, in which case
>> -// we can't re-order them anyway, so provided earlier passes have cleaned up
>> -// redundant loads, we shouldn't miss opportunities by doing this.
>> -void
>> -ldp_bb_info::merge_pairs (insn_list_t &left_list,
>> -			  insn_list_t &right_list,
>> -			  bool load_p,
>> -			  unsigned access_size)
>> +// Return true if we should consider forming ldp/stp insns from memory
>> +// accesses with operand mode MODE at this stage in compilation.
>> +static bool
>> +ldp_operand_mode_ok_p (machine_mode mode)
>>  {
>> -  if (dump_file)
>> -    {
>> -      fprintf (dump_file, "merge_pairs [L=%d], cand vecs ", load_p);
>> -      dump_insn_list (dump_file, left_list);
>> -      fprintf (dump_file, " x ");
>> -      dump_insn_list (dump_file, right_list);
>> -      fprintf (dump_file, "\n");
>> -    }
>> +  const bool allow_qregs
>> +    = !(aarch64_tune_params.extra_tuning_flags
>> +	& AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS);
>>  
>> -  auto iter_l = left_list.begin ();
>> -  auto iter_r = right_list.begin ();
>> +  if (!aarch64_ldpstp_operand_mode_p (mode))
>> +    return false;
>>  
>> -  while (iter_l != left_list.end () && iter_r != right_list.end ())
>> +  const auto size = GET_MODE_SIZE (mode).to_constant ();
>> +  if (size == 16 && !allow_qregs)
>> +    return false;
>> +
>> +  // We don't pair up TImode accesses before RA because TImode is
>> +  // special in that it can be allocated to a pair of GPRs or a single
>> +  // FPR, and the RA is best placed to make that decision.
>> +  return reload_completed || mode != TImode;
>> +}
>> +
>> +bool
>> +aarch64_pair_fusion::pair_operand_mode_ok_p (machine_mode mode)
>> +{
>> +  return (ldp_operand_mode_ok_p (mode));
>> +}
>> +
>> +// Given a pair mode MODE, return a canonical mode to be used for a single
>> +// operand of such a pair.  Currently we only use this when promoting a
>> +// non-writeback pair into a writeback pair, as it isn't otherwise clear
>> +// which mode to use when storing a modeless CONST_INT.
>> +static machine_mode
>> +aarch64_operand_mode_for_pair_mode (machine_mode mode)
>> +{
>> +  switch (mode)
>>      {
>> -      auto next_l = std::next (iter_l);
>> -      auto next_r = std::next (iter_r);
>> -      if (**iter_l < **iter_r
>> -	  && next_l != left_list.end ()
>> -	  && **next_l < **iter_r)
>> -	iter_l = next_l;
>> -      else if (**iter_r < **iter_l
>> -	       && next_r != right_list.end ()
>> -	       && **next_r < **iter_l)
>> -	iter_r = next_r;
>> -      else if (try_fuse_pair (load_p, access_size, *iter_l, *iter_r))
>> -	{
>> -	  left_list.erase (iter_l);
>> -	  iter_l = next_l;
>> -	  right_list.erase (iter_r);
>> -	  iter_r = next_r;
>> -	}
>> -      else if (**iter_l < **iter_r)
>> -	iter_l = next_l;
>> -      else
>> -	iter_r = next_r;
>> +    case E_V2x4QImode:
>> +      return SImode;
>> +    case E_V2x8QImode:
>> +      return DImode;
>> +    case E_V2x16QImode:
>> +      return V16QImode;
>> +    default:
>> +      gcc_unreachable ();
>>      }
>>  }
>>  
>> @@ -2890,8 +3108,8 @@ ldp_bb_info::merge_pairs (insn_list_t &left_list,
>>  // of accesses.  If we find two sets of adjacent accesses, call
>>  // merge_pairs.
>>  void
>> -ldp_bb_info::transform_for_base (int encoded_lfs,
>> -				 access_group &group)
>> +aarch64_pair_fusion::transform_for_base (int encoded_lfs,
>> +					 access_group &group)
>>  {
>>    const auto lfs = decode_lfs (encoded_lfs);
>>    const unsigned access_size = lfs.size;
>> @@ -2915,55 +3133,6 @@ ldp_bb_info::transform_for_base (int encoded_lfs,
>>      }
>>  }
>>  
>> -// If we emitted tombstone insns for this BB, iterate through the BB
>> -// and remove all the tombstone insns, being sure to reparent any uses
>> -// of mem to previous defs when we do this.
>> -void
>> -ldp_bb_info::cleanup_tombstones ()
>> -{
>> -  // No need to do anything if we didn't emit a tombstone insn for this BB.
>> -  if (!m_emitted_tombstone)
>> -    return;
>> -
>> -  for (auto insn : iterate_safely (m_bb->nondebug_insns ()))
>> -    {
>> -      if (!insn->is_real ()
>> -	  || !bitmap_bit_p (&m_tombstone_bitmap, insn->uid ()))
>> -	continue;
>> -
>> -      auto set = as_a<set_info *> (memory_access (insn->defs ()));
>> -      if (set->has_any_uses ())
>> -	{
>> -	  auto prev_set = as_a<set_info *> (set->prev_def ());
>> -	  while (set->first_use ())
>> -	    crtl->ssa->reparent_use (set->first_use (), prev_set);
>> -	}
>> -
>> -      // Now set has no uses, we can delete it.
>> -      insn_change change (insn, insn_change::DELETE);
>> -      crtl->ssa->change_insn (change);
>> -    }
>> -}
>> -
>> -template<typename Map>
>> -void
>> -ldp_bb_info::traverse_base_map (Map &map)
>> -{
>> -  for (auto kv : map)
>> -    {
>> -      const auto &key = kv.first;
>> -      auto &value = kv.second;
>> -      transform_for_base (key.second, value);
>> -    }
>> -}
>> -
>> -void
>> -ldp_bb_info::transform ()
>> -{
>> -  traverse_base_map (expr_map);
>> -  traverse_base_map (def_map);
>> -}
>> -
>>  static void
>>  ldp_fusion_init ()
>>  {
>> @@ -3174,7 +3343,9 @@ void ldp_fusion_bb (bb_info *bb)
>>    const bool track_stores
>>      = aarch64_tune_params.stp_policy_model != AARCH64_LDP_STP_POLICY_NEVER;
>>  
>> -  ldp_bb_info bb_state (bb);
>> +  pair_fusion *bb_state;
>> +  aarch64_pair_fusion derived (bb);
>> +  bb_state = &derived;
>>  
>>    for (auto insn : bb->nondebug_insns ())
>>      {
>> @@ -3194,13 +3365,13 @@ void ldp_fusion_bb (bb_info *bb)
>>  	continue;
>>  
>>        if (track_stores && MEM_P (XEXP (pat, 0)))
>> -	bb_state.track_access (insn, false, XEXP (pat, 0));
>> +	bb_state->track_access (insn, false, XEXP (pat, 0));
>>        else if (track_loads && MEM_P (XEXP (pat, 1)))
>> -	bb_state.track_access (insn, true, XEXP (pat, 1));
>> +	bb_state->track_access (insn, true, XEXP (pat, 1));
>>      }
>>  
>> -  bb_state.transform ();
>> -  bb_state.cleanup_tombstones ();
>> +  bb_state->transform ();
>> +  bb_state->cleanup_tombstones ();
>>  }
>>  
>>  void ldp_fusion ()
>> @@ -3263,7 +3434,7 @@ public:
>>      }
>>  };
>>  
>> -} // anon namespace
>> +}// anon namespace
>>  
>>  rtl_opt_pass *
>>  make_pass_ldp_fusion (gcc::context *ctx)

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2024-02-23 11:25 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-02-15 18:43 [PATCH 0/2 V2] aarch64: Place target independent and dependent code in one file Ajit Agarwal
2024-02-22 19:49 ` Richard Sandiford
2024-02-22 21:17   ` Segher Boessenkool
2024-02-23 11:25   ` Ajit Agarwal

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).