[PATCH v2 10/18]middle-end simplify lane permutes which selects from loads from the same DR.

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

From: Tamar Christina <tamar.christina@arm.com>
To: gcc-patches@gcc.gnu.org
Cc: nd@arm.com, rguenther@suse.de, ook@ucw.cz
Subject: [PATCH v2 10/18]middle-end simplify lane permutes which selects from loads from the same DR.
Date: Tue, 3 Nov 2020 15:07:51 +0000	[thread overview]
Message-ID: <patch-13720-tamar@arm.com> (raw)

[-- Attachment #1: Type: text/plain, Size: 1596 bytes --]

Hi All,

This change allows one to simplify lane permutes that select from multiple load
leafs that load from the same DR group by promoting the VEC_PERM node into a
load itself and pushing the lane permute into it as a load permute.

This saves us from having to calculate where to materialize a new load node.
If the resulting loads are now unused they are freed and are removed from the
graph.

This allows us to handle cases where we would have generated:

	movi    v4.4s, 0
	adrp    x3, .LC0
	ldr     q5, [x3, #:lo12:.LC0]
	mov     x3, 0
	.p2align 3,,7
.L2:
	mov     v0.16b, v4.16b
	mov     v3.16b, v4.16b
	ldr     q1, [x1, x3]
	ldr     q2, [x0, x3]
	fcmla   v0.4s, v2.4s, v1.4s, #0
	fcmla   v3.4s, v1.4s, v2.4s, #0
	fcmla   v0.4s, v2.4s, v1.4s, #270
	fcmla   v3.4s, v1.4s, v2.4s, #270
	mov     v1.16b, v3.16b
	tbl     v0.16b, {v0.16b - v1.16b}, v5.16b
	str     q0, [x2, x3]
	add     x3, x3, 16
	cmp     x3, 1600
	bne     .L2
	ret

and instead generate

	mov     x3, 0
	.p2align 3,,7
.L27:
	ldr     q0, [x2, x3]
	ldr     q1, [x0, x3]
	ldr     q2, [x1, x3]
	fcmla   v0.2d, v1.2d, v2.2d, #0
	fcmla   v0.2d, v1.2d, v2.2d, #270
	str     q0, [x2, x3]
	add     x3, x3, 16
	cmp     x3, 512
	bne     .L27
	ret

This runs as a pre step such that permute simplification can still inspect this
permute is needed

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
Tests are included as part of the final patch as they need the SLP pattern
matcher to insert permutes in between.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* tree-vect-slp.c (vect_optimize_slp): Promote permutes.

-- 

[-- Attachment #2: rb13720.patch --]
[-- Type: text/x-diff, Size: 5287 bytes --]

diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
index 48f615e1952707de4827f0e69e443c0a7db27d81..a3881b59b3c2aafb216ef320633255ed91f4dd45 100644
--- a/gcc/tree-vect-slp.c
+++ b/gcc/tree-vect-slp.c
@@ -2937,6 +2937,7 @@ vect_optimize_slp (vec_info *vinfo)
   unsigned i;
   auto_vec<slp_tree> vertices;
   auto_vec<int> leafs;
+  hash_set<int_hash<int, -1, -1> > dup_leafs;
   vect_slp_build_vertices (vinfo, vertices, leafs);
 
   struct graph *slpg = new_graph (vertices.length ());
@@ -2954,12 +2955,14 @@ vect_optimize_slp (vec_info *vinfo)
   graphds_dfs (slpg, &leafs[0], leafs.length (), &ipo, false, NULL, NULL);
 
   auto_sbitmap n_visited (vertices.length ());
+  auto_sbitmap n_replicated (vertices.length ());
   auto_sbitmap n_materialize (vertices.length ());
   auto_vec<int> n_perm (vertices.length ());
   auto_vec<vec<unsigned> > perms;
 
   bitmap_clear (n_visited);
   bitmap_clear (n_materialize);
+  bitmap_clear (n_replicated);
   n_perm.quick_grow_cleared (vertices.length ());
   perms.safe_push (vNULL); /* zero is no permute */
 
@@ -3000,6 +3003,11 @@ vect_optimize_slp (vec_info *vinfo)
       /* If there's no permute no need to split one out.  */
       if (!any_permute)
 	continue;
+
+      /* If the operation is a replication mark it for further inspection.  */
+      if (imin == imax)
+	dup_leafs.add (idx);
+
       /* If the span doesn't match we'd disrupt VF computation, avoid
 	 that for now.  */
       if (imax - imin + 1 != SLP_TREE_LANES (node))
@@ -3028,6 +3036,100 @@ vect_optimize_slp (vec_info *vinfo)
       n_perm[idx] = perms.length () - 1;
     }
 
+  /* Inspect all replication node and determine if they are connected to a
+     permute operation that may linearize the load.  */
+  for (hash_set<int_hash<int, -1, -1> >::iterator iter = dup_leafs.begin ();
+       iter != dup_leafs.end (); ++iter)
+    {
+      int idx = *iter;
+
+      graph_edge *edge = slpg->vertices[idx].pred;
+      do
+	{
+	  slp_tree node = vertices[idx];
+	  unsigned src = edge->src;
+	  /* If we've visited the permute node already leave it alone.  This
+	     prevents us from re-inspecting it for every leafs that lead to it.  */
+	  if (bitmap_bit_p (n_replicated, src))
+	    continue;
+
+	  slp_tree parent = vertices[src];
+	  bitmap_set_bit (n_replicated, src);
+
+	  if (!SLP_TREE_LANE_PERMUTATION (parent).exists ())
+	    continue;
+
+	  /* Check if all edges lead to a load and that all the loads are
+	     coming from the same group.  */
+	  unsigned j;
+	  bool distribute_p = SLP_TREE_CHILDREN (parent).length () > 0;
+	  stmt_vec_info rep_stmt = SLP_TREE_REPRESENTATIVE (node);
+	  stmt_vec_info dr_stmt = DR_GROUP_FIRST_ELEMENT (rep_stmt);
+	  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (parent), j, node)
+	    {
+	      /* Check if this is one of the nodes we know have a replication
+		 operation.  */
+	      distribute_p = dup_leafs.contains (node->vertex);
+	      if (!distribute_p)
+		break;
+	      stmt_vec_info cur_dr_stmt = SLP_TREE_REPRESENTATIVE (node);
+	      cur_dr_stmt = DR_GROUP_FIRST_ELEMENT (cur_dr_stmt);
+	      distribute_p = dr_stmt == cur_dr_stmt;
+	      if (!distribute_p)
+		break;
+	    }
+
+	  /* If we have a valid node to optimize, do it and disconnect it from
+	     the graph.  */
+	  if (distribute_p)
+	  {
+	    if (dump_enabled_p ())
+	      dump_printf_loc (MSG_NOTE, vect_location,
+			       "converting permute node %p into load\n",
+			       parent);
+
+	    /* First convert this node into a load node and add it to the leaves
+	       list and flatten the permute from a lane to a load one.  If it's
+	       unneeded it will be elided later.
+
+	       Question: I presume I need to add this new node to the instance's
+	       loads list? *But I have no idea which instance I'm in.  */
+	    vec<stmt_vec_info> stmts;
+	    stmts.create (SLP_TREE_LANES (parent));
+	    load_permutation_t load_perm;
+	    load_perm.create (SLP_TREE_LANES (parent));
+	    lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (parent);
+	    for (j = 0; j < lane_perm.length (); j++)
+	      {
+		std::pair<unsigned, unsigned> perm = lane_perm[j];
+		node = SLP_TREE_CHILDREN (parent)[perm.first];
+		stmts.safe_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
+		load_perm.safe_push (SLP_TREE_LOAD_PERMUTATION (node)[perm.second]);
+	      }
+	    SLP_TREE_REPRESENTATIVE (parent) = rep_stmt;
+	    SLP_TREE_SCALAR_STMTS (parent) = stmts;
+	    SLP_TREE_LANE_PERMUTATION (parent).release();
+	    SLP_TREE_LANE_PERMUTATION (parent) = vNULL;
+	    SLP_TREE_LOAD_PERMUTATION (parent) = load_perm;
+	    SLP_TREE_CODE (parent) = ERROR_MARK;
+
+	    /* One last iteration to free the nodes.  */
+	    FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (parent), j, node)
+	      {
+		/* If we are the only reference to the node, remove the vertex.
+		   We don't have to modify the graph since vertices lead the
+		   graph traversal.  */
+		vect_free_slp_tree (node);
+	      }
+
+	    SLP_TREE_CHILDREN (parent) = vNULL;
+
+	    /* And finally it to the leafs set and updated the vertices.  */
+	    leafs.safe_push (parent->vertex);
+	  }
+	} while ((edge = edge->pred_next));
+    }
+
   /* Propagate permutes along the graph and compute materialization points.  */
   bool changed;
   unsigned iteration = 0;

next             reply	other threads:[~2020-11-03 15:08 UTC|newest]

Thread overview: 5+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-11-03 15:07 Tamar Christina [this message]
2020-11-04 13:35 ` Richard Biener
2020-11-04 14:02   ` Tamar Christina
2020-11-04 15:12     ` Richard Biener
2020-11-04 15:17       ` Tamar Christina

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=patch-13720-tamar@arm.com \
    --to=tamar.christina@arm.com \
    --cc=gcc-patches@gcc.gnu.org \
    --cc=nd@arm.com \
    --cc=ook@ucw.cz \
    --cc=rguenther@suse.de \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).