public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH v2 10/18]middle-end simplify lane permutes which selects from loads from the same DR.
@ 2020-11-03 15:07 Tamar Christina
  2020-11-04 13:35 ` Richard Biener
  0 siblings, 1 reply; 5+ messages in thread
From: Tamar Christina @ 2020-11-03 15:07 UTC (permalink / raw)
  To: gcc-patches; +Cc: nd, rguenther, ook

[-- Attachment #1: Type: text/plain, Size: 1596 bytes --]

Hi All,

This change allows one to simplify lane permutes that select from multiple load
leafs that load from the same DR group by promoting the VEC_PERM node into a
load itself and pushing the lane permute into it as a load permute.

This saves us from having to calculate where to materialize a new load node.
If the resulting loads are now unused they are freed and are removed from the
graph.

This allows us to handle cases where we would have generated:

	movi    v4.4s, 0
	adrp    x3, .LC0
	ldr     q5, [x3, #:lo12:.LC0]
	mov     x3, 0
	.p2align 3,,7
.L2:
	mov     v0.16b, v4.16b
	mov     v3.16b, v4.16b
	ldr     q1, [x1, x3]
	ldr     q2, [x0, x3]
	fcmla   v0.4s, v2.4s, v1.4s, #0
	fcmla   v3.4s, v1.4s, v2.4s, #0
	fcmla   v0.4s, v2.4s, v1.4s, #270
	fcmla   v3.4s, v1.4s, v2.4s, #270
	mov     v1.16b, v3.16b
	tbl     v0.16b, {v0.16b - v1.16b}, v5.16b
	str     q0, [x2, x3]
	add     x3, x3, 16
	cmp     x3, 1600
	bne     .L2
	ret

and instead generate

	mov     x3, 0
	.p2align 3,,7
.L27:
	ldr     q0, [x2, x3]
	ldr     q1, [x0, x3]
	ldr     q2, [x1, x3]
	fcmla   v0.2d, v1.2d, v2.2d, #0
	fcmla   v0.2d, v1.2d, v2.2d, #270
	str     q0, [x2, x3]
	add     x3, x3, 16
	cmp     x3, 512
	bne     .L27
	ret

This runs as a pre step such that permute simplification can still inspect this
permute is needed

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
Tests are included as part of the final patch as they need the SLP pattern
matcher to insert permutes in between.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* tree-vect-slp.c (vect_optimize_slp): Promote permutes.

-- 

[-- Attachment #2: rb13720.patch --]
[-- Type: text/x-diff, Size: 5287 bytes --]

diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
index 48f615e1952707de4827f0e69e443c0a7db27d81..a3881b59b3c2aafb216ef320633255ed91f4dd45 100644
--- a/gcc/tree-vect-slp.c
+++ b/gcc/tree-vect-slp.c
@@ -2937,6 +2937,7 @@ vect_optimize_slp (vec_info *vinfo)
   unsigned i;
   auto_vec<slp_tree> vertices;
   auto_vec<int> leafs;
+  hash_set<int_hash<int, -1, -1> > dup_leafs;
   vect_slp_build_vertices (vinfo, vertices, leafs);
 
   struct graph *slpg = new_graph (vertices.length ());
@@ -2954,12 +2955,14 @@ vect_optimize_slp (vec_info *vinfo)
   graphds_dfs (slpg, &leafs[0], leafs.length (), &ipo, false, NULL, NULL);
 
   auto_sbitmap n_visited (vertices.length ());
+  auto_sbitmap n_replicated (vertices.length ());
   auto_sbitmap n_materialize (vertices.length ());
   auto_vec<int> n_perm (vertices.length ());
   auto_vec<vec<unsigned> > perms;
 
   bitmap_clear (n_visited);
   bitmap_clear (n_materialize);
+  bitmap_clear (n_replicated);
   n_perm.quick_grow_cleared (vertices.length ());
   perms.safe_push (vNULL); /* zero is no permute */
 
@@ -3000,6 +3003,11 @@ vect_optimize_slp (vec_info *vinfo)
       /* If there's no permute no need to split one out.  */
       if (!any_permute)
 	continue;
+
+      /* If the operation is a replication mark it for further inspection.  */
+      if (imin == imax)
+	dup_leafs.add (idx);
+
       /* If the span doesn't match we'd disrupt VF computation, avoid
 	 that for now.  */
       if (imax - imin + 1 != SLP_TREE_LANES (node))
@@ -3028,6 +3036,100 @@ vect_optimize_slp (vec_info *vinfo)
       n_perm[idx] = perms.length () - 1;
     }
 
+  /* Inspect all replication node and determine if they are connected to a
+     permute operation that may linearize the load.  */
+  for (hash_set<int_hash<int, -1, -1> >::iterator iter = dup_leafs.begin ();
+       iter != dup_leafs.end (); ++iter)
+    {
+      int idx = *iter;
+
+      graph_edge *edge = slpg->vertices[idx].pred;
+      do
+	{
+	  slp_tree node = vertices[idx];
+	  unsigned src = edge->src;
+	  /* If we've visited the permute node already leave it alone.  This
+	     prevents us from re-inspecting it for every leafs that lead to it.  */
+	  if (bitmap_bit_p (n_replicated, src))
+	    continue;
+
+	  slp_tree parent = vertices[src];
+	  bitmap_set_bit (n_replicated, src);
+
+	  if (!SLP_TREE_LANE_PERMUTATION (parent).exists ())
+	    continue;
+
+	  /* Check if all edges lead to a load and that all the loads are
+	     coming from the same group.  */
+	  unsigned j;
+	  bool distribute_p = SLP_TREE_CHILDREN (parent).length () > 0;
+	  stmt_vec_info rep_stmt = SLP_TREE_REPRESENTATIVE (node);
+	  stmt_vec_info dr_stmt = DR_GROUP_FIRST_ELEMENT (rep_stmt);
+	  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (parent), j, node)
+	    {
+	      /* Check if this is one of the nodes we know have a replication
+		 operation.  */
+	      distribute_p = dup_leafs.contains (node->vertex);
+	      if (!distribute_p)
+		break;
+	      stmt_vec_info cur_dr_stmt = SLP_TREE_REPRESENTATIVE (node);
+	      cur_dr_stmt = DR_GROUP_FIRST_ELEMENT (cur_dr_stmt);
+	      distribute_p = dr_stmt == cur_dr_stmt;
+	      if (!distribute_p)
+		break;
+	    }
+
+	  /* If we have a valid node to optimize, do it and disconnect it from
+	     the graph.  */
+	  if (distribute_p)
+	  {
+	    if (dump_enabled_p ())
+	      dump_printf_loc (MSG_NOTE, vect_location,
+			       "converting permute node %p into load\n",
+			       parent);
+
+	    /* First convert this node into a load node and add it to the leaves
+	       list and flatten the permute from a lane to a load one.  If it's
+	       unneeded it will be elided later.
+
+	       Question: I presume I need to add this new node to the instance's
+	       loads list? *But I have no idea which instance I'm in.  */
+	    vec<stmt_vec_info> stmts;
+	    stmts.create (SLP_TREE_LANES (parent));
+	    load_permutation_t load_perm;
+	    load_perm.create (SLP_TREE_LANES (parent));
+	    lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (parent);
+	    for (j = 0; j < lane_perm.length (); j++)
+	      {
+		std::pair<unsigned, unsigned> perm = lane_perm[j];
+		node = SLP_TREE_CHILDREN (parent)[perm.first];
+		stmts.safe_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
+		load_perm.safe_push (SLP_TREE_LOAD_PERMUTATION (node)[perm.second]);
+	      }
+	    SLP_TREE_REPRESENTATIVE (parent) = rep_stmt;
+	    SLP_TREE_SCALAR_STMTS (parent) = stmts;
+	    SLP_TREE_LANE_PERMUTATION (parent).release();
+	    SLP_TREE_LANE_PERMUTATION (parent) = vNULL;
+	    SLP_TREE_LOAD_PERMUTATION (parent) = load_perm;
+	    SLP_TREE_CODE (parent) = ERROR_MARK;
+
+	    /* One last iteration to free the nodes.  */
+	    FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (parent), j, node)
+	      {
+		/* If we are the only reference to the node, remove the vertex.
+		   We don't have to modify the graph since vertices lead the
+		   graph traversal.  */
+		vect_free_slp_tree (node);
+	      }
+
+	    SLP_TREE_CHILDREN (parent) = vNULL;
+
+	    /* And finally it to the leafs set and updated the vertices.  */
+	    leafs.safe_push (parent->vertex);
+	  }
+	} while ((edge = edge->pred_next));
+    }
+
   /* Propagate permutes along the graph and compute materialization points.  */
   bool changed;
   unsigned iteration = 0;


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v2 10/18]middle-end simplify lane permutes which selects from loads from the same DR.
  2020-11-03 15:07 [PATCH v2 10/18]middle-end simplify lane permutes which selects from loads from the same DR Tamar Christina
@ 2020-11-04 13:35 ` Richard Biener
  2020-11-04 14:02   ` Tamar Christina
  0 siblings, 1 reply; 5+ messages in thread
From: Richard Biener @ 2020-11-04 13:35 UTC (permalink / raw)
  To: Tamar Christina; +Cc: gcc-patches, nd, ook

On Tue, 3 Nov 2020, Tamar Christina wrote:

> Hi All,
> 
> This change allows one to simplify lane permutes that select from multiple load
> leafs that load from the same DR group by promoting the VEC_PERM node into a
> load itself and pushing the lane permute into it as a load permute.
> 
> This saves us from having to calculate where to materialize a new load node.
> If the resulting loads are now unused they are freed and are removed from the
> graph.
> 
> This allows us to handle cases where we would have generated:
> 
> 	movi    v4.4s, 0
> 	adrp    x3, .LC0
> 	ldr     q5, [x3, #:lo12:.LC0]
> 	mov     x3, 0
> 	.p2align 3,,7
> .L2:
> 	mov     v0.16b, v4.16b
> 	mov     v3.16b, v4.16b
> 	ldr     q1, [x1, x3]
> 	ldr     q2, [x0, x3]
> 	fcmla   v0.4s, v2.4s, v1.4s, #0
> 	fcmla   v3.4s, v1.4s, v2.4s, #0
> 	fcmla   v0.4s, v2.4s, v1.4s, #270
> 	fcmla   v3.4s, v1.4s, v2.4s, #270
> 	mov     v1.16b, v3.16b
> 	tbl     v0.16b, {v0.16b - v1.16b}, v5.16b
> 	str     q0, [x2, x3]
> 	add     x3, x3, 16
> 	cmp     x3, 1600
> 	bne     .L2
> 	ret
> 
> and instead generate
> 
> 	mov     x3, 0
> 	.p2align 3,,7
> .L27:
> 	ldr     q0, [x2, x3]
> 	ldr     q1, [x0, x3]
> 	ldr     q2, [x1, x3]
> 	fcmla   v0.2d, v1.2d, v2.2d, #0
> 	fcmla   v0.2d, v1.2d, v2.2d, #270
> 	str     q0, [x2, x3]
> 	add     x3, x3, 16
> 	cmp     x3, 512
> 	bne     .L27
> 	ret
> 
> This runs as a pre step such that permute simplification can still inspect this
> permute is needed
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> Tests are included as part of the final patch as they need the SLP pattern
> matcher to insert permutes in between.
> 
> Ok for master?

So I think this is too specialized for the general issue that we're
doing a bad job in CSEing the load part of different permutes of
the same group.  I've played with fixing this half a year ago (again)
in multiple general ways but they all caused some regressions.

So you're now adding some heuristics as to when to anticipate
"CSE" (or merging with followup permutes).

To quickly recap what I did consider two loads (V2DF)
one { a[0], a[1] } and the other { a[1], a[0] }.  They
currently are two SLP nodes and one with a load_permutation.
My original attempts focused on trying to get rid of load_permutation
in favor of lane_permute nodes and thus during SLP discovery
I turned the second into { a[0], a[1] } (magically unified with
the other load) and a followup lane-permute node.

So for your case you have IIUC { a[0], a[0] } and { a[1], a[1] }
which eventually will (due to patterns) be lane-permuted
into { a[0], a[1] }, right?  So generalizing this as
a single { a[0], a[1] } plus two lane-permute nodes  { 0, 0 }
and { 1, 1 } early would solve the issue as well?  Now,
in general it might be more profitable to generate the
{ a[0], a[0] } and { a[1], a[1] } via scalar-load-and-splat
rather than vector load and permute so we have to be careful
to not over-optimize here or be prepared to do the reverse
transform.

The patch itself is a bit ugly since it modifies the SLP
graph when we already produced the graphds graph so I
would do any of this before.  I did consider gathering
all loads nodes loading from a group and then trying to
apply some heuristic to alter the SLP graph so it can
be better optimized.  In fact when we want to generate
the same code as the non-SLP interleaving scheme does
we do have to look at those since we have to unify
loads there.

I'd put this after vect_slp_build_vertices but before
the new_graph call - altering 'vertices' / 'leafs' should
be more easily possible and the 'leafs' array contains
all loads already (vect_slp_build_vertices could be massaged
to provide a map from DR_GROUP_FIRST_ELEMENT to slp_tree,
giving us the meta we want).

That said, I'd like to see something more forward-looking
rather than the ad-hoc special-casing of what you run into
with the pattern matching.

In case we want to still go with the special-casing it
should IMHO be done in a pre-order walk simply
looking for lane permute nodes with children that all
load from the same group performing what you do before
any of the vertices/graph stuff is built.  That's
probably easiest at this point and it can be done
when then bst_map is still around so you can properly
CSE the new load you build.

Thanks,
Richard.



> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
> 	* tree-vect-slp.c (vect_optimize_slp): Promote permutes.
> 
> 

-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409 Nuernberg,
Germany; GF: Felix Imend

^ permalink raw reply	[flat|nested] 5+ messages in thread

* RE: [PATCH v2 10/18]middle-end simplify lane permutes which selects from loads from the same DR.
  2020-11-04 13:35 ` Richard Biener
@ 2020-11-04 14:02   ` Tamar Christina
  2020-11-04 15:12     ` Richard Biener
  0 siblings, 1 reply; 5+ messages in thread
From: Tamar Christina @ 2020-11-04 14:02 UTC (permalink / raw)
  To: Richard Biener; +Cc: gcc-patches, nd, ook

Hi Richi,

> -----Original Message-----
> From: rguenther@c653.arch.suse.de <rguenther@c653.arch.suse.de> On
> Behalf Of Richard Biener
> Sent: Wednesday, November 4, 2020 1:36 PM
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; ook@ucw.cz
> Subject: Re: [PATCH v2 10/18]middle-end simplify lane permutes which
> selects from loads from the same DR.
> 
> On Tue, 3 Nov 2020, Tamar Christina wrote:
> 
> > Hi All,
> >
> > This change allows one to simplify lane permutes that select from
> > multiple load leafs that load from the same DR group by promoting the
> > VEC_PERM node into a load itself and pushing the lane permute into it as a
> load permute.
> >
> > This saves us from having to calculate where to materialize a new load node.
> > If the resulting loads are now unused they are freed and are removed
> > from the graph.
> >
> > This allows us to handle cases where we would have generated:
> >
> > 	movi    v4.4s, 0
> > 	adrp    x3, .LC0
> > 	ldr     q5, [x3, #:lo12:.LC0]
> > 	mov     x3, 0
> > 	.p2align 3,,7
> > .L2:
> > 	mov     v0.16b, v4.16b
> > 	mov     v3.16b, v4.16b
> > 	ldr     q1, [x1, x3]
> > 	ldr     q2, [x0, x3]
> > 	fcmla   v0.4s, v2.4s, v1.4s, #0
> > 	fcmla   v3.4s, v1.4s, v2.4s, #0
> > 	fcmla   v0.4s, v2.4s, v1.4s, #270
> > 	fcmla   v3.4s, v1.4s, v2.4s, #270
> > 	mov     v1.16b, v3.16b
> > 	tbl     v0.16b, {v0.16b - v1.16b}, v5.16b
> > 	str     q0, [x2, x3]
> > 	add     x3, x3, 16
> > 	cmp     x3, 1600
> > 	bne     .L2
> > 	ret
> >
> > and instead generate
> >
> > 	mov     x3, 0
> > 	.p2align 3,,7
> > .L27:
> > 	ldr     q0, [x2, x3]
> > 	ldr     q1, [x0, x3]
> > 	ldr     q2, [x1, x3]
> > 	fcmla   v0.2d, v1.2d, v2.2d, #0
> > 	fcmla   v0.2d, v1.2d, v2.2d, #270
> > 	str     q0, [x2, x3]
> > 	add     x3, x3, 16
> > 	cmp     x3, 512
> > 	bne     .L27
> > 	ret
> >
> > This runs as a pre step such that permute simplification can still
> > inspect this permute is needed
> >
> > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> > Tests are included as part of the final patch as they need the SLP
> > pattern matcher to insert permutes in between.
> >
> > Ok for master?
> 
> So I think this is too specialized for the general issue that we're doing a bad
> job in CSEing the load part of different permutes of the same group.  I've
> played with fixing this half a year ago (again) in multiple general ways but
> they all caused some regressions.
> 
> So you're now adding some heuristics as to when to anticipate "CSE" (or
> merging with followup permutes).
> 
> To quickly recap what I did consider two loads (V2DF) one { a[0], a[1] } and
> the other { a[1], a[0] }.  They currently are two SLP nodes and one with a
> load_permutation.
> My original attempts focused on trying to get rid of load_permutation in
> favor of lane_permute nodes and thus during SLP discovery I turned the
> second into { a[0], a[1] } (magically unified with the other load) and a
> followup lane-permute node.
> 
> So for your case you have IIUC { a[0], a[0] } and { a[1], a[1] } which eventually
> will (due to patterns) be lane-permuted into { a[0], a[1] }, right?  So
> generalizing this as a single { a[0], a[1] } plus two lane-permute nodes  { 0, 0 }
> and { 1, 1 } early would solve the issue as well?

Correct, I did wonder why it was generating two different nodes instead of a lane
permute but didn't pay much attention that it was just a short coming.

> Now, in general it might be
> more profitable to generate the { a[0], a[0] } and { a[1], a[1] } via scalar-load-
> and-splat rather than vector load and permute so we have to be careful to
> not over-optimize here or be prepared to do the reverse transform.

This in principle can be done in optimize_slp then right? Since it would do
a lot of the same work already and find the materialization points. 

> 
> The patch itself is a bit ugly since it modifies the SLP graph when we already
> produced the graphds graph so I would do any of this before.  I did consider
> gathering all loads nodes loading from a group and then trying to apply some
> heuristic to alter the SLP graph so it can be better optimized.  In fact when we
> want to generate the same code as the non-SLP interleaving scheme does
> we do have to look at those since we have to unify loads there.
> 

Yes.. I will concede the patch isn't my finest work.. I also don't like the fact that I
had to keep leafs in tact less I break things later. But wanted feedback :) 

> I'd put this after vect_slp_build_vertices but before the new_graph call -
> altering 'vertices' / 'leafs' should be more easily possible and the 'leafs' array
> contains all loads already (vect_slp_build_vertices could be massaged to
> provide a map from DR_GROUP_FIRST_ELEMENT to slp_tree, giving us the
> meta we want).
> 
> That said, I'd like to see something more forward-looking rather than the ad-
> hoc special-casing of what you run into with the pattern matching.
> 

Yeah, I like your suggestion about doing it at build time and CSEing early, but
don't think I can get that work in a week given that you've already tried multiple times :)
Happy to give it a go next stage-1 opening though.

> In case we want to still go with the special-casing it should IMHO be done in a
> pre-order walk simply looking for lane permute nodes with children that all
> load from the same group performing what you do before any of the
> vertices/graph stuff is built.  That's probably easiest at this point and it can be
> done when then bst_map is still around so you can properly CSE the new
> load you build.

That's fair enough. I do think I need a temporary (not terrible) workaround...This would
then need to be somewhere in vect_analyze_slp. Would you prefer I do it during the
construction of the instance of afterwards?

Regards,
Tamar

> 
> Thanks,
> Richard.
> 
> 
> 
> > Thanks,
> > Tamar
> >
> > gcc/ChangeLog:
> >
> > 	* tree-vect-slp.c (vect_optimize_slp): Promote permutes.
> >
> >
> 
> --
> Richard Biener <rguenther@suse.de>
> SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409
> Nuernberg, Germany; GF: Felix Imend

^ permalink raw reply	[flat|nested] 5+ messages in thread

* RE: [PATCH v2 10/18]middle-end simplify lane permutes which selects from loads from the same DR.
  2020-11-04 14:02   ` Tamar Christina
@ 2020-11-04 15:12     ` Richard Biener
  2020-11-04 15:17       ` Tamar Christina
  0 siblings, 1 reply; 5+ messages in thread
From: Richard Biener @ 2020-11-04 15:12 UTC (permalink / raw)
  To: Tamar Christina; +Cc: gcc-patches, nd, ook

On Wed, 4 Nov 2020, Tamar Christina wrote:

> Hi Richi,
> 
> > -----Original Message-----
> > From: rguenther@c653.arch.suse.de <rguenther@c653.arch.suse.de> On
> > Behalf Of Richard Biener
> > Sent: Wednesday, November 4, 2020 1:36 PM
> > To: Tamar Christina <Tamar.Christina@arm.com>
> > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; ook@ucw.cz
> > Subject: Re: [PATCH v2 10/18]middle-end simplify lane permutes which
> > selects from loads from the same DR.
> > 
> > On Tue, 3 Nov 2020, Tamar Christina wrote:
> > 
> > > Hi All,
> > >
> > > This change allows one to simplify lane permutes that select from
> > > multiple load leafs that load from the same DR group by promoting the
> > > VEC_PERM node into a load itself and pushing the lane permute into it as a
> > load permute.
> > >
> > > This saves us from having to calculate where to materialize a new load node.
> > > If the resulting loads are now unused they are freed and are removed
> > > from the graph.
> > >
> > > This allows us to handle cases where we would have generated:
> > >
> > > 	movi    v4.4s, 0
> > > 	adrp    x3, .LC0
> > > 	ldr     q5, [x3, #:lo12:.LC0]
> > > 	mov     x3, 0
> > > 	.p2align 3,,7
> > > .L2:
> > > 	mov     v0.16b, v4.16b
> > > 	mov     v3.16b, v4.16b
> > > 	ldr     q1, [x1, x3]
> > > 	ldr     q2, [x0, x3]
> > > 	fcmla   v0.4s, v2.4s, v1.4s, #0
> > > 	fcmla   v3.4s, v1.4s, v2.4s, #0
> > > 	fcmla   v0.4s, v2.4s, v1.4s, #270
> > > 	fcmla   v3.4s, v1.4s, v2.4s, #270
> > > 	mov     v1.16b, v3.16b
> > > 	tbl     v0.16b, {v0.16b - v1.16b}, v5.16b
> > > 	str     q0, [x2, x3]
> > > 	add     x3, x3, 16
> > > 	cmp     x3, 1600
> > > 	bne     .L2
> > > 	ret
> > >
> > > and instead generate
> > >
> > > 	mov     x3, 0
> > > 	.p2align 3,,7
> > > .L27:
> > > 	ldr     q0, [x2, x3]
> > > 	ldr     q1, [x0, x3]
> > > 	ldr     q2, [x1, x3]
> > > 	fcmla   v0.2d, v1.2d, v2.2d, #0
> > > 	fcmla   v0.2d, v1.2d, v2.2d, #270
> > > 	str     q0, [x2, x3]
> > > 	add     x3, x3, 16
> > > 	cmp     x3, 512
> > > 	bne     .L27
> > > 	ret
> > >
> > > This runs as a pre step such that permute simplification can still
> > > inspect this permute is needed
> > >
> > > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> > > Tests are included as part of the final patch as they need the SLP
> > > pattern matcher to insert permutes in between.
> > >
> > > Ok for master?
> > 
> > So I think this is too specialized for the general issue that we're doing a bad
> > job in CSEing the load part of different permutes of the same group.  I've
> > played with fixing this half a year ago (again) in multiple general ways but
> > they all caused some regressions.
> > 
> > So you're now adding some heuristics as to when to anticipate "CSE" (or
> > merging with followup permutes).
> > 
> > To quickly recap what I did consider two loads (V2DF) one { a[0], a[1] } and
> > the other { a[1], a[0] }.  They currently are two SLP nodes and one with a
> > load_permutation.
> > My original attempts focused on trying to get rid of load_permutation in
> > favor of lane_permute nodes and thus during SLP discovery I turned the
> > second into { a[0], a[1] } (magically unified with the other load) and a
> > followup lane-permute node.
> > 
> > So for your case you have IIUC { a[0], a[0] } and { a[1], a[1] } which eventually
> > will (due to patterns) be lane-permuted into { a[0], a[1] }, right?  So
> > generalizing this as a single { a[0], a[1] } plus two lane-permute nodes  { 0, 0 }
> > and { 1, 1 } early would solve the issue as well?
> 
> Correct, I did wonder why it was generating two different nodes instead of a lane
> permute but didn't pay much attention that it was just a short coming.
> 
> > Now, in general it might be
> > more profitable to generate the { a[0], a[0] } and { a[1], a[1] } via scalar-load-
> > and-splat rather than vector load and permute so we have to be careful to
> > not over-optimize here or be prepared to do the reverse transform.
> 
> This in principle can be done in optimize_slp then right? Since it would do
> a lot of the same work already and find the materialization points. 
> 
> > 
> > The patch itself is a bit ugly since it modifies the SLP graph when we already
> > produced the graphds graph so I would do any of this before.  I did consider
> > gathering all loads nodes loading from a group and then trying to apply some
> > heuristic to alter the SLP graph so it can be better optimized.  In fact when we
> > want to generate the same code as the non-SLP interleaving scheme does
> > we do have to look at those since we have to unify loads there.
> > 
> 
> Yes.. I will concede the patch isn't my finest work.. I also don't like the fact that I
> had to keep leafs in tact less I break things later. But wanted feedback :) 
> 
> > I'd put this after vect_slp_build_vertices but before the new_graph call -
> > altering 'vertices' / 'leafs' should be more easily possible and the 'leafs' array
> > contains all loads already (vect_slp_build_vertices could be massaged to
> > provide a map from DR_GROUP_FIRST_ELEMENT to slp_tree, giving us the
> > meta we want).
> > 
> > That said, I'd like to see something more forward-looking rather than the ad-
> > hoc special-casing of what you run into with the pattern matching.
> > 
> 
> Yeah, I like your suggestion about doing it at build time and CSEing early, but
> don't think I can get that work in a week given that you've already tried multiple times :)
> Happy to give it a go next stage-1 opening though.
> 
> > In case we want to still go with the special-casing it should IMHO be done in a
> > pre-order walk simply looking for lane permute nodes with children that all
> > load from the same group performing what you do before any of the
> > vertices/graph stuff is built.  That's probably easiest at this point and it can be
> > done when then bst_map is still around so you can properly CSE the new
> > load you build.
> 
> That's fair enough. I do think I need a temporary (not terrible) workaround...This would
> then need to be somewhere in vect_analyze_slp. Would you prefer I do it during the
> construction of the instance of afterwards?

Well, right after pattern recog processed all instances - the issue only
pops up due to pattern recog, right?

> Regards,
> Tamar
> 
> > 
> > Thanks,
> > Richard.
> > 
> > 
> > 
> > > Thanks,
> > > Tamar
> > >
> > > gcc/ChangeLog:
> > >
> > > 	* tree-vect-slp.c (vect_optimize_slp): Promote permutes.
> > >
> > >
> > 
> > --
> > Richard Biener <rguenther@suse.de>
> > SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409
> > Nuernberg, Germany; GF: Felix Imend
> 

-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409 Nuernberg,
Germany; GF: Felix Imend

^ permalink raw reply	[flat|nested] 5+ messages in thread

* RE: [PATCH v2 10/18]middle-end simplify lane permutes which selects from loads from the same DR.
  2020-11-04 15:12     ` Richard Biener
@ 2020-11-04 15:17       ` Tamar Christina
  0 siblings, 0 replies; 5+ messages in thread
From: Tamar Christina @ 2020-11-04 15:17 UTC (permalink / raw)
  To: Richard Biener; +Cc: gcc-patches, nd, ook



> -----Original Message-----
> From: rguenther@c653.arch.suse.de <rguenther@c653.arch.suse.de> On
> Behalf Of Richard Biener
> Sent: Wednesday, November 4, 2020 3:12 PM
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; ook@ucw.cz
> Subject: RE: [PATCH v2 10/18]middle-end simplify lane permutes which
> selects from loads from the same DR.
> 
> On Wed, 4 Nov 2020, Tamar Christina wrote:
> 
> > Hi Richi,
> >
> > > -----Original Message-----
> > > From: rguenther@c653.arch.suse.de <rguenther@c653.arch.suse.de> On
> > > Behalf Of Richard Biener
> > > Sent: Wednesday, November 4, 2020 1:36 PM
> > > To: Tamar Christina <Tamar.Christina@arm.com>
> > > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; ook@ucw.cz
> > > Subject: Re: [PATCH v2 10/18]middle-end simplify lane permutes which
> > > selects from loads from the same DR.
> > >
> > > On Tue, 3 Nov 2020, Tamar Christina wrote:
> > >
> > > > Hi All,
> > > >
> > > > This change allows one to simplify lane permutes that select from
> > > > multiple load leafs that load from the same DR group by promoting
> > > > the VEC_PERM node into a load itself and pushing the lane permute
> > > > into it as a
> > > load permute.
> > > >
> > > > This saves us from having to calculate where to materialize a new load
> node.
> > > > If the resulting loads are now unused they are freed and are
> > > > removed from the graph.
> > > >
> > > > This allows us to handle cases where we would have generated:
> > > >
> > > > 	movi    v4.4s, 0
> > > > 	adrp    x3, .LC0
> > > > 	ldr     q5, [x3, #:lo12:.LC0]
> > > > 	mov     x3, 0
> > > > 	.p2align 3,,7
> > > > .L2:
> > > > 	mov     v0.16b, v4.16b
> > > > 	mov     v3.16b, v4.16b
> > > > 	ldr     q1, [x1, x3]
> > > > 	ldr     q2, [x0, x3]
> > > > 	fcmla   v0.4s, v2.4s, v1.4s, #0
> > > > 	fcmla   v3.4s, v1.4s, v2.4s, #0
> > > > 	fcmla   v0.4s, v2.4s, v1.4s, #270
> > > > 	fcmla   v3.4s, v1.4s, v2.4s, #270
> > > > 	mov     v1.16b, v3.16b
> > > > 	tbl     v0.16b, {v0.16b - v1.16b}, v5.16b
> > > > 	str     q0, [x2, x3]
> > > > 	add     x3, x3, 16
> > > > 	cmp     x3, 1600
> > > > 	bne     .L2
> > > > 	ret
> > > >
> > > > and instead generate
> > > >
> > > > 	mov     x3, 0
> > > > 	.p2align 3,,7
> > > > .L27:
> > > > 	ldr     q0, [x2, x3]
> > > > 	ldr     q1, [x0, x3]
> > > > 	ldr     q2, [x1, x3]
> > > > 	fcmla   v0.2d, v1.2d, v2.2d, #0
> > > > 	fcmla   v0.2d, v1.2d, v2.2d, #270
> > > > 	str     q0, [x2, x3]
> > > > 	add     x3, x3, 16
> > > > 	cmp     x3, 512
> > > > 	bne     .L27
> > > > 	ret
> > > >
> > > > This runs as a pre step such that permute simplification can still
> > > > inspect this permute is needed
> > > >
> > > > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> > > > Tests are included as part of the final patch as they need the SLP
> > > > pattern matcher to insert permutes in between.
> > > >
> > > > Ok for master?
> > >
> > > So I think this is too specialized for the general issue that we're
> > > doing a bad job in CSEing the load part of different permutes of the
> > > same group.  I've played with fixing this half a year ago (again) in
> > > multiple general ways but they all caused some regressions.
> > >
> > > So you're now adding some heuristics as to when to anticipate "CSE"
> > > (or merging with followup permutes).
> > >
> > > To quickly recap what I did consider two loads (V2DF) one { a[0],
> > > a[1] } and the other { a[1], a[0] }.  They currently are two SLP
> > > nodes and one with a load_permutation.
> > > My original attempts focused on trying to get rid of
> > > load_permutation in favor of lane_permute nodes and thus during SLP
> > > discovery I turned the second into { a[0], a[1] } (magically unified
> > > with the other load) and a followup lane-permute node.
> > >
> > > So for your case you have IIUC { a[0], a[0] } and { a[1], a[1] }
> > > which eventually will (due to patterns) be lane-permuted into {
> > > a[0], a[1] }, right?  So generalizing this as a single { a[0], a[1]
> > > } plus two lane-permute nodes  { 0, 0 } and { 1, 1 } early would solve the
> issue as well?
> >
> > Correct, I did wonder why it was generating two different nodes
> > instead of a lane permute but didn't pay much attention that it was just a
> short coming.
> >
> > > Now, in general it might be
> > > more profitable to generate the { a[0], a[0] } and { a[1], a[1] }
> > > via scalar-load- and-splat rather than vector load and permute so we
> > > have to be careful to not over-optimize here or be prepared to do the
> reverse transform.
> >
> > This in principle can be done in optimize_slp then right? Since it
> > would do a lot of the same work already and find the materialization points.
> >
> > >
> > > The patch itself is a bit ugly since it modifies the SLP graph when
> > > we already produced the graphds graph so I would do any of this
> > > before.  I did consider gathering all loads nodes loading from a
> > > group and then trying to apply some heuristic to alter the SLP graph
> > > so it can be better optimized.  In fact when we want to generate the
> > > same code as the non-SLP interleaving scheme does we do have to look
> at those since we have to unify loads there.
> > >
> >
> > Yes.. I will concede the patch isn't my finest work.. I also don't
> > like the fact that I had to keep leafs in tact less I break things
> > later. But wanted feedback :)
> >
> > > I'd put this after vect_slp_build_vertices but before the new_graph
> > > call - altering 'vertices' / 'leafs' should be more easily possible
> > > and the 'leafs' array contains all loads already
> > > (vect_slp_build_vertices could be massaged to provide a map from
> > > DR_GROUP_FIRST_ELEMENT to slp_tree, giving us the meta we want).
> > >
> > > That said, I'd like to see something more forward-looking rather
> > > than the ad- hoc special-casing of what you run into with the pattern
> matching.
> > >
> >
> > Yeah, I like your suggestion about doing it at build time and CSEing
> > early, but don't think I can get that work in a week given that you've
> > already tried multiple times :) Happy to give it a go next stage-1 opening
> though.
> >
> > > In case we want to still go with the special-casing it should IMHO
> > > be done in a pre-order walk simply looking for lane permute nodes
> > > with children that all load from the same group performing what you
> > > do before any of the vertices/graph stuff is built.  That's probably
> > > easiest at this point and it can be done when then bst_map is still
> > > around so you can properly CSE the new load you build.
> >
> > That's fair enough. I do think I need a temporary (not terrible)
> > workaround...This would then need to be somewhere in vect_analyze_slp.
> > Would you prefer I do it during the construction of the instance of
> afterwards?
> 
> Well, right after pattern recog processed all instances - the issue only pops up
> due to pattern recog, right?

Ah true *facepalm* though I'll need to refactor slightly to keep bst_map around a bit longer.

Regards,
Tamar

> 
> > Regards,
> > Tamar
> >
> > >
> > > Thanks,
> > > Richard.
> > >
> > >
> > >
> > > > Thanks,
> > > > Tamar
> > > >
> > > > gcc/ChangeLog:
> > > >
> > > > 	* tree-vect-slp.c (vect_optimize_slp): Promote permutes.
> > > >
> > > >
> > >
> > > --
> > > Richard Biener <rguenther@suse.de>
> > > SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409
> > > Nuernberg, Germany; GF: Felix Imend
> >
> 
> --
> Richard Biener <rguenther@suse.de>
> SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409
> Nuernberg, Germany; GF: Felix Imend

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2020-11-04 15:17 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-11-03 15:07 [PATCH v2 10/18]middle-end simplify lane permutes which selects from loads from the same DR Tamar Christina
2020-11-04 13:35 ` Richard Biener
2020-11-04 14:02   ` Tamar Christina
2020-11-04 15:12     ` Richard Biener
2020-11-04 15:17       ` Tamar Christina

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).