public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc r11-3823] optimize permutes in SLP, remove vect_attempt_slp_rearrange_stmts
@ 2020-10-12 13:44 Richard Biener
  0 siblings, 0 replies; only message in thread
From: Richard Biener @ 2020-10-12 13:44 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:126ed72b9f48f8530b194532cc281fb761690435

commit r11-3823-g126ed72b9f48f8530b194532cc281fb761690435
Author: Richard Biener <rguenther@suse.de>
Date:   Wed Sep 30 17:08:01 2020 +0200

    optimize permutes in SLP, remove vect_attempt_slp_rearrange_stmts
    
    This introduces a permute optimization phase for SLP which is
    intended to cover the existing permute eliding for SLP reductions
    plus handling commonizing the easy cases.
    
    It currently uses graphds to compute a postorder on the reverse
    SLP graph and it handles all cases vect_attempt_slp_rearrange_stmts
    did (hopefully - I've adjusted most testcases that triggered it
    a few days ago).  It restricts itself to move around bijective
    permutations to simplify things for now, mainly around constant nodes.
    
    As a prerequesite it makes the SLP graph cyclic (ugh).  It looks
    like it would pay off to compute a PRE/POST order visit array
    once and elide all the recursive SLP graph walks and their
    visited hash-set.  At least for the time where we do not change
    the SLP graph during such walk.
    
    I do not like using graphds too much but at least I don't have to
    re-implement yet another RPO walk, so maybe it isn't too bad.
    
    It now computes permute placement during iteration and thus should
    get cycles more obviously correct.
    
    Richard.
    
    2020-10-06  Richard Biener  <rguenther@suse.de>
    
            * tree-vect-data-refs.c (vect_slp_analyze_instance_dependence):
            Use SLP_TREE_REPRESENTATIVE.
            * tree-vectorizer.h (_slp_tree::vertex): New member used
            for graphds interfacing.
            * tree-vect-slp.c (vect_build_slp_tree_2): Allocate space
            for PHI SLP children.
            (vect_analyze_slp_backedges): New function filling in SLP
            node children for PHIs that correspond to backedge values.
            (vect_analyze_slp): Call vect_analyze_slp_backedges for the
            graph.
            (vect_slp_analyze_node_operations): Deal with a cyclic graph.
            (vect_schedule_slp_instance): Likewise.
            (vect_schedule_slp): Likewise.
            (slp_copy_subtree): Remove.
            (vect_slp_rearrange_stmts): Likewise.
            (vect_attempt_slp_rearrange_stmts): Likewise.
            (vect_slp_build_vertices): New functions.
            (vect_slp_permute): Likewise.
            (vect_slp_perms_eq): Likewise.
            (vect_optimize_slp): Remove special code to elide
            permutations with SLP reductions.  Implement generic
            permute optimization.
    
            * gcc.dg/vect/bb-slp-50.c: New testcase.
            * gcc.dg/vect/bb-slp-51.c: Likewise.

Diff:
---
 gcc/testsuite/gcc.dg/vect/bb-slp-50.c |  20 +
 gcc/testsuite/gcc.dg/vect/bb-slp-51.c |  20 +
 gcc/tree-vect-data-refs.c             |   2 +-
 gcc/tree-vect-slp.c                   | 685 +++++++++++++++++++++++-----------
 gcc/tree-vectorizer.h                 |   2 +
 5 files changed, 503 insertions(+), 226 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-50.c b/gcc/testsuite/gcc.dg/vect/bb-slp-50.c
new file mode 100644
index 00000000000..80216be4ebf
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-50.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_double } */
+
+double a[2];
+double b[2];
+double c[2];
+double d[2];
+double e[2];
+void foo(double x)
+{
+  double tembc0 = b[1] + c[1];
+  double tembc1 = b[0] + c[0];
+  double temde0 = d[0] + e[1];
+  double temde1 = d[1] + e[0];
+  a[0] = tembc0 + temde0;
+  a[1] = tembc1 + temde1;
+}
+
+/* We should common the permutation on the tembc{0,1} operands.  */
+/* { dg-final { scan-tree-dump-times "add new stmt: \[^\\n\\r\]* = VEC_PERM_EXPR" 2 "slp2" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-51.c b/gcc/testsuite/gcc.dg/vect/bb-slp-51.c
new file mode 100644
index 00000000000..1481018428e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-51.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_double } */
+
+double a[2];
+double b[2];
+double c[2];
+double e[2];
+void foo(double x)
+{
+  double tembc0 = b[1] + c[1];
+  double tembc1 = b[0] + c[0];
+  double temde0 = 5 + e[1];
+  double temde1 = 11 + e[0];
+  a[0] = tembc0 + temde0;
+  a[1] = tembc1 + temde1;
+}
+
+/* We should common the permutations on the tembc{0,1} and temde{0,1}
+   operands.  */
+/* { dg-final { scan-tree-dump-times "add new stmt: \[^\\r\\n\]* VEC_PERM_EXPR" 1 "slp2" } } */
diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c
index 676182c0888..3ff3088641a 100644
--- a/gcc/tree-vect-data-refs.c
+++ b/gcc/tree-vect-data-refs.c
@@ -841,7 +841,7 @@ vect_slp_analyze_instance_dependence (vec_info *vinfo, slp_instance instance)
 
   /* The stores of this instance are at the root of the SLP tree.  */
   slp_tree store = SLP_INSTANCE_TREE (instance);
-  if (! STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (store)[0]))
+  if (! STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (store)))
     store = NULL;
 
   /* Verify we can sink stores to the vectorized stmt insert location.  */
diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
index 8acef6f3cef..ff0ecda801b 100644
--- a/gcc/tree-vect-slp.c
+++ b/gcc/tree-vect-slp.c
@@ -1256,8 +1256,8 @@ vect_build_slp_tree_2 (vec_info *vinfo,
       if (gimple_assign_rhs_code (stmt) == COND_EXPR)
 	nops++;
     }
-  else if (is_a <gphi *> (stmt_info->stmt))
-    nops = 0;
+  else if (gphi *phi = dyn_cast <gphi *> (stmt_info->stmt))
+    nops = gimple_phi_num_args (phi);
   else
     return NULL;
 
@@ -1294,7 +1294,7 @@ vect_build_slp_tree_2 (vec_info *vinfo,
       else
 	return NULL;
       (*tree_size)++;
-      node = vect_create_new_slp_node (stmts, 0);
+      node = vect_create_new_slp_node (stmts, nops);
       SLP_TREE_VECTYPE (node) = vectype;
       return node;
     }
@@ -1812,188 +1812,6 @@ vect_mark_slp_stmts_relevant (slp_tree node)
   vect_mark_slp_stmts_relevant (node, visited);
 }
 
-/* Copy the SLP subtree rooted at NODE.  */
-
-static slp_tree
-slp_copy_subtree (slp_tree node, hash_map<slp_tree, slp_tree> &map)
-{
-  unsigned i;
-
-  bool existed_p;
-  slp_tree &copy_ref = map.get_or_insert (node, &existed_p);
-  if (existed_p)
-    return copy_ref;
-
-  copy_ref = new _slp_tree;
-  slp_tree copy = copy_ref;
-  SLP_TREE_DEF_TYPE (copy) = SLP_TREE_DEF_TYPE (node);
-  SLP_TREE_VECTYPE (copy) = SLP_TREE_VECTYPE (node);
-  SLP_TREE_REPRESENTATIVE (copy) = SLP_TREE_REPRESENTATIVE (node);
-  SLP_TREE_LANES (copy) = SLP_TREE_LANES (node);
-  copy->max_nunits = node->max_nunits;
-  SLP_TREE_REF_COUNT (copy) = 0;
-  if (SLP_TREE_SCALAR_STMTS (node).exists ())
-    SLP_TREE_SCALAR_STMTS (copy) = SLP_TREE_SCALAR_STMTS (node).copy ();
-  if (SLP_TREE_SCALAR_OPS (node).exists ())
-    SLP_TREE_SCALAR_OPS (copy) = SLP_TREE_SCALAR_OPS (node).copy ();
-  if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
-    SLP_TREE_LOAD_PERMUTATION (copy) = SLP_TREE_LOAD_PERMUTATION (node).copy ();
-  if (SLP_TREE_LANE_PERMUTATION (node).exists ())
-    SLP_TREE_LANE_PERMUTATION (copy) = SLP_TREE_LANE_PERMUTATION (node).copy ();
-  if (SLP_TREE_CHILDREN (node).exists ())
-    SLP_TREE_CHILDREN (copy) = SLP_TREE_CHILDREN (node).copy ();
-  gcc_assert (!SLP_TREE_VEC_STMTS (node).exists ());
-
-  slp_tree child;
-  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (copy), i, child)
-    {
-      SLP_TREE_CHILDREN (copy)[i] = slp_copy_subtree (child, map);
-      SLP_TREE_REF_COUNT (SLP_TREE_CHILDREN (copy)[i])++;
-    }
-  return copy;
-}
-
-/* Rearrange the statements of NODE according to PERMUTATION.  */
-
-static void
-vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
-                          vec<unsigned> permutation,
-			  hash_set<slp_tree> &visited)
-{
-  unsigned int i;
-  slp_tree child;
-
-  if (visited.add (node))
-    return;
-
-  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
-    vect_slp_rearrange_stmts (child, group_size, permutation, visited);
-
-  if (SLP_TREE_SCALAR_STMTS (node).exists ())
-    {
-      gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ());
-      vec<stmt_vec_info> tmp_stmts;
-      tmp_stmts.create (group_size);
-      tmp_stmts.quick_grow (group_size);
-      stmt_vec_info stmt_info;
-      FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
-	tmp_stmts[permutation[i]] = stmt_info;
-      SLP_TREE_SCALAR_STMTS (node).release ();
-      SLP_TREE_SCALAR_STMTS (node) = tmp_stmts;
-    }
-  if (SLP_TREE_SCALAR_OPS (node).exists ())
-    {
-      gcc_assert (group_size == SLP_TREE_SCALAR_OPS (node).length ());
-      vec<tree> tmp_ops;
-      tmp_ops.create (group_size);
-      tmp_ops.quick_grow (group_size);
-      tree op;
-      FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
-	tmp_ops[permutation[i]] = op;
-      SLP_TREE_SCALAR_OPS (node).release ();
-      SLP_TREE_SCALAR_OPS (node) = tmp_ops;
-    }
-  if (SLP_TREE_LANE_PERMUTATION (node).exists ())
-    {
-      gcc_assert (group_size == SLP_TREE_LANE_PERMUTATION (node).length ());
-      for (i = 0; i < group_size; ++i)
-	SLP_TREE_LANE_PERMUTATION (node)[i].second
-	  = permutation[SLP_TREE_LANE_PERMUTATION (node)[i].second];
-    }
-}
-
-
-/* Attempt to reorder stmts in a reduction chain so that we don't
-   require any load permutation.  Return true if that was possible,
-   otherwise return false.  */
-
-static bool
-vect_attempt_slp_rearrange_stmts (slp_instance slp_instn)
-{
-  unsigned int i, j;
-  unsigned int lidx;
-  slp_tree node, load;
-
-  if (SLP_INSTANCE_LOADS (slp_instn).is_empty ())
-    return false;
-
-  /* Compare all the permutation sequences to the first one.  We know
-     that at least one load is permuted.  */
-  node = SLP_INSTANCE_LOADS (slp_instn)[0];
-  if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
-    return false;
-  unsigned int group_size = SLP_TREE_LOAD_PERMUTATION (node).length ();
-  for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
-    {
-      if (!SLP_TREE_LOAD_PERMUTATION (load).exists ()
-	  || SLP_TREE_LOAD_PERMUTATION (load).length () != group_size)
-	return false;
-      FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (load), j, lidx)
-	if (lidx != SLP_TREE_LOAD_PERMUTATION (node)[j])
-	  return false;
-    }
-
-  /* Check that the loads in the first sequence are different and there
-     are no gaps between them and that there is an actual permutation.  */
-  bool any_permute = false;
-  auto_sbitmap load_index (group_size);
-  bitmap_clear (load_index);
-  FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
-    {
-      if (lidx != i)
-	any_permute = true;
-      if (lidx >= group_size)
-	return false;
-      if (bitmap_bit_p (load_index, lidx))
-	return false;
-
-      bitmap_set_bit (load_index, lidx);
-    }
-  if (!any_permute)
-    return false;
-  for (i = 0; i < group_size; i++)
-    if (!bitmap_bit_p (load_index, i))
-      return false;
-
-  /* This permutation is valid for reduction.  Since the order of the
-     statements in the nodes is not important unless they are memory
-     accesses, we can rearrange the statements in all the nodes
-     according to the order of the loads.  */
-
-  /* We have to unshare the SLP tree we modify.  */
-  hash_map<slp_tree, slp_tree> map;
-  slp_tree unshared = slp_copy_subtree (SLP_INSTANCE_TREE (slp_instn), map);
-  vect_free_slp_tree (SLP_INSTANCE_TREE (slp_instn));
-  SLP_TREE_REF_COUNT (unshared)++;
-  SLP_INSTANCE_TREE (slp_instn) = unshared;
-  FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
-    SLP_INSTANCE_LOADS (slp_instn)[i] = *map.get (node);
-  node = SLP_INSTANCE_LOADS (slp_instn)[0];
-
-  /* Do the actual re-arrangement.  */
-  hash_set<slp_tree> visited;
-  vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
-			    node->load_permutation, visited);
-
-  /* We are done, no actual permutations need to be generated.  */
-  poly_uint64 unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_instn);
-  FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
-    {
-      stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
-      first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
-      /* But we have to keep those permutations that are required because
-         of handling of gaps.  */
-      if (known_eq (unrolling_factor, 1U)
-	  || (group_size == DR_GROUP_SIZE (first_stmt_info)
-	      && DR_GROUP_GAP (first_stmt_info) == 0))
-	SLP_TREE_LOAD_PERMUTATION (node).release ();
-      else
-	for (j = 0; j < SLP_TREE_LOAD_PERMUTATION (node).length (); ++j)
-	  SLP_TREE_LOAD_PERMUTATION (node)[j] = j;
-    }
-
-  return true;
-}
 
 /* Gather loads in the SLP graph NODE and populate the INST loads array.  */
 
@@ -2458,6 +2276,53 @@ vect_analyze_slp_instance (vec_info *vinfo,
   return false;
 }
 
+/* Fill in backedge SLP children in the SLP graph.  */
+
+static void
+vect_analyze_slp_backedges (vec_info *vinfo, slp_tree node,
+			    scalar_stmts_to_slp_tree_map_t *bst_map,
+			    hash_set<slp_tree> &visited)
+{
+  if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
+      || visited.add (node))
+    return;
+
+  slp_tree child;
+  unsigned i;
+  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
+    if (child)
+      vect_analyze_slp_backedges (vinfo, child, bst_map, visited);
+
+  if (gphi *phi = dyn_cast <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
+    for (unsigned i = 0; i < gimple_phi_num_args (phi); ++i)
+      {
+	auto_vec<stmt_vec_info, 64> stmts;
+	unsigned j;
+	stmt_vec_info phi_info;
+	FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, phi_info)
+	  {
+	    tree def = gimple_phi_arg_def (as_a <gphi *>(phi_info->stmt), i);
+	    stmt_vec_info def_info = vinfo->lookup_def (def);
+	    if (!def_info)
+	      break;
+	    stmts.safe_push (vect_stmt_to_vectorize (def_info));
+	  }
+	if (j != SLP_TREE_LANES (node))
+	  continue;
+	slp_tree *edge_def = bst_map->get (stmts);
+	if (edge_def)
+	  {
+	    /* ???  We're currently not recording non-backedge children
+	       of PHIs like external reduction initial values.  Avoid
+	       NULL entries in SLP_TREE_CHILDREN for those and thus
+	       for now simply only record backedge defs at a
+	       SLP_TREE_CHILDREN index possibly not matching that of
+	       the corresponding PHI argument index.  */
+	    SLP_TREE_CHILDREN (node).quick_push (*edge_def);
+	    (*edge_def)->refcnt++;
+	  }
+      }
+}
 
 /* Check if there are stmts in the loop can be vectorized using SLP.  Build SLP
    trees of packed scalar stmts if SLP is possible.  */
@@ -2509,6 +2374,13 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
 				   max_tree_size);
     }
 
+  /* Fill in backedges.  */
+  slp_instance instance;
+  hash_set<slp_tree> visited;
+  FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
+    vect_analyze_slp_backedges (vinfo, SLP_INSTANCE_TREE (instance),
+				bst_map, visited);
+
   /* The map keeps a reference on SLP nodes built, release that.  */
   for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
        it != bst_map->end (); ++it)
@@ -2519,34 +2391,396 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
   return opt_result::success ();
 }
 
+/* Fill the vertices and leafs vector with all nodes in the SLP graph.  */
+
+static void
+vect_slp_build_vertices (hash_set<slp_tree> &visited, slp_tree node,
+			 vec<slp_tree> &vertices, vec<int> &leafs)
+{
+  unsigned i;
+  slp_tree child;
+
+  if (visited.add (node))
+    return;
+
+  node->vertex = vertices.length ();
+  vertices.safe_push (node);
+  if (SLP_TREE_CHILDREN (node).is_empty ())
+    leafs.safe_push (node->vertex);
+  else
+    FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
+      vect_slp_build_vertices (visited, child, vertices, leafs);
+}
+
+/* Fill the vertices and leafs vector with all nodes in the SLP graph.  */
+
+static void
+vect_slp_build_vertices (vec_info *info, vec<slp_tree> &vertices,
+			 vec<int> &leafs)
+{
+  hash_set<slp_tree> visited;
+  unsigned i;
+  slp_instance instance;
+  FOR_EACH_VEC_ELT (info->slp_instances, i, instance)
+    vect_slp_build_vertices (visited, SLP_INSTANCE_TREE (instance), vertices,
+			     leafs);
+}
+
+/* Apply (reverse) bijectite PERM to VEC.  */
+
+template <class T>
+static void
+vect_slp_permute (vec<unsigned> perm,
+		  vec<T> &vec, bool reverse)
+{
+  auto_vec<T, 64> saved;
+  saved.create (vec.length ());
+  for (unsigned i = 0; i < vec.length (); ++i)
+    saved.quick_push (vec[i]);
+
+  if (reverse)
+    {
+      for (unsigned i = 0; i < vec.length (); ++i)
+	vec[perm[i]] = saved[i];
+      for (unsigned i = 0; i < vec.length (); ++i)
+	gcc_assert (vec[perm[i]] == saved[i]);
+    }
+  else
+    {
+      for (unsigned i = 0; i < vec.length (); ++i)
+	vec[i] = saved[perm[i]];
+      for (unsigned i = 0; i < vec.length (); ++i)
+	gcc_assert (vec[i] == saved[perm[i]]);
+    }
+}
+
+/* Return whether permutations PERM_A and PERM_B as recorded in the
+   PERMS vector are equal.  */
+
+static bool
+vect_slp_perms_eq (const vec<vec<unsigned> > &perms,
+		   int perm_a, int perm_b)
+{
+  return (perm_a == perm_b
+	  || (perms[perm_a].length () == perms[perm_b].length ()
+	      && memcmp (&perms[perm_a][0], &perms[perm_b][0],
+			 sizeof (unsigned) * perms[perm_a].length ()) == 0));
+}
+
+/* Optimize the SLP graph of VINFO.  */
+
 void
 vect_optimize_slp (vec_info *vinfo)
 {
-  /* Optimize permutations in SLP reductions.  */
-  slp_instance instance;
+  if (vinfo->slp_instances.is_empty ())
+    return;
+
+  slp_tree node;
   unsigned i;
-  FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
+  auto_vec<slp_tree> vertices;
+  auto_vec<int> leafs;
+  vect_slp_build_vertices (vinfo, vertices, leafs);
+
+  struct graph *slpg = new_graph (vertices.length ());
+  FOR_EACH_VEC_ELT (vertices, i, node)
     {
-      slp_tree node = SLP_INSTANCE_TREE (instance);
-      stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
-      /* Reduction (there are no data-refs in the root).
-	 In reduction chain the order of the loads is not important.  */
-      if (!STMT_VINFO_DATA_REF (stmt_info)
-	  && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
-	  && !SLP_INSTANCE_ROOT_STMT (instance))
-	vect_attempt_slp_rearrange_stmts (instance);
+      unsigned j;
+      slp_tree child;
+      FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
+	add_edge (slpg, i, child->vertex);
     }
 
-  /* Gather all loads in the SLP graph.  */
-  auto_vec<slp_tree> slp_loads;
-  hash_set<slp_tree> visited;
-  FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
-    vect_gather_slp_loads (slp_loads, SLP_INSTANCE_TREE (instance),
-			   visited);
+  /* Compute (reverse) postorder on the inverted graph.  */
+  auto_vec<int> ipo;
+  graphds_dfs (slpg, &leafs[0], leafs.length (), &ipo, false, NULL, NULL);
 
-  slp_tree node;
-  FOR_EACH_VEC_ELT (slp_loads, i, node)
+  auto_sbitmap n_visited (vertices.length ());
+  auto_sbitmap n_materialize (vertices.length ());
+  auto_vec<int> n_perm (vertices.length ());
+  auto_vec<vec<unsigned> > perms;
+
+  bitmap_clear (n_visited);
+  bitmap_clear (n_materialize);
+  n_perm.quick_grow_cleared (vertices.length ());
+  perms.safe_push (vNULL); /* zero is no permute */
+
+  /* Produce initial permutations.  */
+  for (i = 0; i < leafs.length (); ++i)
+    {
+      int idx = leafs[i];
+      slp_tree node = vertices[idx];
+
+      /* Handle externals and constants optimistically throughout the
+	 iteration.  */
+      if (SLP_TREE_DEF_TYPE (node) == vect_external_def
+	  || SLP_TREE_DEF_TYPE (node) == vect_constant_def)
+	continue;
+
+      /* Loads are the only thing generating permutes and leafs do not
+	 change across iterations.  */
+      bitmap_set_bit (n_visited, idx);
+      if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
+	continue;
+
+      /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the
+	 node unpermuted, record this permute.  */
+      stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
+      if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
+	continue;
+      dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
+      unsigned imin = DR_GROUP_SIZE (dr_stmt) + 1, imax = 0;
+      bool any_permute = false;
+      for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
+	{
+	  unsigned idx = SLP_TREE_LOAD_PERMUTATION (node)[j];
+	  imin = MIN (imin, idx);
+	  imax = MAX (imax, idx);
+	  if (idx - SLP_TREE_LOAD_PERMUTATION (node)[0] != j)
+	    any_permute = true;
+	}
+      /* If there's no permute no need to split one out.  */
+      if (!any_permute)
+	continue;
+      /* If the span doesn't match we'd disrupt VF computation, avoid
+	 that for now.  */
+      if (imax - imin + 1 != SLP_TREE_LANES (node))
+	continue;
+
+      /* For now only handle true permutes, like
+	 vect_attempt_slp_rearrange_stmts did.  This allows us to be lazy
+	 when permuting constants and invariants keeping the permute
+	 bijective.  */
+      auto_sbitmap load_index (SLP_TREE_LANES (node));
+      bitmap_clear (load_index);
+      for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
+	bitmap_set_bit (load_index, SLP_TREE_LOAD_PERMUTATION (node)[j] - imin);
+      unsigned j;
+      for (j = 0; j < SLP_TREE_LANES (node); ++j)
+	if (!bitmap_bit_p (load_index, j))
+	  break;
+      if (j != SLP_TREE_LANES (node))
+	continue;
+
+      vec<unsigned> perm = vNULL;
+      perm.safe_grow (SLP_TREE_LANES (node), true);
+      for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
+	perm[j] = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
+      perms.safe_push (perm);
+      n_perm[idx] = perms.length () - 1;
+    }
+
+  /* Propagate permutes along the graph and compute materialization points.  */
+  bool changed;
+  unsigned iteration = 0;
+  do
+    {
+      changed = false;
+      ++iteration;
+
+      for (i = vertices.length (); i > 0 ; --i)
+	{
+	  int idx = ipo[i-1];
+	  slp_tree node = vertices[idx];
+	  /* For leafs there's nothing to do - we've seeded permutes
+	     on those above.  */
+	  if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
+	    continue;
+
+	  bitmap_set_bit (n_visited, idx);
+
+	  /* We cannot move a permute across a store.  */
+	  if (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))
+	      && DR_IS_WRITE
+		   (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))))
+	    continue;
+
+	  int perm = -1;
+	  for (graph_edge *succ = slpg->vertices[idx].succ;
+	       succ; succ = succ->succ_next)
+	    {
+	      int succ_idx = succ->dest;
+	      /* Handle unvisited nodes optimistically.  */
+	      /* ???  But for constants once we want to handle non-bijective
+		 permutes we have to verify the permute, when unifying lanes,
+		 will not unify different constants.  For example see
+		 gcc.dg/vect/bb-slp-14.c for a case that would break.  */
+	      if (!bitmap_bit_p (n_visited, succ_idx))
+		continue;
+	      int succ_perm = n_perm[succ_idx];
+	      /* Once we materialize succs permutation its output lanes
+		 appear unpermuted to us.  */
+	      if (bitmap_bit_p (n_materialize, succ_idx))
+		succ_perm = 0;
+	      if (perm == -1)
+		perm = succ_perm;
+	      else if (succ_perm == 0)
+		{
+		  perm = 0;
+		  break;
+		}
+	      else if (!vect_slp_perms_eq (perms, perm, succ_perm))
+		{
+		  perm = 0;
+		  break;
+		}
+	    }
+
+	  if (perm == -1)
+	    /* Pick up pre-computed leaf values.  */
+	    perm = n_perm[idx];
+	  else if (!vect_slp_perms_eq (perms, perm, n_perm[idx]))
+	    {
+	      if (iteration > 1)
+		/* Make sure we eventually converge.  */
+		gcc_checking_assert (perm == 0);
+	      n_perm[idx] = perm;
+	      if (perm == 0)
+		bitmap_clear_bit (n_materialize, idx);
+	      changed = true;
+	    }
+
+	  if (perm == 0)
+	    continue;
+
+	  /* Elide pruning at materialization points in the first
+	     iteration so every node was visited once at least.  */
+	  if (iteration == 1)
+	    continue;
+
+	  /* Decide on permute materialization.  Look whether there's
+	     a use (pred) edge that is permuted differently than us.
+	     In that case mark ourselves so the permutation is applied.  */
+	  bool all_preds_permuted = slpg->vertices[idx].pred != NULL;
+	  for (graph_edge *pred = slpg->vertices[idx].pred;
+	       pred; pred = pred->pred_next)
+	    {
+	      gcc_checking_assert (bitmap_bit_p (n_visited, pred->src));
+	      int pred_perm = n_perm[pred->src];
+	      if (!vect_slp_perms_eq (perms, perm, pred_perm))
+		{
+		  all_preds_permuted = false;
+		  break;
+		}
+	    }
+	  if (!all_preds_permuted)
+	    {
+	      if (!bitmap_bit_p (n_materialize, idx))
+		changed = true;
+	      bitmap_set_bit (n_materialize, idx);
+	    }
+	}
+    }
+  while (changed || iteration == 1);
+
+  /* Materialize.  */
+  for (i = 0; i < vertices.length (); ++i)
+    {
+      int perm = n_perm[i];
+      if (perm <= 0)
+	continue;
+
+      slp_tree node = vertices[i];
+
+      /* First permute invariant/external original successors.  */
+      unsigned j;
+      slp_tree child;
+      FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
+	{
+	  if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
+	    continue;
+
+	  /* If the vector is uniform there's nothing to do.  */
+	  if (vect_slp_tree_uniform_p (child))
+	    continue;
+
+	  /* We can end up sharing some externals via two_operator
+	     handling.  Be prepared to unshare those.  */
+	  if (child->refcnt != 1)
+	    {
+	      gcc_assert (slpg->vertices[child->vertex].pred->pred_next);
+	      SLP_TREE_CHILDREN (node)[j] = child
+		= vect_create_new_slp_node
+		    (SLP_TREE_SCALAR_OPS (child).copy ());
+	    }
+	  vect_slp_permute (perms[perm],
+			    SLP_TREE_SCALAR_OPS (child), true);
+	}
+
+      if (bitmap_bit_p (n_materialize, i))
+	{
+	  if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
+	    /* For loads simply drop the permutation, the load permutation
+	       already performs the desired permutation.  */
+	    ;
+	  else
+	    {
+	      if (dump_enabled_p ())
+		dump_printf_loc (MSG_NOTE, vect_location,
+				 "inserting permute node in place of %p\n",
+				 node);
+
+	      /* Make a copy of NODE and in-place change it to a
+		 VEC_PERM node to permute the lanes of the copy.  */
+	      slp_tree copy = new _slp_tree;
+	      SLP_TREE_CHILDREN (copy) = SLP_TREE_CHILDREN (node);
+	      SLP_TREE_CHILDREN (node) = vNULL;
+	      SLP_TREE_SCALAR_STMTS (copy)
+		= SLP_TREE_SCALAR_STMTS (node).copy ();
+	      vect_slp_permute (perms[perm],
+				SLP_TREE_SCALAR_STMTS (copy), true);
+	      gcc_assert (!SLP_TREE_SCALAR_OPS (node).exists ());
+	      SLP_TREE_REPRESENTATIVE (copy) = SLP_TREE_REPRESENTATIVE (node);
+	      gcc_assert (!SLP_TREE_LOAD_PERMUTATION (node).exists ());
+	      SLP_TREE_LANE_PERMUTATION (copy)
+		= SLP_TREE_LANE_PERMUTATION (node);
+	      SLP_TREE_LANE_PERMUTATION (node) = vNULL;
+	      SLP_TREE_VECTYPE (copy) = SLP_TREE_VECTYPE (node);
+	      copy->refcnt = 1;
+	      copy->max_nunits = node->max_nunits;
+	      SLP_TREE_DEF_TYPE (copy) = SLP_TREE_DEF_TYPE (node);
+	      SLP_TREE_LANES (copy) = SLP_TREE_LANES (node);
+	      SLP_TREE_CODE (copy) = SLP_TREE_CODE (node);
+
+	      /* Now turn NODE into a VEC_PERM.  */
+	      SLP_TREE_CHILDREN (node).safe_push (copy);
+	      SLP_TREE_LANE_PERMUTATION (node).create (SLP_TREE_LANES (node));
+	      for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
+		SLP_TREE_LANE_PERMUTATION (node)
+		  .quick_push (std::make_pair (0, perms[perm][j]));
+	      SLP_TREE_CODE (node) = VEC_PERM_EXPR;
+	    }
+	}
+      else
+	{
+	  /* Apply the reverse permutation to our stmts.  */
+	  vect_slp_permute (perms[perm],
+			    SLP_TREE_SCALAR_STMTS (node), true);
+	  /* And to the load permutation, which we can simply
+	     make regular by design.  */
+	  if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
+	    {
+	      /* ???  When we handle non-bijective permutes the idea
+		 is that we can force the load-permutation to be
+		 { min, min + 1, min + 2, ... max }.  But then the
+		 scalar defs might no longer match the lane content
+		 which means wrong-code with live lane vectorization.
+		 So we possibly have to have NULL entries for those.  */
+	      vect_slp_permute (perms[perm],
+				SLP_TREE_LOAD_PERMUTATION (node), true);
+	    }
+	}
+    }
+
+  /* Free the perms vector used for propagation.  */
+  while (!perms.is_empty ())
+    perms.pop ().release ();
+  free_graph (slpg);
+
+
+  /* Now elide load permutations that are not necessary.  */
+  for (i = 0; i < leafs.length (); ++i)
     {
+      node = vertices[i];
       if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
 	continue;
 
@@ -2593,7 +2827,8 @@ vect_optimize_slp (vec_info *vinfo)
 	      /* The load requires permutation when unrolling exposes
 		 a gap either because the group is larger than the SLP
 		 group-size or because there is a gap between the groups.  */
-	      && (known_eq (LOOP_VINFO_VECT_FACTOR (as_a <loop_vec_info> (vinfo)), 1U)
+	      && (known_eq (LOOP_VINFO_VECT_FACTOR
+			      (as_a <loop_vec_info> (vinfo)), 1U)
 		  || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
 		      && DR_GROUP_GAP (first_stmt_info) == 0)))
 	    {
@@ -2975,12 +3210,9 @@ vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
     return true;
 
   /* If we already analyzed the exact same set of scalar stmts we're done.
-     We share the generated vector stmts for those.
-     The SLP graph is acyclic so not caching whether we failed or succeeded
-     doesn't result in any issue since we throw away the lvisited set
-     when we fail.  */
+     We share the generated vector stmts for those.  */
   if (visited.contains (node)
-      || lvisited.contains (node))
+      || lvisited.add (node))
     return true;
 
   bool res = true;
@@ -2993,12 +3225,10 @@ vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
     }
 
   if (res)
-    {
-      res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
-						cost_vec);
-      if (res)
-	lvisited.add (node);
-    }
+    res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
+					      cost_vec);
+  if (!res)
+    lvisited.remove (node);
 
   /* When the node can be vectorized cost invariant nodes it references.
      This is not done in DFS order to allow the refering node
@@ -4685,7 +4915,8 @@ vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
 
 static void
 vect_schedule_slp_instance (vec_info *vinfo,
-			    slp_tree node, slp_instance instance)
+			    slp_tree node, slp_instance instance,
+			    hash_set<slp_tree> &visited)
 {
   gimple_stmt_iterator si;
   int i;
@@ -4712,8 +4943,12 @@ vect_schedule_slp_instance (vec_info *vinfo,
       return;
     }
 
+  /* ???  If we'd have a way to mark backedges that would be cheaper.  */
+  if (visited.add (node))
+    return;
+
   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
-    vect_schedule_slp_instance (vinfo, child, instance);
+    vect_schedule_slp_instance (vinfo, child, instance, visited);
 
   gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
   SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
@@ -4737,14 +4972,13 @@ vect_schedule_slp_instance (vec_info *vinfo,
 	last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
       si = gsi_for_stmt (last_stmt_info->stmt);
     }
-  else if (SLP_TREE_CHILDREN (node).is_empty ())
+  else if ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
+	    == cycle_phi_info_type)
+	   || (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
+	       == induc_vec_info_type))
     {
-      /* This happens for reduction and induction PHIs where we do not use the
+      /* For reduction and induction PHIs we do not use the
 	 insertion iterator.  */
-      gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
-		  == cycle_phi_info_type
-		  || (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
-		      == induc_vec_info_type));
       si = gsi_none ();
     }
   else
@@ -4957,6 +5191,7 @@ vect_schedule_slp (vec_info *vinfo, vec<slp_instance> slp_instances)
   slp_instance instance;
   unsigned int i;
 
+  hash_set<slp_tree> visited;
   FOR_EACH_VEC_ELT (slp_instances, i, instance)
     {
       slp_tree node = SLP_INSTANCE_TREE (instance);
@@ -4971,7 +5206,7 @@ vect_schedule_slp (vec_info *vinfo, vec<slp_instance> slp_instances)
 				SLP_INSTANCE_TREE (instance));
 	}
       /* Schedule the tree of INSTANCE.  */
-      vect_schedule_slp_instance (vinfo, node, instance);
+      vect_schedule_slp_instance (vinfo, node, instance, visited);
 
       if (SLP_INSTANCE_ROOT_STMT (instance))
 	vectorize_slp_instance_root_stmt (node, instance);
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 38daa05aebb..2a8c4a56a8a 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -161,6 +161,8 @@ struct _slp_tree {
   unsigned int lanes;
   /* The operation of this node.  */
   enum tree_code code;
+
+  int vertex;
 };


^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2020-10-12 13:44 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-10-12 13:44 [gcc r11-3823] optimize permutes in SLP, remove vect_attempt_slp_rearrange_stmts Richard Biener

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).