public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH 1/8 v9]middle-end slp: Support optimizing load distribution
@ 2020-12-28 13:35 Tamar Christina
  2020-12-28 13:36 ` [PATCH 2/8 v9]middle-end slp: fix is_linear_load_p to prevent multiple answers Tamar Christina
                   ` (7 more replies)
  0 siblings, 8 replies; 27+ messages in thread
From: Tamar Christina @ 2020-12-28 13:35 UTC (permalink / raw)
  To: gcc-patches; +Cc: nd, rguenther, ook

[-- Attachment #1: Type: text/plain, Size: 5403 bytes --]

Hi All,

This introduces a post processing step for the pattern matcher to flatten
permutes introduced by the complex multiplications patterns. 

This performs a blend early such that SLP is not cancelled by the LOAD_LANES
permute.  This is a temporary workaround to the fact that loads are not CSEd
during building and is required to produce efficient code.

Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* tree-vect-slp.c (optimize_load_redistribution_1): New.
	(optimize_load_redistribution): New.
	(vect_match_slp_patterns): Use it.

--- inline copy of patch -- 
diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
index 2a58e54fe51471df5f55ce4a524d0022744054b0..8360a59098f517498f3155f325cf8406466ac25c 100644
--- a/gcc/tree-vect-slp.c
+++ b/gcc/tree-vect-slp.c
@@ -2228,6 +2228,115 @@ calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
   return exact_div (common_multiple (nunits, group_size), group_size);
 }
 
+/* Helper function of optimize_load_redistribution that performs the operation
+   recursively.  */
+
+static slp_tree
+optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
+				hash_set<slp_tree> *visited, slp_tree root)
+{
+  if (visited->add (root))
+    return NULL;
+
+  slp_tree node;
+  unsigned i;
+
+  /* For now, we don't know anything about externals so do not do anything.  */
+  if (SLP_TREE_DEF_TYPE (root) == vect_external_def
+      || SLP_TREE_DEF_TYPE (root) == vect_constant_def)
+    return NULL;
+  else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR
+      && SLP_TREE_LANE_PERMUTATION (root).exists ()
+      && !SLP_TREE_SCALAR_STMTS (root).exists ())
+    {
+      /* First convert this node into a load node and add it to the leaves
+         list and flatten the permute from a lane to a load one.  If it's
+         unneeded it will be elided later.  */
+      auto_vec<stmt_vec_info> stmts;
+      stmts.create (SLP_TREE_LANES (root));
+      load_permutation_t load_perm;
+      load_perm.create (SLP_TREE_LANES (root));
+      lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
+      for (unsigned j = 0; j < lane_perm.length (); j++)
+        {
+          std::pair<unsigned, unsigned> perm = lane_perm[j];
+	  /* This isn't strictly needed, but this function is a temporary
+	     one for specifically pattern matching, so don't want it to
+	     optimize things the remainder of the pipeline will.  */
+	  if (perm.first != j)
+	    goto next;
+          node = SLP_TREE_CHILDREN (root)[perm.first];
+
+	  if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
+	    return NULL;
+
+	  stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
+          load_perm.safe_push (SLP_TREE_LOAD_PERMUTATION (node)[perm.second]);
+        }
+
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_NOTE, vect_location,
+			 "converting stmts on permute node %p\n", root);
+
+      slp_tree *value = bst_map->get (stmts);
+      if (value)
+	node = *value;
+      else
+	{
+	  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i, node)
+	    SLP_TREE_REF_COUNT (node)++;
+
+	  vec<stmt_vec_info> stmts_cpy = stmts.copy ();
+	  node = vect_create_new_slp_node (stmts_cpy.copy (), 0);
+	  SLP_TREE_VECTYPE (node) = SLP_TREE_VECTYPE (root);
+	  SLP_TREE_LOAD_PERMUTATION (node) = load_perm;
+	  bst_map->put (stmts_cpy, node);
+	}
+      SLP_TREE_REF_COUNT (node)++;
+
+      return node;
+    }
+
+next:
+  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
+    {
+      slp_tree value = optimize_load_redistribution_1 (bst_map, visited, node);
+      if (value)
+	{
+          SLP_TREE_CHILDREN (root)[i] = value;
+          vect_free_slp_tree (node);
+	}
+    }
+
+  return NULL;
+}
+
+/* Temporary workaround for loads not being CSEd during SLP build.  This
+   function will traverse the SLP tree rooted in ROOT for INSTANCE and find
+   VEC_PERM nodes that blend vectors from multiple nodes that all read from the
+   same DR such that the final operation is equal to a permuted load.  Such
+   NODES are then directly converted into LOADS themselves.  The nodes are
+   CSEd using BST_MAP.  */
+
+static void
+optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
+			      slp_tree root)
+{
+  slp_tree node;
+  unsigned i;
+  hash_set<slp_tree> visited;
+
+  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
+    {
+      slp_tree value = optimize_load_redistribution_1 (bst_map, &visited, node);
+      if (value)
+	{
+          SLP_TREE_CHILDREN (root)[i] = value;
+          vect_free_slp_tree (node);
+	}
+    }
+}
+
 /* Helper function of vect_match_slp_patterns.
 
    Attempts to match patterns against the slp tree rooted in REF_NODE using
@@ -2276,7 +2385,7 @@ static bool
 vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
 			 hash_set<slp_tree> *visited,
 			 slp_tree_to_load_perm_map_t *perm_cache,
-			 scalar_stmts_to_slp_tree_map_t * /* bst_map */)
+			 scalar_stmts_to_slp_tree_map_t *bst_map)
 {
   DUMP_VECT_SCOPE ("vect_match_slp_patterns");
   slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
@@ -2291,6 +2400,8 @@ vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
 
   if (found_p)
     {
+      optimize_load_redistribution (bst_map, *ref_node);
+
       if (dump_enabled_p ())
 	{
 	  dump_printf_loc (MSG_NOTE, vect_location,


-- 

[-- Attachment #2: rb13956.patch --]
[-- Type: text/x-diff, Size: 4743 bytes --]

diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
index 2a58e54fe51471df5f55ce4a524d0022744054b0..8360a59098f517498f3155f325cf8406466ac25c 100644
--- a/gcc/tree-vect-slp.c
+++ b/gcc/tree-vect-slp.c
@@ -2228,6 +2228,115 @@ calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
   return exact_div (common_multiple (nunits, group_size), group_size);
 }
 
+/* Helper function of optimize_load_redistribution that performs the operation
+   recursively.  */
+
+static slp_tree
+optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
+				hash_set<slp_tree> *visited, slp_tree root)
+{
+  if (visited->add (root))
+    return NULL;
+
+  slp_tree node;
+  unsigned i;
+
+  /* For now, we don't know anything about externals so do not do anything.  */
+  if (SLP_TREE_DEF_TYPE (root) == vect_external_def
+      || SLP_TREE_DEF_TYPE (root) == vect_constant_def)
+    return NULL;
+  else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR
+      && SLP_TREE_LANE_PERMUTATION (root).exists ()
+      && !SLP_TREE_SCALAR_STMTS (root).exists ())
+    {
+      /* First convert this node into a load node and add it to the leaves
+         list and flatten the permute from a lane to a load one.  If it's
+         unneeded it will be elided later.  */
+      auto_vec<stmt_vec_info> stmts;
+      stmts.create (SLP_TREE_LANES (root));
+      load_permutation_t load_perm;
+      load_perm.create (SLP_TREE_LANES (root));
+      lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
+      for (unsigned j = 0; j < lane_perm.length (); j++)
+        {
+          std::pair<unsigned, unsigned> perm = lane_perm[j];
+	  /* This isn't strictly needed, but this function is a temporary
+	     one for specifically pattern matching, so don't want it to
+	     optimize things the remainder of the pipeline will.  */
+	  if (perm.first != j)
+	    goto next;
+          node = SLP_TREE_CHILDREN (root)[perm.first];
+
+	  if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
+	    return NULL;
+
+	  stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
+          load_perm.safe_push (SLP_TREE_LOAD_PERMUTATION (node)[perm.second]);
+        }
+
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_NOTE, vect_location,
+			 "converting stmts on permute node %p\n", root);
+
+      slp_tree *value = bst_map->get (stmts);
+      if (value)
+	node = *value;
+      else
+	{
+	  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i, node)
+	    SLP_TREE_REF_COUNT (node)++;
+
+	  vec<stmt_vec_info> stmts_cpy = stmts.copy ();
+	  node = vect_create_new_slp_node (stmts_cpy.copy (), 0);
+	  SLP_TREE_VECTYPE (node) = SLP_TREE_VECTYPE (root);
+	  SLP_TREE_LOAD_PERMUTATION (node) = load_perm;
+	  bst_map->put (stmts_cpy, node);
+	}
+      SLP_TREE_REF_COUNT (node)++;
+
+      return node;
+    }
+
+next:
+  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
+    {
+      slp_tree value = optimize_load_redistribution_1 (bst_map, visited, node);
+      if (value)
+	{
+          SLP_TREE_CHILDREN (root)[i] = value;
+          vect_free_slp_tree (node);
+	}
+    }
+
+  return NULL;
+}
+
+/* Temporary workaround for loads not being CSEd during SLP build.  This
+   function will traverse the SLP tree rooted in ROOT for INSTANCE and find
+   VEC_PERM nodes that blend vectors from multiple nodes that all read from the
+   same DR such that the final operation is equal to a permuted load.  Such
+   NODES are then directly converted into LOADS themselves.  The nodes are
+   CSEd using BST_MAP.  */
+
+static void
+optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
+			      slp_tree root)
+{
+  slp_tree node;
+  unsigned i;
+  hash_set<slp_tree> visited;
+
+  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
+    {
+      slp_tree value = optimize_load_redistribution_1 (bst_map, &visited, node);
+      if (value)
+	{
+          SLP_TREE_CHILDREN (root)[i] = value;
+          vect_free_slp_tree (node);
+	}
+    }
+}
+
 /* Helper function of vect_match_slp_patterns.
 
    Attempts to match patterns against the slp tree rooted in REF_NODE using
@@ -2276,7 +2385,7 @@ static bool
 vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
 			 hash_set<slp_tree> *visited,
 			 slp_tree_to_load_perm_map_t *perm_cache,
-			 scalar_stmts_to_slp_tree_map_t * /* bst_map */)
+			 scalar_stmts_to_slp_tree_map_t *bst_map)
 {
   DUMP_VECT_SCOPE ("vect_match_slp_patterns");
   slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
@@ -2291,6 +2400,8 @@ vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
 
   if (found_p)
     {
+      optimize_load_redistribution (bst_map, *ref_node);
+
       if (dump_enabled_p ())
 	{
 	  dump_printf_loc (MSG_NOTE, vect_location,


^ permalink raw reply	[flat|nested] 27+ messages in thread

end of thread, other threads:[~2021-01-11 13:54 UTC | newest]

Thread overview: 27+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-12-28 13:35 [PATCH 1/8 v9]middle-end slp: Support optimizing load distribution Tamar Christina
2020-12-28 13:36 ` [PATCH 2/8 v9]middle-end slp: fix is_linear_load_p to prevent multiple answers Tamar Christina
2021-01-07 13:17   ` Richard Biener
2020-12-28 13:36 ` [PATCH 3/8 v9]middle-end slp: handle externals correctly in linear_loads_p Tamar Christina
2021-01-07 13:17   ` Richard Biener
2020-12-28 13:37 ` [PATCH 4/8 v9]middle-end slp: upgrade complex add to new format and fix memory leaks Tamar Christina
2021-01-07 13:18   ` Richard Biener
2020-12-28 13:37 ` [PATCH 5/8 v9]middle-end slp: support complex multiply and complex multiply conjugate Tamar Christina
2021-01-08  9:37   ` Richard Biener
2021-01-11 11:01     ` Tamar Christina
2021-01-11 12:04       ` Richard Biener
2020-12-28 13:37 ` [PATCH 6/8 v9]middle-end slp: support complex FMA and complex FMA conjugate Tamar Christina
2021-01-08  9:45   ` Richard Biener
2021-01-08  9:59     ` Tamar Christina
2021-01-08 10:17       ` Richard Biener
2021-01-08 10:21         ` Tamar Christina
2021-01-11 10:24     ` Tamar Christina
2020-12-28 13:38 ` [PATCH 7/8 v9]middle-end slp: support complex FMS and complex FMS conjugate Tamar Christina
2021-01-08  9:49   ` Richard Biener
2021-01-08 10:02     ` Tamar Christina
2020-12-28 13:38 ` [PATCH 8/8 v9]middle-end slp: Add complex operations class to share first match among all matchers Tamar Christina
2021-01-08  9:50   ` Richard Biener
2021-01-07 13:20 ` [PATCH 1/8 v9]middle-end slp: Support optimizing load distribution Richard Biener
2021-01-07 13:25   ` Tamar Christina
2021-01-07 13:36     ` Richard Biener
2021-01-11 11:01   ` Tamar Christina
2021-01-11 13:54     ` Richard Biener

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).