public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH] Rearrange SLP nodes with duplicate statements. [PR98138]
@ 2024-06-04 13:53 Manolis Tsamis
  2024-06-05  8:07 ` Richard Biener
  0 siblings, 1 reply; 7+ messages in thread
From: Manolis Tsamis @ 2024-06-04 13:53 UTC (permalink / raw)
  To: gcc-patches
  Cc: Christoph Müllner, Richard Biener, Kewen . Lin,
	Philipp Tomsich, Tamar Christina, Jiangning Liu, Manolis Tsamis

This change adds a function that checks for SLP nodes with multiple occurrences
of the same statement (e.g. {A, B, A, B, ...}) and tries to rearrange the node
so that there are no duplicates. A vec_perm is then introduced to recreate the
original ordering. These duplicates can appear due to how two_operators nodes
are handled, and they prevent vectorization in some cases.

This targets the vectorization of the SPEC2017 x264 pixel_satd functions.
In some processors a larger than 10% improvement on x264 has been observed.

See also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98138

gcc/ChangeLog:

	* tree-vect-slp.cc (enum slp_oprnd_pattern): new enum for rearrangement
	patterns.
	(try_rearrange_oprnd_info): Detect if a node corresponds to one of the
	patterns.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/vect-slp-two-operator.c: New test.

Signed-off-by: Manolis Tsamis <manolis.tsamis@vrull.eu>
---

 .../aarch64/vect-slp-two-operator.c           |  42 ++++
 gcc/tree-vect-slp.cc                          | 234 ++++++++++++++++++
 2 files changed, 276 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/vect-slp-two-operator.c

diff --git a/gcc/testsuite/gcc.target/aarch64/vect-slp-two-operator.c b/gcc/testsuite/gcc.target/aarch64/vect-slp-two-operator.c
new file mode 100644
index 00000000000..2db066a0b6e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vect-slp-two-operator.c
@@ -0,0 +1,42 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect -fdump-tree-vect-details" } */
+
+typedef unsigned char uint8_t;
+typedef unsigned int uint32_t;
+
+#define HADAMARD4(d0, d1, d2, d3, s0, s1, s2, s3) {\
+    int t0 = s0 + s1;\
+    int t1 = s0 - s1;\
+    int t2 = s2 + s3;\
+    int t3 = s2 - s3;\
+    d0 = t0 + t2;\
+    d1 = t1 + t3;\
+    d2 = t0 - t2;\
+    d3 = t1 - t3;\
+}
+
+static uint32_t abs2( uint32_t a )
+{
+    uint32_t s = ((a>>15)&0x10001)*0xffff;
+    return (a+s)^s;
+}
+
+void sink(uint32_t tmp[4][4]);
+
+int x264_pixel_satd_8x4( uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
+{
+    uint32_t tmp[4][4];
+    int sum = 0;
+    for( int i = 0; i < 4; i++, pix1 += i_pix1, pix2 += i_pix2 )
+    {
+        uint32_t a0 = (pix1[0] - pix2[0]) + ((pix1[4] - pix2[4]) << 16);
+        uint32_t a1 = (pix1[1] - pix2[1]) + ((pix1[5] - pix2[5]) << 16);
+        uint32_t a2 = (pix1[2] - pix2[2]) + ((pix1[6] - pix2[6]) << 16);
+        uint32_t a3 = (pix1[3] - pix2[3]) + ((pix1[7] - pix2[7]) << 16);
+        HADAMARD4( tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], a0,a1,a2,a3 );
+    }
+    sink(tmp);
+}
+
+/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index bf1f467f53f..e395db0e185 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -40,6 +40,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "tree-vectorizer.h"
 #include "langhooks.h"
 #include "gimple-walk.h"
+#include "gimple-pretty-print.h"
 #include "dbgcnt.h"
 #include "tree-vector-builder.h"
 #include "vec-perm-indices.h"
@@ -1829,6 +1830,141 @@ vect_slp_build_two_operator_nodes (slp_tree perm, tree vectype,
   SLP_TREE_CHILDREN (perm).quick_push (child2);
 }
 
+enum slp_oprnd_pattern
+{
+  SLP_OPRND_PATTERN_NONE,
+  SLP_OPRND_PATTERN_ABAB,
+  SLP_OPRND_PATTERN_AABB,
+  SLP_OPRND_PATTERN_ABBA
+};
+
+/* Check if OPRNDS_INFO has duplicated nodes that correspond to a predefined
+   pattern described by SLP_OPRND_PATTERN and return it.  */
+
+static int
+try_rearrange_oprnd_info (vec<slp_oprnd_info> &oprnds_info, unsigned group_size)
+{
+  unsigned i;
+  slp_oprnd_info info;
+
+  if (oprnds_info.length () != 2 || group_size % 4 != 0)
+    return SLP_OPRND_PATTERN_NONE;
+
+  if (!oprnds_info[0]->def_stmts[0]
+      || !is_a<gassign *> (oprnds_info[0]->def_stmts[0]->stmt))
+    return SLP_OPRND_PATTERN_NONE;
+
+  enum tree_code code
+    = gimple_assign_rhs_code (oprnds_info[0]->def_stmts[0]->stmt);
+  FOR_EACH_VEC_ELT (oprnds_info, i, info)
+    for (unsigned int j = 0; j < group_size; j += 1)
+      {
+	if (!info->def_stmts[j]
+	    || !is_a<gassign *> (info->def_stmts[j]->stmt)
+	    || STMT_VINFO_DATA_REF (info->def_stmts[j]))
+	  return SLP_OPRND_PATTERN_NONE;
+	/* Don't mix different operations.  */
+	if (gimple_assign_rhs_code (info->def_stmts[j]->stmt) != code)
+	  return SLP_OPRND_PATTERN_NONE;
+      }
+
+  if (gimple_assign_rhs_code (oprnds_info[0]->def_stmts[0]->stmt)
+      != gimple_assign_rhs_code (oprnds_info[1]->def_stmts[0]->stmt))
+    return SLP_OPRND_PATTERN_NONE;
+
+  int pattern = SLP_OPRND_PATTERN_NONE;
+  FOR_EACH_VEC_ELT (oprnds_info, i, info)
+    for (unsigned int j = 0; j < group_size; j += 4)
+      {
+	int cur_pattern = SLP_OPRND_PATTERN_NONE;
+	/* Check for an ABAB... pattern.  */
+	if ((info->def_stmts[j] == info->def_stmts[j + 2])
+	    && (info->def_stmts[j + 1] == info->def_stmts[j + 3])
+	    && (info->def_stmts[j] != info->def_stmts[j + 1]))
+	  cur_pattern = SLP_OPRND_PATTERN_ABAB;
+	/* Check for an AABB... pattern.  */
+	else if ((info->def_stmts[j] == info->def_stmts[j + 1])
+		 && (info->def_stmts[j + 2] == info->def_stmts[j + 3])
+		 && (info->def_stmts[j] != info->def_stmts[j + 2]))
+	  cur_pattern = SLP_OPRND_PATTERN_AABB;
+	/* Check for an ABBA... pattern.  */
+	else if ((info->def_stmts[j] == info->def_stmts[j + 3])
+		 && (info->def_stmts[j + 1] == info->def_stmts[j + 2])
+		 && (info->def_stmts[j] != info->def_stmts[j + 1]))
+	  cur_pattern = SLP_OPRND_PATTERN_ABBA;
+	/* Unrecognised pattern.  */
+	else
+	  return SLP_OPRND_PATTERN_NONE;
+
+	if (pattern == SLP_OPRND_PATTERN_NONE)
+	  pattern = cur_pattern;
+	/* Multiple patterns detected.  */
+	else if (cur_pattern != pattern)
+	  return SLP_OPRND_PATTERN_NONE;
+      }
+
+  gcc_checking_assert (pattern != SLP_OPRND_PATTERN_NONE);
+
+  if (dump_enabled_p ())
+    {
+      if (pattern == SLP_OPRND_PATTERN_ABAB)
+	dump_printf (MSG_NOTE, "ABAB");
+      else if (pattern == SLP_OPRND_PATTERN_AABB)
+	dump_printf (MSG_NOTE, "AABB");
+      else if (pattern == SLP_OPRND_PATTERN_ABBA)
+	dump_printf (MSG_NOTE, "ABBA");
+      dump_printf (MSG_NOTE, " pattern detected.\n");
+    }
+
+  if (pattern == SLP_OPRND_PATTERN_ABAB || pattern == SLP_OPRND_PATTERN_ABBA)
+    for (unsigned int j = 0; j < group_size; j += 4)
+      {
+	/* Given oprnd[0] -> A1, B1, A1, B1, A2, B2, A2, B2, ...
+	   Given oprnd[1] -> C1, D1, C1, D1, C2, D2, C2, D2, ...
+	   Create a single node -> A1, B1, C1, D1, A2, B2, C2, D2, ...  */
+	oprnds_info[0]->def_stmts[j+2] = oprnds_info[1]->def_stmts[j];
+	oprnds_info[0]->ops[j+2] = oprnds_info[1]->ops[j];
+	oprnds_info[0]->def_stmts[j+3] = oprnds_info[1]->def_stmts[j+1];
+	oprnds_info[0]->ops[j+3] = oprnds_info[1]->ops[j+1];
+      }
+  else if (pattern == SLP_OPRND_PATTERN_AABB)
+    for (unsigned int j = 0; j < group_size; j += 4)
+      {
+	/* Given oprnd[0] -> A1, A1, B1, B1, A2, A2, B2, B2, ...
+	   Given oprnd[1] -> C1, C1, D1, D1, C2, C2, D2, D2, ...
+	   Create a single node -> A1, C1, B1, D1, A2, C2, B2, D2, ...  */
+
+	/* The ordering here is at least to some extent arbitrary.
+	   A generilized version needs to use some explicit ordering.  */
+	oprnds_info[0]->def_stmts[j+1] = oprnds_info[1]->def_stmts[j];
+	oprnds_info[0]->ops[j+1] = oprnds_info[1]->ops[j];
+	oprnds_info[0]->def_stmts[j+2] = oprnds_info[0]->def_stmts[j+2];
+	oprnds_info[0]->ops[j+2] = oprnds_info[0]->ops[j+2];
+	oprnds_info[0]->def_stmts[j+3] = oprnds_info[1]->def_stmts[j+2];
+	oprnds_info[0]->ops[j+3] = oprnds_info[1]->ops[j+2];
+      }
+
+  if (dump_enabled_p ())
+    {
+      dump_printf (MSG_NOTE, "Recurse with:\n");
+      for (unsigned int j = 0; j < group_size; j++)
+	{
+	  dump_printf (MSG_NOTE, "  ");
+	  print_gimple_stmt (dump_file, oprnds_info[0]->def_stmts[j]->stmt, 0);
+	}
+    }
+
+  /* Since we've merged the two nodes in one, make the second one a copy of
+     the first.  */
+  for (unsigned int j = 0; j < group_size; j++)
+    {
+      oprnds_info[1]->def_stmts[j] = oprnds_info[0]->def_stmts[j];
+      oprnds_info[1]->ops[j] = oprnds_info[0]->ops[j];
+    }
+
+  return pattern;
+}
+
 /* Recursively build an SLP tree starting from NODE.
    Fail (and return a value not equal to zero) if def-stmts are not
    isomorphic, require data permutation or are of unsupported types of
@@ -2409,6 +2545,10 @@ out:
 
   stmt_info = stmts[0];
 
+  int rearrange_pattern = SLP_OPRND_PATTERN_NONE;
+  if (is_a<gassign *> (stmt_info->stmt))
+    rearrange_pattern = try_rearrange_oprnd_info (oprnds_info, group_size);
+
   /* Create SLP_TREE nodes for the definition node/s.  */
   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
     {
@@ -2669,6 +2809,100 @@ fail:
   *tree_size += this_tree_size + 1;
   *max_nunits = this_max_nunits;
 
+  /* If we applied any rearrangmenets then we need to reconstruct the original
+     elements with an additional permutation layer.  */
+  if (rearrange_pattern != SLP_OPRND_PATTERN_NONE)
+    {
+      slp_tree one =  new _slp_tree;
+      slp_tree two = new _slp_tree;
+      SLP_TREE_DEF_TYPE (one) = vect_internal_def;
+      SLP_TREE_DEF_TYPE (two) = vect_internal_def;
+      SLP_TREE_VECTYPE (one) = vectype;
+      SLP_TREE_VECTYPE (two) = vectype;
+      SLP_TREE_CHILDREN (one).safe_splice (children);
+      SLP_TREE_CHILDREN (two).safe_splice (children);
+
+      SLP_TREE_CODE (one) = VEC_PERM_EXPR;
+      SLP_TREE_CODE (two) = VEC_PERM_EXPR;
+      SLP_TREE_REPRESENTATIVE (one) = stmts[0];
+      SLP_TREE_REPRESENTATIVE (two) = stmts[2];
+      lane_permutation_t &perm_one = SLP_TREE_LANE_PERMUTATION (one);
+      lane_permutation_t &perm_two = SLP_TREE_LANE_PERMUTATION (two);
+
+      if (rearrange_pattern == SLP_OPRND_PATTERN_ABAB)
+	{
+	   /* Given a single node -> A1, B1, C1, D1, A2, B2, C2, D2, ...
+	      Create node "one" -> A1, B1, A1, B1, A2, B2, A2, B2, ...
+	      Create node "two" -> C1, D1, C1, D1, C2, D2, C2, D2, ...  */
+
+	  for (unsigned int j = 0; j < group_size; j += 4)
+	    {
+	      perm_one.safe_push (std::make_pair (0, j + 0));
+	      perm_one.safe_push (std::make_pair (0, j + 1));
+	      perm_one.safe_push (std::make_pair (0, j + 0));
+	      perm_one.safe_push (std::make_pair (0, j + 1));
+
+	      perm_two.safe_push (std::make_pair (0, j + 2));
+	      perm_two.safe_push (std::make_pair (0, j + 3));
+	      perm_two.safe_push (std::make_pair (0, j + 2));
+	      perm_two.safe_push (std::make_pair (0, j + 3));
+	    }
+	}
+      else if (rearrange_pattern == SLP_OPRND_PATTERN_AABB)
+	{
+	   /* Given a single node -> A1, C1, B1, D1, A2, C2, B2, D2, ...
+	      Create node "one" -> A1, A1, B1, B1, A2, A2, B2, B2, ...
+	      Create node "two" -> C1, C1, D1, D1, C2, C2, D2, D2, ...  */
+
+	  for (unsigned int j = 0; j < group_size; j += 4)
+	    {
+	      perm_one.safe_push (std::make_pair (0, j + 0));
+	      perm_one.safe_push (std::make_pair (0, j + 0));
+	      perm_one.safe_push (std::make_pair (0, j + 2));
+	      perm_one.safe_push (std::make_pair (0, j + 2));
+
+	      perm_two.safe_push (std::make_pair (0, j + 1));
+	      perm_two.safe_push (std::make_pair (0, j + 1));
+	      perm_two.safe_push (std::make_pair (0, j + 3));
+	      perm_two.safe_push (std::make_pair (0, j + 3));
+	    }
+	}
+      else if (rearrange_pattern == SLP_OPRND_PATTERN_ABBA)
+	{
+	   /* Given a single node -> A1, B1, C1, D1, A2, B2, C2, D2, ...
+	      Create node "one" -> A1, B1, B1, A1, A2, B2, B2, A2, ...
+	      Create node "two" -> C1, D1, D1, C1, C2, D2, D2, C2, ...  */
+
+	  for (unsigned int j = 0; j < group_size; j += 4)
+	    {
+	      perm_one.safe_push (std::make_pair (0, j + 0));
+	      perm_one.safe_push (std::make_pair (0, j + 1));
+	      perm_one.safe_push (std::make_pair (0, j + 1));
+	      perm_one.safe_push (std::make_pair (0, j + 0));
+
+	      perm_two.safe_push (std::make_pair (0, j + 2));
+	      perm_two.safe_push (std::make_pair (0, j + 3));
+	      perm_two.safe_push (std::make_pair (0, j + 3));
+	      perm_two.safe_push (std::make_pair (0, j + 2));
+	    }
+	}
+
+      slp_tree child;
+      FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
+       SLP_TREE_REF_COUNT (child)++;
+
+      node = vect_create_new_slp_node (node, stmts, 2);
+      SLP_TREE_VECTYPE (node) = vectype;
+      SLP_TREE_CHILDREN (node).quick_push (one);
+      SLP_TREE_CHILDREN (node).quick_push (two);
+
+      SLP_TREE_LANES (one) = stmts.length ();
+      SLP_TREE_LANES (two) = stmts.length ();
+
+      children.truncate (0);
+      children.safe_splice (SLP_TREE_CHILDREN (node));
+    }
+
   if (two_operators)
     {
       /* ???  We'd likely want to either cache in bst_map sth like
-- 
2.44.0


^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2024-06-26 12:42 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-06-04 13:53 [PATCH] Rearrange SLP nodes with duplicate statements. [PR98138] Manolis Tsamis
2024-06-05  8:07 ` Richard Biener
2024-06-05  9:13   ` Tamar Christina
2024-06-05  9:18     ` Richard Biener
2024-06-10 11:43   ` Manolis Tsamis
2024-06-10 12:54     ` Richard Biener
2024-06-26 12:42   ` Manolis Tsamis

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).