public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc(refs/vendors/vrull/heads/slp-improvements)] Build SLP tree for nodes with shared statements
@ 2024-01-23 20:57 Philipp Tomsich
  0 siblings, 0 replies; 4+ messages in thread
From: Philipp Tomsich @ 2024-01-23 20:57 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:6fe9f962a2ba10a11e31f73867d6608d0736ff55

commit 6fe9f962a2ba10a11e31f73867d6608d0736ff55
Author: Manolis Tsamis <manolis.tsamis@vrull.eu>
Date:   Mon Oct 30 17:11:34 2023 +0100

    Build SLP tree for nodes with shared statements
    
    When an SLP tree has multiple occurances of the same statement in a
    vector node (e.g. {A, B, A, B, ...} it can be that this duplication
    hurts the efficacy of the tree building procedures, compared to what
    can be achieved if we used a ordered node with no duplicates (i.e.,
    {A, B, ...}) and then did a permute.
    
    This is especially true when the two_operators conditions are met,
    where nodes are duplicated and if they have duplicates there is an
    'exponential explosion' of nodes that end up as splat vectors.
    
    This is a first iteration that improves these cases by matching some
    patterns and deduplicates + permutes them appropriately. This
    currently targets the vectorization of the first loop in x264's
    pixel_satd functions.
    
    See also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98138
    
    Ref #342

Diff:
---
 gcc/tree-vect-slp.cc | 183 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 183 insertions(+)

diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index 086377a9ac0..5991f80239d 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -40,6 +40,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "tree-vectorizer.h"
 #include "langhooks.h"
 #include "gimple-walk.h"
+#include "gimple-pretty-print.h"
 #include "dbgcnt.h"
 #include "tree-vector-builder.h"
 #include "vec-perm-indices.h"
@@ -1819,6 +1820,109 @@ vect_slp_build_two_operator_nodes (slp_tree perm, tree vectype,
   SLP_TREE_CHILDREN (perm).quick_push (child2);
 }
 
+static int
+try_rearrange_oprnd_info (vec<slp_oprnd_info> &oprnds_info, unsigned group_size)
+{
+  unsigned i;
+  slp_oprnd_info oprnd_info;
+
+  /* A more generalized version is WIP but this is generic enough anyway.  */
+  if (oprnds_info.length() != 2 || group_size < 4)
+    return 0;
+
+  FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
+    for (unsigned int j = 0; j < group_size; j += 1)
+      if (!oprnd_info->def_stmts[j])
+	return 0;
+
+  int pattern = 0;
+  FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
+    for (unsigned int j = 0; j < group_size; j += 4)
+      {
+	int cur_pattern = 0;
+	/* Check for an ABAB... pattern.  */
+	if ((oprnd_info->def_stmts[j] == oprnd_info->def_stmts[j + 2])
+	    && (oprnd_info->def_stmts[j + 1] == oprnd_info->def_stmts[j + 3]))
+	  cur_pattern = 1;
+	/* Check for an AABB... pattern.  */
+	else if ((oprnd_info->def_stmts[j] == oprnd_info->def_stmts[j + 1])
+		 && (oprnd_info->def_stmts[j + 2] == oprnd_info->def_stmts[j + 3]))
+	  cur_pattern = 2;
+	/* Check for an ABBA... pattern.  */
+	else if ((oprnd_info->def_stmts[j] == oprnd_info->def_stmts[j + 3])
+		 && (oprnd_info->def_stmts[j + 1] == oprnd_info->def_stmts[j + 2]))
+	  cur_pattern = 3;
+	/* Unrecognised pattern.  */
+	else
+	  return 0;
+
+	if (pattern == 0)
+	  pattern = cur_pattern;
+	/* Multiple patterns detected.  */
+	else if (cur_pattern != pattern)
+	  return 0;
+      }
+
+  gcc_checking_assert (pattern);
+
+  if (dump_enabled_p ())
+    {
+      if (pattern == 1)
+	dump_printf (MSG_NOTE, "ABAB");
+      else if (pattern == 2)
+	dump_printf (MSG_NOTE, "AABB");
+      else if (pattern == 3)
+	dump_printf (MSG_NOTE, "ABBA");
+      dump_printf (MSG_NOTE, " pattern detected.\n");
+    }
+
+  if (pattern == 1)
+    for (unsigned int j = 0; j < group_size; j += 4)
+      {
+	oprnds_info[0]->def_stmts[j+2] = oprnds_info[1]->def_stmts[j];
+	oprnds_info[0]->ops[j+2] = oprnds_info[1]->ops[j];
+	oprnds_info[0]->def_stmts[j+3] = oprnds_info[1]->def_stmts[j+1];
+	oprnds_info[0]->ops[j+3] = oprnds_info[1]->ops[j+1];
+      }
+  else if (pattern == 2)
+    for (unsigned int j = 0; j < group_size; j += 4)
+      {
+	/* The ordering here is at least to some extent arbitrary.
+	   A generilized version needs to use some explicit ordering.  */
+	oprnds_info[0]->def_stmts[j+1] = oprnds_info[1]->def_stmts[j];
+	oprnds_info[0]->ops[j+1] = oprnds_info[1]->ops[j];
+	oprnds_info[0]->def_stmts[j+2] = oprnds_info[0]->def_stmts[j+2];
+	oprnds_info[0]->ops[j+2] = oprnds_info[0]->ops[j+2];
+	oprnds_info[0]->def_stmts[j+3] = oprnds_info[1]->def_stmts[j+2];
+	oprnds_info[0]->ops[j+3] = oprnds_info[1]->ops[j+2];
+      }
+  else if (pattern == 3)
+    /* No need to handle that for now.  */
+    return 0;
+
+  if (dump_enabled_p ())
+    {
+      dump_printf (MSG_NOTE, "Recurse with:\n");
+      for (unsigned int j = 0; j < group_size; j++)
+	{
+	  dump_printf (MSG_NOTE, "  ");
+	  print_gimple_stmt (dump_file, oprnds_info[0]->def_stmts[j]->stmt, 0);
+	}
+    }
+
+  /* Since we've merged the two nodes in one, make the second one a copy of the first.
+     An improvement could be to truncate to one just node, but that needs refactoring
+     in vect_build_slp_tree_2 which we can avoid for now. The nodes are going to be
+     cached anyway.  */
+  for (unsigned int j = 0; j < group_size; j++)
+    {
+      oprnds_info[1]->def_stmts[j] = oprnds_info[0]->def_stmts[j];
+      oprnds_info[1]->ops[j] = oprnds_info[0]->ops[j];
+    }
+
+  return pattern;
+}
+
 /* Recursively build an SLP tree starting from NODE.
    Fail (and return a value not equal to zero) if def-stmts are not
    isomorphic, require data permutation or are of unsupported types of
@@ -2380,6 +2484,8 @@ out:
 
   stmt_info = stmts[0];
 
+  int rearrange_pattern = try_rearrange_oprnd_info (oprnds_info, group_size);
+
   /* Create SLP_TREE nodes for the definition node/s.  */
   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
     {
@@ -2640,6 +2746,83 @@ fail:
   *tree_size += this_tree_size + 1;
   *max_nunits = this_max_nunits;
 
+  /* If we applied any rearrangmenets then we need to reconstruct the original
+     elements with an additional permutation layer.  */
+  if (rearrange_pattern)
+    {
+      slp_tree one =  new _slp_tree;
+      slp_tree two = new _slp_tree;
+      SLP_TREE_DEF_TYPE (one) = vect_internal_def;
+      SLP_TREE_DEF_TYPE (two) = vect_internal_def;
+      SLP_TREE_VECTYPE (one) = vectype;
+      SLP_TREE_VECTYPE (two) = vectype;
+      SLP_TREE_CHILDREN (one).safe_splice (children);
+      SLP_TREE_CHILDREN (two).safe_splice (children);
+
+      if (rearrange_pattern == 1)
+	{
+	  SLP_TREE_CODE (one) = VEC_PERM_EXPR;
+	  SLP_TREE_CODE (two) = VEC_PERM_EXPR;
+	  unsigned int h = group_size / 2;
+	  SLP_TREE_REPRESENTATIVE (one) = stmts[0];
+	  SLP_TREE_REPRESENTATIVE (two) = stmts[h];
+
+	  for (unsigned int j = 0; j < h; j += 2)
+	    {
+	      SLP_TREE_LANE_PERMUTATION(one).safe_push (std::make_pair (0, j));
+	      SLP_TREE_LANE_PERMUTATION(one).safe_push (std::make_pair (0, j + 1));
+	      SLP_TREE_LANE_PERMUTATION(one).safe_push (std::make_pair (0, j));
+	      SLP_TREE_LANE_PERMUTATION(one).safe_push (std::make_pair (0, j + 1));
+	    }
+	  for (unsigned int j = 0; j < h; j += 2)
+	    {
+	      SLP_TREE_LANE_PERMUTATION(two).safe_push (std::make_pair (0, h + j));
+	      SLP_TREE_LANE_PERMUTATION(two).safe_push (std::make_pair (0, h + j + 1));
+	      SLP_TREE_LANE_PERMUTATION(two).safe_push (std::make_pair (0, h + j));
+	      SLP_TREE_LANE_PERMUTATION(two).safe_push (std::make_pair (0, h + j + 1));
+	    }
+	}
+      else if (rearrange_pattern == 2)
+	{
+	  SLP_TREE_CODE (one) = VEC_PERM_EXPR;
+	  SLP_TREE_CODE (two) = VEC_PERM_EXPR;
+	  unsigned int h = group_size / 2;
+	  SLP_TREE_REPRESENTATIVE (one) = stmts[0];
+	  SLP_TREE_REPRESENTATIVE (two) = stmts[h];
+
+	  for (unsigned int j = 0; j < h; j += 2)
+	    {
+	      SLP_TREE_LANE_PERMUTATION(one).safe_push (std::make_pair (0, j));
+	      SLP_TREE_LANE_PERMUTATION(one).safe_push (std::make_pair (0, j));
+	      SLP_TREE_LANE_PERMUTATION(one).safe_push (std::make_pair (0, j + 1));
+	      SLP_TREE_LANE_PERMUTATION(one).safe_push (std::make_pair (0, j + 1));
+	    }
+	  for (unsigned int j = 0; j < h; j += 2)
+	    {
+	      SLP_TREE_LANE_PERMUTATION(two).safe_push (std::make_pair (0, h + j));
+	      SLP_TREE_LANE_PERMUTATION(two).safe_push (std::make_pair (0, h + j));
+	      SLP_TREE_LANE_PERMUTATION(two).safe_push (std::make_pair (0, h + j + 1));
+	      SLP_TREE_LANE_PERMUTATION(two).safe_push (std::make_pair (0, h + j + 1));
+	    }
+	}
+
+      slp_tree child;
+      FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
+       SLP_TREE_REF_COUNT (child)++;
+
+      node = vect_create_new_slp_node (node, stmts, 2);
+      SLP_TREE_VECTYPE (node) = vectype;
+      SLP_TREE_CHILDREN (node).quick_push (one);
+      SLP_TREE_CHILDREN (node).quick_push (two);
+
+      SLP_TREE_LANES (one) = stmts.length ();
+      SLP_TREE_LANES (two) = stmts.length ();
+
+      children.truncate(0);
+      children.safe_splice(SLP_TREE_CHILDREN (node));
+    }
+
+
   if (two_operators)
     {
       /* ???  We'd likely want to either cache in bst_map sth like

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [gcc(refs/vendors/vrull/heads/slp-improvements)] Build SLP tree for nodes with shared statements
@ 2024-02-27 13:37 Philipp Tomsich
  0 siblings, 0 replies; 4+ messages in thread
From: Philipp Tomsich @ 2024-02-27 13:37 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:821917bf62b82ebcd0f423c21b06def7735e1371

commit 821917bf62b82ebcd0f423c21b06def7735e1371
Author: Manolis Tsamis <manolis.tsamis@vrull.eu>
Date:   Mon Oct 30 17:11:34 2023 +0100

    Build SLP tree for nodes with shared statements
    
    When an SLP tree has multiple occurances of the same statement in a
    vector node (e.g. {A, B, A, B, ...} it can be that this duplication
    hurts the efficacy of the tree building procedures, compared to what
    can be achieved if we used a ordered node with no duplicates (i.e.,
    {A, B, ...}) and then did a permute.
    
    This is especially true when the two_operators conditions are met,
    where nodes are duplicated and if they have duplicates there is an
    'exponential explosion' of nodes that end up as splat vectors.
    
    We check that the vector elements taking part in a rearranged pattern
    (e.g. ABAB) are actually different. Otherwise a splat vector (AAAA)
    would be considered to fulfill any rearrangement pattern. We don't
    want to merge splat vectors together in a mixed node as that can both
    reduce performance and cause codegen issues.
    
    This commit improves these cases by matching the patterns and
    deduplicates + permutes them appropriately. This change (among other
    cases) targets the vectorization of the first loop in x264's
    pixel_satd functions.
    
    See also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98138
    
    Ref #342

Diff:
---
 gcc/testsuite/gcc.target/aarch64/pr98138_1.c |  32 ++++
 gcc/testsuite/gcc.target/aarch64/pr98138_2.c |  48 ++++++
 gcc/tree-vect-slp.cc                         | 230 +++++++++++++++++++++++++++
 3 files changed, 310 insertions(+)

diff --git a/gcc/testsuite/gcc.target/aarch64/pr98138_1.c b/gcc/testsuite/gcc.target/aarch64/pr98138_1.c
new file mode 100644
index 00000000000..a9f7d4d6a99
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr98138_1.c
@@ -0,0 +1,32 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-all" } */
+
+extern void test(unsigned int t[4][4]);
+
+void foo(unsigned char *p1, int i1, unsigned char *p2, int i2)
+{
+  unsigned int tmp[4][4];
+  unsigned int a0, a1, a2, a3;
+
+  for (int i = 0; i < 4; i++, p1 += i1, p2 += i2)
+    {
+      a0 = (p1[0] - p2[0]) + ((p1[4] - p2[4]) << 16);
+      a1 = (p1[1] - p2[1]) + ((p1[5] - p2[5]) << 16);
+      a2 = (p1[2] - p2[2]) + ((p1[6] - p2[6]) << 16);
+      a3 = (p1[3] - p2[3]) + ((p1[7] - p2[7]) << 16);
+
+      int t0 = a0 + a1;
+      int t1 = a0 - a1;
+      int t2 = a2 + a3;
+      int t3 = a2 - a3;
+
+      tmp[i][0] = t0 + t2;
+      tmp[i][2] = t0 - t2;
+      tmp[i][1] = t1 + t3;
+      tmp[i][3] = t1 - t3;
+    }
+
+  test(tmp);
+}
+
+/* { dg-final { scan-tree-dump "vectorized 1 loops" "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/pr98138_2.c b/gcc/testsuite/gcc.target/aarch64/pr98138_2.c
new file mode 100644
index 00000000000..7e85470a677
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr98138_2.c
@@ -0,0 +1,48 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-all" } */
+
+typedef unsigned char uint8_t;
+typedef unsigned int uint32_t;
+typedef unsigned short uint16_t;
+
+static inline
+uint32_t abs2 ( uint32_t a )
+{
+    uint32_t s = ((a>>15)&0x10001)*0xffff;
+    return (a+s)^s;
+}
+
+#define HADAMARD4(d0, d1, d2, d3, s0, s1, s2, s3) {\
+    int t0 = s0 + s1;\
+    int t1 = s0 - s1;\
+    int t2 = s2 + s3;\
+    int t3 = s2 - s3;\
+    d0 = t0 + t2;\
+    d2 = t0 - t2;\
+    d1 = t1 + t3;\
+    d3 = t1 - t3;\
+}
+
+int
+foo ( uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
+{
+    uint32_t tmp[4][4];
+    uint32_t a0, a1, a2, a3;
+    int sum = 0;
+    for( int i = 0; i < 4; i++, pix1 += i_pix1, pix2 += i_pix2 )
+    {
+        a0 = (pix1[0] - pix2[0]) + ((pix1[4] - pix2[4]) << 16);
+        a1 = (pix1[1] - pix2[1]) + ((pix1[5] - pix2[5]) << 16);
+        a2 = (pix1[2] - pix2[2]) + ((pix1[6] - pix2[6]) << 16);
+        a3 = (pix1[3] - pix2[3]) + ((pix1[7] - pix2[7]) << 16);
+        HADAMARD4( tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], a0,a1,a2,a3 );
+    }
+    for( int i = 0; i < 4; i++ )
+    {
+        HADAMARD4( a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i] );
+        sum += abs2(a0) + abs2(a1) + abs2(a2) + abs2(a3);
+    }
+    return (((uint16_t)sum) + ((uint32_t)sum>>16)) >> 1;
+}
+
+/* { dg-final { scan-tree-dump "vectorized 2 loops" "vect" } } */
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index 895f4f7fb6b..238a17ca4e1 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -40,6 +40,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "tree-vectorizer.h"
 #include "langhooks.h"
 #include "gimple-walk.h"
+#include "gimple-pretty-print.h"
 #include "dbgcnt.h"
 #include "tree-vector-builder.h"
 #include "vec-perm-indices.h"
@@ -1819,6 +1820,141 @@ vect_slp_build_two_operator_nodes (slp_tree perm, tree vectype,
   SLP_TREE_CHILDREN (perm).quick_push (child2);
 }
 
+enum slp_oprnd_pattern
+{
+  SLP_OPRND_PATTERN_NONE,
+  SLP_OPRND_PATTERN_ABAB,
+  SLP_OPRND_PATTERN_AABB,
+  SLP_OPRND_PATTERN_ABBA
+};
+
+static int
+try_rearrange_oprnd_info (vec<slp_oprnd_info> &oprnds_info, unsigned group_size)
+{
+  unsigned i;
+  slp_oprnd_info oprnd_info;
+
+  /* A more generalized version is WIP but this is generic enough anyway.  */
+  if (oprnds_info.length() != 2 || group_size % 4 != 0)
+    return SLP_OPRND_PATTERN_NONE;
+
+  if (!oprnds_info[0]->def_stmts[0]
+      || !is_a<gassign *> (oprnds_info[0]->def_stmts[0]->stmt))
+    return SLP_OPRND_PATTERN_NONE;
+
+  enum tree_code code
+    = gimple_assign_rhs_code (oprnds_info[0]->def_stmts[0]->stmt);
+  FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
+    for (unsigned int j = 0; j < group_size; j += 1)
+      {
+	if (!oprnd_info->def_stmts[j]
+	    || !is_a<gassign *> (oprnd_info->def_stmts[j]->stmt)
+	    || STMT_VINFO_DATA_REF (oprnd_info->def_stmts[j]))
+	  return SLP_OPRND_PATTERN_NONE;
+	/* Don't mix different operations.  */
+	if (gimple_assign_rhs_code (oprnd_info->def_stmts[j]->stmt) != code)
+	  return SLP_OPRND_PATTERN_NONE;
+      }
+
+  if (gimple_assign_rhs_code (oprnds_info[0]->def_stmts[0]->stmt)
+      != gimple_assign_rhs_code (oprnds_info[1]->def_stmts[0]->stmt))
+    return SLP_OPRND_PATTERN_NONE;
+
+  int pattern = SLP_OPRND_PATTERN_NONE;
+  FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
+    for (unsigned int j = 0; j < group_size; j += 4)
+      {
+	int cur_pattern = SLP_OPRND_PATTERN_NONE;
+	/* Check for an ABAB... pattern.  */
+	if ((oprnd_info->def_stmts[j] == oprnd_info->def_stmts[j + 2])
+	    && (oprnd_info->def_stmts[j + 1] == oprnd_info->def_stmts[j + 3])
+	    && (oprnd_info->def_stmts[j] != oprnd_info->def_stmts[j + 1]))
+	  cur_pattern = SLP_OPRND_PATTERN_ABAB;
+	/* Check for an AABB... pattern.  */
+	else if ((oprnd_info->def_stmts[j] == oprnd_info->def_stmts[j + 1])
+		 && (oprnd_info->def_stmts[j + 2] == oprnd_info->def_stmts[j + 3])
+		 && (oprnd_info->def_stmts[j] != oprnd_info->def_stmts[j + 2]))
+	  cur_pattern = SLP_OPRND_PATTERN_AABB;
+	/* Check for an ABBA... pattern.  */
+	else if ((oprnd_info->def_stmts[j] == oprnd_info->def_stmts[j + 3])
+		 && (oprnd_info->def_stmts[j + 1] == oprnd_info->def_stmts[j + 2])
+		 && (oprnd_info->def_stmts[j] != oprnd_info->def_stmts[j + 1]))
+	  cur_pattern = SLP_OPRND_PATTERN_ABBA;
+	/* Unrecognised pattern.  */
+	else
+	  return SLP_OPRND_PATTERN_NONE;
+
+	if (pattern == SLP_OPRND_PATTERN_NONE)
+	  pattern = cur_pattern;
+	/* Multiple patterns detected.  */
+	else if (cur_pattern != pattern)
+	  return SLP_OPRND_PATTERN_NONE;
+      }
+
+  gcc_checking_assert (pattern != SLP_OPRND_PATTERN_NONE);
+
+  if (dump_enabled_p ())
+    {
+      if (pattern == SLP_OPRND_PATTERN_ABAB)
+	dump_printf (MSG_NOTE, "ABAB");
+      else if (pattern == SLP_OPRND_PATTERN_AABB)
+	dump_printf (MSG_NOTE, "AABB");
+      else if (pattern == SLP_OPRND_PATTERN_ABBA)
+	dump_printf (MSG_NOTE, "ABBA");
+      dump_printf (MSG_NOTE, " pattern detected.\n");
+    }
+
+  if (pattern == SLP_OPRND_PATTERN_ABAB || pattern == SLP_OPRND_PATTERN_ABBA)
+    for (unsigned int j = 0; j < group_size; j += 4)
+      {
+	/* Given oprnd[0] -> A1, B1, A1, B1, A2, B2, A2, B2, ...
+	   Given oprnd[1] -> C1, D1, C1, D1, C2, D2, C2, D2, ...
+	   Create a single node -> A1, B1, C1, D1, A2, B2, C2, D2, ...  */
+	oprnds_info[0]->def_stmts[j+2] = oprnds_info[1]->def_stmts[j];
+	oprnds_info[0]->ops[j+2] = oprnds_info[1]->ops[j];
+	oprnds_info[0]->def_stmts[j+3] = oprnds_info[1]->def_stmts[j+1];
+	oprnds_info[0]->ops[j+3] = oprnds_info[1]->ops[j+1];
+      }
+  else if (pattern == SLP_OPRND_PATTERN_AABB)
+    for (unsigned int j = 0; j < group_size; j += 4)
+      {
+	/* Given oprnd[0] -> A1, A1, B1, B1, A2, A2, B2, B2, ...
+	   Given oprnd[1] -> C1, C1, D1, D1, C2, C2, D2, D2, ...
+	   Create a single node -> A1, C1, B1, D1, A2, C2, B2, D2, ...  */
+
+	/* The ordering here is at least to some extent arbitrary.
+	   A generilized version needs to use some explicit ordering.  */
+	oprnds_info[0]->def_stmts[j+1] = oprnds_info[1]->def_stmts[j];
+	oprnds_info[0]->ops[j+1] = oprnds_info[1]->ops[j];
+	oprnds_info[0]->def_stmts[j+2] = oprnds_info[0]->def_stmts[j+2];
+	oprnds_info[0]->ops[j+2] = oprnds_info[0]->ops[j+2];
+	oprnds_info[0]->def_stmts[j+3] = oprnds_info[1]->def_stmts[j+2];
+	oprnds_info[0]->ops[j+3] = oprnds_info[1]->ops[j+2];
+      }
+
+  if (dump_enabled_p ())
+    {
+      dump_printf (MSG_NOTE, "Recurse with:\n");
+      for (unsigned int j = 0; j < group_size; j++)
+	{
+	  dump_printf (MSG_NOTE, "  ");
+	  print_gimple_stmt (dump_file, oprnds_info[0]->def_stmts[j]->stmt, 0);
+	}
+    }
+
+  /* Since we've merged the two nodes in one, make the second one a copy of the first.
+     An improvement could be to truncate to one just node, but that needs refactoring
+     in vect_build_slp_tree_2 which we can avoid for now. The nodes are going to be
+     cached anyway.  */
+  for (unsigned int j = 0; j < group_size; j++)
+    {
+      oprnds_info[1]->def_stmts[j] = oprnds_info[0]->def_stmts[j];
+      oprnds_info[1]->ops[j] = oprnds_info[0]->ops[j];
+    }
+
+  return pattern;
+}
+
 /* Recursively build an SLP tree starting from NODE.
    Fail (and return a value not equal to zero) if def-stmts are not
    isomorphic, require data permutation or are of unsupported types of
@@ -2380,6 +2516,8 @@ out:
 
   stmt_info = stmts[0];
 
+  int rearrange_pattern = try_rearrange_oprnd_info (oprnds_info, group_size);
+
   /* Create SLP_TREE nodes for the definition node/s.  */
   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
     {
@@ -2640,6 +2778,98 @@ fail:
   *tree_size += this_tree_size + 1;
   *max_nunits = this_max_nunits;
 
+  /* If we applied any rearrangmenets then we need to reconstruct the original
+     elements with an additional permutation layer.  */
+  if (rearrange_pattern != SLP_OPRND_PATTERN_NONE)
+    {
+      slp_tree one =  new _slp_tree;
+      slp_tree two = new _slp_tree;
+      SLP_TREE_DEF_TYPE (one) = vect_internal_def;
+      SLP_TREE_DEF_TYPE (two) = vect_internal_def;
+      SLP_TREE_VECTYPE (one) = vectype;
+      SLP_TREE_VECTYPE (two) = vectype;
+      SLP_TREE_CHILDREN (one).safe_splice (children);
+      SLP_TREE_CHILDREN (two).safe_splice (children);
+
+      SLP_TREE_CODE (one) = VEC_PERM_EXPR;
+      SLP_TREE_CODE (two) = VEC_PERM_EXPR;
+      SLP_TREE_REPRESENTATIVE (one) = stmts[0];
+      SLP_TREE_REPRESENTATIVE (two) = stmts[2];
+
+      if (rearrange_pattern == SLP_OPRND_PATTERN_ABAB)
+	{
+	   /* Given a single node -> A1, B1, C1, D1, A2, B2, C2, D2, ...
+	      Create node "one" -> A1, B1, A1, B1, A2, B2, A2, B2, ...
+	      Create node "two" -> C1, D1, C1, D1, C2, D2, C2, D2, ...  */
+
+	  for (unsigned int j = 0; j < group_size; j += 4)
+	    {
+	      SLP_TREE_LANE_PERMUTATION(one).safe_push (std::make_pair (0, j + 0));
+	      SLP_TREE_LANE_PERMUTATION(one).safe_push (std::make_pair (0, j + 1));
+	      SLP_TREE_LANE_PERMUTATION(one).safe_push (std::make_pair (0, j + 0));
+	      SLP_TREE_LANE_PERMUTATION(one).safe_push (std::make_pair (0, j + 1));
+
+	      SLP_TREE_LANE_PERMUTATION(two).safe_push (std::make_pair (0, j + 2));
+	      SLP_TREE_LANE_PERMUTATION(two).safe_push (std::make_pair (0, j + 3));
+	      SLP_TREE_LANE_PERMUTATION(two).safe_push (std::make_pair (0, j + 2));
+	      SLP_TREE_LANE_PERMUTATION(two).safe_push (std::make_pair (0, j + 3));
+	    }
+	}
+      else if (rearrange_pattern == SLP_OPRND_PATTERN_AABB)
+	{
+	   /* Given a single node -> A1, C1, B1, D1, A2, C2, B2, D2, ...
+	      Create node "one" -> A1, A1, B1, B1, A2, A2, B2, B2, ...
+	      Create node "two" -> C1, C1, D1, D1, C2, C2, D2, D2, ...  */
+
+	  for (unsigned int j = 0; j < group_size; j += 4)
+	    {
+	      SLP_TREE_LANE_PERMUTATION(one).safe_push (std::make_pair (0, j + 0));
+	      SLP_TREE_LANE_PERMUTATION(one).safe_push (std::make_pair (0, j + 0));
+	      SLP_TREE_LANE_PERMUTATION(one).safe_push (std::make_pair (0, j + 2));
+	      SLP_TREE_LANE_PERMUTATION(one).safe_push (std::make_pair (0, j + 2));
+
+	      SLP_TREE_LANE_PERMUTATION(two).safe_push (std::make_pair (0, j + 1));
+	      SLP_TREE_LANE_PERMUTATION(two).safe_push (std::make_pair (0, j + 1));
+	      SLP_TREE_LANE_PERMUTATION(two).safe_push (std::make_pair (0, j + 3));
+	      SLP_TREE_LANE_PERMUTATION(two).safe_push (std::make_pair (0, j + 3));
+	    }
+	}
+      else if (rearrange_pattern == SLP_OPRND_PATTERN_ABBA)
+	{
+	   /* Given a single node -> A1, B1, C1, D1, A2, B2, C2, D2, ...
+	      Create node "one" -> A1, B1, B1, A1, A2, B2, B2, A2, ...
+	      Create node "two" -> C1, D1, D1, C1, C2, D2, D2, C2, ...  */
+
+	  for (unsigned int j = 0; j < group_size; j += 4)
+	    {
+	      SLP_TREE_LANE_PERMUTATION(one).safe_push (std::make_pair (0, j + 0));
+	      SLP_TREE_LANE_PERMUTATION(one).safe_push (std::make_pair (0, j + 1));
+	      SLP_TREE_LANE_PERMUTATION(one).safe_push (std::make_pair (0, j + 1));
+	      SLP_TREE_LANE_PERMUTATION(one).safe_push (std::make_pair (0, j + 0));
+
+	      SLP_TREE_LANE_PERMUTATION(two).safe_push (std::make_pair (0, j + 2));
+	      SLP_TREE_LANE_PERMUTATION(two).safe_push (std::make_pair (0, j + 3));
+	      SLP_TREE_LANE_PERMUTATION(two).safe_push (std::make_pair (0, j + 3));
+	      SLP_TREE_LANE_PERMUTATION(two).safe_push (std::make_pair (0, j + 2));
+	    }
+	}
+
+      slp_tree child;
+      FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
+       SLP_TREE_REF_COUNT (child)++;
+
+      node = vect_create_new_slp_node (node, stmts, 2);
+      SLP_TREE_VECTYPE (node) = vectype;
+      SLP_TREE_CHILDREN (node).quick_push (one);
+      SLP_TREE_CHILDREN (node).quick_push (two);
+
+      SLP_TREE_LANES (one) = stmts.length ();
+      SLP_TREE_LANES (two) = stmts.length ();
+
+      children.truncate(0);
+      children.safe_splice(SLP_TREE_CHILDREN (node));
+    }
+
   if (two_operators)
     {
       /* ???  We'd likely want to either cache in bst_map sth like

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [gcc(refs/vendors/vrull/heads/slp-improvements)] Build SLP tree for nodes with shared statements
@ 2024-01-17 19:13 Philipp Tomsich
  0 siblings, 0 replies; 4+ messages in thread
From: Philipp Tomsich @ 2024-01-17 19:13 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:88e655d3c454acca54c139df1a52a0eedab78640

commit 88e655d3c454acca54c139df1a52a0eedab78640
Author: Manolis Tsamis <manolis.tsamis@vrull.eu>
Date:   Mon Oct 30 17:11:34 2023 +0100

    Build SLP tree for nodes with shared statements
    
    When an SLP tree has multiple occurances of the same statement in a
    vector node (e.g. {A, B, A, B, ...} it can be that this duplication
    hurts the efficacy of the tree building procedures, compared to what
    can be achieved if we used a ordered node with no duplicates (i.e.,
    {A, B, ...}) and then did a permute.
    
    This is especially true when the two_operators conditions are met,
    where nodes are duplicated and if they have duplicates there is an
    'exponential explosion' of nodes that end up as splat vectors.
    
    This is a first iteration that improves these cases by matching some
    patterns and deduplicates + permutes them appropriately. This
    currently targets the vectorization of the first loop in x264's
    pixel_satd functions.
    
    See also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98138
    
    Ref #342

Diff:
---
 gcc/tree-vect-slp.cc | 183 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 183 insertions(+)

diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index b6cce55ce90..4c56e8d1395 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -40,6 +40,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "tree-vectorizer.h"
 #include "langhooks.h"
 #include "gimple-walk.h"
+#include "gimple-pretty-print.h"
 #include "dbgcnt.h"
 #include "tree-vector-builder.h"
 #include "vec-perm-indices.h"
@@ -1819,6 +1820,109 @@ vect_slp_build_two_operator_nodes (slp_tree perm, tree vectype,
   SLP_TREE_CHILDREN (perm).quick_push (child2);
 }
 
+static int
+try_rearrange_oprnd_info (vec<slp_oprnd_info> &oprnds_info, unsigned group_size)
+{
+  unsigned i;
+  slp_oprnd_info oprnd_info;
+
+  /* A more generalized version is WIP but this is generic enough anyway.  */
+  if (oprnds_info.length() != 2 || group_size < 4)
+    return 0;
+
+  FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
+    for (unsigned int j = 0; j < group_size; j += 1)
+      if (!oprnd_info->def_stmts[j])
+	return 0;
+
+  int pattern = 0;
+  FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
+    for (unsigned int j = 0; j < group_size; j += 4)
+      {
+	int cur_pattern = 0;
+	/* Check for an ABAB... pattern.  */
+	if ((oprnd_info->def_stmts[j] == oprnd_info->def_stmts[j + 2])
+	    && (oprnd_info->def_stmts[j + 1] == oprnd_info->def_stmts[j + 3]))
+	  cur_pattern = 1;
+	/* Check for an AABB... pattern.  */
+	else if ((oprnd_info->def_stmts[j] == oprnd_info->def_stmts[j + 1])
+		 && (oprnd_info->def_stmts[j + 2] == oprnd_info->def_stmts[j + 3]))
+	  cur_pattern = 2;
+	/* Check for an ABBA... pattern.  */
+	else if ((oprnd_info->def_stmts[j] == oprnd_info->def_stmts[j + 3])
+		 && (oprnd_info->def_stmts[j + 1] == oprnd_info->def_stmts[j + 2]))
+	  cur_pattern = 3;
+	/* Unrecognised pattern.  */
+	else
+	  return 0;
+
+	if (pattern == 0)
+	  pattern = cur_pattern;
+	/* Multiple patterns detected.  */
+	else if (cur_pattern != pattern)
+	  return 0;
+      }
+
+  gcc_checking_assert (pattern);
+
+  if (dump_enabled_p ())
+    {
+      if (pattern == 1)
+	dump_printf (MSG_NOTE, "ABAB");
+      else if (pattern == 2)
+	dump_printf (MSG_NOTE, "AABB");
+      else if (pattern == 3)
+	dump_printf (MSG_NOTE, "ABBA");
+      dump_printf (MSG_NOTE, " pattern detected.\n");
+    }
+
+  if (pattern == 1)
+    for (unsigned int j = 0; j < group_size; j += 4)
+      {
+	oprnds_info[0]->def_stmts[j+2] = oprnds_info[1]->def_stmts[j];
+	oprnds_info[0]->ops[j+2] = oprnds_info[1]->ops[j];
+	oprnds_info[0]->def_stmts[j+3] = oprnds_info[1]->def_stmts[j+1];
+	oprnds_info[0]->ops[j+3] = oprnds_info[1]->ops[j+1];
+      }
+  else if (pattern == 2)
+    for (unsigned int j = 0; j < group_size; j += 4)
+      {
+	/* The ordering here is at least to some extent arbitrary.
+	   A generilized version needs to use some explicit ordering.  */
+	oprnds_info[0]->def_stmts[j+1] = oprnds_info[1]->def_stmts[j];
+	oprnds_info[0]->ops[j+1] = oprnds_info[1]->ops[j];
+	oprnds_info[0]->def_stmts[j+2] = oprnds_info[0]->def_stmts[j+2];
+	oprnds_info[0]->ops[j+2] = oprnds_info[0]->ops[j+2];
+	oprnds_info[0]->def_stmts[j+3] = oprnds_info[1]->def_stmts[j+2];
+	oprnds_info[0]->ops[j+3] = oprnds_info[1]->ops[j+2];
+      }
+  else if (pattern == 3)
+    /* No need to handle that for now.  */
+    return 0;
+
+  if (dump_enabled_p ())
+    {
+      dump_printf (MSG_NOTE, "Recurse with:\n");
+      for (unsigned int j = 0; j < group_size; j++)
+	{
+	  dump_printf (MSG_NOTE, "  ");
+	  print_gimple_stmt (dump_file, oprnds_info[0]->def_stmts[j]->stmt, 0);
+	}
+    }
+
+  /* Since we've merged the two nodes in one, make the second one a copy of the first.
+     An improvement could be to truncate to one just node, but that needs refactoring
+     in vect_build_slp_tree_2 which we can avoid for now. The nodes are going to be
+     cached anyway.  */
+  for (unsigned int j = 0; j < group_size; j++)
+    {
+      oprnds_info[1]->def_stmts[j] = oprnds_info[0]->def_stmts[j];
+      oprnds_info[1]->ops[j] = oprnds_info[0]->ops[j];
+    }
+
+  return pattern;
+}
+
 /* Recursively build an SLP tree starting from NODE.
    Fail (and return a value not equal to zero) if def-stmts are not
    isomorphic, require data permutation or are of unsupported types of
@@ -2380,6 +2484,8 @@ out:
 
   stmt_info = stmts[0];
 
+  int rearrange_pattern = try_rearrange_oprnd_info (oprnds_info, group_size);
+
   /* Create SLP_TREE nodes for the definition node/s.  */
   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
     {
@@ -2640,6 +2746,83 @@ fail:
   *tree_size += this_tree_size + 1;
   *max_nunits = this_max_nunits;
 
+  /* If we applied any rearrangmenets then we need to reconstruct the original
+     elements with an additional permutation layer.  */
+  if (rearrange_pattern)
+    {
+      slp_tree one =  new _slp_tree;
+      slp_tree two = new _slp_tree;
+      SLP_TREE_DEF_TYPE (one) = vect_internal_def;
+      SLP_TREE_DEF_TYPE (two) = vect_internal_def;
+      SLP_TREE_VECTYPE (one) = vectype;
+      SLP_TREE_VECTYPE (two) = vectype;
+      SLP_TREE_CHILDREN (one).safe_splice (children);
+      SLP_TREE_CHILDREN (two).safe_splice (children);
+
+      if (rearrange_pattern == 1)
+	{
+	  SLP_TREE_CODE (one) = VEC_PERM_EXPR;
+	  SLP_TREE_CODE (two) = VEC_PERM_EXPR;
+	  unsigned int h = group_size / 2;
+	  SLP_TREE_REPRESENTATIVE (one) = stmts[0];
+	  SLP_TREE_REPRESENTATIVE (two) = stmts[h];
+
+	  for (unsigned int j = 0; j < h; j += 2)
+	    {
+	      SLP_TREE_LANE_PERMUTATION(one).safe_push (std::make_pair (0, j));
+	      SLP_TREE_LANE_PERMUTATION(one).safe_push (std::make_pair (0, j + 1));
+	      SLP_TREE_LANE_PERMUTATION(one).safe_push (std::make_pair (0, j));
+	      SLP_TREE_LANE_PERMUTATION(one).safe_push (std::make_pair (0, j + 1));
+	    }
+	  for (unsigned int j = 0; j < h; j += 2)
+	    {
+	      SLP_TREE_LANE_PERMUTATION(two).safe_push (std::make_pair (0, h + j));
+	      SLP_TREE_LANE_PERMUTATION(two).safe_push (std::make_pair (0, h + j + 1));
+	      SLP_TREE_LANE_PERMUTATION(two).safe_push (std::make_pair (0, h + j));
+	      SLP_TREE_LANE_PERMUTATION(two).safe_push (std::make_pair (0, h + j + 1));
+	    }
+	}
+      else if (rearrange_pattern == 2)
+	{
+	  SLP_TREE_CODE (one) = VEC_PERM_EXPR;
+	  SLP_TREE_CODE (two) = VEC_PERM_EXPR;
+	  unsigned int h = group_size / 2;
+	  SLP_TREE_REPRESENTATIVE (one) = stmts[0];
+	  SLP_TREE_REPRESENTATIVE (two) = stmts[h];
+
+	  for (unsigned int j = 0; j < h; j += 2)
+	    {
+	      SLP_TREE_LANE_PERMUTATION(one).safe_push (std::make_pair (0, j));
+	      SLP_TREE_LANE_PERMUTATION(one).safe_push (std::make_pair (0, j));
+	      SLP_TREE_LANE_PERMUTATION(one).safe_push (std::make_pair (0, j + 1));
+	      SLP_TREE_LANE_PERMUTATION(one).safe_push (std::make_pair (0, j + 1));
+	    }
+	  for (unsigned int j = 0; j < h; j += 2)
+	    {
+	      SLP_TREE_LANE_PERMUTATION(two).safe_push (std::make_pair (0, h + j));
+	      SLP_TREE_LANE_PERMUTATION(two).safe_push (std::make_pair (0, h + j));
+	      SLP_TREE_LANE_PERMUTATION(two).safe_push (std::make_pair (0, h + j + 1));
+	      SLP_TREE_LANE_PERMUTATION(two).safe_push (std::make_pair (0, h + j + 1));
+	    }
+	}
+
+      slp_tree child;
+      FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
+       SLP_TREE_REF_COUNT (child)++;
+
+      node = vect_create_new_slp_node (node, stmts, 2);
+      SLP_TREE_VECTYPE (node) = vectype;
+      SLP_TREE_CHILDREN (node).quick_push (one);
+      SLP_TREE_CHILDREN (node).quick_push (two);
+
+      SLP_TREE_LANES (one) = stmts.length ();
+      SLP_TREE_LANES (two) = stmts.length ();
+
+      children.truncate(0);
+      children.safe_splice(SLP_TREE_CHILDREN (node));
+    }
+
+
   if (two_operators)
     {
       /* ???  We'd likely want to either cache in bst_map sth like

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [gcc(refs/vendors/vrull/heads/slp-improvements)] Build SLP tree for nodes with shared statements
@ 2023-11-28 13:35 Philipp Tomsich
  0 siblings, 0 replies; 4+ messages in thread
From: Philipp Tomsich @ 2023-11-28 13:35 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:c8eb943687266e13cdd95286518cdeed2508237a

commit c8eb943687266e13cdd95286518cdeed2508237a
Author: Manolis Tsamis <manolis.tsamis@vrull.eu>
Date:   Mon Oct 30 17:11:34 2023 +0100

    Build SLP tree for nodes with shared statements
    
    When an SLP tree has multiple occurances of the same statement in a
    vector node (e.g. {A, B, A, B, ...} it can be that this duplication
    hurts the efficacy of the tree building procedures, compared to what
    can be achieved if we used a ordered node with no duplicates (i.e.,
    {A, B, ...}) and then did a permute.
    
    This is especially true when the two_operators conditions are met,
    where nodes are duplicated and if they have duplicates there is an
    'exponential explosion' of nodes that end up as splat vectors.
    
    This is a first iteration that improves these cases by matching some
    patterns and deduplicates + permutes them appropriately. This
    currently targets the vectorization of the first loop in x264's
    pixel_satd functions.
    
    See also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98138
    
    Ref #342

Diff:
---
 gcc/tree-vect-slp.cc | 183 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 183 insertions(+)

diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index 4a09b3c2aca..681f72c43fe 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -40,6 +40,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "tree-vectorizer.h"
 #include "langhooks.h"
 #include "gimple-walk.h"
+#include "gimple-pretty-print.h"
 #include "dbgcnt.h"
 #include "tree-vector-builder.h"
 #include "vec-perm-indices.h"
@@ -1831,6 +1832,109 @@ vect_slp_build_two_operator_nodes (slp_tree perm, tree vectype,
   SLP_TREE_CHILDREN (perm).quick_push (child2);
 }
 
+static int
+try_rearrange_oprnd_info (vec<slp_oprnd_info> &oprnds_info, unsigned group_size)
+{
+  unsigned i;
+  slp_oprnd_info oprnd_info;
+
+  /* A more generalized version is WIP but this is generic enough anyway.  */
+  if (oprnds_info.length() != 2 || group_size < 4)
+    return 0;
+
+  FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
+    for (unsigned int j = 0; j < group_size; j += 1)
+      if (!oprnd_info->def_stmts[j])
+	return 0;
+
+  int pattern = 0;
+  FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
+    for (unsigned int j = 0; j < group_size; j += 4)
+      {
+	int cur_pattern = 0;
+	/* Check for an ABAB... pattern.  */
+	if ((oprnd_info->def_stmts[j] == oprnd_info->def_stmts[j + 2])
+	    && (oprnd_info->def_stmts[j + 1] == oprnd_info->def_stmts[j + 3]))
+	  cur_pattern = 1;
+	/* Check for an AABB... pattern.  */
+	else if ((oprnd_info->def_stmts[j] == oprnd_info->def_stmts[j + 1])
+		 && (oprnd_info->def_stmts[j + 2] == oprnd_info->def_stmts[j + 3]))
+	  cur_pattern = 2;
+	/* Check for an ABBA... pattern.  */
+	else if ((oprnd_info->def_stmts[j] == oprnd_info->def_stmts[j + 3])
+		 && (oprnd_info->def_stmts[j + 1] == oprnd_info->def_stmts[j + 2]))
+	  cur_pattern = 3;
+	/* Unrecognised pattern.  */
+	else
+	  return 0;
+
+	if (pattern == 0)
+	  pattern = cur_pattern;
+	/* Multiple patterns detected.  */
+	else if (cur_pattern != pattern)
+	  return 0;
+      }
+
+  gcc_checking_assert (pattern);
+
+  if (dump_enabled_p ())
+    {
+      if (pattern == 1)
+	dump_printf (MSG_NOTE, "ABAB");
+      else if (pattern == 2)
+	dump_printf (MSG_NOTE, "AABB");
+      else if (pattern == 3)
+	dump_printf (MSG_NOTE, "ABBA");
+      dump_printf (MSG_NOTE, " pattern detected.\n");
+    }
+
+  if (pattern == 1)
+    for (unsigned int j = 0; j < group_size; j += 4)
+      {
+	oprnds_info[0]->def_stmts[j+2] = oprnds_info[1]->def_stmts[j];
+	oprnds_info[0]->ops[j+2] = oprnds_info[1]->ops[j];
+	oprnds_info[0]->def_stmts[j+3] = oprnds_info[1]->def_stmts[j+1];
+	oprnds_info[0]->ops[j+3] = oprnds_info[1]->ops[j+1];
+      }
+  else if (pattern == 2)
+    for (unsigned int j = 0; j < group_size; j += 4)
+      {
+	/* The ordering here is at least to some extent arbitrary.
+	   A generilized version needs to use some explicit ordering.  */
+	oprnds_info[0]->def_stmts[j+1] = oprnds_info[1]->def_stmts[j];
+	oprnds_info[0]->ops[j+1] = oprnds_info[1]->ops[j];
+	oprnds_info[0]->def_stmts[j+2] = oprnds_info[0]->def_stmts[j+2];
+	oprnds_info[0]->ops[j+2] = oprnds_info[0]->ops[j+2];
+	oprnds_info[0]->def_stmts[j+3] = oprnds_info[1]->def_stmts[j+2];
+	oprnds_info[0]->ops[j+3] = oprnds_info[1]->ops[j+2];
+      }
+  else if (pattern == 3)
+    /* No need to handle that for now.  */
+    return 0;
+
+  if (dump_enabled_p ())
+    {
+      dump_printf (MSG_NOTE, "Recurse with:\n");
+      for (unsigned int j = 0; j < group_size; j++)
+	{
+	  dump_printf (MSG_NOTE, "  ");
+	  print_gimple_stmt (dump_file, oprnds_info[0]->def_stmts[j]->stmt, 0);
+	}
+    }
+
+  /* Since we've merged the two nodes in one, make the second one a copy of the first.
+     An improvement could be to truncate to one just node, but that needs refactoring
+     in vect_build_slp_tree_2 which we can avoid for now. The nodes are going to be
+     cached anyway.  */
+  for (unsigned int j = 0; j < group_size; j++)
+    {
+      oprnds_info[1]->def_stmts[j] = oprnds_info[0]->def_stmts[j];
+      oprnds_info[1]->ops[j] = oprnds_info[0]->ops[j];
+    }
+
+  return pattern;
+}
+
 /* Recursively build an SLP tree starting from NODE.
    Fail (and return a value not equal to zero) if def-stmts are not
    isomorphic, require data permutation or are of unsupported types of
@@ -2392,6 +2496,8 @@ out:
 
   stmt_info = stmts[0];
 
+  int rearrange_pattern = try_rearrange_oprnd_info (oprnds_info, group_size);
+
   /* Create SLP_TREE nodes for the definition node/s.  */
   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
     {
@@ -2629,6 +2735,83 @@ fail:
   *tree_size += this_tree_size + 1;
   *max_nunits = this_max_nunits;
 
+  /* If we applied any rearrangmenets then we need to reconstruct the original
+     elements with an additional permutation layer.  */
+  if (rearrange_pattern)
+    {
+      slp_tree one =  new _slp_tree;
+      slp_tree two = new _slp_tree;
+      SLP_TREE_DEF_TYPE (one) = vect_internal_def;
+      SLP_TREE_DEF_TYPE (two) = vect_internal_def;
+      SLP_TREE_VECTYPE (one) = vectype;
+      SLP_TREE_VECTYPE (two) = vectype;
+      SLP_TREE_CHILDREN (one).safe_splice (children);
+      SLP_TREE_CHILDREN (two).safe_splice (children);
+
+      if (rearrange_pattern == 1)
+	{
+	  SLP_TREE_CODE (one) = VEC_PERM_EXPR;
+	  SLP_TREE_CODE (two) = VEC_PERM_EXPR;
+	  unsigned int h = group_size / 2;
+	  SLP_TREE_REPRESENTATIVE (one) = stmts[0];
+	  SLP_TREE_REPRESENTATIVE (two) = stmts[h];
+
+	  for (unsigned int j = 0; j < h; j += 2)
+	    {
+	      SLP_TREE_LANE_PERMUTATION(one).safe_push (std::make_pair (0, j));
+	      SLP_TREE_LANE_PERMUTATION(one).safe_push (std::make_pair (0, j + 1));
+	      SLP_TREE_LANE_PERMUTATION(one).safe_push (std::make_pair (0, j));
+	      SLP_TREE_LANE_PERMUTATION(one).safe_push (std::make_pair (0, j + 1));
+	    }
+	  for (unsigned int j = 0; j < h; j += 2)
+	    {
+	      SLP_TREE_LANE_PERMUTATION(two).safe_push (std::make_pair (0, h + j));
+	      SLP_TREE_LANE_PERMUTATION(two).safe_push (std::make_pair (0, h + j + 1));
+	      SLP_TREE_LANE_PERMUTATION(two).safe_push (std::make_pair (0, h + j));
+	      SLP_TREE_LANE_PERMUTATION(two).safe_push (std::make_pair (0, h + j + 1));
+	    }
+	}
+      else if (rearrange_pattern == 2)
+	{
+	  SLP_TREE_CODE (one) = VEC_PERM_EXPR;
+	  SLP_TREE_CODE (two) = VEC_PERM_EXPR;
+	  unsigned int h = group_size / 2;
+	  SLP_TREE_REPRESENTATIVE (one) = stmts[0];
+	  SLP_TREE_REPRESENTATIVE (two) = stmts[h];
+
+	  for (unsigned int j = 0; j < h; j += 2)
+	    {
+	      SLP_TREE_LANE_PERMUTATION(one).safe_push (std::make_pair (0, j));
+	      SLP_TREE_LANE_PERMUTATION(one).safe_push (std::make_pair (0, j));
+	      SLP_TREE_LANE_PERMUTATION(one).safe_push (std::make_pair (0, j + 1));
+	      SLP_TREE_LANE_PERMUTATION(one).safe_push (std::make_pair (0, j + 1));
+	    }
+	  for (unsigned int j = 0; j < h; j += 2)
+	    {
+	      SLP_TREE_LANE_PERMUTATION(two).safe_push (std::make_pair (0, h + j));
+	      SLP_TREE_LANE_PERMUTATION(two).safe_push (std::make_pair (0, h + j));
+	      SLP_TREE_LANE_PERMUTATION(two).safe_push (std::make_pair (0, h + j + 1));
+	      SLP_TREE_LANE_PERMUTATION(two).safe_push (std::make_pair (0, h + j + 1));
+	    }
+	}
+
+      slp_tree child;
+      FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
+       SLP_TREE_REF_COUNT (child)++;
+
+      node = vect_create_new_slp_node (node, stmts, 2);
+      SLP_TREE_VECTYPE (node) = vectype;
+      SLP_TREE_CHILDREN (node).quick_push (one);
+      SLP_TREE_CHILDREN (node).quick_push (two);
+
+      SLP_TREE_LANES (one) = stmts.length ();
+      SLP_TREE_LANES (two) = stmts.length ();
+
+      children.truncate(0);
+      children.safe_splice(SLP_TREE_CHILDREN (node));
+    }
+
+
   if (two_operators)
     {
       /* ???  We'd likely want to either cache in bst_map sth like

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2024-02-27 13:37 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-01-23 20:57 [gcc(refs/vendors/vrull/heads/slp-improvements)] Build SLP tree for nodes with shared statements Philipp Tomsich
  -- strict thread matches above, loose matches on Subject: below --
2024-02-27 13:37 Philipp Tomsich
2024-01-17 19:13 Philipp Tomsich
2023-11-28 13:35 Philipp Tomsich

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).