public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
From: Philipp Tomsich <ptomsich@gcc.gnu.org>
To: gcc-cvs@gcc.gnu.org
Subject: [gcc(refs/vendors/vrull/heads/slp-improvements)] Build SLP tree for nodes with shared statements
Date: Tue, 27 Feb 2024 13:37:13 +0000 (GMT)	[thread overview]
Message-ID: <20240227133713.241BD3858427@sourceware.org> (raw)

https://gcc.gnu.org/g:821917bf62b82ebcd0f423c21b06def7735e1371

commit 821917bf62b82ebcd0f423c21b06def7735e1371
Author: Manolis Tsamis <manolis.tsamis@vrull.eu>
Date:   Mon Oct 30 17:11:34 2023 +0100

    Build SLP tree for nodes with shared statements
    
    When an SLP tree has multiple occurances of the same statement in a
    vector node (e.g. {A, B, A, B, ...} it can be that this duplication
    hurts the efficacy of the tree building procedures, compared to what
    can be achieved if we used a ordered node with no duplicates (i.e.,
    {A, B, ...}) and then did a permute.
    
    This is especially true when the two_operators conditions are met,
    where nodes are duplicated and if they have duplicates there is an
    'exponential explosion' of nodes that end up as splat vectors.
    
    We check that the vector elements taking part in a rearranged pattern
    (e.g. ABAB) are actually different. Otherwise a splat vector (AAAA)
    would be considered to fulfill any rearrangement pattern. We don't
    want to merge splat vectors together in a mixed node as that can both
    reduce performance and cause codegen issues.
    
    This commit improves these cases by matching the patterns and
    deduplicates + permutes them appropriately. This change (among other
    cases) targets the vectorization of the first loop in x264's
    pixel_satd functions.
    
    See also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98138
    
    Ref #342

Diff:
---
 gcc/testsuite/gcc.target/aarch64/pr98138_1.c |  32 ++++
 gcc/testsuite/gcc.target/aarch64/pr98138_2.c |  48 ++++++
 gcc/tree-vect-slp.cc                         | 230 +++++++++++++++++++++++++++
 3 files changed, 310 insertions(+)

diff --git a/gcc/testsuite/gcc.target/aarch64/pr98138_1.c b/gcc/testsuite/gcc.target/aarch64/pr98138_1.c
new file mode 100644
index 00000000000..a9f7d4d6a99
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr98138_1.c
@@ -0,0 +1,32 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-all" } */
+
+extern void test(unsigned int t[4][4]);
+
+void foo(unsigned char *p1, int i1, unsigned char *p2, int i2)
+{
+  unsigned int tmp[4][4];
+  unsigned int a0, a1, a2, a3;
+
+  for (int i = 0; i < 4; i++, p1 += i1, p2 += i2)
+    {
+      a0 = (p1[0] - p2[0]) + ((p1[4] - p2[4]) << 16);
+      a1 = (p1[1] - p2[1]) + ((p1[5] - p2[5]) << 16);
+      a2 = (p1[2] - p2[2]) + ((p1[6] - p2[6]) << 16);
+      a3 = (p1[3] - p2[3]) + ((p1[7] - p2[7]) << 16);
+
+      int t0 = a0 + a1;
+      int t1 = a0 - a1;
+      int t2 = a2 + a3;
+      int t3 = a2 - a3;
+
+      tmp[i][0] = t0 + t2;
+      tmp[i][2] = t0 - t2;
+      tmp[i][1] = t1 + t3;
+      tmp[i][3] = t1 - t3;
+    }
+
+  test(tmp);
+}
+
+/* { dg-final { scan-tree-dump "vectorized 1 loops" "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/pr98138_2.c b/gcc/testsuite/gcc.target/aarch64/pr98138_2.c
new file mode 100644
index 00000000000..7e85470a677
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr98138_2.c
@@ -0,0 +1,48 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-all" } */
+
+typedef unsigned char uint8_t;
+typedef unsigned int uint32_t;
+typedef unsigned short uint16_t;
+
+static inline
+uint32_t abs2 ( uint32_t a )
+{
+    uint32_t s = ((a>>15)&0x10001)*0xffff;
+    return (a+s)^s;
+}
+
+#define HADAMARD4(d0, d1, d2, d3, s0, s1, s2, s3) {\
+    int t0 = s0 + s1;\
+    int t1 = s0 - s1;\
+    int t2 = s2 + s3;\
+    int t3 = s2 - s3;\
+    d0 = t0 + t2;\
+    d2 = t0 - t2;\
+    d1 = t1 + t3;\
+    d3 = t1 - t3;\
+}
+
+int
+foo ( uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
+{
+    uint32_t tmp[4][4];
+    uint32_t a0, a1, a2, a3;
+    int sum = 0;
+    for( int i = 0; i < 4; i++, pix1 += i_pix1, pix2 += i_pix2 )
+    {
+        a0 = (pix1[0] - pix2[0]) + ((pix1[4] - pix2[4]) << 16);
+        a1 = (pix1[1] - pix2[1]) + ((pix1[5] - pix2[5]) << 16);
+        a2 = (pix1[2] - pix2[2]) + ((pix1[6] - pix2[6]) << 16);
+        a3 = (pix1[3] - pix2[3]) + ((pix1[7] - pix2[7]) << 16);
+        HADAMARD4( tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], a0,a1,a2,a3 );
+    }
+    for( int i = 0; i < 4; i++ )
+    {
+        HADAMARD4( a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i] );
+        sum += abs2(a0) + abs2(a1) + abs2(a2) + abs2(a3);
+    }
+    return (((uint16_t)sum) + ((uint32_t)sum>>16)) >> 1;
+}
+
+/* { dg-final { scan-tree-dump "vectorized 2 loops" "vect" } } */
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index 895f4f7fb6b..238a17ca4e1 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -40,6 +40,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "tree-vectorizer.h"
 #include "langhooks.h"
 #include "gimple-walk.h"
+#include "gimple-pretty-print.h"
 #include "dbgcnt.h"
 #include "tree-vector-builder.h"
 #include "vec-perm-indices.h"
@@ -1819,6 +1820,141 @@ vect_slp_build_two_operator_nodes (slp_tree perm, tree vectype,
   SLP_TREE_CHILDREN (perm).quick_push (child2);
 }
 
+enum slp_oprnd_pattern
+{
+  SLP_OPRND_PATTERN_NONE,
+  SLP_OPRND_PATTERN_ABAB,
+  SLP_OPRND_PATTERN_AABB,
+  SLP_OPRND_PATTERN_ABBA
+};
+
+static int
+try_rearrange_oprnd_info (vec<slp_oprnd_info> &oprnds_info, unsigned group_size)
+{
+  unsigned i;
+  slp_oprnd_info oprnd_info;
+
+  /* A more generalized version is WIP but this is generic enough anyway.  */
+  if (oprnds_info.length() != 2 || group_size % 4 != 0)
+    return SLP_OPRND_PATTERN_NONE;
+
+  if (!oprnds_info[0]->def_stmts[0]
+      || !is_a<gassign *> (oprnds_info[0]->def_stmts[0]->stmt))
+    return SLP_OPRND_PATTERN_NONE;
+
+  enum tree_code code
+    = gimple_assign_rhs_code (oprnds_info[0]->def_stmts[0]->stmt);
+  FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
+    for (unsigned int j = 0; j < group_size; j += 1)
+      {
+	if (!oprnd_info->def_stmts[j]
+	    || !is_a<gassign *> (oprnd_info->def_stmts[j]->stmt)
+	    || STMT_VINFO_DATA_REF (oprnd_info->def_stmts[j]))
+	  return SLP_OPRND_PATTERN_NONE;
+	/* Don't mix different operations.  */
+	if (gimple_assign_rhs_code (oprnd_info->def_stmts[j]->stmt) != code)
+	  return SLP_OPRND_PATTERN_NONE;
+      }
+
+  if (gimple_assign_rhs_code (oprnds_info[0]->def_stmts[0]->stmt)
+      != gimple_assign_rhs_code (oprnds_info[1]->def_stmts[0]->stmt))
+    return SLP_OPRND_PATTERN_NONE;
+
+  int pattern = SLP_OPRND_PATTERN_NONE;
+  FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
+    for (unsigned int j = 0; j < group_size; j += 4)
+      {
+	int cur_pattern = SLP_OPRND_PATTERN_NONE;
+	/* Check for an ABAB... pattern.  */
+	if ((oprnd_info->def_stmts[j] == oprnd_info->def_stmts[j + 2])
+	    && (oprnd_info->def_stmts[j + 1] == oprnd_info->def_stmts[j + 3])
+	    && (oprnd_info->def_stmts[j] != oprnd_info->def_stmts[j + 1]))
+	  cur_pattern = SLP_OPRND_PATTERN_ABAB;
+	/* Check for an AABB... pattern.  */
+	else if ((oprnd_info->def_stmts[j] == oprnd_info->def_stmts[j + 1])
+		 && (oprnd_info->def_stmts[j + 2] == oprnd_info->def_stmts[j + 3])
+		 && (oprnd_info->def_stmts[j] != oprnd_info->def_stmts[j + 2]))
+	  cur_pattern = SLP_OPRND_PATTERN_AABB;
+	/* Check for an ABBA... pattern.  */
+	else if ((oprnd_info->def_stmts[j] == oprnd_info->def_stmts[j + 3])
+		 && (oprnd_info->def_stmts[j + 1] == oprnd_info->def_stmts[j + 2])
+		 && (oprnd_info->def_stmts[j] != oprnd_info->def_stmts[j + 1]))
+	  cur_pattern = SLP_OPRND_PATTERN_ABBA;
+	/* Unrecognised pattern.  */
+	else
+	  return SLP_OPRND_PATTERN_NONE;
+
+	if (pattern == SLP_OPRND_PATTERN_NONE)
+	  pattern = cur_pattern;
+	/* Multiple patterns detected.  */
+	else if (cur_pattern != pattern)
+	  return SLP_OPRND_PATTERN_NONE;
+      }
+
+  gcc_checking_assert (pattern != SLP_OPRND_PATTERN_NONE);
+
+  if (dump_enabled_p ())
+    {
+      if (pattern == SLP_OPRND_PATTERN_ABAB)
+	dump_printf (MSG_NOTE, "ABAB");
+      else if (pattern == SLP_OPRND_PATTERN_AABB)
+	dump_printf (MSG_NOTE, "AABB");
+      else if (pattern == SLP_OPRND_PATTERN_ABBA)
+	dump_printf (MSG_NOTE, "ABBA");
+      dump_printf (MSG_NOTE, " pattern detected.\n");
+    }
+
+  if (pattern == SLP_OPRND_PATTERN_ABAB || pattern == SLP_OPRND_PATTERN_ABBA)
+    for (unsigned int j = 0; j < group_size; j += 4)
+      {
+	/* Given oprnd[0] -> A1, B1, A1, B1, A2, B2, A2, B2, ...
+	   Given oprnd[1] -> C1, D1, C1, D1, C2, D2, C2, D2, ...
+	   Create a single node -> A1, B1, C1, D1, A2, B2, C2, D2, ...  */
+	oprnds_info[0]->def_stmts[j+2] = oprnds_info[1]->def_stmts[j];
+	oprnds_info[0]->ops[j+2] = oprnds_info[1]->ops[j];
+	oprnds_info[0]->def_stmts[j+3] = oprnds_info[1]->def_stmts[j+1];
+	oprnds_info[0]->ops[j+3] = oprnds_info[1]->ops[j+1];
+      }
+  else if (pattern == SLP_OPRND_PATTERN_AABB)
+    for (unsigned int j = 0; j < group_size; j += 4)
+      {
+	/* Given oprnd[0] -> A1, A1, B1, B1, A2, A2, B2, B2, ...
+	   Given oprnd[1] -> C1, C1, D1, D1, C2, C2, D2, D2, ...
+	   Create a single node -> A1, C1, B1, D1, A2, C2, B2, D2, ...  */
+
+	/* The ordering here is at least to some extent arbitrary.
+	   A generilized version needs to use some explicit ordering.  */
+	oprnds_info[0]->def_stmts[j+1] = oprnds_info[1]->def_stmts[j];
+	oprnds_info[0]->ops[j+1] = oprnds_info[1]->ops[j];
+	oprnds_info[0]->def_stmts[j+2] = oprnds_info[0]->def_stmts[j+2];
+	oprnds_info[0]->ops[j+2] = oprnds_info[0]->ops[j+2];
+	oprnds_info[0]->def_stmts[j+3] = oprnds_info[1]->def_stmts[j+2];
+	oprnds_info[0]->ops[j+3] = oprnds_info[1]->ops[j+2];
+      }
+
+  if (dump_enabled_p ())
+    {
+      dump_printf (MSG_NOTE, "Recurse with:\n");
+      for (unsigned int j = 0; j < group_size; j++)
+	{
+	  dump_printf (MSG_NOTE, "  ");
+	  print_gimple_stmt (dump_file, oprnds_info[0]->def_stmts[j]->stmt, 0);
+	}
+    }
+
+  /* Since we've merged the two nodes in one, make the second one a copy of the first.
+     An improvement could be to truncate to one just node, but that needs refactoring
+     in vect_build_slp_tree_2 which we can avoid for now. The nodes are going to be
+     cached anyway.  */
+  for (unsigned int j = 0; j < group_size; j++)
+    {
+      oprnds_info[1]->def_stmts[j] = oprnds_info[0]->def_stmts[j];
+      oprnds_info[1]->ops[j] = oprnds_info[0]->ops[j];
+    }
+
+  return pattern;
+}
+
 /* Recursively build an SLP tree starting from NODE.
    Fail (and return a value not equal to zero) if def-stmts are not
    isomorphic, require data permutation or are of unsupported types of
@@ -2380,6 +2516,8 @@ out:
 
   stmt_info = stmts[0];
 
+  int rearrange_pattern = try_rearrange_oprnd_info (oprnds_info, group_size);
+
   /* Create SLP_TREE nodes for the definition node/s.  */
   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
     {
@@ -2640,6 +2778,98 @@ fail:
   *tree_size += this_tree_size + 1;
   *max_nunits = this_max_nunits;
 
+  /* If we applied any rearrangmenets then we need to reconstruct the original
+     elements with an additional permutation layer.  */
+  if (rearrange_pattern != SLP_OPRND_PATTERN_NONE)
+    {
+      slp_tree one =  new _slp_tree;
+      slp_tree two = new _slp_tree;
+      SLP_TREE_DEF_TYPE (one) = vect_internal_def;
+      SLP_TREE_DEF_TYPE (two) = vect_internal_def;
+      SLP_TREE_VECTYPE (one) = vectype;
+      SLP_TREE_VECTYPE (two) = vectype;
+      SLP_TREE_CHILDREN (one).safe_splice (children);
+      SLP_TREE_CHILDREN (two).safe_splice (children);
+
+      SLP_TREE_CODE (one) = VEC_PERM_EXPR;
+      SLP_TREE_CODE (two) = VEC_PERM_EXPR;
+      SLP_TREE_REPRESENTATIVE (one) = stmts[0];
+      SLP_TREE_REPRESENTATIVE (two) = stmts[2];
+
+      if (rearrange_pattern == SLP_OPRND_PATTERN_ABAB)
+	{
+	   /* Given a single node -> A1, B1, C1, D1, A2, B2, C2, D2, ...
+	      Create node "one" -> A1, B1, A1, B1, A2, B2, A2, B2, ...
+	      Create node "two" -> C1, D1, C1, D1, C2, D2, C2, D2, ...  */
+
+	  for (unsigned int j = 0; j < group_size; j += 4)
+	    {
+	      SLP_TREE_LANE_PERMUTATION(one).safe_push (std::make_pair (0, j + 0));
+	      SLP_TREE_LANE_PERMUTATION(one).safe_push (std::make_pair (0, j + 1));
+	      SLP_TREE_LANE_PERMUTATION(one).safe_push (std::make_pair (0, j + 0));
+	      SLP_TREE_LANE_PERMUTATION(one).safe_push (std::make_pair (0, j + 1));
+
+	      SLP_TREE_LANE_PERMUTATION(two).safe_push (std::make_pair (0, j + 2));
+	      SLP_TREE_LANE_PERMUTATION(two).safe_push (std::make_pair (0, j + 3));
+	      SLP_TREE_LANE_PERMUTATION(two).safe_push (std::make_pair (0, j + 2));
+	      SLP_TREE_LANE_PERMUTATION(two).safe_push (std::make_pair (0, j + 3));
+	    }
+	}
+      else if (rearrange_pattern == SLP_OPRND_PATTERN_AABB)
+	{
+	   /* Given a single node -> A1, C1, B1, D1, A2, C2, B2, D2, ...
+	      Create node "one" -> A1, A1, B1, B1, A2, A2, B2, B2, ...
+	      Create node "two" -> C1, C1, D1, D1, C2, C2, D2, D2, ...  */
+
+	  for (unsigned int j = 0; j < group_size; j += 4)
+	    {
+	      SLP_TREE_LANE_PERMUTATION(one).safe_push (std::make_pair (0, j + 0));
+	      SLP_TREE_LANE_PERMUTATION(one).safe_push (std::make_pair (0, j + 0));
+	      SLP_TREE_LANE_PERMUTATION(one).safe_push (std::make_pair (0, j + 2));
+	      SLP_TREE_LANE_PERMUTATION(one).safe_push (std::make_pair (0, j + 2));
+
+	      SLP_TREE_LANE_PERMUTATION(two).safe_push (std::make_pair (0, j + 1));
+	      SLP_TREE_LANE_PERMUTATION(two).safe_push (std::make_pair (0, j + 1));
+	      SLP_TREE_LANE_PERMUTATION(two).safe_push (std::make_pair (0, j + 3));
+	      SLP_TREE_LANE_PERMUTATION(two).safe_push (std::make_pair (0, j + 3));
+	    }
+	}
+      else if (rearrange_pattern == SLP_OPRND_PATTERN_ABBA)
+	{
+	   /* Given a single node -> A1, B1, C1, D1, A2, B2, C2, D2, ...
+	      Create node "one" -> A1, B1, B1, A1, A2, B2, B2, A2, ...
+	      Create node "two" -> C1, D1, D1, C1, C2, D2, D2, C2, ...  */
+
+	  for (unsigned int j = 0; j < group_size; j += 4)
+	    {
+	      SLP_TREE_LANE_PERMUTATION(one).safe_push (std::make_pair (0, j + 0));
+	      SLP_TREE_LANE_PERMUTATION(one).safe_push (std::make_pair (0, j + 1));
+	      SLP_TREE_LANE_PERMUTATION(one).safe_push (std::make_pair (0, j + 1));
+	      SLP_TREE_LANE_PERMUTATION(one).safe_push (std::make_pair (0, j + 0));
+
+	      SLP_TREE_LANE_PERMUTATION(two).safe_push (std::make_pair (0, j + 2));
+	      SLP_TREE_LANE_PERMUTATION(two).safe_push (std::make_pair (0, j + 3));
+	      SLP_TREE_LANE_PERMUTATION(two).safe_push (std::make_pair (0, j + 3));
+	      SLP_TREE_LANE_PERMUTATION(two).safe_push (std::make_pair (0, j + 2));
+	    }
+	}
+
+      slp_tree child;
+      FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
+       SLP_TREE_REF_COUNT (child)++;
+
+      node = vect_create_new_slp_node (node, stmts, 2);
+      SLP_TREE_VECTYPE (node) = vectype;
+      SLP_TREE_CHILDREN (node).quick_push (one);
+      SLP_TREE_CHILDREN (node).quick_push (two);
+
+      SLP_TREE_LANES (one) = stmts.length ();
+      SLP_TREE_LANES (two) = stmts.length ();
+
+      children.truncate(0);
+      children.safe_splice(SLP_TREE_CHILDREN (node));
+    }
+
   if (two_operators)
     {
       /* ???  We'd likely want to either cache in bst_map sth like

             reply	other threads:[~2024-02-27 13:37 UTC|newest]

Thread overview: 4+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-02-27 13:37 Philipp Tomsich [this message]
  -- strict thread matches above, loose matches on Subject: below --
2024-01-23 20:57 Philipp Tomsich
2024-01-17 19:13 Philipp Tomsich
2023-11-28 13:35 Philipp Tomsich

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20240227133713.241BD3858427@sourceware.org \
    --to=ptomsich@gcc.gnu.org \
    --cc=gcc-cvs@gcc.gnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).