From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 1923) id 241BD3858427; Tue, 27 Feb 2024 13:37:13 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 241BD3858427 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org; s=default; t=1709041033; bh=w+kHsbUhtwuri0m1SnH2D38J7aTM1hJOrgHcU/DFNhM=; h=From:To:Subject:Date:From; b=yAxiNXjwLfkMq/JRjXOdooj7soZv9w5CHYLwBW2GR8W9Z7f3DGYzDcjj3X5YKAGk/ 86yxdMyhtWY+Y88PWQ/7yQkruBYiq7u7IBiPvIWD0DBlKwppOyqH9g1zsoc83kmVRD BBnqE7GzpUJLJin03qG6aWVg2jLxEudd02PEVUiI= Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: Philipp Tomsich To: gcc-cvs@gcc.gnu.org Subject: [gcc(refs/vendors/vrull/heads/slp-improvements)] Build SLP tree for nodes with shared statements X-Act-Checkin: gcc X-Git-Author: Manolis Tsamis X-Git-Refname: refs/vendors/vrull/heads/slp-improvements X-Git-Oldrev: a82ecdeacf83996321de5312522b0cc961e02a95 X-Git-Newrev: 821917bf62b82ebcd0f423c21b06def7735e1371 Message-Id: <20240227133713.241BD3858427@sourceware.org> Date: Tue, 27 Feb 2024 13:37:13 +0000 (GMT) List-Id: https://gcc.gnu.org/g:821917bf62b82ebcd0f423c21b06def7735e1371 commit 821917bf62b82ebcd0f423c21b06def7735e1371 Author: Manolis Tsamis Date: Mon Oct 30 17:11:34 2023 +0100 Build SLP tree for nodes with shared statements When an SLP tree has multiple occurances of the same statement in a vector node (e.g. {A, B, A, B, ...} it can be that this duplication hurts the efficacy of the tree building procedures, compared to what can be achieved if we used a ordered node with no duplicates (i.e., {A, B, ...}) and then did a permute. This is especially true when the two_operators conditions are met, where nodes are duplicated and if they have duplicates there is an 'exponential explosion' of nodes that end up as splat vectors. We check that the vector elements taking part in a rearranged pattern (e.g. ABAB) are actually different. Otherwise a splat vector (AAAA) would be considered to fulfill any rearrangement pattern. We don't want to merge splat vectors together in a mixed node as that can both reduce performance and cause codegen issues. This commit improves these cases by matching the patterns and deduplicates + permutes them appropriately. This change (among other cases) targets the vectorization of the first loop in x264's pixel_satd functions. See also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98138 Ref #342 Diff: --- gcc/testsuite/gcc.target/aarch64/pr98138_1.c | 32 ++++ gcc/testsuite/gcc.target/aarch64/pr98138_2.c | 48 ++++++ gcc/tree-vect-slp.cc | 230 +++++++++++++++++++++++++++ 3 files changed, 310 insertions(+) diff --git a/gcc/testsuite/gcc.target/aarch64/pr98138_1.c b/gcc/testsuite/gcc.target/aarch64/pr98138_1.c new file mode 100644 index 00000000000..a9f7d4d6a99 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/pr98138_1.c @@ -0,0 +1,32 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-all" } */ + +extern void test(unsigned int t[4][4]); + +void foo(unsigned char *p1, int i1, unsigned char *p2, int i2) +{ + unsigned int tmp[4][4]; + unsigned int a0, a1, a2, a3; + + for (int i = 0; i < 4; i++, p1 += i1, p2 += i2) + { + a0 = (p1[0] - p2[0]) + ((p1[4] - p2[4]) << 16); + a1 = (p1[1] - p2[1]) + ((p1[5] - p2[5]) << 16); + a2 = (p1[2] - p2[2]) + ((p1[6] - p2[6]) << 16); + a3 = (p1[3] - p2[3]) + ((p1[7] - p2[7]) << 16); + + int t0 = a0 + a1; + int t1 = a0 - a1; + int t2 = a2 + a3; + int t3 = a2 - a3; + + tmp[i][0] = t0 + t2; + tmp[i][2] = t0 - t2; + tmp[i][1] = t1 + t3; + tmp[i][3] = t1 - t3; + } + + test(tmp); +} + +/* { dg-final { scan-tree-dump "vectorized 1 loops" "vect" } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/pr98138_2.c b/gcc/testsuite/gcc.target/aarch64/pr98138_2.c new file mode 100644 index 00000000000..7e85470a677 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/pr98138_2.c @@ -0,0 +1,48 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-all" } */ + +typedef unsigned char uint8_t; +typedef unsigned int uint32_t; +typedef unsigned short uint16_t; + +static inline +uint32_t abs2 ( uint32_t a ) +{ + uint32_t s = ((a>>15)&0x10001)*0xffff; + return (a+s)^s; +} + +#define HADAMARD4(d0, d1, d2, d3, s0, s1, s2, s3) {\ + int t0 = s0 + s1;\ + int t1 = s0 - s1;\ + int t2 = s2 + s3;\ + int t3 = s2 - s3;\ + d0 = t0 + t2;\ + d2 = t0 - t2;\ + d1 = t1 + t3;\ + d3 = t1 - t3;\ +} + +int +foo ( uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 ) +{ + uint32_t tmp[4][4]; + uint32_t a0, a1, a2, a3; + int sum = 0; + for( int i = 0; i < 4; i++, pix1 += i_pix1, pix2 += i_pix2 ) + { + a0 = (pix1[0] - pix2[0]) + ((pix1[4] - pix2[4]) << 16); + a1 = (pix1[1] - pix2[1]) + ((pix1[5] - pix2[5]) << 16); + a2 = (pix1[2] - pix2[2]) + ((pix1[6] - pix2[6]) << 16); + a3 = (pix1[3] - pix2[3]) + ((pix1[7] - pix2[7]) << 16); + HADAMARD4( tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], a0,a1,a2,a3 ); + } + for( int i = 0; i < 4; i++ ) + { + HADAMARD4( a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i] ); + sum += abs2(a0) + abs2(a1) + abs2(a2) + abs2(a3); + } + return (((uint16_t)sum) + ((uint32_t)sum>>16)) >> 1; +} + +/* { dg-final { scan-tree-dump "vectorized 2 loops" "vect" } } */ diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc index 895f4f7fb6b..238a17ca4e1 100644 --- a/gcc/tree-vect-slp.cc +++ b/gcc/tree-vect-slp.cc @@ -40,6 +40,7 @@ along with GCC; see the file COPYING3. If not see #include "tree-vectorizer.h" #include "langhooks.h" #include "gimple-walk.h" +#include "gimple-pretty-print.h" #include "dbgcnt.h" #include "tree-vector-builder.h" #include "vec-perm-indices.h" @@ -1819,6 +1820,141 @@ vect_slp_build_two_operator_nodes (slp_tree perm, tree vectype, SLP_TREE_CHILDREN (perm).quick_push (child2); } +enum slp_oprnd_pattern +{ + SLP_OPRND_PATTERN_NONE, + SLP_OPRND_PATTERN_ABAB, + SLP_OPRND_PATTERN_AABB, + SLP_OPRND_PATTERN_ABBA +}; + +static int +try_rearrange_oprnd_info (vec &oprnds_info, unsigned group_size) +{ + unsigned i; + slp_oprnd_info oprnd_info; + + /* A more generalized version is WIP but this is generic enough anyway. */ + if (oprnds_info.length() != 2 || group_size % 4 != 0) + return SLP_OPRND_PATTERN_NONE; + + if (!oprnds_info[0]->def_stmts[0] + || !is_a (oprnds_info[0]->def_stmts[0]->stmt)) + return SLP_OPRND_PATTERN_NONE; + + enum tree_code code + = gimple_assign_rhs_code (oprnds_info[0]->def_stmts[0]->stmt); + FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info) + for (unsigned int j = 0; j < group_size; j += 1) + { + if (!oprnd_info->def_stmts[j] + || !is_a (oprnd_info->def_stmts[j]->stmt) + || STMT_VINFO_DATA_REF (oprnd_info->def_stmts[j])) + return SLP_OPRND_PATTERN_NONE; + /* Don't mix different operations. */ + if (gimple_assign_rhs_code (oprnd_info->def_stmts[j]->stmt) != code) + return SLP_OPRND_PATTERN_NONE; + } + + if (gimple_assign_rhs_code (oprnds_info[0]->def_stmts[0]->stmt) + != gimple_assign_rhs_code (oprnds_info[1]->def_stmts[0]->stmt)) + return SLP_OPRND_PATTERN_NONE; + + int pattern = SLP_OPRND_PATTERN_NONE; + FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info) + for (unsigned int j = 0; j < group_size; j += 4) + { + int cur_pattern = SLP_OPRND_PATTERN_NONE; + /* Check for an ABAB... pattern. */ + if ((oprnd_info->def_stmts[j] == oprnd_info->def_stmts[j + 2]) + && (oprnd_info->def_stmts[j + 1] == oprnd_info->def_stmts[j + 3]) + && (oprnd_info->def_stmts[j] != oprnd_info->def_stmts[j + 1])) + cur_pattern = SLP_OPRND_PATTERN_ABAB; + /* Check for an AABB... pattern. */ + else if ((oprnd_info->def_stmts[j] == oprnd_info->def_stmts[j + 1]) + && (oprnd_info->def_stmts[j + 2] == oprnd_info->def_stmts[j + 3]) + && (oprnd_info->def_stmts[j] != oprnd_info->def_stmts[j + 2])) + cur_pattern = SLP_OPRND_PATTERN_AABB; + /* Check for an ABBA... pattern. */ + else if ((oprnd_info->def_stmts[j] == oprnd_info->def_stmts[j + 3]) + && (oprnd_info->def_stmts[j + 1] == oprnd_info->def_stmts[j + 2]) + && (oprnd_info->def_stmts[j] != oprnd_info->def_stmts[j + 1])) + cur_pattern = SLP_OPRND_PATTERN_ABBA; + /* Unrecognised pattern. */ + else + return SLP_OPRND_PATTERN_NONE; + + if (pattern == SLP_OPRND_PATTERN_NONE) + pattern = cur_pattern; + /* Multiple patterns detected. */ + else if (cur_pattern != pattern) + return SLP_OPRND_PATTERN_NONE; + } + + gcc_checking_assert (pattern != SLP_OPRND_PATTERN_NONE); + + if (dump_enabled_p ()) + { + if (pattern == SLP_OPRND_PATTERN_ABAB) + dump_printf (MSG_NOTE, "ABAB"); + else if (pattern == SLP_OPRND_PATTERN_AABB) + dump_printf (MSG_NOTE, "AABB"); + else if (pattern == SLP_OPRND_PATTERN_ABBA) + dump_printf (MSG_NOTE, "ABBA"); + dump_printf (MSG_NOTE, " pattern detected.\n"); + } + + if (pattern == SLP_OPRND_PATTERN_ABAB || pattern == SLP_OPRND_PATTERN_ABBA) + for (unsigned int j = 0; j < group_size; j += 4) + { + /* Given oprnd[0] -> A1, B1, A1, B1, A2, B2, A2, B2, ... + Given oprnd[1] -> C1, D1, C1, D1, C2, D2, C2, D2, ... + Create a single node -> A1, B1, C1, D1, A2, B2, C2, D2, ... */ + oprnds_info[0]->def_stmts[j+2] = oprnds_info[1]->def_stmts[j]; + oprnds_info[0]->ops[j+2] = oprnds_info[1]->ops[j]; + oprnds_info[0]->def_stmts[j+3] = oprnds_info[1]->def_stmts[j+1]; + oprnds_info[0]->ops[j+3] = oprnds_info[1]->ops[j+1]; + } + else if (pattern == SLP_OPRND_PATTERN_AABB) + for (unsigned int j = 0; j < group_size; j += 4) + { + /* Given oprnd[0] -> A1, A1, B1, B1, A2, A2, B2, B2, ... + Given oprnd[1] -> C1, C1, D1, D1, C2, C2, D2, D2, ... + Create a single node -> A1, C1, B1, D1, A2, C2, B2, D2, ... */ + + /* The ordering here is at least to some extent arbitrary. + A generilized version needs to use some explicit ordering. */ + oprnds_info[0]->def_stmts[j+1] = oprnds_info[1]->def_stmts[j]; + oprnds_info[0]->ops[j+1] = oprnds_info[1]->ops[j]; + oprnds_info[0]->def_stmts[j+2] = oprnds_info[0]->def_stmts[j+2]; + oprnds_info[0]->ops[j+2] = oprnds_info[0]->ops[j+2]; + oprnds_info[0]->def_stmts[j+3] = oprnds_info[1]->def_stmts[j+2]; + oprnds_info[0]->ops[j+3] = oprnds_info[1]->ops[j+2]; + } + + if (dump_enabled_p ()) + { + dump_printf (MSG_NOTE, "Recurse with:\n"); + for (unsigned int j = 0; j < group_size; j++) + { + dump_printf (MSG_NOTE, " "); + print_gimple_stmt (dump_file, oprnds_info[0]->def_stmts[j]->stmt, 0); + } + } + + /* Since we've merged the two nodes in one, make the second one a copy of the first. + An improvement could be to truncate to one just node, but that needs refactoring + in vect_build_slp_tree_2 which we can avoid for now. The nodes are going to be + cached anyway. */ + for (unsigned int j = 0; j < group_size; j++) + { + oprnds_info[1]->def_stmts[j] = oprnds_info[0]->def_stmts[j]; + oprnds_info[1]->ops[j] = oprnds_info[0]->ops[j]; + } + + return pattern; +} + /* Recursively build an SLP tree starting from NODE. Fail (and return a value not equal to zero) if def-stmts are not isomorphic, require data permutation or are of unsupported types of @@ -2380,6 +2516,8 @@ out: stmt_info = stmts[0]; + int rearrange_pattern = try_rearrange_oprnd_info (oprnds_info, group_size); + /* Create SLP_TREE nodes for the definition node/s. */ FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info) { @@ -2640,6 +2778,98 @@ fail: *tree_size += this_tree_size + 1; *max_nunits = this_max_nunits; + /* If we applied any rearrangmenets then we need to reconstruct the original + elements with an additional permutation layer. */ + if (rearrange_pattern != SLP_OPRND_PATTERN_NONE) + { + slp_tree one = new _slp_tree; + slp_tree two = new _slp_tree; + SLP_TREE_DEF_TYPE (one) = vect_internal_def; + SLP_TREE_DEF_TYPE (two) = vect_internal_def; + SLP_TREE_VECTYPE (one) = vectype; + SLP_TREE_VECTYPE (two) = vectype; + SLP_TREE_CHILDREN (one).safe_splice (children); + SLP_TREE_CHILDREN (two).safe_splice (children); + + SLP_TREE_CODE (one) = VEC_PERM_EXPR; + SLP_TREE_CODE (two) = VEC_PERM_EXPR; + SLP_TREE_REPRESENTATIVE (one) = stmts[0]; + SLP_TREE_REPRESENTATIVE (two) = stmts[2]; + + if (rearrange_pattern == SLP_OPRND_PATTERN_ABAB) + { + /* Given a single node -> A1, B1, C1, D1, A2, B2, C2, D2, ... + Create node "one" -> A1, B1, A1, B1, A2, B2, A2, B2, ... + Create node "two" -> C1, D1, C1, D1, C2, D2, C2, D2, ... */ + + for (unsigned int j = 0; j < group_size; j += 4) + { + SLP_TREE_LANE_PERMUTATION(one).safe_push (std::make_pair (0, j + 0)); + SLP_TREE_LANE_PERMUTATION(one).safe_push (std::make_pair (0, j + 1)); + SLP_TREE_LANE_PERMUTATION(one).safe_push (std::make_pair (0, j + 0)); + SLP_TREE_LANE_PERMUTATION(one).safe_push (std::make_pair (0, j + 1)); + + SLP_TREE_LANE_PERMUTATION(two).safe_push (std::make_pair (0, j + 2)); + SLP_TREE_LANE_PERMUTATION(two).safe_push (std::make_pair (0, j + 3)); + SLP_TREE_LANE_PERMUTATION(two).safe_push (std::make_pair (0, j + 2)); + SLP_TREE_LANE_PERMUTATION(two).safe_push (std::make_pair (0, j + 3)); + } + } + else if (rearrange_pattern == SLP_OPRND_PATTERN_AABB) + { + /* Given a single node -> A1, C1, B1, D1, A2, C2, B2, D2, ... + Create node "one" -> A1, A1, B1, B1, A2, A2, B2, B2, ... + Create node "two" -> C1, C1, D1, D1, C2, C2, D2, D2, ... */ + + for (unsigned int j = 0; j < group_size; j += 4) + { + SLP_TREE_LANE_PERMUTATION(one).safe_push (std::make_pair (0, j + 0)); + SLP_TREE_LANE_PERMUTATION(one).safe_push (std::make_pair (0, j + 0)); + SLP_TREE_LANE_PERMUTATION(one).safe_push (std::make_pair (0, j + 2)); + SLP_TREE_LANE_PERMUTATION(one).safe_push (std::make_pair (0, j + 2)); + + SLP_TREE_LANE_PERMUTATION(two).safe_push (std::make_pair (0, j + 1)); + SLP_TREE_LANE_PERMUTATION(two).safe_push (std::make_pair (0, j + 1)); + SLP_TREE_LANE_PERMUTATION(two).safe_push (std::make_pair (0, j + 3)); + SLP_TREE_LANE_PERMUTATION(two).safe_push (std::make_pair (0, j + 3)); + } + } + else if (rearrange_pattern == SLP_OPRND_PATTERN_ABBA) + { + /* Given a single node -> A1, B1, C1, D1, A2, B2, C2, D2, ... + Create node "one" -> A1, B1, B1, A1, A2, B2, B2, A2, ... + Create node "two" -> C1, D1, D1, C1, C2, D2, D2, C2, ... */ + + for (unsigned int j = 0; j < group_size; j += 4) + { + SLP_TREE_LANE_PERMUTATION(one).safe_push (std::make_pair (0, j + 0)); + SLP_TREE_LANE_PERMUTATION(one).safe_push (std::make_pair (0, j + 1)); + SLP_TREE_LANE_PERMUTATION(one).safe_push (std::make_pair (0, j + 1)); + SLP_TREE_LANE_PERMUTATION(one).safe_push (std::make_pair (0, j + 0)); + + SLP_TREE_LANE_PERMUTATION(two).safe_push (std::make_pair (0, j + 2)); + SLP_TREE_LANE_PERMUTATION(two).safe_push (std::make_pair (0, j + 3)); + SLP_TREE_LANE_PERMUTATION(two).safe_push (std::make_pair (0, j + 3)); + SLP_TREE_LANE_PERMUTATION(two).safe_push (std::make_pair (0, j + 2)); + } + } + + slp_tree child; + FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child) + SLP_TREE_REF_COUNT (child)++; + + node = vect_create_new_slp_node (node, stmts, 2); + SLP_TREE_VECTYPE (node) = vectype; + SLP_TREE_CHILDREN (node).quick_push (one); + SLP_TREE_CHILDREN (node).quick_push (two); + + SLP_TREE_LANES (one) = stmts.length (); + SLP_TREE_LANES (two) = stmts.length (); + + children.truncate(0); + children.safe_splice(SLP_TREE_CHILDREN (node)); + } + if (two_operators) { /* ??? We'd likely want to either cache in bst_map sth like