From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 1923) id 6FD313858C29; Wed, 17 Jan 2024 19:13:55 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 6FD313858C29 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org; s=default; t=1705518835; bh=hsWzjWzLA3r9hUG6FTFp5O46uOtdxHWEtgRHBk1CQgI=; h=From:To:Subject:Date:From; b=EdJ0rS4LUj8Y2Ktg4IIsaRronEK++wgeIvnzaaVRtvIaGFLp1qvLgvS1VTBo5RJU1 1Pa2vJAkxzHdPJnzSLpsAnEicGlwkNWEu0Np4yOWlBMQN9SZHna0bS57l+fHNZIb+s 9QFjFCYnaONi8JGg/eDpec3UYz5J77HcofrRS95g= Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: Philipp Tomsich To: gcc-cvs@gcc.gnu.org Subject: [gcc(refs/vendors/vrull/heads/slp-improvements)] Build SLP tree for nodes with shared statements X-Act-Checkin: gcc X-Git-Author: Manolis Tsamis X-Git-Refname: refs/vendors/vrull/heads/slp-improvements X-Git-Oldrev: bae3b7919e4231ba1e0488ed5104097b27d5ddfd X-Git-Newrev: 88e655d3c454acca54c139df1a52a0eedab78640 Message-Id: <20240117191355.6FD313858C29@sourceware.org> Date: Wed, 17 Jan 2024 19:13:55 +0000 (GMT) List-Id: https://gcc.gnu.org/g:88e655d3c454acca54c139df1a52a0eedab78640 commit 88e655d3c454acca54c139df1a52a0eedab78640 Author: Manolis Tsamis Date: Mon Oct 30 17:11:34 2023 +0100 Build SLP tree for nodes with shared statements When an SLP tree has multiple occurances of the same statement in a vector node (e.g. {A, B, A, B, ...} it can be that this duplication hurts the efficacy of the tree building procedures, compared to what can be achieved if we used a ordered node with no duplicates (i.e., {A, B, ...}) and then did a permute. This is especially true when the two_operators conditions are met, where nodes are duplicated and if they have duplicates there is an 'exponential explosion' of nodes that end up as splat vectors. This is a first iteration that improves these cases by matching some patterns and deduplicates + permutes them appropriately. This currently targets the vectorization of the first loop in x264's pixel_satd functions. See also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98138 Ref #342 Diff: --- gcc/tree-vect-slp.cc | 183 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 183 insertions(+) diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc index b6cce55ce90..4c56e8d1395 100644 --- a/gcc/tree-vect-slp.cc +++ b/gcc/tree-vect-slp.cc @@ -40,6 +40,7 @@ along with GCC; see the file COPYING3. If not see #include "tree-vectorizer.h" #include "langhooks.h" #include "gimple-walk.h" +#include "gimple-pretty-print.h" #include "dbgcnt.h" #include "tree-vector-builder.h" #include "vec-perm-indices.h" @@ -1819,6 +1820,109 @@ vect_slp_build_two_operator_nodes (slp_tree perm, tree vectype, SLP_TREE_CHILDREN (perm).quick_push (child2); } +static int +try_rearrange_oprnd_info (vec &oprnds_info, unsigned group_size) +{ + unsigned i; + slp_oprnd_info oprnd_info; + + /* A more generalized version is WIP but this is generic enough anyway. */ + if (oprnds_info.length() != 2 || group_size < 4) + return 0; + + FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info) + for (unsigned int j = 0; j < group_size; j += 1) + if (!oprnd_info->def_stmts[j]) + return 0; + + int pattern = 0; + FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info) + for (unsigned int j = 0; j < group_size; j += 4) + { + int cur_pattern = 0; + /* Check for an ABAB... pattern. */ + if ((oprnd_info->def_stmts[j] == oprnd_info->def_stmts[j + 2]) + && (oprnd_info->def_stmts[j + 1] == oprnd_info->def_stmts[j + 3])) + cur_pattern = 1; + /* Check for an AABB... pattern. */ + else if ((oprnd_info->def_stmts[j] == oprnd_info->def_stmts[j + 1]) + && (oprnd_info->def_stmts[j + 2] == oprnd_info->def_stmts[j + 3])) + cur_pattern = 2; + /* Check for an ABBA... pattern. */ + else if ((oprnd_info->def_stmts[j] == oprnd_info->def_stmts[j + 3]) + && (oprnd_info->def_stmts[j + 1] == oprnd_info->def_stmts[j + 2])) + cur_pattern = 3; + /* Unrecognised pattern. */ + else + return 0; + + if (pattern == 0) + pattern = cur_pattern; + /* Multiple patterns detected. */ + else if (cur_pattern != pattern) + return 0; + } + + gcc_checking_assert (pattern); + + if (dump_enabled_p ()) + { + if (pattern == 1) + dump_printf (MSG_NOTE, "ABAB"); + else if (pattern == 2) + dump_printf (MSG_NOTE, "AABB"); + else if (pattern == 3) + dump_printf (MSG_NOTE, "ABBA"); + dump_printf (MSG_NOTE, " pattern detected.\n"); + } + + if (pattern == 1) + for (unsigned int j = 0; j < group_size; j += 4) + { + oprnds_info[0]->def_stmts[j+2] = oprnds_info[1]->def_stmts[j]; + oprnds_info[0]->ops[j+2] = oprnds_info[1]->ops[j]; + oprnds_info[0]->def_stmts[j+3] = oprnds_info[1]->def_stmts[j+1]; + oprnds_info[0]->ops[j+3] = oprnds_info[1]->ops[j+1]; + } + else if (pattern == 2) + for (unsigned int j = 0; j < group_size; j += 4) + { + /* The ordering here is at least to some extent arbitrary. + A generilized version needs to use some explicit ordering. */ + oprnds_info[0]->def_stmts[j+1] = oprnds_info[1]->def_stmts[j]; + oprnds_info[0]->ops[j+1] = oprnds_info[1]->ops[j]; + oprnds_info[0]->def_stmts[j+2] = oprnds_info[0]->def_stmts[j+2]; + oprnds_info[0]->ops[j+2] = oprnds_info[0]->ops[j+2]; + oprnds_info[0]->def_stmts[j+3] = oprnds_info[1]->def_stmts[j+2]; + oprnds_info[0]->ops[j+3] = oprnds_info[1]->ops[j+2]; + } + else if (pattern == 3) + /* No need to handle that for now. */ + return 0; + + if (dump_enabled_p ()) + { + dump_printf (MSG_NOTE, "Recurse with:\n"); + for (unsigned int j = 0; j < group_size; j++) + { + dump_printf (MSG_NOTE, " "); + print_gimple_stmt (dump_file, oprnds_info[0]->def_stmts[j]->stmt, 0); + } + } + + /* Since we've merged the two nodes in one, make the second one a copy of the first. + An improvement could be to truncate to one just node, but that needs refactoring + in vect_build_slp_tree_2 which we can avoid for now. The nodes are going to be + cached anyway. */ + for (unsigned int j = 0; j < group_size; j++) + { + oprnds_info[1]->def_stmts[j] = oprnds_info[0]->def_stmts[j]; + oprnds_info[1]->ops[j] = oprnds_info[0]->ops[j]; + } + + return pattern; +} + /* Recursively build an SLP tree starting from NODE. Fail (and return a value not equal to zero) if def-stmts are not isomorphic, require data permutation or are of unsupported types of @@ -2380,6 +2484,8 @@ out: stmt_info = stmts[0]; + int rearrange_pattern = try_rearrange_oprnd_info (oprnds_info, group_size); + /* Create SLP_TREE nodes for the definition node/s. */ FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info) { @@ -2640,6 +2746,83 @@ fail: *tree_size += this_tree_size + 1; *max_nunits = this_max_nunits; + /* If we applied any rearrangmenets then we need to reconstruct the original + elements with an additional permutation layer. */ + if (rearrange_pattern) + { + slp_tree one = new _slp_tree; + slp_tree two = new _slp_tree; + SLP_TREE_DEF_TYPE (one) = vect_internal_def; + SLP_TREE_DEF_TYPE (two) = vect_internal_def; + SLP_TREE_VECTYPE (one) = vectype; + SLP_TREE_VECTYPE (two) = vectype; + SLP_TREE_CHILDREN (one).safe_splice (children); + SLP_TREE_CHILDREN (two).safe_splice (children); + + if (rearrange_pattern == 1) + { + SLP_TREE_CODE (one) = VEC_PERM_EXPR; + SLP_TREE_CODE (two) = VEC_PERM_EXPR; + unsigned int h = group_size / 2; + SLP_TREE_REPRESENTATIVE (one) = stmts[0]; + SLP_TREE_REPRESENTATIVE (two) = stmts[h]; + + for (unsigned int j = 0; j < h; j += 2) + { + SLP_TREE_LANE_PERMUTATION(one).safe_push (std::make_pair (0, j)); + SLP_TREE_LANE_PERMUTATION(one).safe_push (std::make_pair (0, j + 1)); + SLP_TREE_LANE_PERMUTATION(one).safe_push (std::make_pair (0, j)); + SLP_TREE_LANE_PERMUTATION(one).safe_push (std::make_pair (0, j + 1)); + } + for (unsigned int j = 0; j < h; j += 2) + { + SLP_TREE_LANE_PERMUTATION(two).safe_push (std::make_pair (0, h + j)); + SLP_TREE_LANE_PERMUTATION(two).safe_push (std::make_pair (0, h + j + 1)); + SLP_TREE_LANE_PERMUTATION(two).safe_push (std::make_pair (0, h + j)); + SLP_TREE_LANE_PERMUTATION(two).safe_push (std::make_pair (0, h + j + 1)); + } + } + else if (rearrange_pattern == 2) + { + SLP_TREE_CODE (one) = VEC_PERM_EXPR; + SLP_TREE_CODE (two) = VEC_PERM_EXPR; + unsigned int h = group_size / 2; + SLP_TREE_REPRESENTATIVE (one) = stmts[0]; + SLP_TREE_REPRESENTATIVE (two) = stmts[h]; + + for (unsigned int j = 0; j < h; j += 2) + { + SLP_TREE_LANE_PERMUTATION(one).safe_push (std::make_pair (0, j)); + SLP_TREE_LANE_PERMUTATION(one).safe_push (std::make_pair (0, j)); + SLP_TREE_LANE_PERMUTATION(one).safe_push (std::make_pair (0, j + 1)); + SLP_TREE_LANE_PERMUTATION(one).safe_push (std::make_pair (0, j + 1)); + } + for (unsigned int j = 0; j < h; j += 2) + { + SLP_TREE_LANE_PERMUTATION(two).safe_push (std::make_pair (0, h + j)); + SLP_TREE_LANE_PERMUTATION(two).safe_push (std::make_pair (0, h + j)); + SLP_TREE_LANE_PERMUTATION(two).safe_push (std::make_pair (0, h + j + 1)); + SLP_TREE_LANE_PERMUTATION(two).safe_push (std::make_pair (0, h + j + 1)); + } + } + + slp_tree child; + FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child) + SLP_TREE_REF_COUNT (child)++; + + node = vect_create_new_slp_node (node, stmts, 2); + SLP_TREE_VECTYPE (node) = vectype; + SLP_TREE_CHILDREN (node).quick_push (one); + SLP_TREE_CHILDREN (node).quick_push (two); + + SLP_TREE_LANES (one) = stmts.length (); + SLP_TREE_LANES (two) = stmts.length (); + + children.truncate(0); + children.safe_splice(SLP_TREE_CHILDREN (node)); + } + + if (two_operators) { /* ??? We'd likely want to either cache in bst_map sth like