From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 1666) id A2210385783D; Mon, 7 Dec 2020 11:54:32 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org A2210385783D MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Content-Type: text/plain; charset="utf-8" From: Richard Biener To: gcc-cvs@gcc.gnu.org Subject: [gcc r11-5821] tree-optimization/98113 - vectorize a sequence of BIT_INSERT_EXPRs X-Act-Checkin: gcc X-Git-Author: Richard Biener X-Git-Refname: refs/heads/master X-Git-Oldrev: cdcbef3c3310a14f2994982b44cb1f8e14c77232 X-Git-Newrev: ebdfd1606da6b5aa586b0cd156b69b659235c9c2 Message-Id: <20201207115432.A2210385783D@sourceware.org> Date: Mon, 7 Dec 2020 11:54:32 +0000 (GMT) X-BeenThere: gcc-cvs@gcc.gnu.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Gcc-cvs mailing list List-Unsubscribe: , List-Archive: List-Help: List-Subscribe: , X-List-Received-Date: Mon, 07 Dec 2020 11:54:32 -0000 https://gcc.gnu.org/g:ebdfd1606da6b5aa586b0cd156b69b659235c9c2 commit r11-5821-gebdfd1606da6b5aa586b0cd156b69b659235c9c2 Author: Richard Biener Date: Thu Dec 3 10:25:14 2020 +0100 tree-optimization/98113 - vectorize a sequence of BIT_INSERT_EXPRs This adds the capability to handle a sequence of vector BIT_INSERT_EXPRs to be vectorized similar as to how we vectorize vector constructors. 2020-12-03 Richard Biener PR tree-optimization/98113 * tree-vectorizer.h (struct slp_root): New. (_bb_vec_info::roots): New member. * tree-vect-slp.c (vect_analyze_slp): Also walk BB info roots. (_bb_vec_info::_bb_vec_info): Adjust. (_bb_vec_info::~_bb_vec_info): Likewise. (vld_cmp): New. (vect_slp_is_lane_insert): Likewise. (vect_slp_check_for_constructors): Match a series of BIT_INSERT_EXPRs as vector constructor. (vect_slp_analyze_bb_1): Continue if BB info roots is not empty. (vect_slp_analyze_bb_1): Mark the whole BIT_INSERT_EXPR root sequence as pure_slp. * gcc.dg/vect/bb-slp-70.c: New testcase. Diff: --- gcc/testsuite/gcc.dg/vect/bb-slp-70.c | 17 +++ gcc/tree-vect-slp.c | 192 ++++++++++++++++++++++++++++++---- gcc/tree-vectorizer.h | 12 +++ 3 files changed, 200 insertions(+), 21 deletions(-) diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-70.c b/gcc/testsuite/gcc.dg/vect/bb-slp-70.c new file mode 100644 index 00000000000..0eb70112bde --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/bb-slp-70.c @@ -0,0 +1,17 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-mavx512vl -mavx512vpopcntdq" { target avx512vpopcntdq } } */ + +typedef unsigned uv4si __attribute__((vector_size(16))); + +uv4si __attribute__((noinline)) +vpopctf (uv4si a) +{ + uv4si r; + r[2] = __builtin_popcount (a[2]); + r[1] = __builtin_popcount (a[1]); + r[0] = __builtin_popcount (a[0]); + r[3] = __builtin_popcount (a[3]); + return r; +} + +/* { dg-final { scan-tree-dump "optimized: basic block" "slp2" { target avx512vpopcntdq } } } */ diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c index 3bd40cbb193..2dccca02aa0 100644 --- a/gcc/tree-vect-slp.c +++ b/gcc/tree-vect-slp.c @@ -2578,6 +2578,19 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size) ? slp_inst_kind_store : slp_inst_kind_ctor, max_tree_size); + if (bb_vec_info bb_vinfo = dyn_cast (vinfo)) + { + for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i) + { + vect_location = bb_vinfo->roots[i].root->stmt; + if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind, + bb_vinfo->roots[i].stmts, + bb_vinfo->roots[i].root, + max_tree_size, bst_map, NULL)) + bb_vinfo->roots[i].stmts = vNULL; + } + } + if (loop_vec_info loop_vinfo = dyn_cast (vinfo)) { /* Find SLP sequences starting from reduction chains. */ @@ -3336,7 +3349,7 @@ vect_detect_hybrid_slp (loop_vec_info loop_vinfo) /* Initialize a bb_vec_info struct for the statements in BBS basic blocks. */ _bb_vec_info::_bb_vec_info (vec _bbs, vec_info_shared *shared) - : vec_info (vec_info::bb, init_cost (NULL), shared), bbs (_bbs) + : vec_info (vec_info::bb, init_cost (NULL), shared), bbs (_bbs), roots (vNULL) { for (unsigned i = 0; i < bbs.length (); ++i) { @@ -3383,6 +3396,10 @@ _bb_vec_info::~_bb_vec_info () gimple_set_uid (stmt, -1); } } + + for (unsigned i = 0; i < roots.length (); ++i) + roots[i].stmts.release (); + roots.release (); } /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE, @@ -4105,6 +4122,38 @@ vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo, return true; } +/* qsort comparator for lane defs. */ + +static int +vld_cmp (const void *a_, const void *b_) +{ + auto *a = (const std::pair *)a_; + auto *b = (const std::pair *)b_; + return a->first - b->first; +} + +/* Return true if USE_STMT is a vector lane insert into VEC and set + *THIS_LANE to the lane number that is set. */ + +static bool +vect_slp_is_lane_insert (gimple *use_stmt, tree vec, unsigned *this_lane) +{ + gassign *use_ass = dyn_cast (use_stmt); + if (!use_ass + || gimple_assign_rhs_code (use_ass) != BIT_INSERT_EXPR + || (vec + ? gimple_assign_rhs1 (use_ass) != vec + : ((vec = gimple_assign_rhs1 (use_ass)), false)) + || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec)), + TREE_TYPE (gimple_assign_rhs2 (use_ass))) + || !constant_multiple_p + (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass)), + tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec)))), + this_lane)) + return false; + return true; +} + /* Find any vectorizable constructors and add them to the grouped_store array. */ @@ -4116,28 +4165,114 @@ vect_slp_check_for_constructors (bb_vec_info bb_vinfo) !gsi_end_p (gsi); gsi_next (&gsi)) { gassign *assign = dyn_cast (gsi_stmt (gsi)); - if (!assign || gimple_assign_rhs_code (assign) != CONSTRUCTOR) + if (!assign) continue; tree rhs = gimple_assign_rhs1 (assign); - if (!VECTOR_TYPE_P (TREE_TYPE (rhs)) - || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)), - CONSTRUCTOR_NELTS (rhs)) - || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value)) - || uniform_vector_p (rhs)) - continue; + if (gimple_assign_rhs_code (assign) == CONSTRUCTOR) + { + if (!VECTOR_TYPE_P (TREE_TYPE (rhs)) + || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)), + CONSTRUCTOR_NELTS (rhs)) + || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value)) + || uniform_vector_p (rhs)) + continue; - unsigned j; - tree val; - FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val) - if (TREE_CODE (val) != SSA_NAME - || !bb_vinfo->lookup_def (val)) - break; - if (j != CONSTRUCTOR_NELTS (rhs)) - continue; + unsigned j; + tree val; + FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val) + if (TREE_CODE (val) != SSA_NAME + || !bb_vinfo->lookup_def (val)) + break; + if (j != CONSTRUCTOR_NELTS (rhs)) + continue; - stmt_vec_info stmt_info = bb_vinfo->lookup_stmt (assign); - BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info); + stmt_vec_info stmt_info = bb_vinfo->lookup_stmt (assign); + BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info); + } + else if (gimple_assign_rhs_code (assign) == BIT_INSERT_EXPR + && VECTOR_TYPE_P (TREE_TYPE (rhs)) + && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant () + && integer_zerop (gimple_assign_rhs3 (assign)) + && useless_type_conversion_p + (TREE_TYPE (TREE_TYPE (rhs)), + TREE_TYPE (gimple_assign_rhs2 (assign)))) + { + /* We start to match on insert to lane zero but since the + inserts need not be ordered we'd have to search both + the def and the use chains. */ + tree vectype = TREE_TYPE (rhs); + unsigned nlanes = TYPE_VECTOR_SUBPARTS (vectype).to_constant (); + auto_vec > lane_defs (nlanes); + auto_sbitmap lanes (nlanes); + bitmap_clear (lanes); + bitmap_set_bit (lanes, 0); + tree def = gimple_assign_lhs (assign); + lane_defs.quick_push + (std::make_pair (0, gimple_assign_rhs2 (assign))); + unsigned lanes_found = 1; + /* Start with the use chains, the last stmt will be the root. */ + stmt_vec_info last = bb_vinfo->lookup_stmt (assign); + do + { + use_operand_p use_p; + gimple *use_stmt; + if (!single_imm_use (def, &use_p, &use_stmt)) + break; + unsigned this_lane; + if (!bb_vinfo->lookup_stmt (use_stmt) + || !vect_slp_is_lane_insert (use_stmt, def, &this_lane) + || !bb_vinfo->lookup_def (gimple_assign_rhs2 (use_stmt))) + break; + if (bitmap_bit_p (lanes, this_lane)) + break; + lanes_found++; + bitmap_set_bit (lanes, this_lane); + gassign *use_ass = as_a (use_stmt); + lane_defs.quick_push (std::make_pair + (this_lane, gimple_assign_rhs2 (use_ass))); + last = bb_vinfo->lookup_stmt (use_ass); + def = gimple_assign_lhs (use_ass); + } + while (lanes_found < nlanes); + if (lanes_found < nlanes) + { + /* Now search the def chain. */ + def = gimple_assign_rhs1 (assign); + do + { + if (!has_single_use (def)) + break; + gimple *def_stmt = SSA_NAME_DEF_STMT (def); + unsigned this_lane; + if (!bb_vinfo->lookup_stmt (def_stmt) + || !vect_slp_is_lane_insert (def_stmt, + NULL_TREE, &this_lane) + || !bb_vinfo->lookup_def (gimple_assign_rhs2 (def_stmt))) + break; + if (bitmap_bit_p (lanes, this_lane)) + break; + lanes_found++; + bitmap_set_bit (lanes, this_lane); + lane_defs.quick_push (std::make_pair + (this_lane, + gimple_assign_rhs2 (def_stmt))); + def = gimple_assign_rhs1 (def_stmt); + } + while (lanes_found < nlanes); + } + if (lanes_found == nlanes) + { + /* Sort lane_defs after the lane index and register the root. */ + lane_defs.qsort (vld_cmp); + vec stmts; + stmts.create (nlanes); + for (unsigned i = 0; i < nlanes; ++i) + stmts.quick_push (bb_vinfo->lookup_def (lane_defs[i].second)); + bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor, + stmts, last)); + } + } } } @@ -4227,7 +4362,8 @@ vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal, /* If there are no grouped stores and no constructors in the region there is no need to continue with pattern recog as vect_analyze_slp will fail anyway. */ - if (bb_vinfo->grouped_stores.is_empty ()) + if (bb_vinfo->grouped_stores.is_empty () + && bb_vinfo->roots.is_empty ()) { if (dump_enabled_p ()) dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, @@ -4290,8 +4426,22 @@ vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal, relevant. */ vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance)); vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance)); - if (SLP_INSTANCE_ROOT_STMT (instance)) - STMT_SLP_TYPE (SLP_INSTANCE_ROOT_STMT (instance)) = pure_slp; + if (stmt_vec_info root = SLP_INSTANCE_ROOT_STMT (instance)) + { + STMT_SLP_TYPE (root) = pure_slp; + if (is_gimple_assign (root->stmt) + && gimple_assign_rhs_code (root->stmt) == BIT_INSERT_EXPR) + { + /* ??? We should probably record the whole vector of + root stmts so we do not have to back-track here... */ + for (unsigned n = SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)); + n != 1; --n) + { + root = bb_vinfo->lookup_def (gimple_assign_rhs1 (root->stmt)); + STMT_SLP_TYPE (root) = pure_slp; + } + } + } i++; } diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index c0f786c8f34..95e8ea06a62 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -849,6 +849,16 @@ loop_vec_info_for_loop (class loop *loop) return (loop_vec_info) loop->aux; } +struct slp_root +{ + slp_root (slp_instance_kind kind_, vec stmts_, + stmt_vec_info root_) + : kind(kind_), stmts(stmts_), root(root_) {} + slp_instance_kind kind; + vec stmts; + stmt_vec_info root; +}; + typedef class _bb_vec_info : public vec_info { public: @@ -860,6 +870,8 @@ public: entry edge to cover bbs[0] PHI nodes and have a region entry insert location. */ vec bbs; + + vec roots; } *bb_vec_info; #define BB_VINFO_BB(B) (B)->bb