From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 1666) id 433473854834; Mon, 21 Jun 2021 13:02:05 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 433473854834 MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Content-Type: text/plain; charset="utf-8" From: Richard Biener To: gcc-cvs@gcc.gnu.org Subject: [gcc r12-1699] tree-optimization/101120 - fix compile-time issue with SLP groups X-Act-Checkin: gcc X-Git-Author: Richard Biener X-Git-Refname: refs/heads/master X-Git-Oldrev: 21761d2b2b01f6cef4287c646845f6b3006546aa X-Git-Newrev: 0ad9c7087ef3904da89f2db6007b6d28b116087f Message-Id: <20210621130205.433473854834@sourceware.org> Date: Mon, 21 Jun 2021 13:02:05 +0000 (GMT) X-BeenThere: gcc-cvs@gcc.gnu.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Gcc-cvs mailing list List-Unsubscribe: , List-Archive: List-Help: List-Subscribe: , X-List-Received-Date: Mon, 21 Jun 2021 13:02:05 -0000 https://gcc.gnu.org/g:0ad9c7087ef3904da89f2db6007b6d28b116087f commit r12-1699-g0ad9c7087ef3904da89f2db6007b6d28b116087f Author: Richard Biener Date: Fri Jun 18 14:07:00 2021 +0200 tree-optimization/101120 - fix compile-time issue with SLP groups This places two hacks to avoid an old compile-time issue when vectorizing large permuted SLP groups with gaps where we end up emitting loads and IV adjustments for the gap as well and those have quite a high cost until they are eventually cleaned up. The first hack is to fold the auto-inc style IV updates early in the vectorizer rather than in the next forwprop pass which shortens the SSA use-def chains of the used IV. The second hack is to remove the unused loads after we've picked all that we possibly use. 2021-06-18 Richard Biener PR tree-optimization/101120 * tree-vect-data-refs.c (bump_vector_ptr): Fold the built increment. * tree-vect-slp.c (vect_transform_slp_perm_load): Add DR chain DCE capability. * tree-vectorizer.h (vect_transform_slp_perm_load): Adjust. * tree-vect-stmts.c (vectorizable_load): Remove unused loads in the DR chain for SLP. Diff: --- gcc/tree-vect-data-refs.c | 12 +++++++++++- gcc/tree-vect-slp.c | 31 ++++++++++++++++++++++++++----- gcc/tree-vect-stmts.c | 7 ++++++- gcc/tree-vectorizer.h | 2 +- 4 files changed, 44 insertions(+), 8 deletions(-) diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c index bb086c6ac1c..be067c8923b 100644 --- a/gcc/tree-vect-data-refs.c +++ b/gcc/tree-vect-data-refs.c @@ -53,6 +53,7 @@ along with GCC; see the file COPYING3. If not see #include "tree-hash-traits.h" #include "vec-perm-indices.h" #include "internal-fn.h" +#include "gimple-fold.h" /* Return true if load- or store-lanes optab OPTAB is implemented for COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */ @@ -5026,7 +5027,7 @@ bump_vector_ptr (vec_info *vinfo, struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); tree vectype = STMT_VINFO_VECTYPE (stmt_info); tree update = TYPE_SIZE_UNIT (vectype); - gassign *incr_stmt; + gimple *incr_stmt; ssa_op_iter iter; use_operand_p use_p; tree new_dataref_ptr; @@ -5041,6 +5042,15 @@ bump_vector_ptr (vec_info *vinfo, incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR, dataref_ptr, update); vect_finish_stmt_generation (vinfo, stmt_info, incr_stmt, gsi); + /* Fold the increment, avoiding excessive chains use-def chains of + those, leading to compile-time issues for passes until the next + forwprop pass which would do this as well. */ + gimple_stmt_iterator fold_gsi = gsi_for_stmt (incr_stmt); + if (fold_stmt (&fold_gsi, follow_all_ssa_edges)) + { + incr_stmt = gsi_stmt (fold_gsi); + update_stmt (incr_stmt); + } /* Copy the points-to information if it exists. */ if (DR_PTR_INFO (dr)) diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c index 99e7ce21e4e..a32f86b8bc7 100644 --- a/gcc/tree-vect-slp.c +++ b/gcc/tree-vect-slp.c @@ -6284,14 +6284,15 @@ vect_get_slp_defs (vec_info *, If ANALYZE_ONLY is TRUE, only check that it is possible to create valid permute statements for the SLP node NODE. Store the number of vector permute instructions in *N_PERMS and the number of vector load - instructions in *N_LOADS. */ + instructions in *N_LOADS. If DCE_CHAIN is true, remove all definitions + that were not needed. */ bool vect_transform_slp_perm_load (vec_info *vinfo, slp_tree node, vec dr_chain, gimple_stmt_iterator *gsi, poly_uint64 vf, bool analyze_only, unsigned *n_perms, - unsigned int *n_loads) + unsigned int *n_loads, bool dce_chain) { stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0]; int vec_index = 0; @@ -6370,6 +6371,7 @@ vect_transform_slp_perm_load (vec_info *vinfo, } auto_sbitmap used_in_lanes (in_nlanes); bitmap_clear (used_in_lanes); + auto_bitmap used_defs; unsigned int count = mask.encoded_nelts (); mask.quick_grow (count); @@ -6477,11 +6479,20 @@ vect_transform_slp_perm_load (vec_info *vinfo, mask_vec); vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); + if (dce_chain) + { + bitmap_set_bit (used_defs, first_vec_index + ri); + bitmap_set_bit (used_defs, second_vec_index + ri); + } } else - /* If mask was NULL_TREE generate the requested - identity transform. */ - perm_stmt = SSA_NAME_DEF_STMT (first_vec); + { + /* If mask was NULL_TREE generate the requested + identity transform. */ + perm_stmt = SSA_NAME_DEF_STMT (first_vec); + if (dce_chain) + bitmap_set_bit (used_defs, first_vec_index + ri); + } /* Store the vector statement in NODE. */ SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++] = perm_stmt; @@ -6521,6 +6532,16 @@ vect_transform_slp_perm_load (vec_info *vinfo, } } + if (dce_chain) + for (unsigned i = 0; i < dr_chain.length (); ++i) + if (!bitmap_bit_p (used_defs, i)) + { + gimple *stmt = SSA_NAME_DEF_STMT (dr_chain[i]); + gimple_stmt_iterator rgsi = gsi_for_stmt (stmt); + gsi_remove (&rgsi, true); + release_defs (stmt); + } + return true; } diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c index eeef96a2eb6..4ee11b2041a 100644 --- a/gcc/tree-vect-stmts.c +++ b/gcc/tree-vect-stmts.c @@ -9762,8 +9762,13 @@ vectorizable_load (vec_info *vinfo, if (slp_perm) { unsigned n_perms; + /* For SLP we know we've seen all possible uses of dr_chain so + direct vect_transform_slp_perm_load to DCE the unused parts. + ??? This is a hack to prevent compile-time issues as seen + in PR101120 and friends. */ bool ok = vect_transform_slp_perm_load (vinfo, slp_node, dr_chain, - gsi, vf, false, &n_perms); + gsi, vf, false, &n_perms, + nullptr, true); gcc_assert (ok); } else diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index 04c20f8bd0f..5c71fbc487f 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -2012,7 +2012,7 @@ extern void vect_free_slp_instance (slp_instance); extern bool vect_transform_slp_perm_load (vec_info *, slp_tree, vec, gimple_stmt_iterator *, poly_uint64, bool, unsigned *, - unsigned * = nullptr); + unsigned * = nullptr, bool = false); extern bool vect_slp_analyze_operations (vec_info *); extern void vect_schedule_slp (vec_info *, vec); extern opt_result vect_analyze_slp (vec_info *, unsigned);