From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mx2.suse.de (mx2.suse.de [195.135.220.15]) by sourceware.org (Postfix) with ESMTPS id 24FC33857C61 for ; Mon, 16 Nov 2020 13:33:47 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.3.2 sourceware.org 24FC33857C61 Authentication-Results: sourceware.org; dmarc=none (p=none dis=none) header.from=suse.de Authentication-Results: sourceware.org; spf=pass smtp.mailfrom=rguenther@suse.de X-Virus-Scanned: by amavisd-new at test-mx.suse.de Received: from relay2.suse.de (unknown [195.135.221.27]) by mx2.suse.de (Postfix) with ESMTP id 08E18AC23; Mon, 16 Nov 2020 13:33:46 +0000 (UTC) Date: Mon, 16 Nov 2020 14:33:45 +0100 (CET) From: Richard Biener Sender: rguenther@c653.arch.suse.de To: Tamar Christina cc: gcc-patches@gcc.gnu.org, nd@arm.com, ook@ucw.cz Subject: Re: [PATCH 1/2] middle-end : Initial scaffolding and definitions for SLP patttern matches In-Reply-To: Message-ID: References: User-Agent: Alpine 2.21 (LSU 202 2017-01-01) MIME-Version: 1.0 Content-Type: text/plain; charset=US-ASCII X-Spam-Status: No, score=-6.6 required=5.0 tests=BAYES_00, GIT_PATCH_0, INDUSTRIAL_BODY, INDUSTRIAL_SUBJECT, KAM_DMARC_STATUS, KAM_LOTSOFHASH, RCVD_IN_MSPIKE_H3, RCVD_IN_MSPIKE_WL, SPF_HELO_NONE, SPF_PASS, TXREP autolearn=ham autolearn_force=no version=3.4.2 X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on server2.sourceware.org X-BeenThere: gcc-patches@gcc.gnu.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Gcc-patches mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Mon, 16 Nov 2020 13:33:52 -0000 On Mon, 16 Nov 2020, Richard Biener wrote: > On Sat, 14 Nov 2020, Tamar Christina wrote: > > > Hi All, > > > > This patch adds the pre-requisites and general scaffolding for supporting doing > > SLP pattern matching. > > > > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. > > > > Ok for master? > > Comments below. > > > Thanks, > > Tamar > > > > gcc/ChangeLog: > > > > * tree-vect-loop.c (vect_dissolve_slp_only_patterns): New. > > (vect_dissolve_slp_only_groups): Call here. > > * tree-vect-slp.c (vect_free_slp_tree, vect_create_new_slp_node): Export > > from file. > > (vect_build_slp_tree_2): Set vectype for externals. > > (vect_print_slp_tree): Print SLP only patterns. > > (optimize_load_redistribution_1, optimize_load_redistribution, > > vect_match_slp_patterns_2, vect_match_slp_patterns): New. > > (vect_analyze_slp): Call matcher. > > * tree-vectorizer.c (vec_info::add_pattern_stmt): Save relevancy. > > * tree-vectorizer.h (STMT_VINFO_SAVED_RELEVANT, vect_pop_relevancy, > > vect_dissolve_pattern_relevancy, vect_save_relevancy, > > vect_push_relevancy, vect_free_slp_tree, enum _complex_operation, > > class vect_pattern): New. > > > > --- inline copy of patch -- > > > > diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c > > index 39b7319e8253c351a4f6fbdd8c154330f08f2b1b..791d9c6cb0649862a84fd3c80efc89fefedbb085 100644 > > --- a/gcc/tree-vect-loop.c > > +++ b/gcc/tree-vect-loop.c > > @@ -1979,6 +1979,61 @@ vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs, > > return opt_result::success (); > > } > > > > +/* For every SLP only pattern created by the pattern matched rooted in ROOT > > + restore the relevancy of the original statements over those of the pattern > > + and destroy the pattern relationship. This restores the SLP tree to a state > > + where it can be used when SLP build is cancelled or re-tried. */ > > + > > +static void > > +vect_dissolve_slp_only_patterns (loop_vec_info loop_vinfo, > > + hash_set *visited, slp_tree root) > > +{ > > + if (!root || visited->add (root)) > > + return; > > + > > + unsigned int i; > > + slp_tree node; > > + stmt_vec_info related_stmt_info; > > + stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (root); > > + > > + if (stmt_info && STMT_VINFO_SLP_VECT_ONLY (stmt_info)) > > + { > > + vect_pop_relevancy (stmt_info); > > + if ((related_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info)) != NULL) > > + { > > + if (dump_enabled_p ()) > > + dump_printf_loc (MSG_NOTE, vect_location, > > + "dissolving relevancy of %G", > > + STMT_VINFO_STMT (stmt_info)); > > + vect_dissolve_pattern_relevancy (related_stmt_info); > > + } > > + } > > + > > + FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i, node) > > + vect_dissolve_slp_only_patterns (loop_vinfo, visited, node); > > +} > > + > > +/* Lookup any SLP Only Pattern statements created by the SLP pattern matcher in > > + all slp_instances in LOOP_VINFO and undo the relevancy of statements such > > + that the original SLP tree before the pattern matching is used. */ > > + > > +static void > > +vect_dissolve_slp_only_patterns (loop_vec_info loop_vinfo) > > +{ > > + > > + unsigned int i; > > + hash_set visited; > > + > > + DUMP_VECT_SCOPE ("vect_dissolve_slp_only_patterns"); > > + > > + /* Unmark any SLP only patterns as relevant and restore the STMT_INFO of the > > + related instruction. */ > > + slp_instance instance; > > + FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance) > > + vect_dissolve_slp_only_patterns (loop_vinfo, &visited, > > + SLP_INSTANCE_TREE (instance)); > > +} > > + > > /* Look for SLP-only access groups and turn each individual access into its own > > group. */ > > static void > > @@ -2583,6 +2638,9 @@ again: > > /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */ > > gcc_assert (!ok); > > > > + /* Dissolve any SLP patterns created by the SLP pattern matcher. */ > > + vect_dissolve_slp_only_patterns (loop_vinfo); > > + > > /* Try again with SLP forced off but if we didn't do any SLP there is > > no point in re-trying. */ > > if (!slp) > > diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c > > index 0c065e835ad13ad32d222e2590e05ef56849c411..3be565a2e566e09a9e42d6c77ba402b9499b06b6 100644 > > --- a/gcc/tree-vect-slp.c > > +++ b/gcc/tree-vect-slp.c > > @@ -105,7 +105,7 @@ _slp_tree::~_slp_tree () > > > > /* Recursively free the memory allocated for the SLP tree rooted at NODE. */ > > > > -static void > > +void > > vect_free_slp_tree (slp_tree node) > > { > > int i; > > @@ -148,7 +148,7 @@ vect_free_slp_instance (slp_instance instance) > > > > /* Create an SLP node for SCALAR_STMTS. */ > > > > -slp_tree > > +static slp_tree > > vect_create_new_slp_node (slp_tree node, > > vec scalar_stmts, unsigned nops) > > { > > @@ -165,7 +165,7 @@ vect_create_new_slp_node (slp_tree node, > > > > /* Create an SLP node for SCALAR_STMTS. */ > > > > -static slp_tree > > +slp_tree > > vect_create_new_slp_node (vec scalar_stmts, unsigned nops) > > { > > return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops); > > @@ -1646,6 +1646,7 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node, > > { > > slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops); > > SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt; > > + SLP_TREE_VECTYPE (invnode) = vectype; > > This is wrong - the vector type of invariants is determined by > the consuming SLP stmts in vectorizable_* > > > oprnd_info->ops = vNULL; > > children.safe_push (invnode); > > continue; > > @@ -1929,6 +1930,13 @@ vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc, > > dump_printf (dump_kind, " %u", j); > > dump_printf (dump_kind, " }\n"); > > } > > + if (SLP_TREE_REPRESENTATIVE (node) > > + && STMT_VINFO_SLP_VECT_ONLY (SLP_TREE_REPRESENTATIVE (node))) > > + { > > + dump_printf_loc (metadata, user_loc, "\tSLP Only pattern:\n"); > > + dump_printf_loc (dump_kind, user_loc, "\t %G", > > + STMT_VINFO_STMT (SLP_TREE_REPRESENTATIVE (node))); > > + } > > if (SLP_TREE_LANE_PERMUTATION (node).exists ()) > > { > > dump_printf_loc (metadata, user_loc, "\tlane permutation {"); > > @@ -2174,6 +2182,219 @@ calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size) > > return exact_div (common_multiple (nunits, group_size), group_size); > > } > > > > +/* Helper function of optimize_load_redistribution that performs the operation > > + recursively. */ > > + > > +bool optimize_load_redistribution_1 (hash_set *loads, > > + scalar_stmts_to_slp_tree_map_t *bst_map, > > + hash_set *visited, > > + slp_tree parent, unsigned idx, > > + slp_tree root) > > +{ > > + if (visited->contains (root)) > > + return true; > > + visited->add (root); > > if (visited->add (root)) > return true; > > > + > > + slp_tree node; > > + unsigned i; > > + stmt_vec_info dr_stmt = NULL; > > + > > + /* For now, we don't know anything about externals so do not do anything. */ > > + if (SLP_TREE_DEF_TYPE (root) == vect_external_def > > + || SLP_TREE_DEF_TYPE (root) == vect_constant_def) > > + return false; > > + > > + if (gimple_assign_load_p (STMT_VINFO_STMT (SLP_TREE_REPRESENTATIVE (root)))) > > The appropriate check is STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE > (root)) && DR_IS_READ (STMT_VINFO_DATA_REF (...)) > > let's not mix gimple & SLP checks when not necessary. > > > + loads->add (root); > > + > > + if (SLP_TREE_CODE (root) == VEC_PERM_EXPR > > else if > > > + && SLP_TREE_LANE_PERMUTATION (root).exists () > > + && !SLP_TREE_SCALAR_STMTS (root).exists ()) > > why do !SLP_TREE_SCALAR_STMTS matter? > > > + { > > + > > extra vertical space > > > + /* First convert this node into a load node and add it to the leaves > > + list and flatten the permute from a lane to a load one. If it's > > + unneeded it will be elided later. */ > > + auto_vec stmts; > > + stmts.create (SLP_TREE_LANES (root)); > > + load_permutation_t load_perm; > > + load_perm.create (SLP_TREE_LANES (root)); > > + lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root); > > + for (unsigned j = 0; j < lane_perm.length (); j++) > > + { > > + std::pair perm = lane_perm[j]; > > + /* This isn't strictly needed, but this function is a temporary > > + one for specifically pattern matching, so don't want it to > > + optimize things the remainder of the pipeline will. */ > > + if (perm.first != j) > > + goto next; > > + node = SLP_TREE_CHILDREN (root)[perm.first]; > > + if (!SLP_TREE_LOAD_PERMUTATION (node).exists ()) > > + { > > + load_perm.release (); > > + return false; > > you're possibly prematurely exiting the DFS walk on a two-operator > permute. Ah, guess that's what the above check was for? I guess > it's better to pre-check all children of the VEC_PERM node > to be proper, two(?) leafs with load permutation. > > > + } > > + > > + stmt_vec_info rep_stmt = SLP_TREE_REPRESENTATIVE (node); > > + if (!STMT_VINFO_GROUPED_ACCESS (rep_stmt)) > > + goto next; > > I think for a node with a load permutation this never triggers. > > > + > > + if (!dr_stmt) > > + dr_stmt = DR_GROUP_FIRST_ELEMENT (rep_stmt); > > + > > + if (dr_stmt != DR_GROUP_FIRST_ELEMENT (rep_stmt)) > > + goto next; > > So all of the above could be done on the children w/o allocating > and the rest on the loop iterating over the lanes. > > > + stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]); > > + load_perm.safe_push (SLP_TREE_LOAD_PERMUTATION (node)[perm.second]); > > + } > > + > > + if (dump_enabled_p ()) > > + dump_printf_loc (MSG_NOTE, vect_location, > > + "converting loads on permute node %p\n", root); > > + > > + slp_tree *value = bst_map->get (stmts); > > + if (value) > > + node = *value; > > + else > > + { > > + /* One last iteration to free the nodes. */ > > + FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i, node) > > + { > > + /* If we are the only reference to the node, remove the vertex. > > + We don't have to modify the graph since vertices lead the > > + graph traversal. */ > > + vect_free_slp_tree (node); > > + } > > You should do this unconditionally (also if the bst_map lookup > succeeded), but after bumping the refcount for the use in this > node. > > > + > > + vec stmts_cpy = stmts.copy (); > > + node = vect_create_new_slp_node (stmts_cpy.copy (), 0); > > + bst_map->put (stmts_cpy, node); > > + } > > + SLP_TREE_CHILDREN (parent)[idx] = node; > > hmm, what is 'parent' - shouldn't this be 'root'? Ah, I see what > you are doing. I think you should do what optimize_slp does, > namely replace 'root' in-place so you do not have to adjust > parents (or even care for the case of multiple parents refering to > 'root'). I see that doesn't easily work when attempting to CSE so > the actual CSE needs to happen below (*) > > > + SLP_TREE_REF_COUNT (node)++; > > + SLP_TREE_VECTYPE (node) = SLP_TREE_VECTYPE (root); > > + SLP_TREE_LOAD_PERMUTATION (node) = load_perm; > > + loads->add (node); > > Note with the load_lane check delayed we no longer need > to vect_gather_slp_loads so early so I suggest to simply > remove the existing early call and do it after > pattern matching instead. I'm testing the patch below for this. Richard. >From 51b89070fcf8eacb188439e6c1b777fd9db4b2ae Mon Sep 17 00:00:00 2001 From: Richard Biener Date: Mon, 16 Nov 2020 14:26:20 +0100 Subject: [PATCH] Delay SLP instance loads gathering To: gcc-patches@gcc.gnu.org This delays filling SLP_INSTANCE_LOADS. 2020-11-16 Richard Biener * tree-vectorizer.h (vect_gather_slp_loads): Declare. * tree-vect-loop.c (vect_analyze_loop_2): Call vect_gather_slp_loads. * tree-vect-slp.c (vect_build_slp_instance): Do not gather SLP loads here. (vect_gather_slp_loads): Remove wrapper, new function. (vect_slp_analyze_bb_1): Call it. --- gcc/tree-vect-loop.c | 3 +++ gcc/tree-vect-slp.c | 26 ++++++++++++++++++-------- gcc/tree-vectorizer.h | 1 + 3 files changed, 22 insertions(+), 8 deletions(-) diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c index 4d5532f71d0..ecaaf0116d3 100644 --- a/gcc/tree-vect-loop.c +++ b/gcc/tree-vect-loop.c @@ -2298,6 +2298,9 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts) /* Optimize the SLP graph with the vectorization factor fixed. */ vect_optimize_slp (loop_vinfo); + + /* Gather the loads reachable from the SLP graph entries. */ + vect_gather_slp_loads (loop_vinfo); } bool saved_can_use_partial_vectors_p diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c index b98d5db9f76..d2f2407ac92 100644 --- a/gcc/tree-vect-slp.c +++ b/gcc/tree-vect-slp.c @@ -2071,13 +2071,6 @@ vect_gather_slp_loads (vec &loads, slp_tree node, } } -static void -vect_gather_slp_loads (slp_instance inst, slp_tree node) -{ - hash_set visited; - vect_gather_slp_loads (SLP_INSTANCE_LOADS (inst), node, visited); -} - /* Find the last store in SLP INSTANCE. */ @@ -2252,7 +2245,6 @@ vect_build_slp_instance (vec_info *vinfo, new_instance->cost_vec = vNULL; new_instance->subgraph_entries = vNULL; - vect_gather_slp_loads (new_instance, node); if (dump_enabled_p ()) dump_printf_loc (MSG_NOTE, vect_location, "SLP size %u vs. limit %u.\n", @@ -3071,6 +3063,21 @@ vect_optimize_slp (vec_info *vinfo) } } +/* Gather loads reachable from the individual SLP graph entries. */ + +void +vect_gather_slp_loads (vec_info *vinfo) +{ + unsigned i; + slp_instance instance; + FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance) + { + hash_set visited; + vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance), + SLP_INSTANCE_TREE (instance), visited); + } +} + /* For each possible SLP instance decide whether to SLP it and calculate overall unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at @@ -4152,6 +4159,9 @@ vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal, /* Optimize permutations. */ vect_optimize_slp (bb_vinfo); + /* Gather the loads reachable from the SLP graph entries. */ + vect_gather_slp_loads (bb_vinfo); + vect_record_base_alignments (bb_vinfo); /* Analyze and verify the alignment of data references and the diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index 3ccd0fd552d..0ee4ef32eb2 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -1974,6 +1974,7 @@ extern opt_result vect_analyze_slp (vec_info *, unsigned); extern bool vect_make_slp_decision (loop_vec_info); extern void vect_detect_hybrid_slp (loop_vec_info); extern void vect_optimize_slp (vec_info *); +extern void vect_gather_slp_loads (vec_info *); extern void vect_get_slp_defs (slp_tree, vec *); extern void vect_get_slp_defs (vec_info *, slp_tree, vec > *, unsigned n = -1U); -- 2.26.2