Hi All, This introduces a post processing step for the pattern matcher to flatten permutes introduced by the complex multiplications patterns. This performs a blend early such that SLP is not cancelled by the LOAD_LANES permute. This is a temporary workaround to the fact that loads are not CSEd during building and is required to produce efficient code. Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu and no issues. Ok for master? Thanks, Tamar gcc/ChangeLog: * tree-vect-slp.c (optimize_load_redistribution_1): New. (optimize_load_redistribution): New. (vect_match_slp_patterns): Use it. --- inline copy of patch -- diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c index 2a58e54fe51471df5f55ce4a524d0022744054b0..8360a59098f517498f3155f325cf8406466ac25c 100644 --- a/gcc/tree-vect-slp.c +++ b/gcc/tree-vect-slp.c @@ -2228,6 +2228,115 @@ calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size) return exact_div (common_multiple (nunits, group_size), group_size); } +/* Helper function of optimize_load_redistribution that performs the operation + recursively. */ + +static slp_tree +optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map, + hash_set *visited, slp_tree root) +{ + if (visited->add (root)) + return NULL; + + slp_tree node; + unsigned i; + + /* For now, we don't know anything about externals so do not do anything. */ + if (SLP_TREE_DEF_TYPE (root) == vect_external_def + || SLP_TREE_DEF_TYPE (root) == vect_constant_def) + return NULL; + else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR + && SLP_TREE_LANE_PERMUTATION (root).exists () + && !SLP_TREE_SCALAR_STMTS (root).exists ()) + { + /* First convert this node into a load node and add it to the leaves + list and flatten the permute from a lane to a load one. If it's + unneeded it will be elided later. */ + auto_vec stmts; + stmts.create (SLP_TREE_LANES (root)); + load_permutation_t load_perm; + load_perm.create (SLP_TREE_LANES (root)); + lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root); + for (unsigned j = 0; j < lane_perm.length (); j++) + { + std::pair perm = lane_perm[j]; + /* This isn't strictly needed, but this function is a temporary + one for specifically pattern matching, so don't want it to + optimize things the remainder of the pipeline will. */ + if (perm.first != j) + goto next; + node = SLP_TREE_CHILDREN (root)[perm.first]; + + if (!SLP_TREE_LOAD_PERMUTATION (node).exists ()) + return NULL; + + stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]); + load_perm.safe_push (SLP_TREE_LOAD_PERMUTATION (node)[perm.second]); + } + + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "converting stmts on permute node %p\n", root); + + slp_tree *value = bst_map->get (stmts); + if (value) + node = *value; + else + { + FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i, node) + SLP_TREE_REF_COUNT (node)++; + + vec stmts_cpy = stmts.copy (); + node = vect_create_new_slp_node (stmts_cpy.copy (), 0); + SLP_TREE_VECTYPE (node) = SLP_TREE_VECTYPE (root); + SLP_TREE_LOAD_PERMUTATION (node) = load_perm; + bst_map->put (stmts_cpy, node); + } + SLP_TREE_REF_COUNT (node)++; + + return node; + } + +next: + FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node) + { + slp_tree value = optimize_load_redistribution_1 (bst_map, visited, node); + if (value) + { + SLP_TREE_CHILDREN (root)[i] = value; + vect_free_slp_tree (node); + } + } + + return NULL; +} + +/* Temporary workaround for loads not being CSEd during SLP build. This + function will traverse the SLP tree rooted in ROOT for INSTANCE and find + VEC_PERM nodes that blend vectors from multiple nodes that all read from the + same DR such that the final operation is equal to a permuted load. Such + NODES are then directly converted into LOADS themselves. The nodes are + CSEd using BST_MAP. */ + +static void +optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map, + slp_tree root) +{ + slp_tree node; + unsigned i; + hash_set visited; + + FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node) + { + slp_tree value = optimize_load_redistribution_1 (bst_map, &visited, node); + if (value) + { + SLP_TREE_CHILDREN (root)[i] = value; + vect_free_slp_tree (node); + } + } +} + /* Helper function of vect_match_slp_patterns. Attempts to match patterns against the slp tree rooted in REF_NODE using @@ -2276,7 +2385,7 @@ static bool vect_match_slp_patterns (slp_instance instance, vec_info *vinfo, hash_set *visited, slp_tree_to_load_perm_map_t *perm_cache, - scalar_stmts_to_slp_tree_map_t * /* bst_map */) + scalar_stmts_to_slp_tree_map_t *bst_map) { DUMP_VECT_SCOPE ("vect_match_slp_patterns"); slp_tree *ref_node = &SLP_INSTANCE_TREE (instance); @@ -2291,6 +2400,8 @@ vect_match_slp_patterns (slp_instance instance, vec_info *vinfo, if (found_p) { + optimize_load_redistribution (bst_map, *ref_node); + if (dump_enabled_p ()) { dump_printf_loc (MSG_NOTE, vect_location, --