From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (qmail 65188 invoked by alias); 9 Jun 2015 14:21:18 -0000 Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm Precedence: bulk List-Id: List-Archive: List-Post: List-Help: Sender: gcc-patches-owner@gcc.gnu.org Received: (qmail 64669 invoked by uid 89); 9 Jun 2015 14:21:18 -0000 Authentication-Results: sourceware.org; auth=none X-Virus-Found: No X-Spam-SWARE-Status: No, score=-0.9 required=5.0 tests=AWL,BAYES_50,KAM_ASCII_DIVIDERS,KAM_LAZY_DOMAIN_SECURITY,T_RP_MATCHES_RCVD autolearn=no version=3.3.2 X-HELO: mx2.suse.de Received: from cantor2.suse.de (HELO mx2.suse.de) (195.135.220.15) by sourceware.org (qpsmtpd/0.93/v0.84-503-g423c35a) with (CAMELLIA256-SHA encrypted) ESMTPS; Tue, 09 Jun 2015 14:21:16 +0000 Received: from relay1.suse.de (charybdis-ext.suse.de [195.135.220.254]) by mx2.suse.de (Postfix) with ESMTP id 29A2FAC20 for ; Tue, 9 Jun 2015 14:21:13 +0000 (UTC) Date: Tue, 09 Jun 2015 14:22:00 -0000 From: Richard Biener To: gcc-patches@gcc.gnu.org Subject: [PATCH] Remove last permutation restriction for SLP vectorizing Message-ID: User-Agent: Alpine 2.11 (LSU 23 2013-08-11) MIME-Version: 1.0 Content-Type: TEXT/PLAIN; charset=US-ASCII X-SW-Source: 2015-06/txt/msg00683.txt.bz2 This removes the restriction in place on reductions. To not regress gfortran.dg/vect/fast-math-pr37021.f90 the patch also implements SLP permutations for strided group loads. With those two pieces we can finally SLP vectorize the complex multiplication (which in the gfortran.dg/vect/fast-math-pr37021.f90 testcase uses variable stride). Code generated on x86_64 suffers from PR56766, combine not being able to use sse3_addsubv2df3 because the RTL doesn't use the expected vec_merge but (vec_select (vec_concat )) instead. Previously we vectorized this with unrolling and interleaving. Bootstrap and regtest running on x86_64-unknown-linux-gnu. Richard. 2015-06-09 Richard Biener * tree-vect-slp.c (vect_attempt_slp_rearrange_stmts): Split out from ... (vect_supported_load_permutation_p): ... here. Handle supportable permutations in reductions. * tree-vect-stmts.c (vectorizable_load): Handle SLP permutations for vectorizing strided group loads. Index: gcc/tree-vect-slp.c =================================================================== *** gcc/tree-vect-slp.c (revision 224271) --- gcc/tree-vect-slp.c (working copy) *************** vect_slp_rearrange_stmts (slp_tree node, *** 1326,1331 **** --- 1270,1336 ---- } + /* Attempt to reorder stmts in a reduction chain so that we don't + require any load permutation. Return true if that was possible, + otherwise return false. */ + + static bool + vect_attempt_slp_rearrange_stmts (slp_instance slp_instn) + { + unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn); + unsigned int i, j; + sbitmap load_index; + unsigned int lidx; + slp_tree node, load; + + /* Compare all the permutation sequences to the first one. We know + that at least one load is permuted. */ + node = SLP_INSTANCE_LOADS (slp_instn)[0]; + if (!node->load_permutation.exists ()) + return false; + for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i) + { + if (!load->load_permutation.exists ()) + return false; + FOR_EACH_VEC_ELT (load->load_permutation, j, lidx) + if (lidx != node->load_permutation[j]) + return false; + } + + /* Check that the loads in the first sequence are different and there + are no gaps between them. */ + load_index = sbitmap_alloc (group_size); + bitmap_clear (load_index); + FOR_EACH_VEC_ELT (node->load_permutation, i, lidx) + { + if (bitmap_bit_p (load_index, lidx)) + { + sbitmap_free (load_index); + return false; + } + bitmap_set_bit (load_index, lidx); + } + for (i = 0; i < group_size; i++) + if (!bitmap_bit_p (load_index, i)) + { + sbitmap_free (load_index); + return false; + } + sbitmap_free (load_index); + + /* This permutation is valid for reduction. Since the order of the + statements in the nodes is not important unless they are memory + accesses, we can rearrange the statements in all the nodes + according to the order of the loads. */ + vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size, + node->load_permutation); + + /* We are done, no actual permutations need to be generated. */ + FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node) + SLP_TREE_LOAD_PERMUTATION (node).release (); + return true; + } + /* Check if the required load permutations in the SLP instance SLP_INSTN are supported. */ *************** vect_supported_load_permutation_p (slp_i *** 1334,1340 **** { unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn); unsigned int i, j, k, next; - sbitmap load_index; slp_tree node; gimple stmt, load, next_load, first_load; struct data_reference *dr; --- 1339,1344 ---- *************** vect_supported_load_permutation_p (slp_i *** 1369,1427 **** stmt = SLP_TREE_SCALAR_STMTS (node)[0]; /* Reduction (there are no data-refs in the root). ! In reduction chain the order of the loads is important. */ if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)) && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))) { ! slp_tree load; ! unsigned int lidx; ! ! /* Compare all the permutation sequences to the first one. We know ! that at least one load is permuted. */ ! node = SLP_INSTANCE_LOADS (slp_instn)[0]; ! if (!node->load_permutation.exists ()) ! return false; ! for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i) ! { ! if (!load->load_permutation.exists ()) ! return false; ! FOR_EACH_VEC_ELT (load->load_permutation, j, lidx) ! if (lidx != node->load_permutation[j]) ! return false; ! } ! /* Check that the loads in the first sequence are different and there ! are no gaps between them. */ ! load_index = sbitmap_alloc (group_size); ! bitmap_clear (load_index); ! FOR_EACH_VEC_ELT (node->load_permutation, i, lidx) ! { ! if (bitmap_bit_p (load_index, lidx)) ! { ! sbitmap_free (load_index); ! return false; ! } ! bitmap_set_bit (load_index, lidx); ! } ! for (i = 0; i < group_size; i++) ! if (!bitmap_bit_p (load_index, i)) ! { ! sbitmap_free (load_index); ! return false; ! } ! sbitmap_free (load_index); ! ! /* This permutation is valid for reduction. Since the order of the ! statements in the nodes is not important unless they are memory ! accesses, we can rearrange the statements in all the nodes ! according to the order of the loads. */ ! vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size, ! node->load_permutation); ! ! /* We are done, no actual permutations need to be generated. */ ! FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node) ! SLP_TREE_LOAD_PERMUTATION (node).release (); ! return true; } /* In basic block vectorization we allow any subchain of an interleaving --- 1373,1386 ---- stmt = SLP_TREE_SCALAR_STMTS (node)[0]; /* Reduction (there are no data-refs in the root). ! In reduction chain the order of the loads is not important. */ if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)) && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))) { ! if (vect_attempt_slp_rearrange_stmts (slp_instn)) ! return true; ! /* Fallthru to general load permutation handling. */ } /* In basic block vectorization we allow any subchain of an interleaving Index: gcc/tree-vect-stmts.c =================================================================== *** gcc/tree-vect-stmts.c (revision 224271) --- gcc/tree-vect-stmts.c (working copy) *************** vectorizable_load (gimple stmt, gimple_s *** 5995,6003 **** if ((grouped_load && (slp || PURE_SLP_STMT (stmt_info))) && (group_size > nunits ! || nunits % group_size != 0 ! /* We don't support load permutations. */ ! || slp_perm)) { dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, "unhandled strided group load\n"); --- 5995,6001 ---- if ((grouped_load && (slp || PURE_SLP_STMT (stmt_info))) && (group_size > nunits ! || nunits % group_size != 0)) { dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, "unhandled strided group load\n"); *************** vectorizable_load (gimple stmt, gimple_s *** 6294,6299 **** --- 6292,6298 ---- alias_off = build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0); int nloads = nunits; tree ltype = TREE_TYPE (vectype); + auto_vec dr_chain; if (slp) { nloads = nunits / group_size; *************** vectorizable_load (gimple stmt, gimple_s *** 6303,6309 **** ltype = vectype; ltype = build_aligned_type (ltype, TYPE_ALIGN (TREE_TYPE (vectype))); ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); ! gcc_assert (!slp_perm); } for (j = 0; j < ncopies; j++) { --- 6302,6309 ---- ltype = vectype; ltype = build_aligned_type (ltype, TYPE_ALIGN (TREE_TYPE (vectype))); ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); ! if (slp_perm) ! dr_chain.create (ncopies); } for (j = 0; j < ncopies; j++) { *************** vectorizable_load (gimple stmt, gimple_s *** 6350,6362 **** } if (slp) ! SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt); if (j == 0) STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt; else STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt; prev_stmt_info = vinfo_for_stmt (new_stmt); } return true; } --- 6350,6369 ---- } if (slp) ! { ! SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt); ! if (slp_perm) ! dr_chain.quick_push (gimple_assign_lhs (new_stmt)); ! } if (j == 0) STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt; else STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt; prev_stmt_info = vinfo_for_stmt (new_stmt); } + if (slp_perm) + vect_transform_slp_perm_load (slp_node, dr_chain, gsi, vf, + slp_node_instance, false); return true; }