* [PATCH] Remove last permutation restriction for SLP vectorizing
@ 2015-06-09 14:22 Richard Biener
0 siblings, 0 replies; only message in thread
From: Richard Biener @ 2015-06-09 14:22 UTC (permalink / raw)
To: gcc-patches
This removes the restriction in place on reductions. To not regress
gfortran.dg/vect/fast-math-pr37021.f90 the patch also implements
SLP permutations for strided group loads. With those two pieces
we can finally SLP vectorize the complex multiplication (which
in the gfortran.dg/vect/fast-math-pr37021.f90 testcase uses
variable stride). Code generated on x86_64 suffers from PR56766,
combine not being able to use sse3_addsubv2df3 because the
RTL doesn't use the expected vec_merge but
(vec_select (vec_concat )) instead.
Previously we vectorized this with unrolling and interleaving.
Bootstrap and regtest running on x86_64-unknown-linux-gnu.
Richard.
2015-06-09 Richard Biener <rguenther@suse.de>
* tree-vect-slp.c (vect_attempt_slp_rearrange_stmts): Split
out from ...
(vect_supported_load_permutation_p): ... here. Handle
supportable permutations in reductions.
* tree-vect-stmts.c (vectorizable_load): Handle SLP permutations
for vectorizing strided group loads.
Index: gcc/tree-vect-slp.c
===================================================================
*** gcc/tree-vect-slp.c (revision 224271)
--- gcc/tree-vect-slp.c (working copy)
*************** vect_slp_rearrange_stmts (slp_tree node,
*** 1326,1331 ****
--- 1270,1336 ----
}
+ /* Attempt to reorder stmts in a reduction chain so that we don't
+ require any load permutation. Return true if that was possible,
+ otherwise return false. */
+
+ static bool
+ vect_attempt_slp_rearrange_stmts (slp_instance slp_instn)
+ {
+ unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
+ unsigned int i, j;
+ sbitmap load_index;
+ unsigned int lidx;
+ slp_tree node, load;
+
+ /* Compare all the permutation sequences to the first one. We know
+ that at least one load is permuted. */
+ node = SLP_INSTANCE_LOADS (slp_instn)[0];
+ if (!node->load_permutation.exists ())
+ return false;
+ for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
+ {
+ if (!load->load_permutation.exists ())
+ return false;
+ FOR_EACH_VEC_ELT (load->load_permutation, j, lidx)
+ if (lidx != node->load_permutation[j])
+ return false;
+ }
+
+ /* Check that the loads in the first sequence are different and there
+ are no gaps between them. */
+ load_index = sbitmap_alloc (group_size);
+ bitmap_clear (load_index);
+ FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
+ {
+ if (bitmap_bit_p (load_index, lidx))
+ {
+ sbitmap_free (load_index);
+ return false;
+ }
+ bitmap_set_bit (load_index, lidx);
+ }
+ for (i = 0; i < group_size; i++)
+ if (!bitmap_bit_p (load_index, i))
+ {
+ sbitmap_free (load_index);
+ return false;
+ }
+ sbitmap_free (load_index);
+
+ /* This permutation is valid for reduction. Since the order of the
+ statements in the nodes is not important unless they are memory
+ accesses, we can rearrange the statements in all the nodes
+ according to the order of the loads. */
+ vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
+ node->load_permutation);
+
+ /* We are done, no actual permutations need to be generated. */
+ FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
+ SLP_TREE_LOAD_PERMUTATION (node).release ();
+ return true;
+ }
+
/* Check if the required load permutations in the SLP instance
SLP_INSTN are supported. */
*************** vect_supported_load_permutation_p (slp_i
*** 1334,1340 ****
{
unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
unsigned int i, j, k, next;
- sbitmap load_index;
slp_tree node;
gimple stmt, load, next_load, first_load;
struct data_reference *dr;
--- 1339,1344 ----
*************** vect_supported_load_permutation_p (slp_i
*** 1369,1427 ****
stmt = SLP_TREE_SCALAR_STMTS (node)[0];
/* Reduction (there are no data-refs in the root).
! In reduction chain the order of the loads is important. */
if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))
&& !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
{
! slp_tree load;
! unsigned int lidx;
!
! /* Compare all the permutation sequences to the first one. We know
! that at least one load is permuted. */
! node = SLP_INSTANCE_LOADS (slp_instn)[0];
! if (!node->load_permutation.exists ())
! return false;
! for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
! {
! if (!load->load_permutation.exists ())
! return false;
! FOR_EACH_VEC_ELT (load->load_permutation, j, lidx)
! if (lidx != node->load_permutation[j])
! return false;
! }
! /* Check that the loads in the first sequence are different and there
! are no gaps between them. */
! load_index = sbitmap_alloc (group_size);
! bitmap_clear (load_index);
! FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
! {
! if (bitmap_bit_p (load_index, lidx))
! {
! sbitmap_free (load_index);
! return false;
! }
! bitmap_set_bit (load_index, lidx);
! }
! for (i = 0; i < group_size; i++)
! if (!bitmap_bit_p (load_index, i))
! {
! sbitmap_free (load_index);
! return false;
! }
! sbitmap_free (load_index);
!
! /* This permutation is valid for reduction. Since the order of the
! statements in the nodes is not important unless they are memory
! accesses, we can rearrange the statements in all the nodes
! according to the order of the loads. */
! vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
! node->load_permutation);
!
! /* We are done, no actual permutations need to be generated. */
! FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
! SLP_TREE_LOAD_PERMUTATION (node).release ();
! return true;
}
/* In basic block vectorization we allow any subchain of an interleaving
--- 1373,1386 ----
stmt = SLP_TREE_SCALAR_STMTS (node)[0];
/* Reduction (there are no data-refs in the root).
! In reduction chain the order of the loads is not important. */
if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))
&& !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
{
! if (vect_attempt_slp_rearrange_stmts (slp_instn))
! return true;
! /* Fallthru to general load permutation handling. */
}
/* In basic block vectorization we allow any subchain of an interleaving
Index: gcc/tree-vect-stmts.c
===================================================================
*** gcc/tree-vect-stmts.c (revision 224271)
--- gcc/tree-vect-stmts.c (working copy)
*************** vectorizable_load (gimple stmt, gimple_s
*** 5995,6003 ****
if ((grouped_load
&& (slp || PURE_SLP_STMT (stmt_info)))
&& (group_size > nunits
! || nunits % group_size != 0
! /* We don't support load permutations. */
! || slp_perm))
{
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
"unhandled strided group load\n");
--- 5995,6001 ----
if ((grouped_load
&& (slp || PURE_SLP_STMT (stmt_info)))
&& (group_size > nunits
! || nunits % group_size != 0))
{
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
"unhandled strided group load\n");
*************** vectorizable_load (gimple stmt, gimple_s
*** 6294,6299 ****
--- 6292,6298 ----
alias_off = build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0);
int nloads = nunits;
tree ltype = TREE_TYPE (vectype);
+ auto_vec<tree> dr_chain;
if (slp)
{
nloads = nunits / group_size;
*************** vectorizable_load (gimple stmt, gimple_s
*** 6303,6309 ****
ltype = vectype;
ltype = build_aligned_type (ltype, TYPE_ALIGN (TREE_TYPE (vectype)));
ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
! gcc_assert (!slp_perm);
}
for (j = 0; j < ncopies; j++)
{
--- 6302,6309 ----
ltype = vectype;
ltype = build_aligned_type (ltype, TYPE_ALIGN (TREE_TYPE (vectype)));
ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
! if (slp_perm)
! dr_chain.create (ncopies);
}
for (j = 0; j < ncopies; j++)
{
*************** vectorizable_load (gimple stmt, gimple_s
*** 6350,6362 ****
}
if (slp)
! SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
if (j == 0)
STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
else
STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
prev_stmt_info = vinfo_for_stmt (new_stmt);
}
return true;
}
--- 6350,6369 ----
}
if (slp)
! {
! SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
! if (slp_perm)
! dr_chain.quick_push (gimple_assign_lhs (new_stmt));
! }
if (j == 0)
STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
else
STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
prev_stmt_info = vinfo_for_stmt (new_stmt);
}
+ if (slp_perm)
+ vect_transform_slp_perm_load (slp_node, dr_chain, gsi, vf,
+ slp_node_instance, false);
return true;
}
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2015-06-09 14:21 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-06-09 14:22 [PATCH] Remove last permutation restriction for SLP vectorizing Richard Biener
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).