[PATCH] Remove last permutation restriction for SLP vectorizing

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCH] Remove last permutation restriction for SLP vectorizing
@ 2015-06-09 14:22 Richard Biener
  0 siblings, 0 replies; only message in thread
From: Richard Biener @ 2015-06-09 14:22 UTC (permalink / raw)
  To: gcc-patches


This removes the restriction in place on reductions.  To not regress
gfortran.dg/vect/fast-math-pr37021.f90 the patch also implements
SLP permutations for strided group loads.  With those two pieces
we can finally SLP vectorize the complex multiplication (which
in the gfortran.dg/vect/fast-math-pr37021.f90 testcase uses
variable stride).  Code generated on x86_64 suffers from PR56766,
combine not being able to use sse3_addsubv2df3 because the
RTL doesn't use the expected vec_merge but
(vec_select (vec_concat )) instead.

Previously we vectorized this with unrolling and interleaving.

Bootstrap and regtest running on x86_64-unknown-linux-gnu.

Richard.

2015-06-09  Richard Biener  <rguenther@suse.de>

	* tree-vect-slp.c (vect_attempt_slp_rearrange_stmts): Split
	out from ...
	(vect_supported_load_permutation_p): ... here.  Handle
	supportable permutations in reductions.
	* tree-vect-stmts.c (vectorizable_load): Handle SLP permutations
	for vectorizing strided group loads.

Index: gcc/tree-vect-slp.c
===================================================================
*** gcc/tree-vect-slp.c	(revision 224271)
--- gcc/tree-vect-slp.c	(working copy)
*************** vect_slp_rearrange_stmts (slp_tree node,
*** 1326,1331 ****
--- 1270,1336 ----
  }
  
  
+ /* Attempt to reorder stmts in a reduction chain so that we don't
+    require any load permutation.  Return true if that was possible,
+    otherwise return false.  */
+ 
+ static bool
+ vect_attempt_slp_rearrange_stmts (slp_instance slp_instn)
+ {
+   unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
+   unsigned int i, j;
+   sbitmap load_index;
+   unsigned int lidx;
+   slp_tree node, load;
+ 
+   /* Compare all the permutation sequences to the first one.  We know
+      that at least one load is permuted.  */
+   node = SLP_INSTANCE_LOADS (slp_instn)[0];
+   if (!node->load_permutation.exists ())
+     return false;
+   for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
+     {
+       if (!load->load_permutation.exists ())
+ 	return false;
+       FOR_EACH_VEC_ELT (load->load_permutation, j, lidx)
+ 	if (lidx != node->load_permutation[j])
+ 	  return false;
+     }
+ 
+   /* Check that the loads in the first sequence are different and there
+      are no gaps between them.  */
+   load_index = sbitmap_alloc (group_size);
+   bitmap_clear (load_index);
+   FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
+     {
+       if (bitmap_bit_p (load_index, lidx))
+ 	{
+ 	  sbitmap_free (load_index);
+ 	  return false;
+ 	}
+       bitmap_set_bit (load_index, lidx);
+     }
+   for (i = 0; i < group_size; i++)
+     if (!bitmap_bit_p (load_index, i))
+       {
+ 	sbitmap_free (load_index);
+ 	return false;
+       }
+   sbitmap_free (load_index);
+ 
+   /* This permutation is valid for reduction.  Since the order of the
+      statements in the nodes is not important unless they are memory
+      accesses, we can rearrange the statements in all the nodes
+      according to the order of the loads.  */
+   vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
+ 			    node->load_permutation);
+ 
+   /* We are done, no actual permutations need to be generated.  */
+   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
+     SLP_TREE_LOAD_PERMUTATION (node).release ();
+   return true;
+ }
+ 
  /* Check if the required load permutations in the SLP instance
     SLP_INSTN are supported.  */
  
*************** vect_supported_load_permutation_p (slp_i
*** 1334,1340 ****
  {
    unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
    unsigned int i, j, k, next;
-   sbitmap load_index;
    slp_tree node;
    gimple stmt, load, next_load, first_load;
    struct data_reference *dr;
--- 1339,1344 ----
*************** vect_supported_load_permutation_p (slp_i
*** 1369,1427 ****
    stmt = SLP_TREE_SCALAR_STMTS (node)[0];
  
    /* Reduction (there are no data-refs in the root).
!      In reduction chain the order of the loads is important.  */
    if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))
        && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
      {
!       slp_tree load;
!       unsigned int lidx;
! 
!       /* Compare all the permutation sequences to the first one.  We know
!          that at least one load is permuted.  */
!       node = SLP_INSTANCE_LOADS (slp_instn)[0];
!       if (!node->load_permutation.exists ())
! 	return false;
!       for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
! 	{
! 	  if (!load->load_permutation.exists ())
! 	    return false;
! 	  FOR_EACH_VEC_ELT (load->load_permutation, j, lidx)
! 	    if (lidx != node->load_permutation[j])
! 	      return false;
! 	}
  
!       /* Check that the loads in the first sequence are different and there
! 	 are no gaps between them.  */
!       load_index = sbitmap_alloc (group_size);
!       bitmap_clear (load_index);
!       FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
! 	{
! 	  if (bitmap_bit_p (load_index, lidx))
! 	    {
! 	      sbitmap_free (load_index);
! 	      return false;
! 	    }
! 	  bitmap_set_bit (load_index, lidx);
! 	}
!       for (i = 0; i < group_size; i++)
! 	if (!bitmap_bit_p (load_index, i))
! 	  {
! 	    sbitmap_free (load_index);
! 	    return false;
! 	  }
!       sbitmap_free (load_index);
! 
!       /* This permutation is valid for reduction.  Since the order of the
! 	 statements in the nodes is not important unless they are memory
! 	 accesses, we can rearrange the statements in all the nodes
! 	 according to the order of the loads.  */
!       vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
! 				node->load_permutation);
! 
!       /* We are done, no actual permutations need to be generated.  */
!       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
! 	SLP_TREE_LOAD_PERMUTATION (node).release ();
!       return true;
      }
  
    /* In basic block vectorization we allow any subchain of an interleaving
--- 1373,1386 ----
    stmt = SLP_TREE_SCALAR_STMTS (node)[0];
  
    /* Reduction (there are no data-refs in the root).
!      In reduction chain the order of the loads is not important.  */
    if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))
        && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
      {
!       if (vect_attempt_slp_rearrange_stmts (slp_instn))
! 	return true;
  
!       /* Fallthru to general load permutation handling.  */
      }
  
    /* In basic block vectorization we allow any subchain of an interleaving
Index: gcc/tree-vect-stmts.c
===================================================================
*** gcc/tree-vect-stmts.c	(revision 224271)
--- gcc/tree-vect-stmts.c	(working copy)
*************** vectorizable_load (gimple stmt, gimple_s
*** 5995,6003 ****
        if ((grouped_load
  	   && (slp || PURE_SLP_STMT (stmt_info)))
  	  && (group_size > nunits
! 	      || nunits % group_size != 0
! 	      /* We don't support load permutations.  */
! 	      || slp_perm))
  	{
  	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  			   "unhandled strided group load\n");
--- 5995,6001 ----
        if ((grouped_load
  	   && (slp || PURE_SLP_STMT (stmt_info)))
  	  && (group_size > nunits
! 	      || nunits % group_size != 0))
  	{
  	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  			   "unhandled strided group load\n");
*************** vectorizable_load (gimple stmt, gimple_s
*** 6294,6299 ****
--- 6292,6298 ----
        alias_off = build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0);
        int nloads = nunits;
        tree ltype = TREE_TYPE (vectype);
+       auto_vec<tree> dr_chain;
        if (slp)
  	{
  	  nloads = nunits / group_size;
*************** vectorizable_load (gimple stmt, gimple_s
*** 6303,6309 ****
  	    ltype = vectype;
  	  ltype = build_aligned_type (ltype, TYPE_ALIGN (TREE_TYPE (vectype)));
  	  ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
! 	  gcc_assert (!slp_perm);
  	}
        for (j = 0; j < ncopies; j++)
  	{
--- 6302,6309 ----
  	    ltype = vectype;
  	  ltype = build_aligned_type (ltype, TYPE_ALIGN (TREE_TYPE (vectype)));
  	  ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
! 	  if (slp_perm)
! 	    dr_chain.create (ncopies);
  	}
        for (j = 0; j < ncopies; j++)
  	{
*************** vectorizable_load (gimple stmt, gimple_s
*** 6350,6362 ****
  	    }
  
  	  if (slp)
! 	    SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
  	  if (j == 0)
  	    STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
  	  else
  	    STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
  	  prev_stmt_info = vinfo_for_stmt (new_stmt);
  	}
        return true;
      }
  
--- 6350,6369 ----
  	    }
  
  	  if (slp)
! 	    {
! 	      SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
! 	      if (slp_perm)
! 		dr_chain.quick_push (gimple_assign_lhs (new_stmt));
! 	    }
  	  if (j == 0)
  	    STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
  	  else
  	    STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
  	  prev_stmt_info = vinfo_for_stmt (new_stmt);
  	}
+       if (slp_perm)
+ 	vect_transform_slp_perm_load (slp_node, dr_chain, gsi, vf,
+ 				      slp_node_instance, false);
        return true;
      }
  

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2015-06-09 14:21 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-06-09 14:22 [PATCH] Remove last permutation restriction for SLP vectorizing Richard Biener

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).