From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <gcc-patches-return-400148-listarch-gcc-patches=gcc.gnu.org@gcc.gnu.org>
Received: (qmail 65188 invoked by alias); 9 Jun 2015 14:21:18 -0000
Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm
Precedence: bulk
List-Id: <gcc-patches.gcc.gnu.org>
List-Archive: <http://gcc.gnu.org/ml/gcc-patches/>
List-Post: <mailto:gcc-patches@gcc.gnu.org>
List-Help: <mailto:gcc-patches-help@gcc.gnu.org>
Sender: gcc-patches-owner@gcc.gnu.org
Received: (qmail 64669 invoked by uid 89); 9 Jun 2015 14:21:18 -0000
Authentication-Results: sourceware.org; auth=none
X-Virus-Found: No
X-Spam-SWARE-Status: No, score=-0.9 required=5.0 tests=AWL,BAYES_50,KAM_ASCII_DIVIDERS,KAM_LAZY_DOMAIN_SECURITY,T_RP_MATCHES_RCVD autolearn=no version=3.3.2
X-HELO: mx2.suse.de
Received: from cantor2.suse.de (HELO mx2.suse.de) (195.135.220.15) by sourceware.org (qpsmtpd/0.93/v0.84-503-g423c35a) with (CAMELLIA256-SHA encrypted) ESMTPS; Tue, 09 Jun 2015 14:21:16 +0000
Received: from relay1.suse.de (charybdis-ext.suse.de [195.135.220.254])	by mx2.suse.de (Postfix) with ESMTP id 29A2FAC20	for <gcc-patches@gcc.gnu.org>; Tue,  9 Jun 2015 14:21:13 +0000 (UTC)
Date: Tue, 09 Jun 2015 14:22:00 -0000
From: Richard Biener <rguenther@suse.de>
To: gcc-patches@gcc.gnu.org
Subject: [PATCH] Remove last permutation restriction for SLP vectorizing
Message-ID: <alpine.LSU.2.11.1506091615180.30088@zhemvz.fhfr.qr>
User-Agent: Alpine 2.11 (LSU 23 2013-08-11)
MIME-Version: 1.0
Content-Type: TEXT/PLAIN; charset=US-ASCII
X-SW-Source: 2015-06/txt/msg00683.txt.bz2


This removes the restriction in place on reductions.  To not regress
gfortran.dg/vect/fast-math-pr37021.f90 the patch also implements
SLP permutations for strided group loads.  With those two pieces
we can finally SLP vectorize the complex multiplication (which
in the gfortran.dg/vect/fast-math-pr37021.f90 testcase uses
variable stride).  Code generated on x86_64 suffers from PR56766,
combine not being able to use sse3_addsubv2df3 because the
RTL doesn't use the expected vec_merge but
(vec_select (vec_concat )) instead.

Previously we vectorized this with unrolling and interleaving.

Bootstrap and regtest running on x86_64-unknown-linux-gnu.

Richard.

2015-06-09  Richard Biener  <rguenther@suse.de>

	* tree-vect-slp.c (vect_attempt_slp_rearrange_stmts): Split
	out from ...
	(vect_supported_load_permutation_p): ... here.  Handle
	supportable permutations in reductions.
	* tree-vect-stmts.c (vectorizable_load): Handle SLP permutations
	for vectorizing strided group loads.

Index: gcc/tree-vect-slp.c
===================================================================
*** gcc/tree-vect-slp.c	(revision 224271)
--- gcc/tree-vect-slp.c	(working copy)
*************** vect_slp_rearrange_stmts (slp_tree node,
*** 1326,1331 ****
--- 1270,1336 ----
  }
  
  
+ /* Attempt to reorder stmts in a reduction chain so that we don't
+    require any load permutation.  Return true if that was possible,
+    otherwise return false.  */
+ 
+ static bool
+ vect_attempt_slp_rearrange_stmts (slp_instance slp_instn)
+ {
+   unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
+   unsigned int i, j;
+   sbitmap load_index;
+   unsigned int lidx;
+   slp_tree node, load;
+ 
+   /* Compare all the permutation sequences to the first one.  We know
+      that at least one load is permuted.  */
+   node = SLP_INSTANCE_LOADS (slp_instn)[0];
+   if (!node->load_permutation.exists ())
+     return false;
+   for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
+     {
+       if (!load->load_permutation.exists ())
+ 	return false;
+       FOR_EACH_VEC_ELT (load->load_permutation, j, lidx)
+ 	if (lidx != node->load_permutation[j])
+ 	  return false;
+     }
+ 
+   /* Check that the loads in the first sequence are different and there
+      are no gaps between them.  */
+   load_index = sbitmap_alloc (group_size);
+   bitmap_clear (load_index);
+   FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
+     {
+       if (bitmap_bit_p (load_index, lidx))
+ 	{
+ 	  sbitmap_free (load_index);
+ 	  return false;
+ 	}
+       bitmap_set_bit (load_index, lidx);
+     }
+   for (i = 0; i < group_size; i++)
+     if (!bitmap_bit_p (load_index, i))
+       {
+ 	sbitmap_free (load_index);
+ 	return false;
+       }
+   sbitmap_free (load_index);
+ 
+   /* This permutation is valid for reduction.  Since the order of the
+      statements in the nodes is not important unless they are memory
+      accesses, we can rearrange the statements in all the nodes
+      according to the order of the loads.  */
+   vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
+ 			    node->load_permutation);
+ 
+   /* We are done, no actual permutations need to be generated.  */
+   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
+     SLP_TREE_LOAD_PERMUTATION (node).release ();
+   return true;
+ }
+ 
  /* Check if the required load permutations in the SLP instance
     SLP_INSTN are supported.  */
  
*************** vect_supported_load_permutation_p (slp_i
*** 1334,1340 ****
  {
    unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
    unsigned int i, j, k, next;
-   sbitmap load_index;
    slp_tree node;
    gimple stmt, load, next_load, first_load;
    struct data_reference *dr;
--- 1339,1344 ----
*************** vect_supported_load_permutation_p (slp_i
*** 1369,1427 ****
    stmt = SLP_TREE_SCALAR_STMTS (node)[0];
  
    /* Reduction (there are no data-refs in the root).
!      In reduction chain the order of the loads is important.  */
    if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))
        && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
      {
!       slp_tree load;
!       unsigned int lidx;
! 
!       /* Compare all the permutation sequences to the first one.  We know
!          that at least one load is permuted.  */
!       node = SLP_INSTANCE_LOADS (slp_instn)[0];
!       if (!node->load_permutation.exists ())
! 	return false;
!       for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
! 	{
! 	  if (!load->load_permutation.exists ())
! 	    return false;
! 	  FOR_EACH_VEC_ELT (load->load_permutation, j, lidx)
! 	    if (lidx != node->load_permutation[j])
! 	      return false;
! 	}
  
!       /* Check that the loads in the first sequence are different and there
! 	 are no gaps between them.  */
!       load_index = sbitmap_alloc (group_size);
!       bitmap_clear (load_index);
!       FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
! 	{
! 	  if (bitmap_bit_p (load_index, lidx))
! 	    {
! 	      sbitmap_free (load_index);
! 	      return false;
! 	    }
! 	  bitmap_set_bit (load_index, lidx);
! 	}
!       for (i = 0; i < group_size; i++)
! 	if (!bitmap_bit_p (load_index, i))
! 	  {
! 	    sbitmap_free (load_index);
! 	    return false;
! 	  }
!       sbitmap_free (load_index);
! 
!       /* This permutation is valid for reduction.  Since the order of the
! 	 statements in the nodes is not important unless they are memory
! 	 accesses, we can rearrange the statements in all the nodes
! 	 according to the order of the loads.  */
!       vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
! 				node->load_permutation);
! 
!       /* We are done, no actual permutations need to be generated.  */
!       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
! 	SLP_TREE_LOAD_PERMUTATION (node).release ();
!       return true;
      }
  
    /* In basic block vectorization we allow any subchain of an interleaving
--- 1373,1386 ----
    stmt = SLP_TREE_SCALAR_STMTS (node)[0];
  
    /* Reduction (there are no data-refs in the root).
!      In reduction chain the order of the loads is not important.  */
    if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))
        && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
      {
!       if (vect_attempt_slp_rearrange_stmts (slp_instn))
! 	return true;
  
!       /* Fallthru to general load permutation handling.  */
      }
  
    /* In basic block vectorization we allow any subchain of an interleaving
Index: gcc/tree-vect-stmts.c
===================================================================
*** gcc/tree-vect-stmts.c	(revision 224271)
--- gcc/tree-vect-stmts.c	(working copy)
*************** vectorizable_load (gimple stmt, gimple_s
*** 5995,6003 ****
        if ((grouped_load
  	   && (slp || PURE_SLP_STMT (stmt_info)))
  	  && (group_size > nunits
! 	      || nunits % group_size != 0
! 	      /* We don't support load permutations.  */
! 	      || slp_perm))
  	{
  	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  			   "unhandled strided group load\n");
--- 5995,6001 ----
        if ((grouped_load
  	   && (slp || PURE_SLP_STMT (stmt_info)))
  	  && (group_size > nunits
! 	      || nunits % group_size != 0))
  	{
  	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  			   "unhandled strided group load\n");
*************** vectorizable_load (gimple stmt, gimple_s
*** 6294,6299 ****
--- 6292,6298 ----
        alias_off = build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0);
        int nloads = nunits;
        tree ltype = TREE_TYPE (vectype);
+       auto_vec<tree> dr_chain;
        if (slp)
  	{
  	  nloads = nunits / group_size;
*************** vectorizable_load (gimple stmt, gimple_s
*** 6303,6309 ****
  	    ltype = vectype;
  	  ltype = build_aligned_type (ltype, TYPE_ALIGN (TREE_TYPE (vectype)));
  	  ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
! 	  gcc_assert (!slp_perm);
  	}
        for (j = 0; j < ncopies; j++)
  	{
--- 6302,6309 ----
  	    ltype = vectype;
  	  ltype = build_aligned_type (ltype, TYPE_ALIGN (TREE_TYPE (vectype)));
  	  ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
! 	  if (slp_perm)
! 	    dr_chain.create (ncopies);
  	}
        for (j = 0; j < ncopies; j++)
  	{
*************** vectorizable_load (gimple stmt, gimple_s
*** 6350,6362 ****
  	    }
  
  	  if (slp)
! 	    SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
  	  if (j == 0)
  	    STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
  	  else
  	    STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
  	  prev_stmt_info = vinfo_for_stmt (new_stmt);
  	}
        return true;
      }
  
--- 6350,6369 ----
  	    }
  
  	  if (slp)
! 	    {
! 	      SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
! 	      if (slp_perm)
! 		dr_chain.quick_push (gimple_assign_lhs (new_stmt));
! 	    }
  	  if (j == 0)
  	    STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
  	  else
  	    STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
  	  prev_stmt_info = vinfo_for_stmt (new_stmt);
  	}
+       if (slp_perm)
+ 	vect_transform_slp_perm_load (slp_node, dr_chain, gsi, vf,
+ 				      slp_node_instance, false);
        return true;
      }