[gcc r14-2117] tree-optimization/96208 - SLP of non-grouped loads

public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed

* [gcc r14-2117] tree-optimization/96208 - SLP of non-grouped loads
@ 2023-06-27  7:48 Richard Biener
  0 siblings, 0 replies; only message in thread
From: Richard Biener @ 2023-06-27  7:48 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:dd86a5a69cbda40cf76388a65d3317c91cb2b501

commit r14-2117-gdd86a5a69cbda40cf76388a65d3317c91cb2b501
Author: Richard Biener <rguenther@suse.de>
Date:   Thu Jun 22 11:40:46 2023 +0200

    tree-optimization/96208 - SLP of non-grouped loads
    
    The following extends SLP discovery to handle non-grouped loads
    in loop vectorization in the case the same load appears in all
    lanes.
    
    Code generation is adjusted to mimick what we do for the case
    of single element interleaving (when the load is not unit-stride)
    which is already handled by SLP.  There are some limits we
    run into because peeling for gap cannot cover all cases and
    we choose VMAT_CONTIGUOUS.  The patch does not try to address
    these issues yet.
    
    The main obstacle is that these loads are not
    STMT_VINFO_GROUPED_ACCESS and that's a new thing with SLP.
    I know from the past that it's not a good idea to make them
    grouped.  Instead the following massages places to deal
    with SLP loads that are not STMT_VINFO_GROUPED_ACCESS.
    
    There's already a testcase testing for the case the PR
    is after, just XFAILed, the following adjusts that instead
    of adding another.
    
    I do expect to have missed some so I don't plan to push this
    on a Friday.  Still there may be feedback, so posting this
    now.
    
    Bootstrapped and tested on x86_64-unknown-linux-gnu.
    
            PR tree-optimization/96208
            * tree-vect-slp.cc (vect_build_slp_tree_1): Allow
            a non-grouped load if it is the same for all lanes.
            (vect_build_slp_tree_2): Handle not grouped loads.
            (vect_optimize_slp_pass::remove_redundant_permutations):
            Likewise.
            (vect_transform_slp_perm_load_1): Likewise.
            * tree-vect-stmts.cc (vect_model_load_cost): Likewise.
            (get_group_load_store_type): Likewise.  Handle
            invariant accesses.
            (vectorizable_load): Likewise.
    
            * gcc.dg/vect/slp-46.c: Adjust for new vectorizations.
            * gcc.dg/vect/bb-slp-pr65935.c: Adjust.

Diff:
---
 gcc/testsuite/gcc.dg/vect/bb-slp-pr65935.c |  16 ++--
 gcc/testsuite/gcc.dg/vect/slp-46.c         |   2 +-
 gcc/tree-vect-slp.cc                       |  51 ++++++++----
 gcc/tree-vect-stmts.cc                     | 128 ++++++++++++++++++-----------
 4 files changed, 127 insertions(+), 70 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-pr65935.c b/gcc/testsuite/gcc.dg/vect/bb-slp-pr65935.c
index ee121364910..8cefa7f52af 100644
--- a/gcc/testsuite/gcc.dg/vect/bb-slp-pr65935.c
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-pr65935.c
@@ -24,11 +24,17 @@ void rephase (void)
   struct site *s;
   for(i=0,s=lattice;i<sites_on_node;i++,s++)
     for(dir=0;dir<32;dir++)
-      for(j=0;j<3;j++)for(k=0;k<3;k++)
-	{
-	  s->link[dir].e[j][k].real *= s->phase[dir];
-	  s->link[dir].e[j][k].imag *= s->phase[dir];
-	}
+      {
+	for(j=0;j<3;j++)
+	  for(k=0;k<3;k++)
+	    {
+	      s->link[dir].e[j][k].real *= s->phase[dir];
+	      s->link[dir].e[j][k].imag *= s->phase[dir];
+	    }
+	/* Avoid loop vectorizing the outer loop after unrolling
+	   the inners.  */
+	__asm__ volatile ("" : : : "memory");
+      }
 }
 
 int main()
diff --git a/gcc/testsuite/gcc.dg/vect/slp-46.c b/gcc/testsuite/gcc.dg/vect/slp-46.c
index 18476a43d3f..79ed0bb9f6b 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-46.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-46.c
@@ -94,4 +94,4 @@ main ()
   return 0;
 }
 
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { xfail vect_load_lanes } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 4 "vect" { xfail vect_load_lanes } } } */
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index fee992d0171..8cb1ac1f319 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -1286,15 +1286,19 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
 	{
 	  if (load_p
 	      && rhs_code != CFN_GATHER_LOAD
-	      && rhs_code != CFN_MASK_GATHER_LOAD)
+	      && rhs_code != CFN_MASK_GATHER_LOAD
+	      /* Not grouped loads are handled as externals for BB
+		 vectorization.  For loop vectorization we can handle
+		 splats the same we handle single element interleaving.  */
+	      && (is_a <bb_vec_info> (vinfo)
+		  || stmt_info != first_stmt_info))
 	    {
 	      /* Not grouped load.  */
 	      if (dump_enabled_p ())
 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 				 "Build SLP failed: not grouped load %G", stmt);
 
-	      /* FORNOW: Not grouped loads are not supported.  */
-	      if (is_a <bb_vec_info> (vinfo) && i != 0)
+	      if (i != 0)
 		continue;
 	      /* Fatal mismatch.  */
 	      matches[0] = false;
@@ -1302,7 +1306,8 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
 	    }
 
 	  /* Not memory operation.  */
-	  if (!phi_p
+	  if (!load_p
+	      && !phi_p
 	      && rhs_code.is_tree_code ()
 	      && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_binary
 	      && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_unary
@@ -1774,7 +1779,7 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
     return NULL;
 
   /* If the SLP node is a load, terminate the recursion unless masked.  */
-  if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
+  if (STMT_VINFO_DATA_REF (stmt_info)
       && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
     {
       if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
@@ -1798,8 +1803,12 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
 	    = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
 	  FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
 	    {
-	      int load_place = vect_get_place_in_interleaving_chain
-		  (load_info, first_stmt_info);
+	      int load_place;
+	      if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
+		load_place = vect_get_place_in_interleaving_chain
+				(load_info, first_stmt_info);
+	      else
+		load_place = 0;
 	      gcc_assert (load_place != -1);
 	      load_permutation.safe_push (load_place);
 	    }
@@ -5439,6 +5448,16 @@ vect_optimize_slp_pass::remove_redundant_permutations ()
 		this_load_permuted = true;
 		break;
 	      }
+	  /* When this isn't a grouped access we know it's single element
+	     and contiguous.  */
+	  if (!STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (node)[0]))
+	    {
+	      if (!this_load_permuted
+		  && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
+		      || SLP_TREE_LANES (node) == 1))
+		SLP_TREE_LOAD_PERMUTATION (node).release ();
+	      continue;
+	    }
 	  stmt_vec_info first_stmt_info
 	    = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
 	  if (!this_load_permuted
@@ -8129,12 +8148,16 @@ vect_transform_slp_perm_load_1 (vec_info *vinfo, slp_tree node,
   tree vectype = SLP_TREE_VECTYPE (node);
   unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
   unsigned int mask_element;
+  unsigned dr_group_size;
   machine_mode mode;
 
   if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
-    return false;
-
-  stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
+    dr_group_size = 1;
+  else
+    {
+      stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
+      dr_group_size = DR_GROUP_SIZE (stmt_info);
+    }
 
   mode = TYPE_MODE (vectype);
   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
@@ -8175,7 +8198,7 @@ vect_transform_slp_perm_load_1 (vec_info *vinfo, slp_tree node,
   unsigned int nelts_to_build;
   unsigned int nvectors_per_build;
   unsigned int in_nlanes;
-  bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
+  bool repeating_p = (group_size == dr_group_size
 		      && multiple_p (nunits, group_size));
   if (repeating_p)
     {
@@ -8188,7 +8211,7 @@ vect_transform_slp_perm_load_1 (vec_info *vinfo, slp_tree node,
 	 it at least one to ensure the later computation for n_perms
 	 proceed.  */
       nvectors_per_build = nstmts > 0 ? nstmts : 1;
-      in_nlanes = DR_GROUP_SIZE (stmt_info) * 3;
+      in_nlanes = dr_group_size * 3;
     }
   else
     {
@@ -8200,7 +8223,7 @@ vect_transform_slp_perm_load_1 (vec_info *vinfo, slp_tree node,
       mask.new_vector (const_nunits, const_nunits, 1);
       nelts_to_build = const_vf * group_size;
       nvectors_per_build = 1;
-      in_nlanes = const_vf * DR_GROUP_SIZE (stmt_info);
+      in_nlanes = const_vf * dr_group_size;
     }
   auto_sbitmap used_in_lanes (in_nlanes);
   bitmap_clear (used_in_lanes);
@@ -8214,7 +8237,7 @@ vect_transform_slp_perm_load_1 (vec_info *vinfo, slp_tree node,
     {
       unsigned int iter_num = j / group_size;
       unsigned int stmt_num = j % group_size;
-      unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info) + perm[stmt_num]);
+      unsigned int i = (iter_num * dr_group_size + perm[stmt_num]);
       bitmap_set_bit (used_in_lanes, i);
       if (repeating_p)
 	{
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index b31971e99a4..d642d3c257f 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -1150,6 +1150,8 @@ vect_model_load_cost (vec_info *vinfo,
       /* If the load is permuted then the alignment is determined by
 	 the first group element not by the first scalar stmt DR.  */
       stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
+      if (!first_stmt_info)
+	first_stmt_info = stmt_info;
       /* Record the cost for the permutation.  */
       unsigned n_perms, n_loads;
       vect_transform_slp_perm_load (vinfo, slp_node, vNULL, NULL,
@@ -2203,12 +2205,24 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info,
 {
   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
   class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
-  stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
+  stmt_vec_info first_stmt_info;
+  unsigned int group_size;
+  unsigned HOST_WIDE_INT gap;
+  if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
+    {
+      first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
+      group_size = DR_GROUP_SIZE (first_stmt_info);
+      gap = DR_GROUP_GAP (first_stmt_info);
+    }
+  else
+    {
+      first_stmt_info = stmt_info;
+      group_size = 1;
+      gap = 0;
+    }
   dr_vec_info *first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
-  unsigned int group_size = DR_GROUP_SIZE (first_stmt_info);
   bool single_element_p = (stmt_info == first_stmt_info
 			   && !DR_GROUP_NEXT_ELEMENT (stmt_info));
-  unsigned HOST_WIDE_INT gap = DR_GROUP_GAP (first_stmt_info);
   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
 
   /* True if the vectorized statements would access beyond the last
@@ -2311,11 +2325,16 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info,
 		    *memory_access_type = VMAT_ELEMENTWISE;
 		}
 	    }
-	  else
+	  else if (cmp == 0 && loop_vinfo)
 	    {
-	      gcc_assert (!loop_vinfo || cmp > 0);
-	      *memory_access_type = VMAT_CONTIGUOUS;
+	      gcc_assert (vls_type == VLS_LOAD);
+	      *memory_access_type = VMAT_INVARIANT;
+	      /* Invariant accesses perform only component accesses, alignment
+		 is irrelevant for them.  */
+	      *alignment_support_scheme = dr_unaligned_supported;
 	    }
+	  else
+	    *memory_access_type = VMAT_CONTIGUOUS;
 
 	  /* When we have a contiguous access across loop iterations
 	     but the access in the loop doesn't cover the full vector
@@ -2540,7 +2559,7 @@ get_load_store_type (vec_info  *vinfo, stmt_vec_info stmt_info,
 	 is irrelevant for them.  */
       *alignment_support_scheme = dr_unaligned_supported;
     }
-  else if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
+  else if (STMT_VINFO_GROUPED_ACCESS (stmt_info) || slp_node)
     {
       if (!get_group_load_store_type (vinfo, stmt_info, vectype, slp_node,
 				      masked_p,
@@ -9464,46 +9483,6 @@ vectorizable_load (vec_info *vinfo,
 	  return false;
 	}
 
-      if (slp && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
-	{
-	  slp_perm = true;
-
-	  if (!loop_vinfo)
-	    {
-	      /* In BB vectorization we may not actually use a loaded vector
-		 accessing elements in excess of DR_GROUP_SIZE.  */
-	      stmt_vec_info group_info = SLP_TREE_SCALAR_STMTS (slp_node)[0];
-	      group_info = DR_GROUP_FIRST_ELEMENT (group_info);
-	      unsigned HOST_WIDE_INT nunits;
-	      unsigned j, k, maxk = 0;
-	      FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (slp_node), j, k)
-		if (k > maxk)
-		  maxk = k;
-	      tree vectype = SLP_TREE_VECTYPE (slp_node);
-	      if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
-		  || maxk >= (DR_GROUP_SIZE (group_info) & ~(nunits - 1)))
-		{
-		  if (dump_enabled_p ())
-		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-				     "BB vectorization with gaps at the end of "
-				     "a load is not supported\n");
-		  return false;
-		}
-	    }
-
-	  auto_vec<tree> tem;
-	  unsigned n_perms;
-	  if (!vect_transform_slp_perm_load (vinfo, slp_node, tem, NULL, vf,
-					     true, &n_perms))
-	    {
-	      if (dump_enabled_p ())
-		dump_printf_loc (MSG_MISSED_OPTIMIZATION,
-				 vect_location,
-				 "unsupported load permutation\n");
-	      return false;
-	    }
-	}
-
       /* Invalidate assumptions made by dependence analysis when vectorization
 	 on the unrolled body effectively re-orders stmts.  */
       if (!PURE_SLP_STMT (stmt_info)
@@ -9521,6 +9500,46 @@ vectorizable_load (vec_info *vinfo,
   else
     group_size = 1;
 
+  if (slp && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
+    {
+      slp_perm = true;
+
+      if (!loop_vinfo)
+	{
+	  /* In BB vectorization we may not actually use a loaded vector
+	     accessing elements in excess of DR_GROUP_SIZE.  */
+	  stmt_vec_info group_info = SLP_TREE_SCALAR_STMTS (slp_node)[0];
+	  group_info = DR_GROUP_FIRST_ELEMENT (group_info);
+	  unsigned HOST_WIDE_INT nunits;
+	  unsigned j, k, maxk = 0;
+	  FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (slp_node), j, k)
+	      if (k > maxk)
+		maxk = k;
+	  tree vectype = SLP_TREE_VECTYPE (slp_node);
+	  if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
+	      || maxk >= (DR_GROUP_SIZE (group_info) & ~(nunits - 1)))
+	    {
+	      if (dump_enabled_p ())
+		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+				 "BB vectorization with gaps at the end of "
+				 "a load is not supported\n");
+	      return false;
+	    }
+	}
+
+      auto_vec<tree> tem;
+      unsigned n_perms;
+      if (!vect_transform_slp_perm_load (vinfo, slp_node, tem, NULL, vf,
+					 true, &n_perms))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION,
+			     vect_location,
+			     "unsupported load permutation\n");
+	  return false;
+	}
+    }
+
   vect_memory_access_type memory_access_type;
   enum dr_alignment_support alignment_support_scheme;
   int misalignment;
@@ -9898,10 +9917,19 @@ vectorizable_load (vec_info *vinfo,
       || (!slp && memory_access_type == VMAT_CONTIGUOUS))
     grouped_load = false;
 
-  if (grouped_load)
+  if (grouped_load
+      || (slp && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()))
     {
-      first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
-      group_size = DR_GROUP_SIZE (first_stmt_info);
+      if (grouped_load)
+	{
+	  first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
+	  group_size = DR_GROUP_SIZE (first_stmt_info);
+	}
+      else
+	{
+	  first_stmt_info = stmt_info;
+	  group_size = 1;
+	}
       /* For SLP vectorization we directly vectorize a subchain
          without permutation.  */
       if (slp && ! SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2023-06-27  7:48 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-06-27  7:48 [gcc r14-2117] tree-optimization/96208 - SLP of non-grouped loads Richard Biener

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).