From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <rguenth@sourceware.org>
Received: by sourceware.org (Postfix, from userid 1666)
	id 583FE3858039; Mon, 16 Oct 2023 12:50:06 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 583FE3858039
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org;
	s=default; t=1697460606;
	bh=5ja0/vOGmXIT9gISDCsKbRMREZSq7WOpIHkwsU+9SyY=;
	h=From:To:Subject:Date:From;
	b=WzqA7a2Vc8ZkmQW0tAVTdXpL0ub/n+bnb+cZe5RGSYUTKddPaHJ+D5tAoULLcAvvJ
	 kHqUjwxFLm0B/kNaIyrpieAPNdmbm1556JyAK4sr87K1uJhJe0Af52ixFHByNSY33F
	 uOMhej3Y4TT9JfQ/GAyRgFWklXegDqpaycLZ9Cn0=
Content-Type: text/plain; charset="us-ascii"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
From: Richard Biener <rguenth@gcc.gnu.org>
To: gcc-cvs@gcc.gnu.org
Subject: [gcc(refs/users/rguenth/heads/vect-force-slp)] Avoid splitting store
 dataref groups during SLP discovery
X-Act-Checkin: gcc
X-Git-Author: Richard Biener <rguenther@suse.de>
X-Git-Refname: refs/users/rguenth/heads/vect-force-slp
X-Git-Oldrev: 4eb4017ff1ca980e2ff8f78a21efd10f21f89c14
X-Git-Newrev: 2f4a092228dcb79966f5aea8a192f8b6a6ed6979
Message-Id: <20231016125006.583FE3858039@sourceware.org>
Date: Mon, 16 Oct 2023 12:50:06 +0000 (GMT)
List-Id: <gcc-cvs.sourceware.org>

https://gcc.gnu.org/g:2f4a092228dcb79966f5aea8a192f8b6a6ed6979

commit 2f4a092228dcb79966f5aea8a192f8b6a6ed6979
Author: Richard Biener <rguenther@suse.de>
Date:   Fri Sep 29 13:13:16 2023 +0200

    Avoid splitting store dataref groups during SLP discovery
    
    The following avoids splitting store dataref groups during SLP
    discovery but instead forces (eventually single-lane) consecutive
    lane SLP discovery for all lanes of the group, creating a VEC_PERM
    SLP node merging them so the store will always cover the whole group.
    
    I figured the patched function needs some refactoring so this is
    in draft state indenting-wise.  With this for example
    
    int x[1024], y[1024], z[1024], w[1024];
    void foo (void)
    {
      for (int i = 0; i < 256; i++)
        {
          x[4*i+0] = y[2*i+0];
          x[4*i+1] = y[2*i+1];
          x[4*i+2] = z[i];
          x[4*i+3] = w[i];
        }
    }
    
    which was previously using hybrid SLP can now be fully SLPed and
    SSE code generated looks better (but of course you never know,
    I didn't actually benchmark).  We of course need a VF of four here.
    
    .L2:
            movdqa  z(%rax), %xmm0
            movdqa  w(%rax), %xmm4
            movdqa  y(%rax,%rax), %xmm2
            movdqa  y+16(%rax,%rax), %xmm1
            movdqa  %xmm0, %xmm3
            punpckhdq       %xmm4, %xmm0
            punpckldq       %xmm4, %xmm3
            movdqa  %xmm2, %xmm4
            shufps  $238, %xmm3, %xmm2
            movaps  %xmm2, x+16(,%rax,4)
            movdqa  %xmm1, %xmm2
            shufps  $68, %xmm3, %xmm4
            shufps  $68, %xmm0, %xmm2
            movaps  %xmm4, x(,%rax,4)
            shufps  $238, %xmm0, %xmm1
            movaps  %xmm2, x+32(,%rax,4)
            movaps  %xmm1, x+48(,%rax,4)
            addq    $16, %rax
            cmpq    $1024, %rax
            jne     .L2
    
    The extra permute nodes unfortunately sometimes do not behave
    nicely wrt vect_is_simple_use since when the source is an
    invariant or external there's no def stmt we can fake as
    representative but vect_is_simple_use eventually gets the
    caller the scalar operand and its definition.  One might
    argue using SLP_TREE_OPS and getting an external def would
    maybe be more to the point, also since permute optimization
    could change whether or not that appears.
    
            * tree-vect-slp.cc (vect_build_slp_instance): Do not split
            dataref groups on discovery failure.

Diff:
---
 gcc/tree-vect-slp.cc | 171 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 168 insertions(+), 3 deletions(-)

diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index 97bd2f3feea2..42a66992a633 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -3349,8 +3349,6 @@ vect_build_slp_instance (vec_info *vinfo,
   else
     {
       /* Failed to SLP.  */
-      /* Free the allocated memory.  */
-      scalar_stmts.release ();
     }
 
   stmt_vec_info stmt_info = stmt_info_;
@@ -3369,6 +3367,8 @@ vect_build_slp_instance (vec_info *vinfo,
       if (is_a <bb_vec_info> (vinfo)
 	  && (i > 1 && i < group_size))
 	{
+    /* Free the allocated memory.  */
+    scalar_stmts.release ();
 	  tree scalar_type
 	    = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
 	  tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
@@ -3415,7 +3415,10 @@ vect_build_slp_instance (vec_info *vinfo,
 
       /* For loop vectorization split into arbitrary pieces of size > 1.  */
       if (is_a <loop_vec_info> (vinfo)
-	  && (i > 1 && i < group_size)
+	  && ((i > 1 && i < group_size)
+	      /* For single-lane SLP when only the first lane didn't fail
+		 also split to single-lanes.  */
+	      || (i > 0 && i < group_size && param_vect_single_lane_slp != 0))
 	  && !vect_slp_prefer_store_lanes_p (vinfo, stmt_info, group_size, i))
 	{
 	  unsigned group1_size = i;
@@ -3424,6 +3427,164 @@ vect_build_slp_instance (vec_info *vinfo,
 	    dump_printf_loc (MSG_NOTE, vect_location,
 			     "Splitting SLP group at stmt %u\n", i);
 
+	  if (param_vect_single_lane_slp != 0)
+	    {
+	      /* Analyze the stored values and pinch them together with
+		 a permute node so we can preserve the whole store group.  */
+	      auto_vec<slp_tree> rhs_nodes;
+
+      /* Calculate the unrolling factor based on the smallest type.  */
+      poly_uint64 unrolling_factor = 1;
+
+	      unsigned int start = 0, end = i;
+	      while (start < group_size)
+		{
+		  gcc_assert (end - start >= 1);
+	      vec<stmt_vec_info> substmts;
+	      substmts.create (end - start);
+	      for (unsigned j = start; j < end; ++j)
+		substmts.quick_push (scalar_stmts[j]);
+	      max_nunits = 1;
+	      node = vect_build_slp_tree (vinfo, substmts, end - start,
+					  &max_nunits,
+					  matches, limit, &tree_size, bst_map);
+	      if (node)
+		{
+		  /* ???  Possibly not safe, but not sure how to check
+		     and fail SLP build?  */
+		  unrolling_factor = force_common_multiple (unrolling_factor,
+						  calculate_unrolling_factor
+						    (max_nunits, end - start));
+		  rhs_nodes.safe_push (node);
+		  start = end;
+		  end = group_size;
+		}
+	      else
+		{
+		  substmts.release ();
+		  gcc_assert (end - start != 1);
+		  /* ???  It really happens that we soft-fail SLP
+		     build at a mismatch but the matching part hard-fails
+		     later.  As we know we arrived here with a group
+		     larger than one try a group of size one!  */
+		  if (!matches[0])
+		    end = start + 1;
+		  else
+		    for (unsigned j = start; j < end; j++)
+		      if (!matches[j - start])
+			{
+			  end = j;
+			  break;
+			}
+		}
+		}
+	      /* Now we assume we can build the root SLP node from all
+	         stores.  */
+	      node = vect_create_new_slp_node (scalar_stmts, 1);
+	      SLP_TREE_VECTYPE (node) = SLP_TREE_VECTYPE (rhs_nodes[0]);
+	      slp_tree perm = vect_create_new_slp_node (rhs_nodes.length (),
+							VEC_PERM_EXPR);
+	      SLP_TREE_CHILDREN (node).quick_push (perm);
+	      SLP_TREE_LANE_PERMUTATION (perm).create (group_size);
+	      SLP_TREE_VECTYPE (perm) = SLP_TREE_VECTYPE (node);
+	      SLP_TREE_LANES (perm) = group_size;
+	      /* ???  We should set this NULL but that's not expected.  */
+	      SLP_TREE_REPRESENTATIVE (perm)
+		= SLP_TREE_REPRESENTATIVE (SLP_TREE_CHILDREN (rhs_nodes[0])[0]);
+	      for (unsigned j = 0; j < rhs_nodes.length (); ++j)
+		{
+		  SLP_TREE_CHILDREN (perm)
+		    .quick_push (SLP_TREE_CHILDREN (rhs_nodes[j])[0]);
+		  SLP_TREE_REF_COUNT (SLP_TREE_CHILDREN (rhs_nodes[j])[0])++;
+		  for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (rhs_nodes[j]).length (); ++k)
+		    {
+		      /* ???  We should populate SLP_TREE_SCALAR_STMTS
+		         or SLP_TREE_SCALAR_OPS but then we might have
+			 a mix of both in our children.  */
+		      SLP_TREE_LANE_PERMUTATION (perm)
+			.quick_push (std::make_pair (j, k));
+		    }
+		  vect_free_slp_tree (rhs_nodes[j]);
+		}
+
+	      /* ???  Now we have a single permute node but when that's
+	         fed more than two inputs it's prone to hit the limitation
+		 on at most two sources for a VEC_PERM_EXPR.  Ideally
+		 we'd defer the following to the optimize-slp pass but
+		 for now split it here.
+		 ???  Optimally we'd produce permute nodes feeding in
+		 the same number of lanes from each input and also have
+		 the same vector type (only the width will eventually
+		 differ here), for now just do "something".  */
+	      while (SLP_TREE_CHILDREN (perm).length () > 2)
+		{
+		  slp_tree b = SLP_TREE_CHILDREN (perm).pop ();
+		  slp_tree a = SLP_TREE_CHILDREN (perm).pop ();
+		  unsigned n = SLP_TREE_LANES (a) + SLP_TREE_LANES (b);
+		  slp_tree permab = vect_create_new_slp_node (2, VEC_PERM_EXPR);
+		  SLP_TREE_LANES (permab) = n;
+		  SLP_TREE_LANE_PERMUTATION (permab).create (n);
+		  SLP_TREE_VECTYPE (permab) = SLP_TREE_VECTYPE (perm); /* ??? */
+		  /* ???  We should set this NULL but that's not expected.  */
+		  SLP_TREE_REPRESENTATIVE (permab)
+		    = SLP_TREE_REPRESENTATIVE (perm);
+		  SLP_TREE_CHILDREN (permab).quick_push (a);
+		  for (unsigned k = 0; k < SLP_TREE_LANES (a); ++k)
+		    SLP_TREE_LANE_PERMUTATION (permab)
+		      .quick_push (std::make_pair (0, k));
+		  SLP_TREE_CHILDREN (permab).quick_push (b);
+		  for (unsigned k = 0; k < SLP_TREE_LANES (b); ++k)
+		    SLP_TREE_LANE_PERMUTATION (permab)
+		      .quick_push (std::make_pair (1, k));
+		  /* ???  Popluate SLP_TREE_SCALAR_STMTS/OPS of permab.  */
+		  SLP_TREE_CHILDREN (perm).quick_push (permab);
+		  for (unsigned k = group_size - n; k < group_size; ++k)
+		    SLP_TREE_LANE_PERMUTATION (perm)[k]
+		      = std::make_pair (SLP_TREE_CHILDREN (perm).length () - 1,
+					k - (group_size - n));
+		}
+
+	  /* Create a new SLP instance.  */
+	  slp_instance new_instance = XNEW (class _slp_instance);
+	  SLP_INSTANCE_TREE (new_instance) = node;
+	  SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
+	  SLP_INSTANCE_LOADS (new_instance) = vNULL;
+	  SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
+	  SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
+	  SLP_INSTANCE_KIND (new_instance) = kind;
+	  new_instance->reduc_phis = NULL;
+	  new_instance->cost_vec = vNULL;
+	  new_instance->subgraph_entries = vNULL;
+
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_NOTE, vect_location,
+			     "SLP size %u vs. limit %u.\n",
+			     tree_size, max_tree_size);
+
+	  vinfo->slp_instances.safe_push (new_instance);
+
+	  /* ???  We've replaced the old SLP_INSTANCE_GROUP_SIZE with
+	     the number of scalar stmts in the root in a few places.
+	     Verify that assumption holds.  */
+	  gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
+			.length () == group_size);
+
+	  if (dump_enabled_p ())
+	    {
+	      dump_printf_loc (MSG_NOTE, vect_location,
+			       "Final SLP tree for instance %p:\n",
+			       (void *) new_instance);
+	      vect_print_slp_graph (MSG_NOTE, vect_location,
+				    SLP_INSTANCE_TREE (new_instance));
+	    }
+	  return true;
+	    }
+	  else
+	    {
+
+    /* Free the allocated memory.  */
+    scalar_stmts.release ();
+
 	  stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
 							   group1_size);
 	  /* Loop vectorization cannot handle gaps in stores, make sure
@@ -3440,11 +3601,15 @@ vect_build_slp_instance (vec_info *vinfo,
 					      rest, kind, max_tree_size, limit);
 
 	  return res;
+	    }
 	}
 
       /* Even though the first vector did not all match, we might be able to SLP
 	 (some) of the remainder.  FORNOW ignore this possibility.  */
     }
+  else
+    /* Free the allocated memory.  */
+    scalar_stmts.release ();
 
   /* Failed to SLP.  */
   if (dump_enabled_p ())