public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH 1/3] Do single-lane SLP discovery for reductions
@ 2024-05-29 13:23 Richard Biener
  0 siblings, 0 replies; only message in thread
From: Richard Biener @ 2024-05-29 13:23 UTC (permalink / raw)
  To: gcc-patches

The following performs single-lane SLP discovery for reductions.
It requires a fixup for outer loop vectorization where a check
for multiple types needs adjustments as otherwise bogus pointer
IV increments happen when there are multiple copies of vector stmts
in the inner loop.

For the reduction epilog handling this extends the optimized path
to cover the trivial single-lane SLP reduction case.

The fix for PR65518 implemented in vect_grouped_load_supported for
non-SLP needs a SLP counterpart that I put in get_group_load_store_type.

I've squashed parts of the previous series, no changes but the
added last patch (in this series) and the already pushed
r15-858-g65aa46ffc3b06b.

	* tree-vect-slp.cc (vect_build_slp_tree_2): Only multi-lane
	discoveries are reduction chains and need special backedge
	treatment.
	(vect_analyze_slp): Fall back to single-lane SLP discovery
	for reductions.  Make sure to try single-lane SLP reduction
	for all reductions as fallback.
	(vectorizable_load): Avoid outer loop SLP vectorization with
	multi-copy vector stmts in the inner loop.
	(vectorizable_store): Likewise.
	* tree-vect-loop.cc (vect_create_epilog_for_reduction): Allow
	direct opcode and shift reduction also for SLP reductions
	with a single lane.
	* tree-vect-stmts.cc (get_group_load_store_type): For SLP also
	check for the PR65518 single-element interleaving case as done in
	vect_grouped_load_supported.
---
 gcc/tree-vect-loop.cc  |  4 +--
 gcc/tree-vect-slp.cc   | 71 ++++++++++++++++++++++++++++++++----------
 gcc/tree-vect-stmts.cc | 24 ++++++++++++--
 3 files changed, 78 insertions(+), 21 deletions(-)

diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 3b94bb13a8b..24a1239f016 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -6507,7 +6507,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
   /* 2.3 Create the reduction code, using one of the three schemes described
          above. In SLP we simply need to extract all the elements from the 
          vector (without reducing them), so we use scalar shifts.  */
-  else if (reduc_fn != IFN_LAST && !slp_reduc)
+  else if (reduc_fn != IFN_LAST && (!slp_reduc || group_size == 1))
     {
       tree tmp;
       tree vec_elem_type;
@@ -6677,7 +6677,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
       reduc_inputs[0] = new_temp;
 
-      if (reduce_with_shift && !slp_reduc)
+      if (reduce_with_shift && (!slp_reduc || group_size == 1))
 	{
 	  int element_bitsize = tree_to_uhwi (bitsize);
 	  /* Enforced by vectorizable_reduction, which disallows SLP reductions
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index 7a963e28063..6ab00661382 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -1912,7 +1912,8 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
 	    /* Reduction chain backedge defs are filled manually.
 	       ???  Need a better way to identify a SLP reduction chain PHI.
 	       Or a better overall way to SLP match those.  */
-	    if (all_same && def_type == vect_reduction_def)
+	    if (stmts.length () > 1
+		&& all_same && def_type == vect_reduction_def)
 	      skip_args[loop_latch_edge (loop)->dest_idx] = true;
 	  }
 	else if (def_type != vect_internal_def)
@@ -3910,9 +3911,10 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
 	  }
 
       /* Find SLP sequences starting from groups of reductions.  */
-      if (loop_vinfo->reductions.length () > 1)
+      if (loop_vinfo->reductions.length () > 0)
 	{
-	  /* Collect reduction statements.  */
+	  /* Collect reduction statements we can combine into
+	     a SLP reduction.  */
 	  vec<stmt_vec_info> scalar_stmts;
 	  scalar_stmts.create (loop_vinfo->reductions.length ());
 	  for (auto next_info : loop_vinfo->reductions)
@@ -3925,25 +3927,60 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
 		     reduction path.  In that case we'd have to reverse
 		     engineer that conversion stmt following the chain using
 		     reduc_idx and from the PHI using reduc_def.  */
-		  && STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def
-		  /* Do not discover SLP reductions for lane-reducing ops, that
-		     will fail later.  */
-		  && (!(g = dyn_cast <gassign *> (STMT_VINFO_STMT (next_info)))
+		  && STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def)
+		{
+		  /* Do not discover SLP reductions combining lane-reducing
+		     ops, that will fail later.  */
+		  if (!(g = dyn_cast <gassign *> (STMT_VINFO_STMT (next_info)))
 		      || (gimple_assign_rhs_code (g) != DOT_PROD_EXPR
 			  && gimple_assign_rhs_code (g) != WIDEN_SUM_EXPR
-			  && gimple_assign_rhs_code (g) != SAD_EXPR)))
-		scalar_stmts.quick_push (next_info);
+			  && gimple_assign_rhs_code (g) != SAD_EXPR))
+		    scalar_stmts.quick_push (next_info);
+		  else
+		    {
+		      /* Do SLP discovery for single-lane reductions.  */
+		      vec<stmt_vec_info> stmts;
+		      vec<stmt_vec_info> roots = vNULL;
+		      vec<tree> remain = vNULL;
+		      stmts.create (1);
+		      stmts.quick_push (next_info);
+		      vect_build_slp_instance (vinfo,
+					       slp_inst_kind_reduc_group,
+					       stmts, roots, remain,
+					       max_tree_size, &limit,
+					       bst_map, NULL);
+		    }
+		}
 	    }
-	  if (scalar_stmts.length () > 1)
+	  /* Save for re-processing on failure.  */
+	  vec<stmt_vec_info> saved_stmts = scalar_stmts.copy ();
+	  vec<stmt_vec_info> roots = vNULL;
+	  vec<tree> remain = vNULL;
+	  if (scalar_stmts.length () <= 1
+	      || !vect_build_slp_instance (loop_vinfo,
+					   slp_inst_kind_reduc_group,
+					   scalar_stmts, roots, remain,
+					   max_tree_size, &limit, bst_map,
+					   NULL))
 	    {
-	      vec<stmt_vec_info> roots = vNULL;
-	      vec<tree> remain = vNULL;
-	      vect_build_slp_instance (loop_vinfo, slp_inst_kind_reduc_group,
-				       scalar_stmts, roots, remain,
-				       max_tree_size, &limit, bst_map, NULL);
+	      if (scalar_stmts.length () <= 1)
+		scalar_stmts.release ();
+	      /* Do SLP discovery for single-lane reductions.  */
+	      for (auto stmt_info : saved_stmts)
+		{
+		  vec<stmt_vec_info> stmts;
+		  vec<stmt_vec_info> roots = vNULL;
+		  vec<tree> remain = vNULL;
+		  stmts.create (1);
+		  stmts.quick_push (vect_stmt_to_vectorize (stmt_info));
+		  vect_build_slp_instance (vinfo,
+					   slp_inst_kind_reduc_group,
+					   stmts, roots, remain,
+					   max_tree_size, &limit,
+					   bst_map, NULL);
+		}
+	      saved_stmts.release ();
 	    }
-	  else
-	    scalar_stmts.release ();
 	}
     }
 
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 935d80f0e1b..b26cc74f417 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -2160,6 +2160,23 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info,
 		}
 	      overrun_p = true;
 	    }
+
+	  /* If this is single-element interleaving with an element
+	     distance that leaves unused vector loads around punt - we
+	     at least create very sub-optimal code in that case (and
+	     blow up memory, see PR65518).  */
+	  if (loop_vinfo
+	      && *memory_access_type == VMAT_CONTIGUOUS
+	      && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()
+	      && single_element_p
+	      && maybe_gt (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
+	    {
+	      if (dump_enabled_p ())
+		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+				 "single-element interleaving not supported "
+				 "for not adjacent vector loads\n");
+	      return false;
+	    }
 	}
     }
   else
@@ -8202,7 +8219,9 @@ vectorizable_store (vec_info *vinfo,
   gcc_assert (ncopies >= 1);
 
   /* FORNOW.  This restriction should be relaxed.  */
-  if (loop && nested_in_vect_loop_p (loop, stmt_info) && ncopies > 1)
+  if (loop
+      && nested_in_vect_loop_p (loop, stmt_info)
+      && (ncopies > 1 || (slp && SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node) > 1)))
     {
       if (dump_enabled_p ())
 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -9945,7 +9964,8 @@ vectorizable_load (vec_info *vinfo,
   gcc_assert (ncopies >= 1);
 
   /* FORNOW. This restriction should be relaxed.  */
-  if (nested_in_vect_loop && ncopies > 1)
+  if (nested_in_vect_loop
+      && (ncopies > 1 || (slp && SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node) > 1)))
     {
       if (dump_enabled_p ())
         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-- 
2.35.3


^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2024-05-29 13:23 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-05-29 13:23 [PATCH 1/3] Do single-lane SLP discovery for reductions Richard Biener

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).