[PATCH] tree-optimization/49955 - BB reduction with odd number of lanes

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCH] tree-optimization/49955 - BB reduction with odd number of lanes
@ 2023-08-07 13:30 Richard Biener
  0 siblings, 0 replies; only message in thread
From: Richard Biener @ 2023-08-07 13:30 UTC (permalink / raw)
  To: gcc-patches; +Cc: richard.sandiford

The following enhances BB reduction vectorization to support
vectorizing only a subset of the lanes, keeping the rest as
scalar ops.  For now we try to make the number of lanes even
by leaving alone the "last" lane.  That's because SLP discovery
with all lanes will fail too soon to get us any hint on which
lane to strip and likewise we don't know what vector modes the
target supports so restricting ourselves to power-of-two or
other cases isn't easy.

This is enough to get at the vectorization opportunity for the
testcase in the PR - albeit with the chosen lanes not optimal
but at least vectorizable.

Boostrap and regtest running on x86_64-unknown-linux-gnu.

I failed to write a small testcase because of that "optimal"
lane selection and PR110935.

	PR tree-optimization/49955
	* tree-vectorizer.h (_slp_instance::remain_stmts): New.
	(SLP_INSTANCE_REMAIN_STMTS): Likewise.
	* tree-vect-slp.cc (vect_free_slp_instance): Release
	SLP_INSTANCE_REMAIN_STMTS.
	(vect_build_slp_instance): Make the number of lanes of
	a BB reduction even.
	(vectorize_slp_instance_root_stmt): Handle unvectorized
	defs of a BB reduction.

	* gfortran.dg/vect/pr49955.f: New testcase.
---
 gcc/testsuite/gfortran.dg/vect/pr49955.f | 38 ++++++++++++++++++++++++
 gcc/tree-vect-slp.cc                     | 30 ++++++++++++++++++-
 gcc/tree-vectorizer.h                    |  5 ++++
 3 files changed, 72 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gfortran.dg/vect/pr49955.f

diff --git a/gcc/testsuite/gfortran.dg/vect/pr49955.f b/gcc/testsuite/gfortran.dg/vect/pr49955.f
new file mode 100644
index 00000000000..a73cd5ada03
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/vect/pr49955.f
@@ -0,0 +1,38 @@
+! { dg-do compile }
+! { dg-additional-options "-ffast-math -fdump-tree-slp1" }
+
+      subroutine shell(nx,ny,nz,q,dt,cfl,dx,dy,dz,cfll,gm,Pr,Re)
+      implicit none
+      integer nx,ny,nz,i,j,k
+      real*8 cfl,dx,dy,dz,dt
+      real*8 gm,Re,Pr,cfll,t1,t2,t3,t4,t5,t6,t7,t8,mu
+      real*8 q(5,nx,ny,nz)
+
+      if (cfll.ge.cfl) cfll=cfl
+      t8=0.0d0
+
+      do k=1,nz
+         do j=1,ny
+            do i=1,nx
+               t1=q(1,i,j,k)
+               t2=q(2,i,j,k)/t1
+               t3=q(3,i,j,k)/t1
+               t4=q(4,i,j,k)/t1
+               t5=(gm-1.0d0)*(q(5,i,j,k)-0.5d0*t1*(t2*t2+t3*t3+t4*t4))
+               t6=dSQRT(gm*t5/t1)
+               mu=gm*Pr*(gm*t5/t1)**0.75d0*2.0d0/Re/t1
+               t7=((dabs(t2)+t6)/dx+mu/dx**2)**2 +
+     1            ((dabs(t3)+t6)/dy+mu/dy**2)**2 +
+     2            ((dabs(t4)+t6)/dz+mu/dz**2)**2
+               t7=DSQRT(t7)
+               t8=max(t8,t7)
+            enddo
+         enddo
+      enddo
+      dt=cfll / t8
+
+      return
+      end
+
+! We don't have an effective target for reduc_plus_scal optab support
+! { dg-final { scan-tree-dump ".REDUC_PLUS" "slp1" { target x86_64-*-* } } }
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index eab3dcd40ec..070ab3ff7ae 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -209,6 +209,7 @@ vect_free_slp_instance (slp_instance instance)
   vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
   SLP_INSTANCE_LOADS (instance).release ();
   SLP_INSTANCE_ROOT_STMTS (instance).release ();
+  SLP_INSTANCE_REMAIN_STMTS (instance).release ();
   instance->subgraph_entries.release ();
   instance->cost_vec.release ();
   free (instance);
@@ -3128,6 +3129,16 @@ vect_build_slp_instance (vec_info *vinfo,
 			 "  %G", scalar_stmts[i]->stmt);
     }
 
+  /* When a BB reduction doesn't have an even number of lanes
+     strip it down, treating the remaining lane as scalar.
+     ???  Selecting the optimal set of lanes to vectorize would be nice
+     but SLP build for all lanes will fail quickly because we think
+     we're going to need unrolling.  */
+  auto_vec<stmt_vec_info> remain;
+  if (kind == slp_inst_kind_bb_reduc
+      && (scalar_stmts.length () & 1))
+    remain.safe_push (scalar_stmts.pop ());
+
   /* Build the tree for the SLP instance.  */
   unsigned int group_size = scalar_stmts.length ();
   bool *matches = XALLOCAVEC (bool, group_size);
@@ -3175,6 +3186,10 @@ vect_build_slp_instance (vec_info *vinfo,
 	  SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
 	  SLP_INSTANCE_LOADS (new_instance) = vNULL;
 	  SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
+	  if (!remain.is_empty ())
+	    SLP_INSTANCE_REMAIN_STMTS (new_instance) = remain.copy ();
+	  else
+	    SLP_INSTANCE_REMAIN_STMTS (new_instance) = vNULL;
 	  SLP_INSTANCE_KIND (new_instance) = kind;
 	  new_instance->reduc_phis = NULL;
 	  new_instance->cost_vec = vNULL;
@@ -9138,7 +9153,20 @@ vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
 	gcc_unreachable ();
       tree scalar_def = gimple_build (&epilogue, as_combined_fn (reduc_fn),
 				      TREE_TYPE (TREE_TYPE (vec_def)), vec_def);
-
+      if (!SLP_INSTANCE_REMAIN_STMTS (instance).is_empty ())
+	{
+	  tree rem_def = NULL_TREE;
+	  for (auto rem : SLP_INSTANCE_REMAIN_STMTS (instance))
+	    if (!rem_def)
+	      rem_def = gimple_get_lhs (rem->stmt);
+	    else
+	      rem_def = gimple_build (&epilogue, reduc_code,
+				      TREE_TYPE (scalar_def),
+				      rem_def, gimple_get_lhs (rem->stmt));
+	  scalar_def = gimple_build (&epilogue, reduc_code,
+				     TREE_TYPE (scalar_def),
+				     scalar_def, rem_def);
+	}
       gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
       gsi_insert_seq_before (&rgsi, epilogue, GSI_SAME_STMT);
       gimple_assign_set_rhs_from_tree (&rgsi, scalar_def);
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index a65161499ea..dea29a74ebb 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -257,6 +257,10 @@ public:
      from, NULL otherwise.  */
   vec<stmt_vec_info> root_stmts;
 
+  /* For slp_inst_kind_bb_reduc the defs that were not vectorized, NULL
+     otherwise.  */
+  vec<stmt_vec_info> remain_stmts;
+
   /* The unrolling factor required to vectorized this SLP instance.  */
   poly_uint64 unrolling_factor;
 
@@ -285,6 +289,7 @@ public:
 #define SLP_INSTANCE_UNROLLING_FACTOR(S)         (S)->unrolling_factor
 #define SLP_INSTANCE_LOADS(S)                    (S)->loads
 #define SLP_INSTANCE_ROOT_STMTS(S)               (S)->root_stmts
+#define SLP_INSTANCE_REMAIN_STMTS(S)             (S)->remain_stmts
 #define SLP_INSTANCE_KIND(S)                     (S)->kind
 
 #define SLP_TREE_CHILDREN(S)                     (S)->children
-- 
2.35.3

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2023-08-07 13:30 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-08-07 13:30 [PATCH] tree-optimization/49955 - BB reduction with odd number of lanes Richard Biener

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).