[gcc r14-4628] Add support for SLP vectorization of OpenMP SIMD clone calls

public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed

* [gcc r14-4628] Add support for SLP vectorization of OpenMP SIMD clone calls
@ 2023-10-13 12:30 Richard Biener
  0 siblings, 0 replies; only message in thread
From: Richard Biener @ 2023-10-13 12:30 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:63eaccd114393f4692976bb78b30148e6d77a89e

commit r14-4628-g63eaccd114393f4692976bb78b30148e6d77a89e
Author: Richard Biener <rguenther@suse.de>
Date:   Thu Oct 12 14:25:07 2023 +0200

    Add support for SLP vectorization of OpenMP SIMD clone calls
    
    This adds support for SLP vectorization of OpenMP SIMD clone calls.
    There's a complication when vectorizing calls involving virtual
    operands since this is now for the first time not only leafs (loads
    or stores).  With SLP this runs into the issue that placement of
    the vectorized stmts is not necessarily at one of the original
    scalar stmts which leads to the magic updating virtual operands
    in vect_finish_stmt_generation not working.  So we run into the
    assert that updating virtual operands isn't necessary.  I've
    papered over this similar to how we do for mismatched const/pure
    attribution by setting vinfo->any_known_not_updated_vssa.
    
    I've added two basic testcases with multi-lane SLP and verified
    that with single-lane SLP enabled the rest of the existing testcases
    pass.
    
            * tree-vect-slp.cc (mask_call_maps): New.
            (vect_get_operand_map): Handle IFN_MASK_CALL.
            (vect_build_slp_tree_1): Likewise.
            * tree-vect-stmts.cc (vectorizable_simd_clone_call): Handle
            SLP.
    
            * gcc.dg/vect/slp-simd-clone-1.c: New testcase.
            * gcc.dg/vect/slp-simd-clone-2.c: Likewise.

Diff:
---
 gcc/testsuite/gcc.dg/vect/slp-simd-clone-1.c |  46 ++++++++++++
 gcc/testsuite/gcc.dg/vect/slp-simd-clone-2.c |  57 +++++++++++++++
 gcc/tree-vect-slp.cc                         |  20 +++++-
 gcc/tree-vect-stmts.cc                       | 102 +++++++++++++++++++--------
 4 files changed, 196 insertions(+), 29 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/vect/slp-simd-clone-1.c b/gcc/testsuite/gcc.dg/vect/slp-simd-clone-1.c
new file mode 100644
index 00000000000..6ccbb39b567
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/slp-simd-clone-1.c
@@ -0,0 +1,46 @@
+/* { dg-require-effective-target vect_simd_clones } */
+/* { dg-additional-options "-fopenmp-simd" } */
+
+#include "tree-vect.h"
+
+int x[1024];
+
+#pragma omp declare simd simdlen(4) notinbranch
+__attribute__((noinline)) int
+foo (int a, int b)
+{
+  return a + b;
+}
+
+void __attribute__((noipa))
+bar (void)
+{
+#pragma omp simd
+  for (int i = 0; i < 512; i++)
+    {
+      x[2*i+0] = foo (x[2*i+0], x[2*i+0]);
+      x[2*i+1] = foo (x[2*i+1], x[2*i+1]);
+    }
+}
+
+int
+main ()
+{
+  int i;
+  check_vect ();
+
+#pragma GCC novector
+  for (i = 0; i < 1024; i++)
+    x[i] = i;
+
+  bar ();
+
+#pragma GCC novector
+  for (i = 0; i < 1024; i++)
+    if (x[i] != i + i)
+      abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-simd-clone-2.c b/gcc/testsuite/gcc.dg/vect/slp-simd-clone-2.c
new file mode 100644
index 00000000000..98387c92486
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/slp-simd-clone-2.c
@@ -0,0 +1,57 @@
+/* { dg-require-effective-target vect_simd_clones } */
+/* { dg-additional-options "-fopenmp-simd" } */
+/* { dg-additional-options "-mavx2" { target avx2_runtime } } */
+
+#include "tree-vect.h"
+
+int x[1024];
+
+#pragma omp declare simd simdlen(4) inbranch
+__attribute__((noinline)) int
+foo (int a, int b)
+{
+  return a + b;
+}
+
+void __attribute__((noipa))
+bar (void)
+{
+#pragma omp simd
+  for (int i = 0; i < 512; i++)
+    {
+      if (x[2*i+0] < 10)
+	x[2*i+0] = foo (x[2*i+0], x[2*i+0]);
+      if (x[2*i+1] < 20)
+	x[2*i+1] = foo (x[2*i+1], x[2*i+1]);
+    }
+}
+
+int
+main ()
+{
+  int i;
+  check_vect ();
+
+#pragma GCC novector
+  for (i = 0; i < 1024; i++)
+    x[i] = i;
+
+  bar ();
+
+#pragma GCC novector
+  for (i = 0; i < 1024; i++)
+    {
+      if (((i & 1) && i < 20)
+	  || (!(i & 1) && i < 10))
+	{
+	  if (x[i] != i + i)
+	    abort ();
+	}
+      else if (x[i] != i)
+	abort ();
+    }
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" { target avx2_runtime } } } */
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index fa098f9ff4e..af8f5031bd2 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -505,6 +505,14 @@ static const int arg2_map[] = { 1, 2 };
 static const int arg1_arg4_map[] = { 2, 1, 4 };
 static const int arg3_arg2_map[] = { 2, 3, 2 };
 static const int op1_op0_map[] = { 2, 1, 0 };
+static const int mask_call_maps[6][7] = {
+  { 1, 1, },
+  { 2, 1, 2, },
+  { 3, 1, 2, 3, },
+  { 4, 1, 2, 3, 4, },
+  { 5, 1, 2, 3, 4, 5, },
+  { 6, 1, 2, 3, 4, 5, 6 },
+};
 
 /* For most SLP statements, there is a one-to-one mapping between
    gimple arguments and child nodes.  If that is not true for STMT,
@@ -547,6 +555,15 @@ vect_get_operand_map (const gimple *stmt, unsigned char swap = 0)
 	  case IFN_MASK_STORE:
 	    return arg3_arg2_map;
 
+	  case IFN_MASK_CALL:
+	    {
+	      unsigned nargs = gimple_call_num_args (call);
+	      if (nargs >= 2 && nargs <= 7)
+		return mask_call_maps[nargs-2];
+	      else
+		return nullptr;
+	    }
+
 	  default:
 	    break;
 	  }
@@ -1070,7 +1087,7 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
       if (call_stmt)
 	{
 	  combined_fn cfn = gimple_call_combined_fn (call_stmt);
-	  if (cfn != CFN_LAST)
+	  if (cfn != CFN_LAST && cfn != CFN_MASK_CALL)
 	    rhs_code = cfn;
 	  else
 	    rhs_code = CALL_EXPR;
@@ -1085,6 +1102,7 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
 	      rhs_code = CFN_MASK_STORE;
 	    }
 	  else if ((cfn != CFN_LAST
+		    && cfn != CFN_MASK_CALL
 		    && internal_fn_p (cfn)
 		    && !vectorizable_internal_fn_p (as_internal_fn (cfn)))
 		   || gimple_call_tail_p (call_stmt)
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index ce925cc1d53..33b557c2a49 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -4208,10 +4208,6 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
   if (loop_vinfo && nested_in_vect_loop_p (loop, stmt_info))
     return false;
 
-  /* FORNOW */
-  if (slp_node)
-    return false;
-
   /* Process function arguments.  */
   nargs = gimple_call_num_args (stmt) - arg_offset;
 
@@ -4220,6 +4216,8 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
     return false;
 
   arginfo.reserve (nargs, true);
+  auto_vec<slp_tree> slp_op;
+  slp_op.safe_grow_cleared (nargs);
 
   for (i = 0; i < nargs; i++)
     {
@@ -4231,9 +4229,12 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
       thisarginfo.op = NULL_TREE;
       thisarginfo.simd_lane_linear = false;
 
-      op = gimple_call_arg (stmt, i + arg_offset);
-      if (!vect_is_simple_use (op, vinfo, &thisarginfo.dt,
-			       &thisarginfo.vectype)
+      int op_no = i + arg_offset;
+      if (slp_node)
+	op_no = vect_slp_child_index_for_operand (stmt, op_no);
+      if (!vect_is_simple_use (vinfo, stmt_info, slp_node,
+			       op_no, &op, &slp_op[i],
+			       &thisarginfo.dt, &thisarginfo.vectype)
 	  || thisarginfo.dt == vect_uninitialized_def)
 	{
 	  if (dump_enabled_p ())
@@ -4244,7 +4245,13 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
 
       if (thisarginfo.dt == vect_constant_def
 	  || thisarginfo.dt == vect_external_def)
-	gcc_assert (thisarginfo.vectype == NULL_TREE);
+	{
+	  gcc_assert (vec_stmt || thisarginfo.vectype == NULL_TREE);
+	  if (!vec_stmt)
+	    thisarginfo.vectype = get_vectype_for_scalar_type (vinfo,
+							       TREE_TYPE (op),
+							       slp_node);
+	}
       else
 	gcc_assert (thisarginfo.vectype != NULL_TREE);
 
@@ -4301,15 +4308,14 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
 	  && thisarginfo.dt != vect_constant_def
 	  && thisarginfo.dt != vect_external_def
 	  && loop_vinfo
-	  && !slp_node
 	  && TREE_CODE (op) == SSA_NAME)
 	vect_simd_lane_linear (op, loop, &thisarginfo);
 
       arginfo.quick_push (thisarginfo);
     }
 
-  poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
-  if (!vf.is_constant ())
+  if (loop_vinfo
+      && !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ())
     {
       if (dump_enabled_p ())
 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -4318,6 +4324,8 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
       return false;
     }
 
+  poly_uint64 vf = loop_vinfo ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) : 1;
+  unsigned group_size = slp_node ? SLP_TREE_LANES (slp_node) : 1;
   unsigned int badness = 0;
   struct cgraph_node *bestn = NULL;
   if (STMT_VINFO_SIMD_CLONE_INFO (stmt_info).exists ())
@@ -4328,7 +4336,8 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
       {
 	unsigned int this_badness = 0;
 	unsigned int num_calls;
-	if (!constant_multiple_p (vf, n->simdclone->simdlen, &num_calls)
+	if (!constant_multiple_p (vf * group_size,
+				  n->simdclone->simdlen, &num_calls)
 	    || n->simdclone->nargs != nargs)
 	  continue;
 	if (num_calls != 1)
@@ -4454,7 +4463,10 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
 
   fndecl = bestn->decl;
   nunits = bestn->simdclone->simdlen;
-  ncopies = vector_unroll_factor (vf, nunits);
+  if (slp_node)
+    ncopies = vector_unroll_factor (vf * group_size, nunits);
+  else
+    ncopies = vector_unroll_factor (vf, nunits);
 
   /* If the function isn't const, only allow it in simd loops where user
      has asserted that at least nunits consecutive iterations can be
@@ -4469,6 +4481,15 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
 
   if (!vec_stmt) /* transformation not required.  */
     {
+      if (slp_node)
+	for (unsigned i = 0; i < nargs; ++i)
+	  if (!vect_maybe_update_slp_op_vectype (slp_op[i], arginfo[i].vectype))
+	    {
+	      if (dump_enabled_p ())
+		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+				 "incompatible vector types for invariants\n");
+	      return false;
+	    }
       /* When the original call is pure or const but the SIMD ABI dictates
 	 an aggregate return we will have to use a virtual definition and
 	 in a loop eventually even need to add a virtual PHI.  That's
@@ -4477,6 +4498,11 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
 	  && !gimple_vdef (stmt)
 	  && TREE_CODE (TREE_TYPE (TREE_TYPE (bestn->decl))) == ARRAY_TYPE)
 	vinfo->any_known_not_updated_vssa = true;
+      /* ???  For SLP code-gen we end up inserting after the last
+	 vector argument def rather than at the original call position
+	 so automagic virtual operand updating doesn't work.  */
+      if (gimple_vuse (stmt) && slp_node)
+	vinfo->any_known_not_updated_vssa = true;
       STMT_VINFO_SIMD_CLONE_INFO (stmt_info).safe_push (bestn->decl);
       for (i = 0; i < nargs; i++)
 	if ((bestn->simdclone->args[i].arg_type
@@ -4526,8 +4552,14 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
 
   auto_vec<vec<tree> > vec_oprnds;
   auto_vec<unsigned> vec_oprnds_i;
-  vec_oprnds.safe_grow_cleared (nargs, true);
   vec_oprnds_i.safe_grow_cleared (nargs, true);
+  if (slp_node)
+    {
+      vec_oprnds.reserve_exact (nargs);
+      vect_get_slp_defs (vinfo, slp_node, &vec_oprnds);
+    }
+  else
+    vec_oprnds.safe_grow_cleared (nargs, true);
   for (j = 0; j < ncopies; ++j)
     {
       /* Build argument list for the vectorized call.  */
@@ -4558,9 +4590,10 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
 		      gcc_assert ((k & (k - 1)) == 0);
 		      if (m == 0)
 			{
-			  vect_get_vec_defs_for_operand (vinfo, stmt_info,
-							 ncopies * o / k, op,
-							 &vec_oprnds[i]);
+			  if (!slp_node)
+			    vect_get_vec_defs_for_operand (vinfo, stmt_info,
+							   ncopies * o / k, op,
+							   &vec_oprnds[i]);
 			  vec_oprnds_i[i] = 0;
 			  vec_oprnd0 = vec_oprnds[i][vec_oprnds_i[i]++];
 			}
@@ -4596,10 +4629,11 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
 			{
 			  if (m == 0 && l == 0)
 			    {
-			      vect_get_vec_defs_for_operand (vinfo, stmt_info,
-							     k * o * ncopies,
-							     op,
-							     &vec_oprnds[i]);
+			      if (!slp_node)
+				vect_get_vec_defs_for_operand (vinfo, stmt_info,
+							       k * o * ncopies,
+							       op,
+							       &vec_oprnds[i]);
 			      vec_oprnds_i[i] = 0;
 			      vec_oprnd0 = vec_oprnds[i][vec_oprnds_i[i]++];
 			    }
@@ -4670,10 +4704,11 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
 			     elements as the current function.  */
 			  if (m == 0)
 			    {
-			      vect_get_vec_defs_for_operand (vinfo, stmt_info,
-							     o * ncopies,
-							     op,
-							     &vec_oprnds[i]);
+			      if (!slp_node)
+				vect_get_vec_defs_for_operand (vinfo, stmt_info,
+							       o * ncopies,
+							       op,
+							       &vec_oprnds[i]);
 			      vec_oprnds_i[i] = 0;
 			    }
 			  vec_oprnd0 = vec_oprnds[i][vec_oprnds_i[i]++];
@@ -4817,7 +4852,11 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
 
 		  if (j == 0 && l == 0)
 		    *vec_stmt = new_stmt;
-		  STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
+		  if (slp_node)
+		    SLP_TREE_VEC_DEFS (slp_node)
+		      .quick_push (gimple_assign_lhs (new_stmt));
+		  else
+		    STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
 		}
 
 	      if (ratype)
@@ -4860,7 +4899,11 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
 
 	      if ((unsigned) j == k - 1)
 		*vec_stmt = new_stmt;
-	      STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
+	      if (slp_node)
+		SLP_TREE_VEC_DEFS (slp_node)
+		  .quick_push (gimple_assign_lhs (new_stmt));
+	      else
+		STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
 	      continue;
 	    }
 	  else if (ratype)
@@ -4883,7 +4926,10 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
 
       if (j == 0)
 	*vec_stmt = new_stmt;
-      STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
+      if (slp_node)
+	SLP_TREE_VEC_DEFS (slp_node).quick_push (gimple_get_lhs (new_stmt));
+      else
+	STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
     }
 
   for (i = 0; i < nargs; ++i)

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2023-10-13 12:30 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-10-13 12:30 [gcc r14-4628] Add support for SLP vectorization of OpenMP SIMD clone calls Richard Biener

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).