From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <rguenth@sourceware.org>
Received: by sourceware.org (Postfix, from userid 1666)
	id A91BC3858D1E; Fri, 13 Oct 2023 12:30:18 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org A91BC3858D1E
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org;
	s=default; t=1697200218;
	bh=rDHs/xOMntYVmh4p7ZDOq9Lrm3gP/YlD1xXcI/3GWJg=;
	h=From:To:Subject:Date:From;
	b=KDDKq4ZTi4vfKM7AfqtOrO7Ovmf3MmQMYDsMH6xA2oa6i/y8mBJDBgoaPsirbjTAt
	 z77MrghndHmGaeogBzU0o3n60sxPwMdJiJj3N7IhLwIrk+8bs3NhfFCjQrMxMKSJ9Q
	 HAk67ezNFQCJQPUj4zxqiNC6ZOJg1OzudkwvbFPo=
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
Content-Type: text/plain; charset="utf-8"
From: Richard Biener <rguenth@gcc.gnu.org>
To: gcc-cvs@gcc.gnu.org
Subject: [gcc r14-4629] OMP SIMD inbranch call vectorization for AVX512 style
 masks
X-Act-Checkin: gcc
X-Git-Author: Richard Biener <rguenther@suse.de>
X-Git-Refname: refs/heads/master
X-Git-Oldrev: 63eaccd114393f4692976bb78b30148e6d77a89e
X-Git-Newrev: 3179ad72f67f31824c444ef30ef171ad7495d274
Message-Id: <20231013123018.A91BC3858D1E@sourceware.org>
Date: Fri, 13 Oct 2023 12:30:18 +0000 (GMT)
List-Id: <gcc-cvs.sourceware.org>

https://gcc.gnu.org/g:3179ad72f67f31824c444ef30ef171ad7495d274

commit r14-4629-g3179ad72f67f31824c444ef30ef171ad7495d274
Author: Richard Biener <rguenther@suse.de>
Date:   Fri Oct 13 12:32:51 2023 +0200

    OMP SIMD inbranch call vectorization for AVX512 style masks
    
    The following teaches vectorizable_simd_clone_call to handle
    integer mode masks.  The tricky bit is to second-guess the
    number of lanes represented by a single mask argument - the following
    uses simdlen and the number of mask arguments to calculate that,
    assuming ABIs have them uniform.
    
    Similar to the VOIDmode handling there's a restriction on not
    supporting splitting/merging of incoming vector masks to
    more/less SIMD call arguments.
    
            PR tree-optimization/111795
            * tree-vect-stmts.cc (vectorizable_simd_clone_call): Handle
            integer mode mask arguments.
    
            * gcc.target/i386/vect-simd-clone-avx512-1.c: New testcase.
            * gcc.target/i386/vect-simd-clone-avx512-2.c: Likewise.
            * gcc.target/i386/vect-simd-clone-avx512-3.c: Likewise.

Diff:
---
 .../gcc.target/i386/vect-simd-clone-avx512-1.c     |  43 ++++++
 .../gcc.target/i386/vect-simd-clone-avx512-2.c     |   6 +
 .../gcc.target/i386/vect-simd-clone-avx512-3.c     |   6 +
 gcc/tree-vect-stmts.cc                             | 150 ++++++++++++++++-----
 4 files changed, 175 insertions(+), 30 deletions(-)

diff --git a/gcc/testsuite/gcc.target/i386/vect-simd-clone-avx512-1.c b/gcc/testsuite/gcc.target/i386/vect-simd-clone-avx512-1.c
new file mode 100644
index 00000000000..e350996439e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-simd-clone-avx512-1.c
@@ -0,0 +1,43 @@
+/* { dg-do run } */
+/* { dg-require-effective-target avx512vl } */
+/* { dg-options "-O2 -fopenmp-simd -mavx512vl" } */
+
+#include "avx512vl-check.h"
+
+#ifndef SIMDLEN
+#define SIMDLEN 4
+#endif
+
+int x[1024];
+
+#pragma omp declare simd simdlen(SIMDLEN)
+__attribute__((noinline)) int
+foo (int a, int b)
+{
+  return a + b;
+}
+
+void __attribute__((noipa))
+bar (void)
+{
+#pragma omp simd
+  for (int i = 0; i < 1024; i++)
+    if (x[i] < 20)
+      x[i] = foo (x[i], x[i]);
+}
+
+void avx512vl_test ()
+{
+  int i;
+#pragma GCC novector
+  for (i = 0; i < 1024; i++)
+    x[i] = i;
+
+  bar ();
+
+#pragma GCC novector
+  for (i = 0; i < 1024; i++)
+    if ((i < 20 && x[i] != i + i)
+	|| (i >= 20 && x[i] != i))
+      abort ();
+}
diff --git a/gcc/testsuite/gcc.target/i386/vect-simd-clone-avx512-2.c b/gcc/testsuite/gcc.target/i386/vect-simd-clone-avx512-2.c
new file mode 100644
index 00000000000..d9968ae30f5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-simd-clone-avx512-2.c
@@ -0,0 +1,6 @@
+/* { dg-do run } */
+/* { dg-require-effective-target avx512vl } */
+/* { dg-options "-O2 -fopenmp-simd -mavx512vl" } */
+
+#define SIMDLEN 8
+#include "vect-simd-clone-avx512-1.c"
diff --git a/gcc/testsuite/gcc.target/i386/vect-simd-clone-avx512-3.c b/gcc/testsuite/gcc.target/i386/vect-simd-clone-avx512-3.c
new file mode 100644
index 00000000000..c05f6c8ce91
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-simd-clone-avx512-3.c
@@ -0,0 +1,6 @@
+/* { dg-do run } */
+/* { dg-require-effective-target avx512vl } */
+/* { dg-options "-O2 -fopenmp-simd -mavx512vl" } */
+
+#define SIMDLEN 16
+#include "vect-simd-clone-avx512-1.c"
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 33b557c2a49..b3a56498595 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -4385,6 +4385,9 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
 		i = -1;
 		break;
 	      case SIMD_CLONE_ARG_TYPE_MASK:
+		if (SCALAR_INT_MODE_P (n->simdclone->mask_mode)
+		    != SCALAR_INT_MODE_P (TYPE_MODE (arginfo[i].vectype)))
+		  i = -1;
 		break;
 	      }
 	    if (i == (size_t) -1)
@@ -4410,6 +4413,12 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
   if (bestn == NULL)
     return false;
 
+  unsigned int num_mask_args = 0;
+  if (SCALAR_INT_MODE_P (bestn->simdclone->mask_mode))
+    for (i = 0; i < nargs; i++)
+      if (bestn->simdclone->args[i].arg_type == SIMD_CLONE_ARG_TYPE_MASK)
+	num_mask_args++;
+
   for (i = 0; i < nargs; i++)
     {
       if ((arginfo[i].dt == vect_constant_def
@@ -4434,30 +4443,50 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
 	  return false;
 	}
 
-      if (bestn->simdclone->args[i].arg_type == SIMD_CLONE_ARG_TYPE_MASK
-	  && bestn->simdclone->mask_mode == VOIDmode
-	  && (simd_clone_subparts (bestn->simdclone->args[i].vector_type)
-	      != simd_clone_subparts (arginfo[i].vectype)))
+      if (bestn->simdclone->args[i].arg_type == SIMD_CLONE_ARG_TYPE_MASK)
 	{
-	  /* FORNOW we only have partial support for vector-type masks that
-	     can't hold all of simdlen. */
-	  if (dump_enabled_p ())
-	    dump_printf_loc (MSG_MISSED_OPTIMIZATION,
-			     vect_location,
-			     "in-branch vector clones are not yet"
-			     " supported for mismatched vector sizes.\n");
-	  return false;
-	}
-      if (bestn->simdclone->args[i].arg_type == SIMD_CLONE_ARG_TYPE_MASK
-	  && bestn->simdclone->mask_mode != VOIDmode)
-	{
-	  /* FORNOW don't support integer-type masks.  */
-	  if (dump_enabled_p ())
-	    dump_printf_loc (MSG_MISSED_OPTIMIZATION,
-			     vect_location,
-			     "in-branch vector clones are not yet"
-			     " supported for integer mask modes.\n");
-	  return false;
+	  if (bestn->simdclone->mask_mode == VOIDmode)
+	    {
+	      if (simd_clone_subparts (bestn->simdclone->args[i].vector_type)
+		  != simd_clone_subparts (arginfo[i].vectype))
+		{
+		  /* FORNOW we only have partial support for vector-type masks
+		     that can't hold all of simdlen. */
+		  if (dump_enabled_p ())
+		    dump_printf_loc (MSG_MISSED_OPTIMIZATION,
+				     vect_location,
+				     "in-branch vector clones are not yet"
+				     " supported for mismatched vector sizes.\n");
+		  return false;
+		}
+	    }
+	  else if (SCALAR_INT_MODE_P (bestn->simdclone->mask_mode))
+	    {
+	      if (!SCALAR_INT_MODE_P (TYPE_MODE (arginfo[i].vectype))
+		  || maybe_ne (exact_div (bestn->simdclone->simdlen,
+					  num_mask_args),
+			       simd_clone_subparts (arginfo[i].vectype)))
+		{
+		  /* FORNOW we only have partial support for integer-type masks
+		     that represent the same number of lanes as the
+		     vectorized mask inputs. */
+		  if (dump_enabled_p ())
+		    dump_printf_loc (MSG_MISSED_OPTIMIZATION,
+				     vect_location,
+				     "in-branch vector clones are not yet "
+				     "supported for mismatched vector sizes.\n");
+		  return false;
+		}
+	    }
+	  else
+	    {
+	      if (dump_enabled_p ())
+		dump_printf_loc (MSG_MISSED_OPTIMIZATION,
+				 vect_location,
+				 "in-branch vector clones not supported"
+				 " on this target.\n");
+	      return false;
+	    }
 	}
     }
 
@@ -4674,14 +4703,9 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
 		}
 	      break;
 	    case SIMD_CLONE_ARG_TYPE_MASK:
-	      atype = bestn->simdclone->args[i].vector_type;
-	      if (bestn->simdclone->mask_mode != VOIDmode)
-		{
-		  /* FORNOW: this is disabled above.  */
-		  gcc_unreachable ();
-		}
-	      else
+	      if (bestn->simdclone->mask_mode == VOIDmode)
 		{
+		  atype = bestn->simdclone->args[i].vector_type;
 		  tree elt_type = TREE_TYPE (atype);
 		  tree one = fold_convert (elt_type, integer_one_node);
 		  tree zero = fold_convert (elt_type, integer_zero_node);
@@ -4732,6 +4756,72 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
 			}
 		    }
 		}
+	      else if (SCALAR_INT_MODE_P (bestn->simdclone->mask_mode))
+		{
+		  atype = bestn->simdclone->args[i].vector_type;
+		  /* Guess the number of lanes represented by atype.  */
+		  unsigned HOST_WIDE_INT atype_subparts
+		    = exact_div (bestn->simdclone->simdlen,
+				 num_mask_args).to_constant ();
+		  o = vector_unroll_factor (nunits, atype_subparts);
+		  for (m = j * o; m < (j + 1) * o; m++)
+		    {
+		      if (m == 0)
+			{
+			  if (!slp_node)
+			    vect_get_vec_defs_for_operand (vinfo, stmt_info,
+							   o * ncopies,
+							   op,
+							   &vec_oprnds[i]);
+			  vec_oprnds_i[i] = 0;
+			}
+		      if (atype_subparts
+			  < simd_clone_subparts (arginfo[i].vectype))
+			{
+			  /* The mask argument has fewer elements than the
+			     input vector.  */
+			  /* FORNOW */
+			  gcc_unreachable ();
+			}
+		      else if (atype_subparts
+			       == simd_clone_subparts (arginfo[i].vectype))
+			{
+			  /* The vector mask argument matches the input
+			     in the number of lanes, but not necessarily
+			     in the mode.  */
+			  vec_oprnd0 = vec_oprnds[i][vec_oprnds_i[i]++];
+			  tree st = lang_hooks.types.type_for_mode
+				      (TYPE_MODE (TREE_TYPE (vec_oprnd0)), 1);
+			  vec_oprnd0 = build1 (VIEW_CONVERT_EXPR, st,
+					       vec_oprnd0);
+			  gassign *new_stmt
+			    = gimple_build_assign (make_ssa_name (st),
+						   vec_oprnd0);
+			  vect_finish_stmt_generation (vinfo, stmt_info,
+						       new_stmt, gsi);
+			  if (!types_compatible_p (atype, st))
+			    {
+			      new_stmt
+				= gimple_build_assign (make_ssa_name (atype),
+						       NOP_EXPR,
+						       gimple_assign_lhs
+							 (new_stmt));
+			      vect_finish_stmt_generation (vinfo, stmt_info,
+							   new_stmt, gsi);
+			    }
+			  vargs.safe_push (gimple_assign_lhs (new_stmt));
+			}
+		      else
+			{
+			  /* The mask argument has more elements than the
+			     input vector.  */
+			  /* FORNOW */
+			  gcc_unreachable ();
+			}
+		    }
+		}
+	      else
+		gcc_unreachable ();
 	      break;
 	    case SIMD_CLONE_ARG_TYPE_UNIFORM:
 	      vargs.safe_push (op);