OpenMP: Fix SIMT for complex/float reduction with && and ||

2021-05-07  Tobias Burnus  <tobias@codesourcery.com>
	    Tom de Vries  <tdevries@suse.de>

gcc/ChangeLog:

	* omp-low.c (lower_rec_simd_input_clauses): Set max_vf = 1 if
	a truth_value_p reduction variable is nonintegral.

libgomp/ChangeLog:

	* testsuite/libgomp.c-c++-common/reduction-5.c: New test, testing
	complex/floating-point || + && reduction with 'omp target'.
	* testsuite/libgomp.c-c++-common/reduction-5.c: Likewise.

 gcc/omp-low.c                                      |  28 ++-
 .../testsuite/libgomp.c-c++-common/reduction-5.c   | 193 ++++++++++++++++++++
 .../testsuite/libgomp.c-c++-common/reduction-6.c   | 196 +++++++++++++++++++++
 3 files changed, 410 insertions(+), 7 deletions(-)

diff --git a/gcc/omp-low.c b/gcc/omp-low.c
index 26ceaf74b2d..2325cfcfc34 100644
--- a/gcc/omp-low.c
+++ b/gcc/omp-low.c
@@ -4389,14 +4389,28 @@ lower_rec_simd_input_clauses (tree new_var, omp_context *ctx,
 	{
 	  for (tree c = gimple_omp_for_clauses (ctx->stmt); c;
 	       c = OMP_CLAUSE_CHAIN (c))
-	    if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_REDUCTION
-		&& OMP_CLAUSE_REDUCTION_PLACEHOLDER (c))
-	      {
-		/* UDR reductions are not supported yet for SIMT, disable
-		   SIMT.  */
-		sctx->max_vf = 1;
-		break;
+	    {
+	      if (OMP_CLAUSE_CODE (c) != OMP_CLAUSE_REDUCTION)
+		continue;
+
+	      if (OMP_CLAUSE_REDUCTION_PLACEHOLDER (c))
+		{
+		  /* UDR reductions are not supported yet for SIMT, disable
+		     SIMT.  */
+		  sctx->max_vf = 1;
+		  break;
+		}
+
+	      if (truth_value_p (OMP_CLAUSE_REDUCTION_CODE (c))
+		  && !INTEGRAL_TYPE_P (TREE_TYPE (new_var)))
+		{
+		  /* Doing boolean operations on non-integral types is
+		     for conformance only, it's not worth supporting this
+		     for SIMT.  */
+		  sctx->max_vf = 1;
+		  break;
 	      }
+	    }
 	}
       if (maybe_gt (sctx->max_vf, 1U))
 	{
diff --git a/libgomp/testsuite/libgomp.c-c++-common/reduction-5.c b/libgomp/testsuite/libgomp.c-c++-common/reduction-5.c
new file mode 100644
index 00000000000..21540512e23
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c-c++-common/reduction-5.c
@@ -0,0 +1,193 @@
+/* { dg-additional-options "-foffload=-latomic" { target { offload_target_nvptx } } } */
+/* C / C++'s logical AND and OR operators take any scalar argument
+   which compares (un)equal to 0 - the result 1 or 0 and of type int.
+
+   In this testcase, the int result is again converted to a floating-poing
+   or complex type.
+
+   While having a floating-point/complex array element with || and && can make
+   sense, having a non-integer/non-bool reduction variable is odd but valid.
+
+   Test: FP reduction variable + FP array - as reduction-1.c but with target  */
+
+#define N 1024
+_Complex float rcf[N];
+_Complex double rcd[N];
+float rf[N];
+double rd[N];
+
+int
+reduction_or ()
+{
+  float orf = 0;
+  double ord = 0;
+  _Complex float orfc = 0;
+  _Complex double ordc = 0;
+
+  #pragma omp target parallel reduction(||: orf) map(orf)
+  for (int i=0; i < N; ++i)
+    orf = orf || rf[i];
+
+  #pragma omp target parallel for reduction(||: ord) map(ord)
+  for (int i=0; i < N; ++i)
+    ord = ord || rcd[i];
+
+  #pragma omp target parallel for simd reduction(||: orfc) map(orfc)
+  for (int i=0; i < N; ++i)
+    orfc = orfc || rcf[i];
+
+  #pragma omp target parallel loop reduction(||: ordc) map(ordc)
+  for (int i=0; i < N; ++i)
+    ordc = ordc || rcd[i];
+
+  return orf + ord + __real__ orfc + __real__ ordc;
+}
+
+int
+reduction_or_teams ()
+{
+  float orf = 0;
+  double ord = 0;
+  _Complex float orfc = 0;
+  _Complex double ordc = 0;
+
+  #pragma omp target teams distribute parallel for reduction(||: orf) map(orf)
+  for (int i=0; i < N; ++i)
+    orf = orf || rf[i];
+
+  #pragma omp target teams distribute parallel for simd reduction(||: ord) map(ord)
+  for (int i=0; i < N; ++i)
+    ord = ord || rcd[i];
+
+  #pragma omp target teams distribute parallel for reduction(||: orfc) map(orfc)
+  for (int i=0; i < N; ++i)
+    orfc = orfc || rcf[i];
+
+  #pragma omp target teams distribute parallel for simd reduction(||: ordc) map(ordc)
+  for (int i=0; i < N; ++i)
+    ordc = ordc || rcd[i];
+
+  return orf + ord + __real__ orfc + __real__ ordc;
+}
+
+int
+reduction_and ()
+{
+  float andf = 1;
+  double andd = 1;
+  _Complex float andfc = 1;
+  _Complex double anddc = 1;
+
+  #pragma omp target parallel reduction(&&: andf) map(andf)
+  for (int i=0; i < N; ++i)
+    andf = andf && rf[i];
+
+  #pragma omp target parallel for reduction(&&: andd) map(andd)
+  for (int i=0; i < N; ++i)
+    andd = andd && rcd[i];
+
+  #pragma omp target parallel for simd reduction(&&: andfc) map(andfc)
+  for (int i=0; i < N; ++i)
+    andfc = andfc && rcf[i];
+
+  #pragma omp target parallel loop reduction(&&: anddc) map(anddc)
+  for (int i=0; i < N; ++i)
+    anddc = anddc && rcd[i];
+
+  return andf + andd + __real__ andfc + __real__ anddc;
+}
+
+int
+reduction_and_teams ()
+{
+  float andf = 1;
+  double andd = 1;
+  _Complex float andfc = 1;
+  _Complex double anddc = 1;
+
+  #pragma omp target teams distribute parallel for reduction(&&: andf) map(andf)
+  for (int i=0; i < N; ++i)
+    andf = andf && rf[i];
+
+  #pragma omp target teams distribute parallel for simd reduction(&&: andd) map(andd)
+  for (int i=0; i < N; ++i)
+    andd = andd && rcd[i];
+
+  #pragma omp target teams distribute parallel for reduction(&&: andfc) map(andfc)
+  for (int i=0; i < N; ++i)
+    andfc = andfc && rcf[i];
+
+  #pragma omp target teams distribute parallel for simd reduction(&&: anddc) map(anddc)
+  for (int i=0; i < N; ++i)
+    anddc = anddc && rcd[i];
+
+  return andf + andd + __real__ andfc + __real__ anddc;
+}
+
+int
+main ()
+{
+  for (int i = 0; i < N; ++i)
+    {
+      rf[i] = 0;
+      rd[i] = 0;
+      rcf[i] = 0;
+      rcd[i] = 0;
+    }
+
+  if (reduction_or () != 0)
+    __builtin_abort ();
+  if (reduction_or_teams () != 0)
+    __builtin_abort ();
+  if (reduction_and () != 0)
+    __builtin_abort ();
+  if (reduction_and_teams () != 0)
+    __builtin_abort ();
+
+  rf[10] = 1.0;
+  rd[15] = 1.0;
+  rcf[10] = 1.0;
+  rcd[15] = 1.0i;
+
+  if (reduction_or () != 4)
+    __builtin_abort ();
+  if (reduction_or_teams () != 4)
+    __builtin_abort ();
+  if (reduction_and () != 0)
+    __builtin_abort ();
+  if (reduction_and_teams () != 0)
+    __builtin_abort ();
+
+  for (int i = 0; i < N; ++i)
+    {
+      rf[i] = 1;
+      rd[i] = 1;
+      rcf[i] = 1;
+      rcd[i] = 1;
+    }
+
+  if (reduction_or () != 4)
+    __builtin_abort ();
+  if (reduction_or_teams () != 4)
+    __builtin_abort ();
+  if (reduction_and () != 4)
+    __builtin_abort ();
+  if (reduction_and_teams () != 4)
+    __builtin_abort ();
+
+  rf[10] = 0.0;
+  rd[15] = 0.0;
+  rcf[10] = 0.0;
+  rcd[15] = 0.0;
+
+  if (reduction_or () != 4)
+    __builtin_abort ();
+  if (reduction_or_teams () != 4)
+    __builtin_abort ();
+  if (reduction_and () != 0)
+    __builtin_abort ();
+  if (reduction_and_teams () != 0)
+    __builtin_abort ();
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c-c++-common/reduction-6.c b/libgomp/testsuite/libgomp.c-c++-common/reduction-6.c
new file mode 100644
index 00000000000..27d9ef6b635
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c-c++-common/reduction-6.c
@@ -0,0 +1,196 @@
+/* { dg-additional-options "-foffload=-latomic" { target { offload_target_nvptx } } } */
+/* C / C++'s logical AND and OR operators take any scalar argument
+   which compares (un)equal to 0 - the result 1 or 0 and of type int.
+
+   In this testcase, the int result is again converted to an integer complex
+   type.
+
+   While having a floating-point/complex array element with || and && can make
+   sense, having a complex reduction variable is odd but valid.
+
+   Test: int complex reduction variable + int complex array.
+         as reduction-4.c but with target.  */
+
+#define N 1024
+_Complex char rcc[N];
+_Complex short rcs[N];
+_Complex int rci[N];
+_Complex long long rcl[N];
+
+int
+reduction_or ()
+{
+  _Complex char orc = 0;
+  _Complex short ors = 0;
+  _Complex int ori = 0;
+  _Complex long orl = 0;
+
+  #pragma omp target parallel reduction(||: orc) map(orc)
+  for (int i=0; i < N; ++i)
+    orc = orc || rcl[i];
+
+  #pragma omp target parallel for reduction(||: ors) map(ors)
+  for (int i=0; i < N; ++i)
+    ors = ors || rci[i];
+
+  #pragma omp target parallel for simd reduction(||: ori) map(ori)
+  for (int i=0; i < N; ++i)
+    ori = ori || rcs[i];
+
+  #pragma omp target parallel loop reduction(||: orl) map(orl)
+  for (int i=0; i < N; ++i)
+    orl = orl || rcc[i];
+
+  return __real__ (orc + ors + ori + orl) + __imag__ (orc + ors + ori + orl);
+}
+
+int
+reduction_or_teams ()
+{
+  _Complex char orc = 0;
+  _Complex short ors = 0;
+  _Complex int ori = 0;
+  _Complex long orl = 0;
+
+  #pragma omp target teams distribute parallel for reduction(||: orc) map(orc)
+  for (int i=0; i < N; ++i)
+    orc = orc || rcc[i];
+
+  #pragma omp target teams distribute parallel for simd reduction(||: ors) map(ors)
+  for (int i=0; i < N; ++i)
+    ors = ors || rcs[i];
+
+  #pragma omp target teams distribute parallel for reduction(||: ori) map(ori)
+  for (int i=0; i < N; ++i)
+    ori = ori || rci[i];
+
+  #pragma omp target teams distribute parallel for simd reduction(||: orl) map(orl)
+  for (int i=0; i < N; ++i)
+    orl = orl || rcl[i];
+
+  return __real__ (orc + ors + ori + orl) + __imag__ (orc + ors + ori + orl);
+}
+
+int
+reduction_and ()
+{
+  _Complex char andc = 1;
+  _Complex short ands = 1;
+  _Complex int andi = 1;
+  _Complex long andl = 1;
+
+  #pragma omp target parallel reduction(&&: andc) map(andc)
+  for (int i=0; i < N; ++i)
+    andc = andc && rcc[i];
+
+  #pragma omp target parallel for reduction(&&: ands) map(ands)
+  for (int i=0; i < N; ++i)
+    ands = ands && rcs[i];
+
+  #pragma omp target parallel for simd reduction(&&: andi) map(andi)
+  for (int i=0; i < N; ++i)
+    andi = andi && rci[i];
+
+  #pragma omp target parallel loop reduction(&&: andl) map(andl)
+  for (int i=0; i < N; ++i)
+    andl = andl && rcl[i];
+
+  return __real__ (andc + ands + andi + andl)
+	 + __imag__ (andc + ands + andi + andl);
+}
+
+int
+reduction_and_teams ()
+{
+  _Complex char andc = 1;
+  _Complex short ands = 1;
+  _Complex int andi = 1;
+  _Complex long andl = 1;
+
+  #pragma omp target teams distribute parallel for reduction(&&: andc) map(andc)
+  for (int i=0; i < N; ++i)
+    andc = andc && rcl[i];
+
+  #pragma omp target teams distribute parallel for simd reduction(&&: ands) map(ands)
+  for (int i=0; i < N; ++i)
+    ands = ands && rci[i];
+
+  #pragma omp target teams distribute parallel for reduction(&&: andi) map(andi)
+  for (int i=0; i < N; ++i)
+    andi = andi && rcs[i];
+
+  #pragma omp target teams distribute parallel for simd reduction(&&: andl) map(andl)
+  for (int i=0; i < N; ++i)
+    andl = andl && rcc[i];
+
+  return __real__ (andc + ands + andi + andl)
+	 + __imag__ (andc + ands + andi + andl);
+}
+
+int
+main ()
+{
+  for (int i = 0; i < N; ++i)
+    {
+      rcc[i] = 0;
+      rcs[i] = 0;
+      rci[i] = 0;
+      rcl[i] = 0;
+    }
+
+  if (reduction_or () != 0)
+    __builtin_abort ();
+  if (reduction_or_teams () != 0)
+    __builtin_abort ();
+  if (reduction_and () != 0)
+    __builtin_abort ();
+  if (reduction_and_teams () != 0)
+    __builtin_abort ();
+
+  rcc[10] = 1.0;
+  rcs[15] = 1.0i;
+  rci[10] = 1.0;
+  rcl[15] = 1.0i;
+
+  if (reduction_or () != 4)
+    __builtin_abort ();
+  if (reduction_or_teams () != 4)
+    __builtin_abort ();
+  if (reduction_and () != 0)
+    __builtin_abort ();
+  if (reduction_and_teams () != 0)
+    __builtin_abort ();
+
+  for (int i = 0; i < N; ++i)
+    {
+      rcc[i] = 1;
+      rcs[i] = 1i;
+      rci[i] = 1;
+      rcl[i] = 1 + 1i;
+    }
+
+  if (reduction_or () != 4)
+    __builtin_abort ();
+  if (reduction_or_teams () != 4)
+    __builtin_abort ();
+  if (reduction_and () != 4)
+    __builtin_abort ();
+  if (reduction_and_teams () != 4)
+    __builtin_abort ();
+
+  rcc[10] = 0.0;
+  rcs[15] = 0.0;
+  rci[10] = 0.0;
+  rcl[15] = 0.0;
+
+  if (reduction_or () != 4)
+    __builtin_abort ();
+  if (reduction_or_teams () != 4)
+    __builtin_abort ();
+  if (reduction_and () != 0)
+    __builtin_abort ();
+  if (reduction_and_teams () != 0)
+    __builtin_abort ();
+
+  return 0;
+}