* Runtime checking of OpenACC parallelism dimensions clauses
@ 2017-05-11 12:27 Thomas Schwinge
2017-05-14 10:29 ` Thomas Schwinge
` (2 more replies)
0 siblings, 3 replies; 5+ messages in thread
From: Thomas Schwinge @ 2017-05-11 12:27 UTC (permalink / raw)
To: gcc-patches, Jakub Jelinek
Hi!
OK for trunk?
commit 0ba48b4faf85420fbe12971afdd6e0afe70778bb
Author: Thomas Schwinge <thomas@codesourcery.com>
Date: Fri May 5 16:41:59 2017 +0200
Runtime checking of OpenACC parallelism dimensions clauses
libgomp/
* testsuite/libgomp.oacc-c-c++-common/parallel-dims.c: Rewrite.
* testsuite/lib/libgomp.exp
(check_effective_target_openacc_nvidia_accel_configured): New
proc.
* testsuite/libgomp.oacc-c++/c++.exp (check_effective_target_c)
(check_effective_target_c++): New procs.
* testsuite/libgomp.oacc-c/c.exp (check_effective_target_c)
(check_effective_target_c++): Likewise.
---
libgomp/testsuite/lib/libgomp.exp | 12 +
libgomp/testsuite/libgomp.oacc-c++/c++.exp | 7 +
.../libgomp.oacc-c-c++-common/parallel-dims.c | 523 ++++++++++++++++++++-
libgomp/testsuite/libgomp.oacc-c/c.exp | 7 +
4 files changed, 537 insertions(+), 12 deletions(-)
diff --git libgomp/testsuite/lib/libgomp.exp libgomp/testsuite/lib/libgomp.exp
index 5e47872..62ee2e3 100644
--- libgomp/testsuite/lib/libgomp.exp
+++ libgomp/testsuite/lib/libgomp.exp
@@ -358,6 +358,18 @@ proc check_effective_target_offload_device_shared_as { } {
} ]
}
+# Return 1 if configured for nvptx offloading.
+
+proc check_effective_target_openacc_nvidia_accel_configured { } {
+ global offload_targets
+ if { ![string match "*,nvptx,*" ",$offload_targets,"] } {
+ return 0
+ }
+ # PR libgomp/65099: Currently, we only support offloading in 64-bit
+ # configurations.
+ return [is-effective-target lp64]
+}
+
# Return 1 if at least one nvidia board is present.
proc check_effective_target_openacc_nvidia_accel_present { } {
diff --git libgomp/testsuite/libgomp.oacc-c++/c++.exp libgomp/testsuite/libgomp.oacc-c++/c++.exp
index 608b298..9beadd6 100644
--- libgomp/testsuite/libgomp.oacc-c++/c++.exp
+++ libgomp/testsuite/libgomp.oacc-c++/c++.exp
@@ -4,6 +4,13 @@ load_lib libgomp-dg.exp
load_gcc_lib gcc-dg.exp
load_gcc_lib torture-options.exp
+proc check_effective_target_c { } {
+ return 0
+}
+proc check_effective_target_c++ { } {
+ return 1
+}
+
global shlib_ext
set shlib_ext [get_shlib_extension]
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
index f5766a4..d8af546 100644
--- libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
@@ -1,25 +1,524 @@
-/* { dg-do run { target openacc_nvidia_accel_selected } } */
+/* OpenACC parallelism dimensions clauses: num_gangs, num_workers,
+ vector_length. */
+
+#include <limits.h>
+#include <openacc.h>
+
+/* TODO: "(int) acc_device_*" casts because of the C++ acc_on_device wrapper
+ not behaving as expected for -O0. */
+#pragma acc routine seq
+static unsigned int __attribute__ ((optimize ("O2"))) acc_gang ()
+{
+ if (acc_on_device ((int) acc_device_host))
+ return 0;
+ else if (acc_on_device ((int) acc_device_nvidia))
+ {
+ unsigned int r;
+ asm volatile ("mov.u32 %0,%%ctaid.x;" : "=r" (r));
+ return r;
+ }
+ else
+ __builtin_abort ();
+}
+
+#pragma acc routine seq
+static unsigned int __attribute__ ((optimize ("O2"))) acc_worker ()
+{
+ if (acc_on_device ((int) acc_device_host))
+ return 0;
+ else if (acc_on_device ((int) acc_device_nvidia))
+ {
+ unsigned int r;
+ asm volatile ("mov.u32 %0,%%tid.y;" : "=r" (r));
+ return r;
+ }
+ else
+ __builtin_abort ();
+}
+
+#pragma acc routine seq
+static unsigned int __attribute__ ((optimize ("O2"))) acc_vector ()
+{
+ if (acc_on_device ((int) acc_device_host))
+ return 0;
+ else if (acc_on_device ((int) acc_device_nvidia))
+ {
+ unsigned int r;
+ asm volatile ("mov.u32 %0,%%tid.x;" : "=r" (r));
+ return r;
+ }
+ else
+ __builtin_abort ();
+}
-/* Worker and vector size checks. Picked an outrageously large
- value. */
int main ()
{
- int dummy[10];
+ acc_init (acc_device_default);
-#pragma acc parallel num_workers (2<<20) /* { dg-error "using num_workers" } */
+ /* Non-positive value. */
+
+ /* GR, WS, VS. */
+ {
+#define GANGS 0 /* { dg-warning "'num_gangs' value must be positive" "" { target c } } */
+ int gangs_actual = GANGS;
+ int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
+ gangs_min = workers_min = vectors_min = INT_MAX;
+ gangs_max = workers_max = vectors_max = INT_MIN;
+#pragma acc parallel copy (gangs_actual) \
+ reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max) \
+ num_gangs (GANGS) /* { dg-warning "'num_gangs' value must be positive" "" { target c++ } } */
+ {
+ /* We're actually executing with num_gangs (1). */
+ gangs_actual = 1;
+ for (int i = 100 * gangs_actual; i > -100 * gangs_actual; --i)
+ {
+ /* <https://gcc.gnu.org/PR80547>. */
+#if 0
+ gangs_min = gangs_max = acc_gang ();
+ workers_min = workers_max = acc_worker ();
+ vectors_min = vectors_max = acc_vector ();
+#else
+ int gangs = acc_gang ();
+ gangs_min = (gangs_min < gangs) ? gangs_min : gangs;
+ gangs_max = (gangs_max > gangs) ? gangs_max : gangs;
+ int workers = acc_worker ();
+ workers_min = (workers_min < workers) ? workers_min : workers;
+ workers_max = (workers_max > workers) ? workers_max : workers;
+ int vectors = acc_vector ();
+ vectors_min = (vectors_min < vectors) ? vectors_min : vectors;
+ vectors_max = (vectors_max > vectors) ? vectors_max : vectors;
+#endif
+ }
+ }
+ if (gangs_actual != 1)
+ __builtin_abort ();
+ if (gangs_min != 0 || gangs_max != gangs_actual - 1
+ || workers_min != 0 || workers_max != 0
+ || vectors_min != 0 || vectors_max != 0)
+ __builtin_abort ();
+#undef GANGS
+ }
+
+ /* GP, WS, VS. */
+ {
+#define GANGS 0 /* { dg-warning "'num_gangs' value must be positive" "" { target c } } */
+ int gangs_actual = GANGS;
+ int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
+ gangs_min = workers_min = vectors_min = INT_MAX;
+ gangs_max = workers_max = vectors_max = INT_MIN;
+#pragma acc parallel copy (gangs_actual) \
+ num_gangs (GANGS) /* { dg-warning "'num_gangs' value must be positive" "" { target c++ } } */
+ {
+ /* We're actually executing with num_gangs (1). */
+ gangs_actual = 1;
+#pragma acc loop gang reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
+ for (int i = 100 * gangs_actual; i > -100 * gangs_actual; --i)
+ {
+ gangs_min = gangs_max = acc_gang ();
+ workers_min = workers_max = acc_worker ();
+ vectors_min = vectors_max = acc_vector ();
+ }
+ }
+ if (gangs_actual != 1)
+ __builtin_abort ();
+ if (gangs_min != 0 || gangs_max != gangs_actual - 1
+ || workers_min != 0 || workers_max != 0
+ || vectors_min != 0 || vectors_max != 0)
+ __builtin_abort ();
+#undef GANGS
+ }
+
+ /* GR, WP, VS. */
+ {
+#define WORKERS 0 /* { dg-warning "'num_workers' value must be positive" "" { target c } } */
+ int workers_actual = WORKERS;
+ int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
+ gangs_min = workers_min = vectors_min = INT_MAX;
+ gangs_max = workers_max = vectors_max = INT_MIN;
+#pragma acc parallel copy (workers_actual) \
+ num_workers (WORKERS) /* { dg-warning "'num_workers' value must be positive" "" { target c++ } } */
+ {
+ /* We're actually executing with num_workers (1). */
+ workers_actual = 1;
+#pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
+ for (int i = 100 * workers_actual; i > -100 * workers_actual; --i)
+ {
+ gangs_min = gangs_max = acc_gang ();
+ workers_min = workers_max = acc_worker ();
+ vectors_min = vectors_max = acc_vector ();
+ }
+ }
+ if (workers_actual != 1)
+ __builtin_abort ();
+ if (gangs_min != 0 || gangs_max != 0
+ || workers_min != 0 || workers_max != workers_actual - 1
+ || vectors_min != 0 || vectors_max != 0)
+ __builtin_abort ();
+#undef WORKERS
+ }
+
+ /* GR, WS, VP. */
+ {
+#define VECTORS 0 /* { dg-warning "'vector_length' value must be positive" "" { target c } } */
+ int vectors_actual = VECTORS;
+ int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
+ gangs_min = workers_min = vectors_min = INT_MAX;
+ gangs_max = workers_max = vectors_max = INT_MIN;
+#pragma acc parallel copy (vectors_actual) /* { dg-warning "using vector_length \\(32\\), ignoring 1" "" { target openacc_nvidia_accel_configured } } */ \
+ vector_length (VECTORS) /* { dg-warning "'vector_length' value must be positive" "" { target c++ } } */
+ {
+ /* We're actually executing with vector_length (1), just the GCC nvptx
+ back end enforces vector_length (32). */
+ if (acc_on_device (acc_device_nvidia))
+ vectors_actual = 32;
+ else
+ vectors_actual = 1;
+#pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
+ for (int i = 100 * vectors_actual; i > -100 * vectors_actual; --i)
+ {
+ gangs_min = gangs_max = acc_gang ();
+ workers_min = workers_max = acc_worker ();
+ vectors_min = vectors_max = acc_vector ();
+ }
+ }
+ if (acc_get_device_type () == acc_device_nvidia)
+ {
+ if (vectors_actual != 32)
+ __builtin_abort ();
+ }
+ else
+ if (vectors_actual != 1)
+ __builtin_abort ();
+ if (gangs_min != 0 || gangs_max != 0
+ || workers_min != 0 || workers_max != 0
+ || vectors_min != 0 || vectors_max != vectors_actual - 1)
+ __builtin_abort ();
+#undef VECTORS
+ }
+
+
+ /* High value. */
+
+ /* GR, WS, VS. */
+ {
+ /* There is no actual limit for the number of gangs, so we try with a
+ rather high value. */
+ int gangs = 12345;
+ int gangs_actual = gangs;
+ int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
+ gangs_min = workers_min = vectors_min = INT_MAX;
+ gangs_max = workers_max = vectors_max = INT_MIN;
+#pragma acc parallel copy (gangs_actual) \
+ reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max) \
+ num_gangs (gangs)
+ {
+ if (acc_on_device (acc_device_host))
+ {
+ /* We're actually executing with num_gangs (1). */
+ gangs_actual = 1;
+ }
+ /* As we're executing GR not GP, don't multiply with a "gangs_actual"
+ factor. */
+ for (int i = 100 /* * gangs_actual */; i > -100 /* * gangs_actual */; --i)
+ {
+ gangs_min = gangs_max = acc_gang ();
+ workers_min = workers_max = acc_worker ();
+ vectors_min = vectors_max = acc_vector ();
+ }
+ }
+ if (gangs_actual < 1)
+ __builtin_abort ();
+ if (gangs_min != 0 || gangs_max != gangs_actual - 1
+ || workers_min != 0 || workers_max != 0
+ || vectors_min != 0 || vectors_max != 0)
+ __builtin_abort ();
+ }
+
+ /* GP, WS, VS. */
+ {
+ /* There is no actual limit for the number of gangs, so we try with a
+ rather high value. */
+ int gangs = 12345;
+ int gangs_actual = gangs;
+ int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
+ gangs_min = workers_min = vectors_min = INT_MAX;
+ gangs_max = workers_max = vectors_max = INT_MIN;
+#pragma acc parallel copy (gangs_actual) \
+ num_gangs (gangs)
+ {
+ if (acc_on_device (acc_device_host))
+ {
+ /* We're actually executing with num_gangs (1). */
+ gangs_actual = 1;
+ }
+#pragma acc loop gang reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
+ for (int i = 100 * gangs_actual; i > -100 * gangs_actual; --i)
+ {
+ gangs_min = gangs_max = acc_gang ();
+ workers_min = workers_max = acc_worker ();
+ vectors_min = vectors_max = acc_vector ();
+ }
+ }
+ if (gangs_actual < 1)
+ __builtin_abort ();
+ if (gangs_min != 0 || gangs_max != gangs_actual - 1
+ || workers_min != 0 || workers_max != 0
+ || vectors_min != 0 || vectors_max != 0)
+ __builtin_abort ();
+ }
+
+ /* GR, WP, VS. */
+ {
+ /* We try with an outrageously large value. */
+#define WORKERS 2 << 20
+ int workers_actual = WORKERS;
+ int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
+ gangs_min = workers_min = vectors_min = INT_MAX;
+ gangs_max = workers_max = vectors_max = INT_MIN;
+#pragma acc parallel copy (workers_actual) /* { dg-warning "using num_workers \\(32\\), ignoring 2097152" "" { target openacc_nvidia_accel_configured } } */ \
+ num_workers (WORKERS)
+ {
+ if (acc_on_device (acc_device_host))
+ {
+ /* We're actually executing with num_workers (1). */
+ workers_actual = 1;
+ }
+ else if (acc_on_device (acc_device_nvidia))
+ {
+ /* The GCC nvptx back end enforces num_workers (32). */
+ workers_actual = 32;
+ }
+ else
+ __builtin_abort ();
+#pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
+ for (int i = 100 * workers_actual; i > -100 * workers_actual; --i)
+ {
+ gangs_min = gangs_max = acc_gang ();
+ workers_min = workers_max = acc_worker ();
+ vectors_min = vectors_max = acc_vector ();
+ }
+ }
+ if (workers_actual < 1)
+ __builtin_abort ();
+ if (gangs_min != 0 || gangs_max != 0
+ || workers_min != 0 || workers_max != workers_actual - 1
+ || vectors_min != 0 || vectors_max != 0)
+ __builtin_abort ();
+#undef WORKERS
+ }
+
+ /* GR, WP, VS. */
+ {
+ /* We try with an outrageously large value. */
+ int workers = 2 << 20;
+ /* For nvptx offloading, this one will not result in "using num_workers
+ (32), ignoring runtime setting", and will in fact try to launch with
+ "num_workers (workers)", which will run into "libgomp: cuLaunchKernel
+ error: invalid argument". So, limit ourselves here. */
+ if (acc_get_device_type () == acc_device_nvidia)
+ workers = 32;
+ int workers_actual = workers;
+ int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
+ gangs_min = workers_min = vectors_min = INT_MAX;
+ gangs_max = workers_max = vectors_max = INT_MIN;
+#pragma acc parallel copy (workers_actual) \
+ num_workers (workers)
+ {
+ if (acc_on_device (acc_device_host))
+ {
+ /* We're actually executing with num_workers (1). */
+ workers_actual = 1;
+ }
+ else if (acc_on_device (acc_device_nvidia))
+ {
+ /* We're actually executing with num_workers (32). */
+ /* workers_actual = 32; */
+ }
+ else
+ __builtin_abort ();
+#pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
+ for (int i = 100 * workers_actual; i > -100 * workers_actual; --i)
+ {
+ gangs_min = gangs_max = acc_gang ();
+ workers_min = workers_max = acc_worker ();
+ vectors_min = vectors_max = acc_vector ();
+ }
+ }
+ if (workers_actual < 1)
+ __builtin_abort ();
+ if (gangs_min != 0 || gangs_max != 0
+ || workers_min != 0 || workers_max != workers_actual - 1
+ || vectors_min != 0 || vectors_max != 0)
+ __builtin_abort ();
+ }
+
+ /* GR, WS, VP. */
{
-#pragma acc loop worker
- for (int i = 0; i < 10; i++)
- dummy[i] = i;
+ /* We try with an outrageously large value. */
+#define VECTORS 2 << 20
+ int vectors_actual = VECTORS;
+ int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
+ gangs_min = workers_min = vectors_min = INT_MAX;
+ gangs_max = workers_max = vectors_max = INT_MIN;
+#pragma acc parallel copy (vectors_actual) /* { dg-warning "using vector_length \\(32\\), ignoring 2097152" "" { target openacc_nvidia_accel_configured } } */ \
+ vector_length (VECTORS)
+ {
+ if (acc_on_device (acc_device_host))
+ {
+ /* We're actually executing with vector_length (1). */
+ vectors_actual = 1;
+ }
+ else if (acc_on_device (acc_device_nvidia))
+ {
+ /* The GCC nvptx back end enforces vector_length (32). */
+ vectors_actual = 32;
+ }
+ else
+ __builtin_abort ();
+#pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
+ for (int i = 100 * vectors_actual; i > -100 * vectors_actual; --i)
+ {
+ gangs_min = gangs_max = acc_gang ();
+ workers_min = workers_max = acc_worker ();
+ vectors_min = vectors_max = acc_vector ();
+ }
+ }
+ if (vectors_actual < 1)
+ __builtin_abort ();
+ if (gangs_min != 0 || gangs_max != 0
+ || workers_min != 0 || workers_max != 0
+ || vectors_min != 0 || vectors_max != vectors_actual - 1)
+ __builtin_abort ();
+#undef VECTORS
}
-#pragma acc parallel vector_length (2<<20) /* { dg-error "using vector_length" } */
+ /* GR, WS, VP. */
{
-#pragma acc loop vector
- for (int i = 0; i < 10; i++)
- dummy[i] = i;
+ /* We try with an outrageously large value. */
+ int vectors = 2 << 20;
+ int vectors_actual = vectors;
+ int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
+ gangs_min = workers_min = vectors_min = INT_MAX;
+ gangs_max = workers_max = vectors_max = INT_MIN;
+#pragma acc parallel copy (vectors_actual) /* { dg-warning "using vector_length \\(32\\), ignoring runtime setting" "" { target openacc_nvidia_accel_configured } } */ \
+ vector_length (vectors)
+ {
+ if (acc_on_device (acc_device_host))
+ {
+ /* We're actually executing with vector_length (1). */
+ vectors_actual = 1;
+ }
+ else if (acc_on_device (acc_device_nvidia))
+ {
+ /* The GCC nvptx back end enforces vector_length (32). */
+ vectors_actual = 32;
+ }
+ else
+ __builtin_abort ();
+#pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
+ for (int i = 100 * vectors_actual; i > -100 * vectors_actual; --i)
+ {
+ gangs_min = gangs_max = acc_gang ();
+ workers_min = workers_max = acc_worker ();
+ vectors_min = vectors_max = acc_vector ();
+ }
+ }
+ if (vectors_actual < 1)
+ __builtin_abort ();
+ if (gangs_min != 0 || gangs_max != 0
+ || workers_min != 0 || workers_max != 0
+ || vectors_min != 0 || vectors_max != vectors_actual - 1)
+ __builtin_abort ();
}
+
+ /* Composition of GP, WP, VP. */
+ {
+ int gangs = 12345;
+ /* With nvptx offloading, multi-level reductions apparently are very slow
+ in the following case. So, limit ourselves here. */
+ if (acc_get_device_type () == acc_device_nvidia)
+ gangs = 3;
+ int gangs_actual = gangs;
+#define WORKERS 3
+ int workers_actual = WORKERS;
+#define VECTORS 11
+ int vectors_actual = VECTORS;
+ int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
+ gangs_min = workers_min = vectors_min = INT_MAX;
+ gangs_max = workers_max = vectors_max = INT_MIN;
+#pragma acc parallel copy (gangs_actual, workers_actual, vectors_actual) /* { dg-warning "using vector_length \\(32\\), ignoring 11" "" { target openacc_nvidia_accel_configured } } */ \
+ num_gangs (gangs) \
+ num_workers (WORKERS) \
+ vector_length (VECTORS)
+ {
+ if (acc_on_device (acc_device_host))
+ {
+ /* We're actually executing with num_gangs (1), num_workers (1),
+ vector_length (1). */
+ gangs_actual = 1;
+ workers_actual = 1;
+ vectors_actual = 1;
+ }
+ else if (acc_on_device (acc_device_nvidia))
+ {
+ /* The GCC nvptx back end enforces vector_length (32). */
+ vectors_actual = 32;
+ }
+ else
+ __builtin_abort ();
+#pragma acc loop gang reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
+ for (int i = 100 * gangs_actual; i > -100 * gangs_actual; --i)
+#pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
+ for (int j = 100 * workers_actual; j > -100 * workers_actual; --j)
+#pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
+ for (int k = 100 * vectors_actual; k > -100 * vectors_actual; --k)
+ {
+ gangs_min = gangs_max = acc_gang ();
+ workers_min = workers_max = acc_worker ();
+ vectors_min = vectors_max = acc_vector ();
+ }
+ }
+ if (gangs_min != 0 || gangs_max != gangs_actual - 1
+ || workers_min != 0 || workers_max != workers_actual - 1
+ || vectors_min != 0 || vectors_max != vectors_actual - 1)
+ __builtin_abort ();
+#undef VECTORS
+#undef WORKERS
+ }
+
+
+ /* We can't test parallelized OpenACC kernels constructs in this way: use of
+ the acc_gang, acc_worker, acc_vector functions will make the construct
+ unparallelizable. */
+
+
+ /* Unparallelized OpenACC kernels constructs must get launched as 1 x 1 x 1
+ kernels. */
+ {
+ int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
+ gangs_min = workers_min = vectors_min = INT_MAX;
+ gangs_max = workers_max = vectors_max = INT_MIN;
+#pragma acc kernels
+ {
+ /* This is to make the OpenACC kernels construct unparallelizable. */
+ asm volatile ("" : : : "memory");
+
+#pragma acc loop reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
+ for (int i = 100; i > -100; --i)
+ {
+ gangs_min = gangs_max = acc_gang ();
+ workers_min = workers_max = acc_worker ();
+ vectors_min = vectors_max = acc_vector ();
+ }
+ }
+ if (gangs_min != 0 || gangs_max != 1 - 1
+ || workers_min != 0 || workers_max != 1 - 1
+ || vectors_min != 0 || vectors_max != 1 - 1)
+ __builtin_abort ();
+ }
+
+
return 0;
}
diff --git libgomp/testsuite/libgomp.oacc-c/c.exp libgomp/testsuite/libgomp.oacc-c/c.exp
index b509a10..4475bf5 100644
--- libgomp/testsuite/libgomp.oacc-c/c.exp
+++ libgomp/testsuite/libgomp.oacc-c/c.exp
@@ -15,6 +15,13 @@ load_lib libgomp-dg.exp
load_gcc_lib gcc-dg.exp
load_gcc_lib torture-options.exp
+proc check_effective_target_c { } {
+ return 1
+}
+proc check_effective_target_c++ { } {
+ return 0
+}
+
# Initialize dg.
dg-init
torture-init
Grüße
Thomas
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: Runtime checking of OpenACC parallelism dimensions clauses
2017-05-11 12:27 Runtime checking of OpenACC parallelism dimensions clauses Thomas Schwinge
@ 2017-05-14 10:29 ` Thomas Schwinge
2017-05-19 11:03 ` Thomas Schwinge
2017-05-23 8:41 ` Jakub Jelinek
2 siblings, 0 replies; 5+ messages in thread
From: Thomas Schwinge @ 2017-05-14 10:29 UTC (permalink / raw)
To: gcc-patches; +Cc: Jakub Jelinek
Hi!
On Thu, 11 May 2017 14:24:05 +0200, I wrote:
> OK for trunk?
> Runtime checking of OpenACC parallelism dimensions clauses
For now, committed to gomp-4_0-branch in r248030:
commit 59e5204e0ec16c0f14ec68148f856fd307ef8d51
Author: tschwinge <tschwinge@138bc75d-0d04-0410-961f-82ee72b054a4>
Date: Sun May 14 10:25:46 2017 +0000
Runtime checking of OpenACC parallelism dimensions clauses
libgomp/
* testsuite/libgomp.oacc-c-c++-common/parallel-dims.c: Rewrite.
* testsuite/libgomp.oacc-c++/c++.exp (check_effective_target_c)
(check_effective_target_c++): New procs.
* testsuite/libgomp.oacc-c/c.exp (check_effective_target_c)
(check_effective_target_c++): Likewise.
git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/branches/gomp-4_0-branch@248030 138bc75d-0d04-0410-961f-82ee72b054a4
---
libgomp/ChangeLog.gomp | 8 +
libgomp/testsuite/libgomp.oacc-c++/c++.exp | 7 +
.../libgomp.oacc-c-c++-common/parallel-dims.c | 526 ++++++++++++++++++++-
libgomp/testsuite/libgomp.oacc-c/c.exp | 7 +
4 files changed, 536 insertions(+), 12 deletions(-)
diff --git libgomp/ChangeLog.gomp libgomp/ChangeLog.gomp
index def0feb..a1627a8 100644
--- libgomp/ChangeLog.gomp
+++ libgomp/ChangeLog.gomp
@@ -1,3 +1,11 @@
+2017-05-14 Thomas Schwinge <thomas@codesourcery.com>
+
+ * testsuite/libgomp.oacc-c-c++-common/parallel-dims.c: Rewrite.
+ * testsuite/libgomp.oacc-c++/c++.exp (check_effective_target_c)
+ (check_effective_target_c++): New procs.
+ * testsuite/libgomp.oacc-c/c.exp (check_effective_target_c)
+ (check_effective_target_c++): Likewise.
+
2017-05-12 Cesar Philippidis <cesar@codesourcery.com>
* testsuite/libgomp.oacc-c-c++-common/par-reduction-3.c: New test.
diff --git libgomp/testsuite/libgomp.oacc-c++/c++.exp libgomp/testsuite/libgomp.oacc-c++/c++.exp
index ba1a28e..695b96d 100644
--- libgomp/testsuite/libgomp.oacc-c++/c++.exp
+++ libgomp/testsuite/libgomp.oacc-c++/c++.exp
@@ -4,6 +4,13 @@ load_lib libgomp-dg.exp
load_gcc_lib gcc-dg.exp
load_gcc_lib torture-options.exp
+proc check_effective_target_c { } {
+ return 0
+}
+proc check_effective_target_c++ { } {
+ return 1
+}
+
global shlib_ext
set shlib_ext [get_shlib_extension]
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
index f5766a4..3458757 100644
--- libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
@@ -1,25 +1,527 @@
-/* { dg-do run { target openacc_nvidia_accel_selected } } */
+/* OpenACC parallelism dimensions clauses: num_gangs, num_workers,
+ vector_length. */
+
+/* { dg-additional-options "-foffload-force" } */
+
+#include <limits.h>
+#include <openacc.h>
+
+/* TODO: "(int) acc_device_*" casts because of the C++ acc_on_device wrapper
+ not behaving as expected for -O0. */
+#pragma acc routine seq
+static unsigned int __attribute__ ((optimize ("O2"))) acc_gang ()
+{
+ if (acc_on_device ((int) acc_device_host))
+ return 0;
+ else if (acc_on_device ((int) acc_device_nvidia))
+ {
+ unsigned int r;
+ asm volatile ("mov.u32 %0,%%ctaid.x;" : "=r" (r));
+ return r;
+ }
+ else
+ __builtin_abort ();
+}
+
+#pragma acc routine seq
+static unsigned int __attribute__ ((optimize ("O2"))) acc_worker ()
+{
+ if (acc_on_device ((int) acc_device_host))
+ return 0;
+ else if (acc_on_device ((int) acc_device_nvidia))
+ {
+ unsigned int r;
+ asm volatile ("mov.u32 %0,%%tid.y;" : "=r" (r));
+ return r;
+ }
+ else
+ __builtin_abort ();
+}
+
+#pragma acc routine seq
+static unsigned int __attribute__ ((optimize ("O2"))) acc_vector ()
+{
+ if (acc_on_device ((int) acc_device_host))
+ return 0;
+ else if (acc_on_device ((int) acc_device_nvidia))
+ {
+ unsigned int r;
+ asm volatile ("mov.u32 %0,%%tid.x;" : "=r" (r));
+ return r;
+ }
+ else
+ __builtin_abort ();
+}
-/* Worker and vector size checks. Picked an outrageously large
- value. */
int main ()
{
- int dummy[10];
+ acc_init (acc_device_default);
-#pragma acc parallel num_workers (2<<20) /* { dg-error "using num_workers" } */
+ /* Non-positive value. */
+
+ /* GR, WS, VS. */
+ {
+#define GANGS 0 /* { dg-warning "'num_gangs' value must be positive" "" { target c } } */
+ int gangs_actual = GANGS;
+ int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
+ gangs_min = workers_min = vectors_min = INT_MAX;
+ gangs_max = workers_max = vectors_max = INT_MIN;
+#pragma acc parallel copy (gangs_actual) \
+ reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max) \
+ num_gangs (GANGS) /* { dg-warning "'num_gangs' value must be positive" "" { target c++ } } */
+ {
+ /* We're actually executing with num_gangs (1). */
+ gangs_actual = 1;
+ for (int i = 100 * gangs_actual; i > -100 * gangs_actual; --i)
+ {
+ /* <https://gcc.gnu.org/PR80547>. */
+#if 0
+ gangs_min = gangs_max = acc_gang ();
+ workers_min = workers_max = acc_worker ();
+ vectors_min = vectors_max = acc_vector ();
+#else
+ int gangs = acc_gang ();
+ gangs_min = (gangs_min < gangs) ? gangs_min : gangs;
+ gangs_max = (gangs_max > gangs) ? gangs_max : gangs;
+ int workers = acc_worker ();
+ workers_min = (workers_min < workers) ? workers_min : workers;
+ workers_max = (workers_max > workers) ? workers_max : workers;
+ int vectors = acc_vector ();
+ vectors_min = (vectors_min < vectors) ? vectors_min : vectors;
+ vectors_max = (vectors_max > vectors) ? vectors_max : vectors;
+#endif
+ }
+ }
+ if (gangs_actual != 1)
+ __builtin_abort ();
+ if (gangs_min != 0 || gangs_max != gangs_actual - 1
+ || workers_min != 0 || workers_max != 0
+ || vectors_min != 0 || vectors_max != 0)
+ __builtin_abort ();
+#undef GANGS
+ }
+
+ /* GP, WS, VS. */
+ {
+#define GANGS 0 /* { dg-warning "'num_gangs' value must be positive" "" { target c } } */
+ int gangs_actual = GANGS;
+ int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
+ gangs_min = workers_min = vectors_min = INT_MAX;
+ gangs_max = workers_max = vectors_max = INT_MIN;
+#pragma acc parallel copy (gangs_actual) /* { dg-warning "region contains gang partitoned code but is not gang partitioned" } */ \
+ num_gangs (GANGS) /* { dg-warning "'num_gangs' value must be positive" "" { target c++ } } */
+ {
+ /* We're actually executing with num_gangs (1). */
+ gangs_actual = 1;
+#pragma acc loop gang reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
+ for (int i = 100 * gangs_actual; i > -100 * gangs_actual; --i)
+ {
+ gangs_min = gangs_max = acc_gang ();
+ workers_min = workers_max = acc_worker ();
+ vectors_min = vectors_max = acc_vector ();
+ }
+ }
+ if (gangs_actual != 1)
+ __builtin_abort ();
+ if (gangs_min != 0 || gangs_max != gangs_actual - 1
+ || workers_min != 0 || workers_max != 0
+ || vectors_min != 0 || vectors_max != 0)
+ __builtin_abort ();
+#undef GANGS
+ }
+
+ /* GR, WP, VS. */
+ {
+#define WORKERS 0 /* { dg-warning "'num_workers' value must be positive" "" { target c } } */
+ int workers_actual = WORKERS;
+ int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
+ gangs_min = workers_min = vectors_min = INT_MAX;
+ gangs_max = workers_max = vectors_max = INT_MIN;
+#pragma acc parallel copy (workers_actual) /* { dg-warning "region contains worker partitoned code but is not worker partitioned" } */ \
+ num_workers (WORKERS) /* { dg-warning "'num_workers' value must be positive" "" { target c++ } } */
+ {
+ /* We're actually executing with num_workers (1). */
+ workers_actual = 1;
+#pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
+ for (int i = 100 * workers_actual; i > -100 * workers_actual; --i)
+ {
+ gangs_min = gangs_max = acc_gang ();
+ workers_min = workers_max = acc_worker ();
+ vectors_min = vectors_max = acc_vector ();
+ }
+ }
+ if (workers_actual != 1)
+ __builtin_abort ();
+ if (gangs_min != 0 || gangs_max != 0
+ || workers_min != 0 || workers_max != workers_actual - 1
+ || vectors_min != 0 || vectors_max != 0)
+ __builtin_abort ();
+#undef WORKERS
+ }
+
+ /* GR, WS, VP. */
+ {
+#define VECTORS 0 /* { dg-warning "'vector_length' value must be positive" "" { target c } } */
+ int vectors_actual = VECTORS;
+ int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
+ gangs_min = workers_min = vectors_min = INT_MAX;
+ gangs_max = workers_max = vectors_max = INT_MIN;
+#pragma acc parallel copy (vectors_actual) /* { dg-warning "region contains vector partitoned code but is not vector partitioned" } */ \
+ /* { dg-warning "using vector_length \\(32\\), ignoring 1" "" { target openacc_nvidia_accel_selected } 170 } */ \
+ vector_length (VECTORS) /* { dg-warning "'vector_length' value must be positive" "" { target c++ } } */
+ {
+ /* We're actually executing with vector_length (1), just the GCC nvptx
+ back end enforces vector_length (32). */
+ if (acc_on_device (acc_device_nvidia))
+ vectors_actual = 32;
+ else
+ vectors_actual = 1;
+#pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
+ for (int i = 100 * vectors_actual; i > -100 * vectors_actual; --i)
+ {
+ gangs_min = gangs_max = acc_gang ();
+ workers_min = workers_max = acc_worker ();
+ vectors_min = vectors_max = acc_vector ();
+ }
+ }
+ if (acc_get_device_type () == acc_device_nvidia)
+ {
+ if (vectors_actual != 32)
+ __builtin_abort ();
+ }
+ else
+ if (vectors_actual != 1)
+ __builtin_abort ();
+ if (gangs_min != 0 || gangs_max != 0
+ || workers_min != 0 || workers_max != 0
+ || vectors_min != 0 || vectors_max != vectors_actual - 1)
+ __builtin_abort ();
+#undef VECTORS
+ }
+
+
+ /* High value. */
+
+ /* GR, WS, VS. */
+ {
+ /* There is no actual limit for the number of gangs, so we try with a
+ rather high value. */
+ int gangs = 12345;
+ int gangs_actual = gangs;
+ int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
+ gangs_min = workers_min = vectors_min = INT_MAX;
+ gangs_max = workers_max = vectors_max = INT_MIN;
+#pragma acc parallel copy (gangs_actual) /* { dg-warning "region is gang partitioned but does not contain gang partitioned code" } */ \
+ reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max) \
+ num_gangs (gangs)
+ {
+ if (acc_on_device (acc_device_host))
+ {
+ /* We're actually executing with num_gangs (1). */
+ gangs_actual = 1;
+ }
+ /* As we're executing GR not GP, don't multiply with a "gangs_actual"
+ factor. */
+ for (int i = 100 /* * gangs_actual */; i > -100 /* * gangs_actual */; --i)
+ {
+ gangs_min = gangs_max = acc_gang ();
+ workers_min = workers_max = acc_worker ();
+ vectors_min = vectors_max = acc_vector ();
+ }
+ }
+ if (gangs_actual < 1)
+ __builtin_abort ();
+ if (gangs_min != 0 || gangs_max != gangs_actual - 1
+ || workers_min != 0 || workers_max != 0
+ || vectors_min != 0 || vectors_max != 0)
+ __builtin_abort ();
+ }
+
+ /* GP, WS, VS. */
+ {
+ /* There is no actual limit for the number of gangs, so we try with a
+ rather high value. */
+ int gangs = 12345;
+ int gangs_actual = gangs;
+ int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
+ gangs_min = workers_min = vectors_min = INT_MAX;
+ gangs_max = workers_max = vectors_max = INT_MIN;
+#pragma acc parallel copy (gangs_actual) \
+ num_gangs (gangs)
+ {
+ if (acc_on_device (acc_device_host))
+ {
+ /* We're actually executing with num_gangs (1). */
+ gangs_actual = 1;
+ }
+#pragma acc loop gang reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
+ for (int i = 100 * gangs_actual; i > -100 * gangs_actual; --i)
+ {
+ gangs_min = gangs_max = acc_gang ();
+ workers_min = workers_max = acc_worker ();
+ vectors_min = vectors_max = acc_vector ();
+ }
+ }
+ if (gangs_actual < 1)
+ __builtin_abort ();
+ if (gangs_min != 0 || gangs_max != gangs_actual - 1
+ || workers_min != 0 || workers_max != 0
+ || vectors_min != 0 || vectors_max != 0)
+ __builtin_abort ();
+ }
+
+ /* GR, WP, VS. */
+ {
+ /* We try with an outrageously large value. */
+#define WORKERS 2 << 20
+ int workers_actual = WORKERS;
+ int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
+ gangs_min = workers_min = vectors_min = INT_MAX;
+ gangs_max = workers_max = vectors_max = INT_MIN;
+#pragma acc parallel copy (workers_actual) /* { dg-warning "using num_workers \\(32\\), ignoring 2097152" "" { target openacc_nvidia_accel_selected } } */ \
+ num_workers (WORKERS)
+ {
+ if (acc_on_device (acc_device_host))
+ {
+ /* We're actually executing with num_workers (1). */
+ workers_actual = 1;
+ }
+ else if (acc_on_device (acc_device_nvidia))
+ {
+ /* The GCC nvptx back end enforces num_workers (32). */
+ workers_actual = 32;
+ }
+ else
+ __builtin_abort ();
+#pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
+ for (int i = 100 * workers_actual; i > -100 * workers_actual; --i)
+ {
+ gangs_min = gangs_max = acc_gang ();
+ workers_min = workers_max = acc_worker ();
+ vectors_min = vectors_max = acc_vector ();
+ }
+ }
+ if (workers_actual < 1)
+ __builtin_abort ();
+ if (gangs_min != 0 || gangs_max != 0
+ || workers_min != 0 || workers_max != workers_actual - 1
+ || vectors_min != 0 || vectors_max != 0)
+ __builtin_abort ();
+#undef WORKERS
+ }
+
+ /* GR, WP, VS. */
+ {
+ /* We try with an outrageously large value. */
+ int workers = 2 << 20;
+ /* For nvptx offloading, this one will not result in "using num_workers
+ (32), ignoring runtime setting", and will in fact try to launch with
+ "num_workers (workers)", which will run into "libgomp: cuLaunchKernel
+ error: invalid argument". So, limit ourselves here. */
+ if (acc_get_device_type () == acc_device_nvidia)
+ workers = 32;
+ int workers_actual = workers;
+ int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
+ gangs_min = workers_min = vectors_min = INT_MAX;
+ gangs_max = workers_max = vectors_max = INT_MIN;
+#pragma acc parallel copy (workers_actual) \
+ num_workers (workers)
+ {
+ if (acc_on_device (acc_device_host))
+ {
+ /* We're actually executing with num_workers (1). */
+ workers_actual = 1;
+ }
+ else if (acc_on_device (acc_device_nvidia))
+ {
+ /* We're actually executing with num_workers (32). */
+ /* workers_actual = 32; */
+ }
+ else
+ __builtin_abort ();
+#pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
+ for (int i = 100 * workers_actual; i > -100 * workers_actual; --i)
+ {
+ gangs_min = gangs_max = acc_gang ();
+ workers_min = workers_max = acc_worker ();
+ vectors_min = vectors_max = acc_vector ();
+ }
+ }
+ if (workers_actual < 1)
+ __builtin_abort ();
+ if (gangs_min != 0 || gangs_max != 0
+ || workers_min != 0 || workers_max != workers_actual - 1
+ || vectors_min != 0 || vectors_max != 0)
+ __builtin_abort ();
+ }
+
+ /* GR, WS, VP. */
{
-#pragma acc loop worker
- for (int i = 0; i < 10; i++)
- dummy[i] = i;
+ /* We try with an outrageously large value. */
+#define VECTORS 2 << 20
+ int vectors_actual = VECTORS;
+ int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
+ gangs_min = workers_min = vectors_min = INT_MAX;
+ gangs_max = workers_max = vectors_max = INT_MIN;
+#pragma acc parallel copy (vectors_actual) /* { dg-warning "using vector_length \\(32\\), ignoring 2097152" "" { target openacc_nvidia_accel_selected } } */ \
+ vector_length (VECTORS)
+ {
+ if (acc_on_device (acc_device_host))
+ {
+ /* We're actually executing with vector_length (1). */
+ vectors_actual = 1;
+ }
+ else if (acc_on_device (acc_device_nvidia))
+ {
+ /* The GCC nvptx back end enforces vector_length (32). */
+ vectors_actual = 32;
+ }
+ else
+ __builtin_abort ();
+#pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
+ for (int i = 100 * vectors_actual; i > -100 * vectors_actual; --i)
+ {
+ gangs_min = gangs_max = acc_gang ();
+ workers_min = workers_max = acc_worker ();
+ vectors_min = vectors_max = acc_vector ();
+ }
+ }
+ if (vectors_actual < 1)
+ __builtin_abort ();
+ if (gangs_min != 0 || gangs_max != 0
+ || workers_min != 0 || workers_max != 0
+ || vectors_min != 0 || vectors_max != vectors_actual - 1)
+ __builtin_abort ();
+#undef VECTORS
}
-#pragma acc parallel vector_length (2<<20) /* { dg-error "using vector_length" } */
+ /* GR, WS, VP. */
{
-#pragma acc loop vector
- for (int i = 0; i < 10; i++)
- dummy[i] = i;
+ /* We try with an outrageously large value. */
+ int vectors = 2 << 20;
+ int vectors_actual = vectors;
+ int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
+ gangs_min = workers_min = vectors_min = INT_MAX;
+ gangs_max = workers_max = vectors_max = INT_MIN;
+#pragma acc parallel copy (vectors_actual) /* { dg-warning "using vector_length \\(32\\), ignoring runtime setting" "" { target openacc_nvidia_accel_selected } } */ \
+ vector_length (vectors)
+ {
+ if (acc_on_device (acc_device_host))
+ {
+ /* We're actually executing with vector_length (1). */
+ vectors_actual = 1;
+ }
+ else if (acc_on_device (acc_device_nvidia))
+ {
+ /* The GCC nvptx back end enforces vector_length (32). */
+ vectors_actual = 32;
+ }
+ else
+ __builtin_abort ();
+#pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
+ for (int i = 100 * vectors_actual; i > -100 * vectors_actual; --i)
+ {
+ gangs_min = gangs_max = acc_gang ();
+ workers_min = workers_max = acc_worker ();
+ vectors_min = vectors_max = acc_vector ();
+ }
+ }
+ if (vectors_actual < 1)
+ __builtin_abort ();
+ if (gangs_min != 0 || gangs_max != 0
+ || workers_min != 0 || workers_max != 0
+ || vectors_min != 0 || vectors_max != vectors_actual - 1)
+ __builtin_abort ();
}
+
+ /* Composition of GP, WP, VP. */
+ {
+ int gangs = 12345;
+ /* With nvptx offloading, multi-level reductions apparently are very slow
+ in the following case. So, limit ourselves here. */
+ if (acc_get_device_type () == acc_device_nvidia)
+ gangs = 3;
+ int gangs_actual = gangs;
+#define WORKERS 3
+ int workers_actual = WORKERS;
+#define VECTORS 11
+ int vectors_actual = VECTORS;
+ int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
+ gangs_min = workers_min = vectors_min = INT_MAX;
+ gangs_max = workers_max = vectors_max = INT_MIN;
+#pragma acc parallel copy (gangs_actual, workers_actual, vectors_actual) /* { dg-warning "using vector_length \\(32\\), ignoring 11" "" { target openacc_nvidia_accel_selected } } */ \
+ num_gangs (gangs) \
+ num_workers (WORKERS) \
+ vector_length (VECTORS)
+ {
+ if (acc_on_device (acc_device_host))
+ {
+ /* We're actually executing with num_gangs (1), num_workers (1),
+ vector_length (1). */
+ gangs_actual = 1;
+ workers_actual = 1;
+ vectors_actual = 1;
+ }
+ else if (acc_on_device (acc_device_nvidia))
+ {
+ /* The GCC nvptx back end enforces vector_length (32). */
+ vectors_actual = 32;
+ }
+ else
+ __builtin_abort ();
+#pragma acc loop gang reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
+ for (int i = 100 * gangs_actual; i > -100 * gangs_actual; --i)
+#pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
+ for (int j = 100 * workers_actual; j > -100 * workers_actual; --j)
+#pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
+ for (int k = 100 * vectors_actual; k > -100 * vectors_actual; --k)
+ {
+ gangs_min = gangs_max = acc_gang ();
+ workers_min = workers_max = acc_worker ();
+ vectors_min = vectors_max = acc_vector ();
+ }
+ }
+ if (gangs_min != 0 || gangs_max != gangs_actual - 1
+ || workers_min != 0 || workers_max != workers_actual - 1
+ || vectors_min != 0 || vectors_max != vectors_actual - 1)
+ __builtin_abort ();
+#undef VECTORS
+#undef WORKERS
+ }
+
+
+ /* We can't test parallelized OpenACC kernels constructs in this way: use of
+ the acc_gang, acc_worker, acc_vector functions will make the construct
+ unparallelizable. */
+
+
+ /* Unparallelized OpenACC kernels constructs must get launched as 1 x 1 x 1
+ kernels. */
+ {
+ int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
+ gangs_min = workers_min = vectors_min = INT_MAX;
+ gangs_max = workers_max = vectors_max = INT_MIN;
+#pragma acc kernels
+ {
+ /* This is to make the OpenACC kernels construct unparallelizable. */
+ asm volatile ("" : : : "memory");
+
+#pragma acc loop reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
+ for (int i = 100; i > -100; --i)
+ {
+ gangs_min = gangs_max = acc_gang ();
+ workers_min = workers_max = acc_worker ();
+ vectors_min = vectors_max = acc_vector ();
+ }
+ }
+ if (gangs_min != 0 || gangs_max != 1 - 1
+ || workers_min != 0 || workers_max != 1 - 1
+ || vectors_min != 0 || vectors_max != 1 - 1)
+ __builtin_abort ();
+ }
+
+
return 0;
}
diff --git libgomp/testsuite/libgomp.oacc-c/c.exp libgomp/testsuite/libgomp.oacc-c/c.exp
index e4be086..16f8295 100644
--- libgomp/testsuite/libgomp.oacc-c/c.exp
+++ libgomp/testsuite/libgomp.oacc-c/c.exp
@@ -15,6 +15,13 @@ load_lib libgomp-dg.exp
load_gcc_lib gcc-dg.exp
load_gcc_lib torture-options.exp
+proc check_effective_target_c { } {
+ return 1
+}
+proc check_effective_target_c++ { } {
+ return 0
+}
+
# Initialize dg.
dg-init
torture-init
Grüße
Thomas
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: Runtime checking of OpenACC parallelism dimensions clauses
2017-05-11 12:27 Runtime checking of OpenACC parallelism dimensions clauses Thomas Schwinge
2017-05-14 10:29 ` Thomas Schwinge
@ 2017-05-19 11:03 ` Thomas Schwinge
2017-05-23 8:41 ` Jakub Jelinek
2 siblings, 0 replies; 5+ messages in thread
From: Thomas Schwinge @ 2017-05-19 11:03 UTC (permalink / raw)
To: gcc-patches, Jakub Jelinek
Hi!
Ping.
On Thu, 11 May 2017 14:24:05 +0200, I wrote:
> OK for trunk?
>
> commit 0ba48b4faf85420fbe12971afdd6e0afe70778bb
> Author: Thomas Schwinge <thomas@codesourcery.com>
> Date: Fri May 5 16:41:59 2017 +0200
>
> Runtime checking of OpenACC parallelism dimensions clauses
>
> libgomp/
> * testsuite/libgomp.oacc-c-c++-common/parallel-dims.c: Rewrite.
> * testsuite/lib/libgomp.exp
> (check_effective_target_openacc_nvidia_accel_configured): New
> proc.
> * testsuite/libgomp.oacc-c++/c++.exp (check_effective_target_c)
> (check_effective_target_c++): New procs.
> * testsuite/libgomp.oacc-c/c.exp (check_effective_target_c)
> (check_effective_target_c++): Likewise.
> ---
> libgomp/testsuite/lib/libgomp.exp | 12 +
> libgomp/testsuite/libgomp.oacc-c++/c++.exp | 7 +
> .../libgomp.oacc-c-c++-common/parallel-dims.c | 523 ++++++++++++++++++++-
> libgomp/testsuite/libgomp.oacc-c/c.exp | 7 +
> 4 files changed, 537 insertions(+), 12 deletions(-)
>
> diff --git libgomp/testsuite/lib/libgomp.exp libgomp/testsuite/lib/libgomp.exp
> index 5e47872..62ee2e3 100644
> --- libgomp/testsuite/lib/libgomp.exp
> +++ libgomp/testsuite/lib/libgomp.exp
> @@ -358,6 +358,18 @@ proc check_effective_target_offload_device_shared_as { } {
> } ]
> }
>
> +# Return 1 if configured for nvptx offloading.
> +
> +proc check_effective_target_openacc_nvidia_accel_configured { } {
> + global offload_targets
> + if { ![string match "*,nvptx,*" ",$offload_targets,"] } {
> + return 0
> + }
> + # PR libgomp/65099: Currently, we only support offloading in 64-bit
> + # configurations.
> + return [is-effective-target lp64]
> +}
> +
> # Return 1 if at least one nvidia board is present.
>
> proc check_effective_target_openacc_nvidia_accel_present { } {
> diff --git libgomp/testsuite/libgomp.oacc-c++/c++.exp libgomp/testsuite/libgomp.oacc-c++/c++.exp
> index 608b298..9beadd6 100644
> --- libgomp/testsuite/libgomp.oacc-c++/c++.exp
> +++ libgomp/testsuite/libgomp.oacc-c++/c++.exp
> @@ -4,6 +4,13 @@ load_lib libgomp-dg.exp
> load_gcc_lib gcc-dg.exp
> load_gcc_lib torture-options.exp
>
> +proc check_effective_target_c { } {
> + return 0
> +}
> +proc check_effective_target_c++ { } {
> + return 1
> +}
> +
> global shlib_ext
>
> set shlib_ext [get_shlib_extension]
> diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
> index f5766a4..d8af546 100644
> --- libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
> +++ libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
> @@ -1,25 +1,524 @@
> -/* { dg-do run { target openacc_nvidia_accel_selected } } */
> +/* OpenACC parallelism dimensions clauses: num_gangs, num_workers,
> + vector_length. */
> +
> +#include <limits.h>
> +#include <openacc.h>
> +
> +/* TODO: "(int) acc_device_*" casts because of the C++ acc_on_device wrapper
> + not behaving as expected for -O0. */
> +#pragma acc routine seq
> +static unsigned int __attribute__ ((optimize ("O2"))) acc_gang ()
> +{
> + if (acc_on_device ((int) acc_device_host))
> + return 0;
> + else if (acc_on_device ((int) acc_device_nvidia))
> + {
> + unsigned int r;
> + asm volatile ("mov.u32 %0,%%ctaid.x;" : "=r" (r));
> + return r;
> + }
> + else
> + __builtin_abort ();
> +}
> +
> +#pragma acc routine seq
> +static unsigned int __attribute__ ((optimize ("O2"))) acc_worker ()
> +{
> + if (acc_on_device ((int) acc_device_host))
> + return 0;
> + else if (acc_on_device ((int) acc_device_nvidia))
> + {
> + unsigned int r;
> + asm volatile ("mov.u32 %0,%%tid.y;" : "=r" (r));
> + return r;
> + }
> + else
> + __builtin_abort ();
> +}
> +
> +#pragma acc routine seq
> +static unsigned int __attribute__ ((optimize ("O2"))) acc_vector ()
> +{
> + if (acc_on_device ((int) acc_device_host))
> + return 0;
> + else if (acc_on_device ((int) acc_device_nvidia))
> + {
> + unsigned int r;
> + asm volatile ("mov.u32 %0,%%tid.x;" : "=r" (r));
> + return r;
> + }
> + else
> + __builtin_abort ();
> +}
>
> -/* Worker and vector size checks. Picked an outrageously large
> - value. */
>
> int main ()
> {
> - int dummy[10];
> + acc_init (acc_device_default);
>
> -#pragma acc parallel num_workers (2<<20) /* { dg-error "using num_workers" } */
> + /* Non-positive value. */
> +
> + /* GR, WS, VS. */
> + {
> +#define GANGS 0 /* { dg-warning "'num_gangs' value must be positive" "" { target c } } */
> + int gangs_actual = GANGS;
> + int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
> + gangs_min = workers_min = vectors_min = INT_MAX;
> + gangs_max = workers_max = vectors_max = INT_MIN;
> +#pragma acc parallel copy (gangs_actual) \
> + reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max) \
> + num_gangs (GANGS) /* { dg-warning "'num_gangs' value must be positive" "" { target c++ } } */
> + {
> + /* We're actually executing with num_gangs (1). */
> + gangs_actual = 1;
> + for (int i = 100 * gangs_actual; i > -100 * gangs_actual; --i)
> + {
> + /* <https://gcc.gnu.org/PR80547>. */
> +#if 0
> + gangs_min = gangs_max = acc_gang ();
> + workers_min = workers_max = acc_worker ();
> + vectors_min = vectors_max = acc_vector ();
> +#else
> + int gangs = acc_gang ();
> + gangs_min = (gangs_min < gangs) ? gangs_min : gangs;
> + gangs_max = (gangs_max > gangs) ? gangs_max : gangs;
> + int workers = acc_worker ();
> + workers_min = (workers_min < workers) ? workers_min : workers;
> + workers_max = (workers_max > workers) ? workers_max : workers;
> + int vectors = acc_vector ();
> + vectors_min = (vectors_min < vectors) ? vectors_min : vectors;
> + vectors_max = (vectors_max > vectors) ? vectors_max : vectors;
> +#endif
> + }
> + }
> + if (gangs_actual != 1)
> + __builtin_abort ();
> + if (gangs_min != 0 || gangs_max != gangs_actual - 1
> + || workers_min != 0 || workers_max != 0
> + || vectors_min != 0 || vectors_max != 0)
> + __builtin_abort ();
> +#undef GANGS
> + }
> +
> + /* GP, WS, VS. */
> + {
> +#define GANGS 0 /* { dg-warning "'num_gangs' value must be positive" "" { target c } } */
> + int gangs_actual = GANGS;
> + int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
> + gangs_min = workers_min = vectors_min = INT_MAX;
> + gangs_max = workers_max = vectors_max = INT_MIN;
> +#pragma acc parallel copy (gangs_actual) \
> + num_gangs (GANGS) /* { dg-warning "'num_gangs' value must be positive" "" { target c++ } } */
> + {
> + /* We're actually executing with num_gangs (1). */
> + gangs_actual = 1;
> +#pragma acc loop gang reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
> + for (int i = 100 * gangs_actual; i > -100 * gangs_actual; --i)
> + {
> + gangs_min = gangs_max = acc_gang ();
> + workers_min = workers_max = acc_worker ();
> + vectors_min = vectors_max = acc_vector ();
> + }
> + }
> + if (gangs_actual != 1)
> + __builtin_abort ();
> + if (gangs_min != 0 || gangs_max != gangs_actual - 1
> + || workers_min != 0 || workers_max != 0
> + || vectors_min != 0 || vectors_max != 0)
> + __builtin_abort ();
> +#undef GANGS
> + }
> +
> + /* GR, WP, VS. */
> + {
> +#define WORKERS 0 /* { dg-warning "'num_workers' value must be positive" "" { target c } } */
> + int workers_actual = WORKERS;
> + int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
> + gangs_min = workers_min = vectors_min = INT_MAX;
> + gangs_max = workers_max = vectors_max = INT_MIN;
> +#pragma acc parallel copy (workers_actual) \
> + num_workers (WORKERS) /* { dg-warning "'num_workers' value must be positive" "" { target c++ } } */
> + {
> + /* We're actually executing with num_workers (1). */
> + workers_actual = 1;
> +#pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
> + for (int i = 100 * workers_actual; i > -100 * workers_actual; --i)
> + {
> + gangs_min = gangs_max = acc_gang ();
> + workers_min = workers_max = acc_worker ();
> + vectors_min = vectors_max = acc_vector ();
> + }
> + }
> + if (workers_actual != 1)
> + __builtin_abort ();
> + if (gangs_min != 0 || gangs_max != 0
> + || workers_min != 0 || workers_max != workers_actual - 1
> + || vectors_min != 0 || vectors_max != 0)
> + __builtin_abort ();
> +#undef WORKERS
> + }
> +
> + /* GR, WS, VP. */
> + {
> +#define VECTORS 0 /* { dg-warning "'vector_length' value must be positive" "" { target c } } */
> + int vectors_actual = VECTORS;
> + int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
> + gangs_min = workers_min = vectors_min = INT_MAX;
> + gangs_max = workers_max = vectors_max = INT_MIN;
> +#pragma acc parallel copy (vectors_actual) /* { dg-warning "using vector_length \\(32\\), ignoring 1" "" { target openacc_nvidia_accel_configured } } */ \
> + vector_length (VECTORS) /* { dg-warning "'vector_length' value must be positive" "" { target c++ } } */
> + {
> + /* We're actually executing with vector_length (1), just the GCC nvptx
> + back end enforces vector_length (32). */
> + if (acc_on_device (acc_device_nvidia))
> + vectors_actual = 32;
> + else
> + vectors_actual = 1;
> +#pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
> + for (int i = 100 * vectors_actual; i > -100 * vectors_actual; --i)
> + {
> + gangs_min = gangs_max = acc_gang ();
> + workers_min = workers_max = acc_worker ();
> + vectors_min = vectors_max = acc_vector ();
> + }
> + }
> + if (acc_get_device_type () == acc_device_nvidia)
> + {
> + if (vectors_actual != 32)
> + __builtin_abort ();
> + }
> + else
> + if (vectors_actual != 1)
> + __builtin_abort ();
> + if (gangs_min != 0 || gangs_max != 0
> + || workers_min != 0 || workers_max != 0
> + || vectors_min != 0 || vectors_max != vectors_actual - 1)
> + __builtin_abort ();
> +#undef VECTORS
> + }
> +
> +
> + /* High value. */
> +
> + /* GR, WS, VS. */
> + {
> + /* There is no actual limit for the number of gangs, so we try with a
> + rather high value. */
> + int gangs = 12345;
> + int gangs_actual = gangs;
> + int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
> + gangs_min = workers_min = vectors_min = INT_MAX;
> + gangs_max = workers_max = vectors_max = INT_MIN;
> +#pragma acc parallel copy (gangs_actual) \
> + reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max) \
> + num_gangs (gangs)
> + {
> + if (acc_on_device (acc_device_host))
> + {
> + /* We're actually executing with num_gangs (1). */
> + gangs_actual = 1;
> + }
> + /* As we're executing GR not GP, don't multiply with a "gangs_actual"
> + factor. */
> + for (int i = 100 /* * gangs_actual */; i > -100 /* * gangs_actual */; --i)
> + {
> + gangs_min = gangs_max = acc_gang ();
> + workers_min = workers_max = acc_worker ();
> + vectors_min = vectors_max = acc_vector ();
> + }
> + }
> + if (gangs_actual < 1)
> + __builtin_abort ();
> + if (gangs_min != 0 || gangs_max != gangs_actual - 1
> + || workers_min != 0 || workers_max != 0
> + || vectors_min != 0 || vectors_max != 0)
> + __builtin_abort ();
> + }
> +
> + /* GP, WS, VS. */
> + {
> + /* There is no actual limit for the number of gangs, so we try with a
> + rather high value. */
> + int gangs = 12345;
> + int gangs_actual = gangs;
> + int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
> + gangs_min = workers_min = vectors_min = INT_MAX;
> + gangs_max = workers_max = vectors_max = INT_MIN;
> +#pragma acc parallel copy (gangs_actual) \
> + num_gangs (gangs)
> + {
> + if (acc_on_device (acc_device_host))
> + {
> + /* We're actually executing with num_gangs (1). */
> + gangs_actual = 1;
> + }
> +#pragma acc loop gang reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
> + for (int i = 100 * gangs_actual; i > -100 * gangs_actual; --i)
> + {
> + gangs_min = gangs_max = acc_gang ();
> + workers_min = workers_max = acc_worker ();
> + vectors_min = vectors_max = acc_vector ();
> + }
> + }
> + if (gangs_actual < 1)
> + __builtin_abort ();
> + if (gangs_min != 0 || gangs_max != gangs_actual - 1
> + || workers_min != 0 || workers_max != 0
> + || vectors_min != 0 || vectors_max != 0)
> + __builtin_abort ();
> + }
> +
> + /* GR, WP, VS. */
> + {
> + /* We try with an outrageously large value. */
> +#define WORKERS 2 << 20
> + int workers_actual = WORKERS;
> + int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
> + gangs_min = workers_min = vectors_min = INT_MAX;
> + gangs_max = workers_max = vectors_max = INT_MIN;
> +#pragma acc parallel copy (workers_actual) /* { dg-warning "using num_workers \\(32\\), ignoring 2097152" "" { target openacc_nvidia_accel_configured } } */ \
> + num_workers (WORKERS)
> + {
> + if (acc_on_device (acc_device_host))
> + {
> + /* We're actually executing with num_workers (1). */
> + workers_actual = 1;
> + }
> + else if (acc_on_device (acc_device_nvidia))
> + {
> + /* The GCC nvptx back end enforces num_workers (32). */
> + workers_actual = 32;
> + }
> + else
> + __builtin_abort ();
> +#pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
> + for (int i = 100 * workers_actual; i > -100 * workers_actual; --i)
> + {
> + gangs_min = gangs_max = acc_gang ();
> + workers_min = workers_max = acc_worker ();
> + vectors_min = vectors_max = acc_vector ();
> + }
> + }
> + if (workers_actual < 1)
> + __builtin_abort ();
> + if (gangs_min != 0 || gangs_max != 0
> + || workers_min != 0 || workers_max != workers_actual - 1
> + || vectors_min != 0 || vectors_max != 0)
> + __builtin_abort ();
> +#undef WORKERS
> + }
> +
> + /* GR, WP, VS. */
> + {
> + /* We try with an outrageously large value. */
> + int workers = 2 << 20;
> + /* For nvptx offloading, this one will not result in "using num_workers
> + (32), ignoring runtime setting", and will in fact try to launch with
> + "num_workers (workers)", which will run into "libgomp: cuLaunchKernel
> + error: invalid argument". So, limit ourselves here. */
> + if (acc_get_device_type () == acc_device_nvidia)
> + workers = 32;
> + int workers_actual = workers;
> + int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
> + gangs_min = workers_min = vectors_min = INT_MAX;
> + gangs_max = workers_max = vectors_max = INT_MIN;
> +#pragma acc parallel copy (workers_actual) \
> + num_workers (workers)
> + {
> + if (acc_on_device (acc_device_host))
> + {
> + /* We're actually executing with num_workers (1). */
> + workers_actual = 1;
> + }
> + else if (acc_on_device (acc_device_nvidia))
> + {
> + /* We're actually executing with num_workers (32). */
> + /* workers_actual = 32; */
> + }
> + else
> + __builtin_abort ();
> +#pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
> + for (int i = 100 * workers_actual; i > -100 * workers_actual; --i)
> + {
> + gangs_min = gangs_max = acc_gang ();
> + workers_min = workers_max = acc_worker ();
> + vectors_min = vectors_max = acc_vector ();
> + }
> + }
> + if (workers_actual < 1)
> + __builtin_abort ();
> + if (gangs_min != 0 || gangs_max != 0
> + || workers_min != 0 || workers_max != workers_actual - 1
> + || vectors_min != 0 || vectors_max != 0)
> + __builtin_abort ();
> + }
> +
> + /* GR, WS, VP. */
> {
> -#pragma acc loop worker
> - for (int i = 0; i < 10; i++)
> - dummy[i] = i;
> + /* We try with an outrageously large value. */
> +#define VECTORS 2 << 20
> + int vectors_actual = VECTORS;
> + int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
> + gangs_min = workers_min = vectors_min = INT_MAX;
> + gangs_max = workers_max = vectors_max = INT_MIN;
> +#pragma acc parallel copy (vectors_actual) /* { dg-warning "using vector_length \\(32\\), ignoring 2097152" "" { target openacc_nvidia_accel_configured } } */ \
> + vector_length (VECTORS)
> + {
> + if (acc_on_device (acc_device_host))
> + {
> + /* We're actually executing with vector_length (1). */
> + vectors_actual = 1;
> + }
> + else if (acc_on_device (acc_device_nvidia))
> + {
> + /* The GCC nvptx back end enforces vector_length (32). */
> + vectors_actual = 32;
> + }
> + else
> + __builtin_abort ();
> +#pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
> + for (int i = 100 * vectors_actual; i > -100 * vectors_actual; --i)
> + {
> + gangs_min = gangs_max = acc_gang ();
> + workers_min = workers_max = acc_worker ();
> + vectors_min = vectors_max = acc_vector ();
> + }
> + }
> + if (vectors_actual < 1)
> + __builtin_abort ();
> + if (gangs_min != 0 || gangs_max != 0
> + || workers_min != 0 || workers_max != 0
> + || vectors_min != 0 || vectors_max != vectors_actual - 1)
> + __builtin_abort ();
> +#undef VECTORS
> }
>
> -#pragma acc parallel vector_length (2<<20) /* { dg-error "using vector_length" } */
> + /* GR, WS, VP. */
> {
> -#pragma acc loop vector
> - for (int i = 0; i < 10; i++)
> - dummy[i] = i;
> + /* We try with an outrageously large value. */
> + int vectors = 2 << 20;
> + int vectors_actual = vectors;
> + int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
> + gangs_min = workers_min = vectors_min = INT_MAX;
> + gangs_max = workers_max = vectors_max = INT_MIN;
> +#pragma acc parallel copy (vectors_actual) /* { dg-warning "using vector_length \\(32\\), ignoring runtime setting" "" { target openacc_nvidia_accel_configured } } */ \
> + vector_length (vectors)
> + {
> + if (acc_on_device (acc_device_host))
> + {
> + /* We're actually executing with vector_length (1). */
> + vectors_actual = 1;
> + }
> + else if (acc_on_device (acc_device_nvidia))
> + {
> + /* The GCC nvptx back end enforces vector_length (32). */
> + vectors_actual = 32;
> + }
> + else
> + __builtin_abort ();
> +#pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
> + for (int i = 100 * vectors_actual; i > -100 * vectors_actual; --i)
> + {
> + gangs_min = gangs_max = acc_gang ();
> + workers_min = workers_max = acc_worker ();
> + vectors_min = vectors_max = acc_vector ();
> + }
> + }
> + if (vectors_actual < 1)
> + __builtin_abort ();
> + if (gangs_min != 0 || gangs_max != 0
> + || workers_min != 0 || workers_max != 0
> + || vectors_min != 0 || vectors_max != vectors_actual - 1)
> + __builtin_abort ();
> }
>
> +
> + /* Composition of GP, WP, VP. */
> + {
> + int gangs = 12345;
> + /* With nvptx offloading, multi-level reductions apparently are very slow
> + in the following case. So, limit ourselves here. */
> + if (acc_get_device_type () == acc_device_nvidia)
> + gangs = 3;
> + int gangs_actual = gangs;
> +#define WORKERS 3
> + int workers_actual = WORKERS;
> +#define VECTORS 11
> + int vectors_actual = VECTORS;
> + int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
> + gangs_min = workers_min = vectors_min = INT_MAX;
> + gangs_max = workers_max = vectors_max = INT_MIN;
> +#pragma acc parallel copy (gangs_actual, workers_actual, vectors_actual) /* { dg-warning "using vector_length \\(32\\), ignoring 11" "" { target openacc_nvidia_accel_configured } } */ \
> + num_gangs (gangs) \
> + num_workers (WORKERS) \
> + vector_length (VECTORS)
> + {
> + if (acc_on_device (acc_device_host))
> + {
> + /* We're actually executing with num_gangs (1), num_workers (1),
> + vector_length (1). */
> + gangs_actual = 1;
> + workers_actual = 1;
> + vectors_actual = 1;
> + }
> + else if (acc_on_device (acc_device_nvidia))
> + {
> + /* The GCC nvptx back end enforces vector_length (32). */
> + vectors_actual = 32;
> + }
> + else
> + __builtin_abort ();
> +#pragma acc loop gang reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
> + for (int i = 100 * gangs_actual; i > -100 * gangs_actual; --i)
> +#pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
> + for (int j = 100 * workers_actual; j > -100 * workers_actual; --j)
> +#pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
> + for (int k = 100 * vectors_actual; k > -100 * vectors_actual; --k)
> + {
> + gangs_min = gangs_max = acc_gang ();
> + workers_min = workers_max = acc_worker ();
> + vectors_min = vectors_max = acc_vector ();
> + }
> + }
> + if (gangs_min != 0 || gangs_max != gangs_actual - 1
> + || workers_min != 0 || workers_max != workers_actual - 1
> + || vectors_min != 0 || vectors_max != vectors_actual - 1)
> + __builtin_abort ();
> +#undef VECTORS
> +#undef WORKERS
> + }
> +
> +
> + /* We can't test parallelized OpenACC kernels constructs in this way: use of
> + the acc_gang, acc_worker, acc_vector functions will make the construct
> + unparallelizable. */
> +
> +
> + /* Unparallelized OpenACC kernels constructs must get launched as 1 x 1 x 1
> + kernels. */
> + {
> + int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
> + gangs_min = workers_min = vectors_min = INT_MAX;
> + gangs_max = workers_max = vectors_max = INT_MIN;
> +#pragma acc kernels
> + {
> + /* This is to make the OpenACC kernels construct unparallelizable. */
> + asm volatile ("" : : : "memory");
> +
> +#pragma acc loop reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
> + for (int i = 100; i > -100; --i)
> + {
> + gangs_min = gangs_max = acc_gang ();
> + workers_min = workers_max = acc_worker ();
> + vectors_min = vectors_max = acc_vector ();
> + }
> + }
> + if (gangs_min != 0 || gangs_max != 1 - 1
> + || workers_min != 0 || workers_max != 1 - 1
> + || vectors_min != 0 || vectors_max != 1 - 1)
> + __builtin_abort ();
> + }
> +
> +
> return 0;
> }
> diff --git libgomp/testsuite/libgomp.oacc-c/c.exp libgomp/testsuite/libgomp.oacc-c/c.exp
> index b509a10..4475bf5 100644
> --- libgomp/testsuite/libgomp.oacc-c/c.exp
> +++ libgomp/testsuite/libgomp.oacc-c/c.exp
> @@ -15,6 +15,13 @@ load_lib libgomp-dg.exp
> load_gcc_lib gcc-dg.exp
> load_gcc_lib torture-options.exp
>
> +proc check_effective_target_c { } {
> + return 1
> +}
> +proc check_effective_target_c++ { } {
> + return 0
> +}
> +
> # Initialize dg.
> dg-init
> torture-init
Grüße
Thomas
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: Runtime checking of OpenACC parallelism dimensions clauses
2017-05-11 12:27 Runtime checking of OpenACC parallelism dimensions clauses Thomas Schwinge
2017-05-14 10:29 ` Thomas Schwinge
2017-05-19 11:03 ` Thomas Schwinge
@ 2017-05-23 8:41 ` Jakub Jelinek
2017-05-23 9:48 ` Thomas Schwinge
2 siblings, 1 reply; 5+ messages in thread
From: Jakub Jelinek @ 2017-05-23 8:41 UTC (permalink / raw)
To: Thomas Schwinge; +Cc: gcc-patches
On Thu, May 11, 2017 at 02:24:05PM +0200, Thomas Schwinge wrote:
> Hi!
>
> OK for trunk?
>
> commit 0ba48b4faf85420fbe12971afdd6e0afe70778bb
> Author: Thomas Schwinge <thomas@codesourcery.com>
> Date: Fri May 5 16:41:59 2017 +0200
>
> Runtime checking of OpenACC parallelism dimensions clauses
>
> libgomp/
> * testsuite/libgomp.oacc-c-c++-common/parallel-dims.c: Rewrite.
> * testsuite/lib/libgomp.exp
> (check_effective_target_openacc_nvidia_accel_configured): New
> proc.
> * testsuite/libgomp.oacc-c++/c++.exp (check_effective_target_c)
> (check_effective_target_c++): New procs.
> * testsuite/libgomp.oacc-c/c.exp (check_effective_target_c)
> (check_effective_target_c++): Likewise.
Ok.
Jakub
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: Runtime checking of OpenACC parallelism dimensions clauses
2017-05-23 8:41 ` Jakub Jelinek
@ 2017-05-23 9:48 ` Thomas Schwinge
0 siblings, 0 replies; 5+ messages in thread
From: Thomas Schwinge @ 2017-05-23 9:48 UTC (permalink / raw)
To: Jakub Jelinek, gcc-patches
Hi!
On Tue, 23 May 2017 10:25:12 +0200, Jakub Jelinek <jakub@redhat.com> wrote:
> On Thu, May 11, 2017 at 02:24:05PM +0200, Thomas Schwinge wrote:
> > OK for trunk?
> > Runtime checking of OpenACC parallelism dimensions clauses
> Ok.
Thanks. As posted, committed to trunk in r248358:
commit 681ad5cef0c3153f1233ef178c01ad53e7b9c405
Author: tschwinge <tschwinge@138bc75d-0d04-0410-961f-82ee72b054a4>
Date: Tue May 23 09:16:05 2017 +0000
Runtime checking of OpenACC parallelism dimensions clauses
libgomp/
* testsuite/libgomp.oacc-c-c++-common/parallel-dims.c: Rewrite.
* testsuite/lib/libgomp.exp
(check_effective_target_openacc_nvidia_accel_configured): New
proc.
* testsuite/libgomp.oacc-c++/c++.exp (check_effective_target_c)
(check_effective_target_c++): New procs.
* testsuite/libgomp.oacc-c/c.exp (check_effective_target_c)
(check_effective_target_c++): Likewise.
git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@248358 138bc75d-0d04-0410-961f-82ee72b054a4
---
libgomp/ChangeLog | 11 +
libgomp/testsuite/lib/libgomp.exp | 12 +
libgomp/testsuite/libgomp.oacc-c++/c++.exp | 7 +
.../libgomp.oacc-c-c++-common/parallel-dims.c | 523 ++++++++++++++++++++-
libgomp/testsuite/libgomp.oacc-c/c.exp | 7 +
5 files changed, 548 insertions(+), 12 deletions(-)
diff --git libgomp/ChangeLog libgomp/ChangeLog
index 8209f9f..8fd5f07 100644
--- libgomp/ChangeLog
+++ libgomp/ChangeLog
@@ -1,3 +1,14 @@
+2017-05-23 Thomas Schwinge <thomas@codesourcery.com>
+
+ * testsuite/libgomp.oacc-c-c++-common/parallel-dims.c: Rewrite.
+ * testsuite/lib/libgomp.exp
+ (check_effective_target_openacc_nvidia_accel_configured): New
+ proc.
+ * testsuite/libgomp.oacc-c++/c++.exp (check_effective_target_c)
+ (check_effective_target_c++): New procs.
+ * testsuite/libgomp.oacc-c/c.exp (check_effective_target_c)
+ (check_effective_target_c++): Likewise.
+
2017-05-22 Jakub Jelinek <jakub@redhat.com>
PR middle-end/80809
diff --git libgomp/testsuite/lib/libgomp.exp libgomp/testsuite/lib/libgomp.exp
index 501a860..ea3da2c 100644
--- libgomp/testsuite/lib/libgomp.exp
+++ libgomp/testsuite/lib/libgomp.exp
@@ -359,6 +359,18 @@ proc check_effective_target_offload_device_shared_as { } {
} ]
}
+# Return 1 if configured for nvptx offloading.
+
+proc check_effective_target_openacc_nvidia_accel_configured { } {
+ global offload_targets
+ if { ![string match "*,nvptx,*" ",$offload_targets,"] } {
+ return 0
+ }
+ # PR libgomp/65099: Currently, we only support offloading in 64-bit
+ # configurations.
+ return [is-effective-target lp64]
+}
+
# Return 1 if at least one nvidia board is present.
proc check_effective_target_openacc_nvidia_accel_present { } {
diff --git libgomp/testsuite/libgomp.oacc-c++/c++.exp libgomp/testsuite/libgomp.oacc-c++/c++.exp
index 608b298..9beadd6 100644
--- libgomp/testsuite/libgomp.oacc-c++/c++.exp
+++ libgomp/testsuite/libgomp.oacc-c++/c++.exp
@@ -4,6 +4,13 @@ load_lib libgomp-dg.exp
load_gcc_lib gcc-dg.exp
load_gcc_lib torture-options.exp
+proc check_effective_target_c { } {
+ return 0
+}
+proc check_effective_target_c++ { } {
+ return 1
+}
+
global shlib_ext
set shlib_ext [get_shlib_extension]
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
index f5766a4..d8af546 100644
--- libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
@@ -1,25 +1,524 @@
-/* { dg-do run { target openacc_nvidia_accel_selected } } */
+/* OpenACC parallelism dimensions clauses: num_gangs, num_workers,
+ vector_length. */
+
+#include <limits.h>
+#include <openacc.h>
+
+/* TODO: "(int) acc_device_*" casts because of the C++ acc_on_device wrapper
+ not behaving as expected for -O0. */
+#pragma acc routine seq
+static unsigned int __attribute__ ((optimize ("O2"))) acc_gang ()
+{
+ if (acc_on_device ((int) acc_device_host))
+ return 0;
+ else if (acc_on_device ((int) acc_device_nvidia))
+ {
+ unsigned int r;
+ asm volatile ("mov.u32 %0,%%ctaid.x;" : "=r" (r));
+ return r;
+ }
+ else
+ __builtin_abort ();
+}
+
+#pragma acc routine seq
+static unsigned int __attribute__ ((optimize ("O2"))) acc_worker ()
+{
+ if (acc_on_device ((int) acc_device_host))
+ return 0;
+ else if (acc_on_device ((int) acc_device_nvidia))
+ {
+ unsigned int r;
+ asm volatile ("mov.u32 %0,%%tid.y;" : "=r" (r));
+ return r;
+ }
+ else
+ __builtin_abort ();
+}
+
+#pragma acc routine seq
+static unsigned int __attribute__ ((optimize ("O2"))) acc_vector ()
+{
+ if (acc_on_device ((int) acc_device_host))
+ return 0;
+ else if (acc_on_device ((int) acc_device_nvidia))
+ {
+ unsigned int r;
+ asm volatile ("mov.u32 %0,%%tid.x;" : "=r" (r));
+ return r;
+ }
+ else
+ __builtin_abort ();
+}
-/* Worker and vector size checks. Picked an outrageously large
- value. */
int main ()
{
- int dummy[10];
+ acc_init (acc_device_default);
-#pragma acc parallel num_workers (2<<20) /* { dg-error "using num_workers" } */
+ /* Non-positive value. */
+
+ /* GR, WS, VS. */
+ {
+#define GANGS 0 /* { dg-warning "'num_gangs' value must be positive" "" { target c } } */
+ int gangs_actual = GANGS;
+ int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
+ gangs_min = workers_min = vectors_min = INT_MAX;
+ gangs_max = workers_max = vectors_max = INT_MIN;
+#pragma acc parallel copy (gangs_actual) \
+ reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max) \
+ num_gangs (GANGS) /* { dg-warning "'num_gangs' value must be positive" "" { target c++ } } */
+ {
+ /* We're actually executing with num_gangs (1). */
+ gangs_actual = 1;
+ for (int i = 100 * gangs_actual; i > -100 * gangs_actual; --i)
+ {
+ /* <https://gcc.gnu.org/PR80547>. */
+#if 0
+ gangs_min = gangs_max = acc_gang ();
+ workers_min = workers_max = acc_worker ();
+ vectors_min = vectors_max = acc_vector ();
+#else
+ int gangs = acc_gang ();
+ gangs_min = (gangs_min < gangs) ? gangs_min : gangs;
+ gangs_max = (gangs_max > gangs) ? gangs_max : gangs;
+ int workers = acc_worker ();
+ workers_min = (workers_min < workers) ? workers_min : workers;
+ workers_max = (workers_max > workers) ? workers_max : workers;
+ int vectors = acc_vector ();
+ vectors_min = (vectors_min < vectors) ? vectors_min : vectors;
+ vectors_max = (vectors_max > vectors) ? vectors_max : vectors;
+#endif
+ }
+ }
+ if (gangs_actual != 1)
+ __builtin_abort ();
+ if (gangs_min != 0 || gangs_max != gangs_actual - 1
+ || workers_min != 0 || workers_max != 0
+ || vectors_min != 0 || vectors_max != 0)
+ __builtin_abort ();
+#undef GANGS
+ }
+
+ /* GP, WS, VS. */
+ {
+#define GANGS 0 /* { dg-warning "'num_gangs' value must be positive" "" { target c } } */
+ int gangs_actual = GANGS;
+ int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
+ gangs_min = workers_min = vectors_min = INT_MAX;
+ gangs_max = workers_max = vectors_max = INT_MIN;
+#pragma acc parallel copy (gangs_actual) \
+ num_gangs (GANGS) /* { dg-warning "'num_gangs' value must be positive" "" { target c++ } } */
+ {
+ /* We're actually executing with num_gangs (1). */
+ gangs_actual = 1;
+#pragma acc loop gang reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
+ for (int i = 100 * gangs_actual; i > -100 * gangs_actual; --i)
+ {
+ gangs_min = gangs_max = acc_gang ();
+ workers_min = workers_max = acc_worker ();
+ vectors_min = vectors_max = acc_vector ();
+ }
+ }
+ if (gangs_actual != 1)
+ __builtin_abort ();
+ if (gangs_min != 0 || gangs_max != gangs_actual - 1
+ || workers_min != 0 || workers_max != 0
+ || vectors_min != 0 || vectors_max != 0)
+ __builtin_abort ();
+#undef GANGS
+ }
+
+ /* GR, WP, VS. */
+ {
+#define WORKERS 0 /* { dg-warning "'num_workers' value must be positive" "" { target c } } */
+ int workers_actual = WORKERS;
+ int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
+ gangs_min = workers_min = vectors_min = INT_MAX;
+ gangs_max = workers_max = vectors_max = INT_MIN;
+#pragma acc parallel copy (workers_actual) \
+ num_workers (WORKERS) /* { dg-warning "'num_workers' value must be positive" "" { target c++ } } */
+ {
+ /* We're actually executing with num_workers (1). */
+ workers_actual = 1;
+#pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
+ for (int i = 100 * workers_actual; i > -100 * workers_actual; --i)
+ {
+ gangs_min = gangs_max = acc_gang ();
+ workers_min = workers_max = acc_worker ();
+ vectors_min = vectors_max = acc_vector ();
+ }
+ }
+ if (workers_actual != 1)
+ __builtin_abort ();
+ if (gangs_min != 0 || gangs_max != 0
+ || workers_min != 0 || workers_max != workers_actual - 1
+ || vectors_min != 0 || vectors_max != 0)
+ __builtin_abort ();
+#undef WORKERS
+ }
+
+ /* GR, WS, VP. */
+ {
+#define VECTORS 0 /* { dg-warning "'vector_length' value must be positive" "" { target c } } */
+ int vectors_actual = VECTORS;
+ int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
+ gangs_min = workers_min = vectors_min = INT_MAX;
+ gangs_max = workers_max = vectors_max = INT_MIN;
+#pragma acc parallel copy (vectors_actual) /* { dg-warning "using vector_length \\(32\\), ignoring 1" "" { target openacc_nvidia_accel_configured } } */ \
+ vector_length (VECTORS) /* { dg-warning "'vector_length' value must be positive" "" { target c++ } } */
+ {
+ /* We're actually executing with vector_length (1), just the GCC nvptx
+ back end enforces vector_length (32). */
+ if (acc_on_device (acc_device_nvidia))
+ vectors_actual = 32;
+ else
+ vectors_actual = 1;
+#pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
+ for (int i = 100 * vectors_actual; i > -100 * vectors_actual; --i)
+ {
+ gangs_min = gangs_max = acc_gang ();
+ workers_min = workers_max = acc_worker ();
+ vectors_min = vectors_max = acc_vector ();
+ }
+ }
+ if (acc_get_device_type () == acc_device_nvidia)
+ {
+ if (vectors_actual != 32)
+ __builtin_abort ();
+ }
+ else
+ if (vectors_actual != 1)
+ __builtin_abort ();
+ if (gangs_min != 0 || gangs_max != 0
+ || workers_min != 0 || workers_max != 0
+ || vectors_min != 0 || vectors_max != vectors_actual - 1)
+ __builtin_abort ();
+#undef VECTORS
+ }
+
+
+ /* High value. */
+
+ /* GR, WS, VS. */
+ {
+ /* There is no actual limit for the number of gangs, so we try with a
+ rather high value. */
+ int gangs = 12345;
+ int gangs_actual = gangs;
+ int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
+ gangs_min = workers_min = vectors_min = INT_MAX;
+ gangs_max = workers_max = vectors_max = INT_MIN;
+#pragma acc parallel copy (gangs_actual) \
+ reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max) \
+ num_gangs (gangs)
+ {
+ if (acc_on_device (acc_device_host))
+ {
+ /* We're actually executing with num_gangs (1). */
+ gangs_actual = 1;
+ }
+ /* As we're executing GR not GP, don't multiply with a "gangs_actual"
+ factor. */
+ for (int i = 100 /* * gangs_actual */; i > -100 /* * gangs_actual */; --i)
+ {
+ gangs_min = gangs_max = acc_gang ();
+ workers_min = workers_max = acc_worker ();
+ vectors_min = vectors_max = acc_vector ();
+ }
+ }
+ if (gangs_actual < 1)
+ __builtin_abort ();
+ if (gangs_min != 0 || gangs_max != gangs_actual - 1
+ || workers_min != 0 || workers_max != 0
+ || vectors_min != 0 || vectors_max != 0)
+ __builtin_abort ();
+ }
+
+ /* GP, WS, VS. */
+ {
+ /* There is no actual limit for the number of gangs, so we try with a
+ rather high value. */
+ int gangs = 12345;
+ int gangs_actual = gangs;
+ int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
+ gangs_min = workers_min = vectors_min = INT_MAX;
+ gangs_max = workers_max = vectors_max = INT_MIN;
+#pragma acc parallel copy (gangs_actual) \
+ num_gangs (gangs)
+ {
+ if (acc_on_device (acc_device_host))
+ {
+ /* We're actually executing with num_gangs (1). */
+ gangs_actual = 1;
+ }
+#pragma acc loop gang reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
+ for (int i = 100 * gangs_actual; i > -100 * gangs_actual; --i)
+ {
+ gangs_min = gangs_max = acc_gang ();
+ workers_min = workers_max = acc_worker ();
+ vectors_min = vectors_max = acc_vector ();
+ }
+ }
+ if (gangs_actual < 1)
+ __builtin_abort ();
+ if (gangs_min != 0 || gangs_max != gangs_actual - 1
+ || workers_min != 0 || workers_max != 0
+ || vectors_min != 0 || vectors_max != 0)
+ __builtin_abort ();
+ }
+
+ /* GR, WP, VS. */
+ {
+ /* We try with an outrageously large value. */
+#define WORKERS 2 << 20
+ int workers_actual = WORKERS;
+ int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
+ gangs_min = workers_min = vectors_min = INT_MAX;
+ gangs_max = workers_max = vectors_max = INT_MIN;
+#pragma acc parallel copy (workers_actual) /* { dg-warning "using num_workers \\(32\\), ignoring 2097152" "" { target openacc_nvidia_accel_configured } } */ \
+ num_workers (WORKERS)
+ {
+ if (acc_on_device (acc_device_host))
+ {
+ /* We're actually executing with num_workers (1). */
+ workers_actual = 1;
+ }
+ else if (acc_on_device (acc_device_nvidia))
+ {
+ /* The GCC nvptx back end enforces num_workers (32). */
+ workers_actual = 32;
+ }
+ else
+ __builtin_abort ();
+#pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
+ for (int i = 100 * workers_actual; i > -100 * workers_actual; --i)
+ {
+ gangs_min = gangs_max = acc_gang ();
+ workers_min = workers_max = acc_worker ();
+ vectors_min = vectors_max = acc_vector ();
+ }
+ }
+ if (workers_actual < 1)
+ __builtin_abort ();
+ if (gangs_min != 0 || gangs_max != 0
+ || workers_min != 0 || workers_max != workers_actual - 1
+ || vectors_min != 0 || vectors_max != 0)
+ __builtin_abort ();
+#undef WORKERS
+ }
+
+ /* GR, WP, VS. */
+ {
+ /* We try with an outrageously large value. */
+ int workers = 2 << 20;
+ /* For nvptx offloading, this one will not result in "using num_workers
+ (32), ignoring runtime setting", and will in fact try to launch with
+ "num_workers (workers)", which will run into "libgomp: cuLaunchKernel
+ error: invalid argument". So, limit ourselves here. */
+ if (acc_get_device_type () == acc_device_nvidia)
+ workers = 32;
+ int workers_actual = workers;
+ int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
+ gangs_min = workers_min = vectors_min = INT_MAX;
+ gangs_max = workers_max = vectors_max = INT_MIN;
+#pragma acc parallel copy (workers_actual) \
+ num_workers (workers)
+ {
+ if (acc_on_device (acc_device_host))
+ {
+ /* We're actually executing with num_workers (1). */
+ workers_actual = 1;
+ }
+ else if (acc_on_device (acc_device_nvidia))
+ {
+ /* We're actually executing with num_workers (32). */
+ /* workers_actual = 32; */
+ }
+ else
+ __builtin_abort ();
+#pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
+ for (int i = 100 * workers_actual; i > -100 * workers_actual; --i)
+ {
+ gangs_min = gangs_max = acc_gang ();
+ workers_min = workers_max = acc_worker ();
+ vectors_min = vectors_max = acc_vector ();
+ }
+ }
+ if (workers_actual < 1)
+ __builtin_abort ();
+ if (gangs_min != 0 || gangs_max != 0
+ || workers_min != 0 || workers_max != workers_actual - 1
+ || vectors_min != 0 || vectors_max != 0)
+ __builtin_abort ();
+ }
+
+ /* GR, WS, VP. */
{
-#pragma acc loop worker
- for (int i = 0; i < 10; i++)
- dummy[i] = i;
+ /* We try with an outrageously large value. */
+#define VECTORS 2 << 20
+ int vectors_actual = VECTORS;
+ int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
+ gangs_min = workers_min = vectors_min = INT_MAX;
+ gangs_max = workers_max = vectors_max = INT_MIN;
+#pragma acc parallel copy (vectors_actual) /* { dg-warning "using vector_length \\(32\\), ignoring 2097152" "" { target openacc_nvidia_accel_configured } } */ \
+ vector_length (VECTORS)
+ {
+ if (acc_on_device (acc_device_host))
+ {
+ /* We're actually executing with vector_length (1). */
+ vectors_actual = 1;
+ }
+ else if (acc_on_device (acc_device_nvidia))
+ {
+ /* The GCC nvptx back end enforces vector_length (32). */
+ vectors_actual = 32;
+ }
+ else
+ __builtin_abort ();
+#pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
+ for (int i = 100 * vectors_actual; i > -100 * vectors_actual; --i)
+ {
+ gangs_min = gangs_max = acc_gang ();
+ workers_min = workers_max = acc_worker ();
+ vectors_min = vectors_max = acc_vector ();
+ }
+ }
+ if (vectors_actual < 1)
+ __builtin_abort ();
+ if (gangs_min != 0 || gangs_max != 0
+ || workers_min != 0 || workers_max != 0
+ || vectors_min != 0 || vectors_max != vectors_actual - 1)
+ __builtin_abort ();
+#undef VECTORS
}
-#pragma acc parallel vector_length (2<<20) /* { dg-error "using vector_length" } */
+ /* GR, WS, VP. */
{
-#pragma acc loop vector
- for (int i = 0; i < 10; i++)
- dummy[i] = i;
+ /* We try with an outrageously large value. */
+ int vectors = 2 << 20;
+ int vectors_actual = vectors;
+ int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
+ gangs_min = workers_min = vectors_min = INT_MAX;
+ gangs_max = workers_max = vectors_max = INT_MIN;
+#pragma acc parallel copy (vectors_actual) /* { dg-warning "using vector_length \\(32\\), ignoring runtime setting" "" { target openacc_nvidia_accel_configured } } */ \
+ vector_length (vectors)
+ {
+ if (acc_on_device (acc_device_host))
+ {
+ /* We're actually executing with vector_length (1). */
+ vectors_actual = 1;
+ }
+ else if (acc_on_device (acc_device_nvidia))
+ {
+ /* The GCC nvptx back end enforces vector_length (32). */
+ vectors_actual = 32;
+ }
+ else
+ __builtin_abort ();
+#pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
+ for (int i = 100 * vectors_actual; i > -100 * vectors_actual; --i)
+ {
+ gangs_min = gangs_max = acc_gang ();
+ workers_min = workers_max = acc_worker ();
+ vectors_min = vectors_max = acc_vector ();
+ }
+ }
+ if (vectors_actual < 1)
+ __builtin_abort ();
+ if (gangs_min != 0 || gangs_max != 0
+ || workers_min != 0 || workers_max != 0
+ || vectors_min != 0 || vectors_max != vectors_actual - 1)
+ __builtin_abort ();
}
+
+ /* Composition of GP, WP, VP. */
+ {
+ int gangs = 12345;
+ /* With nvptx offloading, multi-level reductions apparently are very slow
+ in the following case. So, limit ourselves here. */
+ if (acc_get_device_type () == acc_device_nvidia)
+ gangs = 3;
+ int gangs_actual = gangs;
+#define WORKERS 3
+ int workers_actual = WORKERS;
+#define VECTORS 11
+ int vectors_actual = VECTORS;
+ int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
+ gangs_min = workers_min = vectors_min = INT_MAX;
+ gangs_max = workers_max = vectors_max = INT_MIN;
+#pragma acc parallel copy (gangs_actual, workers_actual, vectors_actual) /* { dg-warning "using vector_length \\(32\\), ignoring 11" "" { target openacc_nvidia_accel_configured } } */ \
+ num_gangs (gangs) \
+ num_workers (WORKERS) \
+ vector_length (VECTORS)
+ {
+ if (acc_on_device (acc_device_host))
+ {
+ /* We're actually executing with num_gangs (1), num_workers (1),
+ vector_length (1). */
+ gangs_actual = 1;
+ workers_actual = 1;
+ vectors_actual = 1;
+ }
+ else if (acc_on_device (acc_device_nvidia))
+ {
+ /* The GCC nvptx back end enforces vector_length (32). */
+ vectors_actual = 32;
+ }
+ else
+ __builtin_abort ();
+#pragma acc loop gang reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
+ for (int i = 100 * gangs_actual; i > -100 * gangs_actual; --i)
+#pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
+ for (int j = 100 * workers_actual; j > -100 * workers_actual; --j)
+#pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
+ for (int k = 100 * vectors_actual; k > -100 * vectors_actual; --k)
+ {
+ gangs_min = gangs_max = acc_gang ();
+ workers_min = workers_max = acc_worker ();
+ vectors_min = vectors_max = acc_vector ();
+ }
+ }
+ if (gangs_min != 0 || gangs_max != gangs_actual - 1
+ || workers_min != 0 || workers_max != workers_actual - 1
+ || vectors_min != 0 || vectors_max != vectors_actual - 1)
+ __builtin_abort ();
+#undef VECTORS
+#undef WORKERS
+ }
+
+
+ /* We can't test parallelized OpenACC kernels constructs in this way: use of
+ the acc_gang, acc_worker, acc_vector functions will make the construct
+ unparallelizable. */
+
+
+ /* Unparallelized OpenACC kernels constructs must get launched as 1 x 1 x 1
+ kernels. */
+ {
+ int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
+ gangs_min = workers_min = vectors_min = INT_MAX;
+ gangs_max = workers_max = vectors_max = INT_MIN;
+#pragma acc kernels
+ {
+ /* This is to make the OpenACC kernels construct unparallelizable. */
+ asm volatile ("" : : : "memory");
+
+#pragma acc loop reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
+ for (int i = 100; i > -100; --i)
+ {
+ gangs_min = gangs_max = acc_gang ();
+ workers_min = workers_max = acc_worker ();
+ vectors_min = vectors_max = acc_vector ();
+ }
+ }
+ if (gangs_min != 0 || gangs_max != 1 - 1
+ || workers_min != 0 || workers_max != 1 - 1
+ || vectors_min != 0 || vectors_max != 1 - 1)
+ __builtin_abort ();
+ }
+
+
return 0;
}
diff --git libgomp/testsuite/libgomp.oacc-c/c.exp libgomp/testsuite/libgomp.oacc-c/c.exp
index b509a10..4475bf5 100644
--- libgomp/testsuite/libgomp.oacc-c/c.exp
+++ libgomp/testsuite/libgomp.oacc-c/c.exp
@@ -15,6 +15,13 @@ load_lib libgomp-dg.exp
load_gcc_lib gcc-dg.exp
load_gcc_lib torture-options.exp
+proc check_effective_target_c { } {
+ return 1
+}
+proc check_effective_target_c++ { } {
+ return 0
+}
+
# Initialize dg.
dg-init
torture-init
Grüße
Thomas
^ permalink raw reply [flat|nested] 5+ messages in thread
end of thread, other threads:[~2017-05-23 9:18 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-05-11 12:27 Runtime checking of OpenACC parallelism dimensions clauses Thomas Schwinge
2017-05-14 10:29 ` Thomas Schwinge
2017-05-19 11:03 ` Thomas Schwinge
2017-05-23 8:41 ` Jakub Jelinek
2017-05-23 9:48 ` Thomas Schwinge
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).