public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH 5/6, OpenACC, libgomp] Async re-work, C/C++ testsuite changes
@ 2018-09-25 13:12 Chung-Lin Tang
  2018-12-07 15:31 ` Thomas Schwinge
                   ` (2 more replies)
  0 siblings, 3 replies; 8+ messages in thread
From: Chung-Lin Tang @ 2018-09-25 13:12 UTC (permalink / raw)
  To: gcc-patches, Thomas Schwinge

[-- Attachment #1: Type: text/plain, Size: 573 bytes --]

These are the testsuite/libgomp.oacc-c-c++-common/* changes.

Thanks,
Chung-Lin

	* testsuite/libgomp.oacc-c-c++-common/asyncwait-2.c: New testcase.
	* testsuite/libgomp.oacc-c-c++-common/data-2-lib.c: Adjust testcase.
	* testsuite/libgomp.oacc-c-c++-common/data-2.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/data-3.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/lib-71.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/lib-77.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/lib-79.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/lib-81.c: Likewise.

[-- Attachment #2: async-05.c-c++-testsuite.patch --]
[-- Type: text/plain, Size: 20888 bytes --]

diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/asyncwait-2.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/asyncwait-2.c
new file mode 100644
index 0000000..9420540
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/asyncwait-2.c
@@ -0,0 +1,904 @@
+/* { dg-do run { target openacc_nvidia_accel_selected } } */
+/* { dg-additional-options "-lcuda" } */
+
+#include <openacc.h>
+#include <stdlib.h>
+#include <cuda.h>
+
+#include <stdio.h>
+#include <time.h>
+#include <sys/time.h>
+
+int
+main (int argc, char **argv)
+{
+    CUresult r;
+    CUstream stream1;
+    int N = 128; //1024 * 1024;
+    float *a, *b, *c, *d, *e;
+    int i;
+    int nbytes;
+
+    srand (time (NULL));
+    int s = rand () % 100;
+
+    acc_init (acc_device_nvidia);
+
+    nbytes = N * sizeof (float);
+
+    a = (float *) malloc (nbytes);
+    b = (float *) malloc (nbytes);
+    c = (float *) malloc (nbytes);
+    d = (float *) malloc (nbytes);
+    e = (float *) malloc (nbytes);
+
+    for (i = 0; i < N; i++)
+    {
+        a[i] = 3.0;
+        b[i] = 0.0;
+    }
+
+    acc_set_default_async (s);
+
+#pragma acc data copy (a[0:N]) copy (b[0:N]) copyin (N)
+    {
+
+#pragma acc parallel async
+    {
+        int ii;
+
+        for (ii = 0; ii < N; ii++)
+            b[ii] = a[ii];
+    }
+
+#pragma acc wait
+
+    }
+
+    for (i = 0; i < N; i++)
+    {
+        if (a[i] != 3.0)
+            abort ();
+
+        if (b[i] != 3.0)
+            abort ();
+    }
+
+    for (i = 0; i < N; i++)
+    {
+        a[i] = 2.0;
+        b[i] = 0.0;
+    }
+
+#pragma acc data copy (a[0:N]) copy (b[0:N]) copyin (N)
+    {
+
+#pragma acc parallel async
+    {
+        int ii;
+
+        for (ii = 0; ii < N; ii++)
+            b[ii] = a[ii];
+    }
+
+#pragma acc wait (s)
+
+    }
+
+    for (i = 0; i < N; i++)
+    {
+        if (a[i] != 2.0)
+            abort ();
+
+        if (b[i] != 2.0)
+            abort ();
+    }
+
+    for (i = 0; i < N; i++)
+    {
+        a[i] = 3.0;
+        b[i] = 0.0;
+        c[i] = 0.0;
+        d[i] = 0.0;
+    }
+
+#pragma acc data copy (a[0:N]) copy (b[0:N]) copy (c[0:N]) copy (d[0:N]) copyin (N)
+    {
+
+#pragma acc parallel async
+    {
+        int ii;
+
+        for (ii = 0; ii < N; ii++)
+            b[ii] = (a[ii] * a[ii] * a[ii]) / a[ii];
+    }
+
+#pragma acc parallel async
+    {
+        int ii;
+
+        for (ii = 0; ii < N; ii++)
+            c[ii] = (a[ii] + a[ii] + a[ii] + a[ii]) / a[ii];
+    }
+
+
+#pragma acc parallel async
+    {
+        int ii;
+
+        for (ii = 0; ii < N; ii++)
+            d[ii] = ((a[ii] * a[ii] + a[ii]) / a[ii]) - a[ii];
+    }
+
+#pragma acc wait (s)
+
+    }
+
+    for (i = 0; i < N; i++)
+    {
+        if (a[i] != 3.0)
+            abort ();
+
+        if (b[i] != 9.0)
+            abort ();
+
+        if (c[i] != 4.0)
+            abort ();
+
+        if (d[i] != 1.0)
+            abort ();
+    }
+
+    for (i = 0; i < N; i++)
+    {
+        a[i] = 2.0;
+        b[i] = 0.0;
+        c[i] = 0.0;
+        d[i] = 0.0;
+        e[i] = 0.0;
+    }
+
+#pragma acc data copy (a[0:N], b[0:N], c[0:N], d[0:N], e[0:N]) copyin (N)
+    {
+
+#pragma acc parallel async
+    {
+        int ii;
+
+        for (ii = 0; ii < N; ii++)
+            b[ii] = (a[ii] * a[ii] * a[ii]) / a[ii];
+    }
+
+#pragma acc parallel async
+    {
+        int ii;
+
+        for (ii = 0; ii < N; ii++)
+            c[ii] = (a[ii] + a[ii] + a[ii] + a[ii]) / a[ii];
+    }
+
+#pragma acc parallel async
+    {
+        int ii;
+
+        for (ii = 0; ii < N; ii++)
+            d[ii] = ((a[ii] * a[ii] + a[ii]) / a[ii]) - a[ii];
+    }
+
+#pragma acc parallel wait (s) async (s)
+    {
+        int ii;
+
+        for (ii = 0; ii < N; ii++)
+            e[ii] = a[ii] + b[ii] + c[ii] + d[ii];
+    }
+
+#pragma acc wait (s)
+
+    }
+
+    for (i = 0; i < N; i++)
+    {
+        if (a[i] != 2.0)
+            abort ();
+
+        if (b[i] != 4.0)
+            abort ();
+
+        if (c[i] != 4.0)
+            abort ();
+
+        if (d[i] != 1.0)
+            abort ();
+
+        if (e[i] != 11.0)
+            abort ();
+    }
+
+
+    r = cuStreamCreate (&stream1, CU_STREAM_NON_BLOCKING);
+    if (r != CUDA_SUCCESS)
+    {
+        fprintf (stderr, "cuStreamCreate failed: %d\n", r);
+        abort ();
+    }
+
+    acc_set_cuda_stream (1, stream1);
+
+    for (i = 0; i < N; i++)
+    {
+        a[i] = 5.0;
+        b[i] = 0.0;
+    }
+
+#pragma acc data copy (a[0:N], b[0:N]) copyin (N)
+    {
+
+#pragma acc parallel async
+    {
+        int ii;
+
+        for (ii = 0; ii < N; ii++)
+            b[ii] = a[ii];
+    }
+
+#pragma acc wait (s)
+
+    }
+
+    for (i = 0; i < N; i++)
+    {
+        if (a[i] != 5.0)
+            abort ();
+
+        if (b[i] != 5.0)
+            abort ();
+    }
+
+    for (i = 0; i < N; i++)
+    {
+        a[i] = 7.0;
+        b[i] = 0.0;
+        c[i] = 0.0;
+        d[i] = 0.0;
+    }
+
+#pragma acc data copy (a[0:N]) copy (b[0:N]) copy (c[0:N]) copy (d[0:N]) copyin (N)
+    {
+
+#pragma acc parallel async
+    {
+        int ii;
+
+        for (ii = 0; ii < N; ii++)
+            b[ii] = (a[ii] * a[ii] * a[ii]) / a[ii];
+    }
+
+#pragma acc parallel async
+    {
+        int ii;
+
+        for (ii = 0; ii < N; ii++)
+            c[ii] = (a[ii] + a[ii] + a[ii] + a[ii]) / a[ii];
+    }
+
+#pragma acc parallel async
+    {
+        int ii;
+
+        for (ii = 0; ii < N; ii++)
+            d[ii] = ((a[ii] * a[ii] + a[ii]) / a[ii]) - a[ii];
+    }
+
+#pragma acc wait (s)
+
+    }
+
+    for (i = 0; i < N; i++)
+    {
+        if (a[i] != 7.0)
+            abort ();
+
+        if (b[i] != 49.0)
+            abort ();
+
+        if (c[i] != 4.0)
+            abort ();
+
+        if (d[i] != 1.0)
+            abort ();
+    }
+
+    for (i = 0; i < N; i++)
+    {
+        a[i] = 3.0;
+        b[i] = 0.0;
+        c[i] = 0.0;
+        d[i] = 0.0;
+        e[i] = 0.0;
+    }
+
+#pragma acc data copy (a[0:N], b[0:N], c[0:N], d[0:N], e[0:N]) copyin (N)
+    {
+
+#pragma acc parallel async
+    {
+        int ii;
+
+        for (ii = 0; ii < N; ii++)
+            b[ii] = (a[ii] * a[ii] * a[ii]) / a[ii];
+    }
+
+#pragma acc parallel async
+    {
+        int ii;
+
+        for (ii = 0; ii < N; ii++)
+            c[ii] = (a[ii] + a[ii] + a[ii] + a[ii]) / a[ii];
+    }
+
+#pragma acc parallel async
+    {
+        int ii;
+
+        for (ii = 0; ii < N; ii++)
+            d[ii] = ((a[ii] * a[ii] + a[ii]) / a[ii]) - a[ii];
+    }
+
+#pragma acc parallel wait (s) async (s)
+    {
+        int ii;
+
+        for (ii = 0; ii < N; ii++)
+            e[ii] = a[ii] + b[ii] + c[ii] + d[ii];
+    }
+
+#pragma acc wait (s)
+
+    }
+
+    for (i = 0; i < N; i++)
+    {
+        if (a[i] != 3.0)
+            abort ();
+
+        if (b[i] != 9.0)
+            abort ();
+
+        if (c[i] != 4.0)
+            abort ();
+
+        if (d[i] != 1.0)
+            abort ();
+
+        if (e[i] != 17.0)
+            abort ();
+    }
+
+    for (i = 0; i < N; i++)
+    {
+        a[i] = 4.0;
+        b[i] = 0.0;
+        c[i] = 0.0;
+        d[i] = 0.0;
+        e[i] = 0.0;
+    }
+
+#pragma acc data copyin (a[0:N], b[0:N], c[0:N]) copyin (N)
+    {
+
+#pragma acc parallel async
+    {
+        int ii;
+
+        for (ii = 0; ii < N; ii++)
+            b[ii] = (a[ii] * a[ii] * a[ii]) / a[ii];
+    }
+
+#pragma acc parallel async
+    {
+        int ii;
+
+        for (ii = 0; ii < N; ii++)
+            c[ii] = (a[ii] + a[ii] + a[ii] + a[ii]) / a[ii];
+    }
+
+#pragma acc update host (a[0:N], b[0:N], c[0:N]) wait (s)
+
+    }
+
+    for (i = 0; i < N; i++)
+    {
+        if (a[i] != 4.0)
+            abort ();
+
+        if (b[i] != 16.0)
+            abort ();
+
+        if (c[i] != 4.0)
+            abort ();
+    }
+
+
+    for (i = 0; i < N; i++)
+    {
+        a[i] = 5.0;
+        b[i] = 0.0;
+        c[i] = 0.0;
+        d[i] = 0.0;
+        e[i] = 0.0;
+    }
+
+#pragma acc data copyin (a[0:N], b[0:N], c[0:N]) copyin (N)
+    {
+
+#pragma acc parallel async
+    {
+        int ii;
+
+        for (ii = 0; ii < N; ii++)
+            b[ii] = (a[ii] * a[ii] * a[ii]) / a[ii];
+    }
+
+#pragma acc parallel async
+    {
+        int ii;
+
+        for (ii = 0; ii < N; ii++)
+            c[ii] = (a[ii] + a[ii] + a[ii] + a[ii]) / a[ii];
+    }
+
+#pragma acc update host (a[0:N], b[0:N], c[0:N]) async
+
+#pragma acc wait (s)
+
+    }
+
+    for (i = 0; i < N; i++)
+    {
+        if (a[i] != 5.0)
+            abort ();
+
+        if (b[i] != 25.0)
+            abort ();
+
+        if (c[i] != 4.0)
+            abort ();
+    }
+
+    for (i = 0; i < N; i++)
+    {
+        a[i] = 3.0;
+        b[i] = 0.0;
+    }
+
+#pragma acc data copy (a[0:N]) copy (b[0:N]) copyin (N)
+    {
+
+#pragma acc kernels async
+    {
+        int ii;
+
+        for (ii = 0; ii < N; ii++)
+            b[ii] = a[ii];
+    }
+
+#pragma acc wait
+
+    }
+
+    for (i = 0; i < N; i++)
+    {
+        if (a[i] != 3.0)
+            abort ();
+
+        if (b[i] != 3.0)
+            abort ();
+    }
+
+    for (i = 0; i < N; i++)
+    {
+        a[i] = 2.0;
+        b[i] = 0.0;
+    }
+
+#pragma acc data copy (a[0:N]) copy (b[0:N]) copyin (N)
+    {
+
+#pragma acc kernels async
+    {
+        int ii;
+
+        for (ii = 0; ii < N; ii++)
+            b[ii] = a[ii];
+    }
+
+#pragma acc wait (s)
+
+    }
+
+    for (i = 0; i < N; i++)
+    {
+        if (a[i] != 2.0)
+            abort ();
+
+        if (b[i] != 2.0)
+            abort ();
+    }
+
+    for (i = 0; i < N; i++)
+    {
+        a[i] = 3.0;
+        b[i] = 0.0;
+        c[i] = 0.0;
+        d[i] = 0.0;
+    }
+
+#pragma acc data copy (a[0:N]) copy (b[0:N]) copy (c[0:N]) copy (d[0:N]) copyin (N)
+    {
+
+#pragma acc kernels async
+    {
+        int ii;
+
+        for (ii = 0; ii < N; ii++)
+            b[ii] = (a[ii] * a[ii] * a[ii]) / a[ii];
+    }
+
+#pragma acc kernels async
+    {
+        int ii;
+
+        for (ii = 0; ii < N; ii++)
+            c[ii] = (a[ii] + a[ii] + a[ii] + a[ii]) / a[ii];
+    }
+
+
+#pragma acc kernels async
+    {
+        int ii;
+
+        for (ii = 0; ii < N; ii++)
+            d[ii] = ((a[ii] * a[ii] + a[ii]) / a[ii]) - a[ii];
+    }
+
+#pragma acc wait (s)
+
+    }
+
+    for (i = 0; i < N; i++)
+    {
+        if (a[i] != 3.0)
+            abort ();
+
+        if (b[i] != 9.0)
+            abort ();
+
+        if (c[i] != 4.0)
+            abort ();
+
+        if (d[i] != 1.0)
+            abort ();
+    }
+
+    for (i = 0; i < N; i++)
+    {
+        a[i] = 2.0;
+        b[i] = 0.0;
+        c[i] = 0.0;
+        d[i] = 0.0;
+        e[i] = 0.0;
+    }
+
+#pragma acc data copy (a[0:N], b[0:N], c[0:N], d[0:N], e[0:N]) copyin (N)
+    {
+
+#pragma acc kernels async
+    {
+        int ii;
+
+        for (ii = 0; ii < N; ii++)
+            b[ii] = (a[ii] * a[ii] * a[ii]) / a[ii];
+    }
+
+#pragma acc kernels async
+    {
+        int ii;
+
+        for (ii = 0; ii < N; ii++)
+            c[ii] = (a[ii] + a[ii] + a[ii] + a[ii]) / a[ii];
+    }
+
+#pragma acc kernels async
+    {
+        int ii;
+
+        for (ii = 0; ii < N; ii++)
+            d[ii] = ((a[ii] * a[ii] + a[ii]) / a[ii]) - a[ii];
+    }
+
+#pragma acc kernels wait (s) async (s)
+    {
+        int ii;
+
+        for (ii = 0; ii < N; ii++)
+            e[ii] = a[ii] + b[ii] + c[ii] + d[ii];
+    }
+
+#pragma acc wait (s)
+
+    }
+
+    for (i = 0; i < N; i++)
+    {
+        if (a[i] != 2.0)
+            abort ();
+
+        if (b[i] != 4.0)
+            abort ();
+
+        if (c[i] != 4.0)
+            abort ();
+
+        if (d[i] != 1.0)
+            abort ();
+
+        if (e[i] != 11.0)
+            abort ();
+    }
+
+
+    r = cuStreamCreate (&stream1, CU_STREAM_NON_BLOCKING);
+    if (r != CUDA_SUCCESS)
+    {
+        fprintf (stderr, "cuStreamCreate failed: %d\n", r);
+        abort ();
+    }
+
+    acc_set_cuda_stream (1, stream1);
+
+    for (i = 0; i < N; i++)
+    {
+        a[i] = 5.0;
+        b[i] = 0.0;
+    }
+
+#pragma acc data copy (a[0:N], b[0:N]) copyin (N)
+    {
+
+#pragma acc kernels async
+    {
+        int ii;
+
+        for (ii = 0; ii < N; ii++)
+            b[ii] = a[ii];
+    }
+
+#pragma acc wait (s)
+
+    }
+
+    for (i = 0; i < N; i++)
+    {
+        if (a[i] != 5.0)
+            abort ();
+
+        if (b[i] != 5.0)
+            abort ();
+    }
+
+    for (i = 0; i < N; i++)
+    {
+        a[i] = 7.0;
+        b[i] = 0.0;
+        c[i] = 0.0;
+        d[i] = 0.0;
+    }
+
+#pragma acc data copy (a[0:N]) copy (b[0:N]) copy (c[0:N]) copy (d[0:N]) copyin (N)
+    {
+
+#pragma acc kernels async
+    {
+        int ii;
+
+        for (ii = 0; ii < N; ii++)
+            b[ii] = (a[ii] * a[ii] * a[ii]) / a[ii];
+    }
+
+#pragma acc kernels async
+    {
+        int ii;
+
+        for (ii = 0; ii < N; ii++)
+            c[ii] = (a[ii] + a[ii] + a[ii] + a[ii]) / a[ii];
+    }
+
+#pragma acc kernels async
+    {
+        int ii;
+
+        for (ii = 0; ii < N; ii++)
+            d[ii] = ((a[ii] * a[ii] + a[ii]) / a[ii]) - a[ii];
+    }
+
+#pragma acc wait (s)
+
+    }
+
+    for (i = 0; i < N; i++)
+    {
+        if (a[i] != 7.0)
+            abort ();
+
+        if (b[i] != 49.0)
+            abort ();
+
+        if (c[i] != 4.0)
+            abort ();
+
+        if (d[i] != 1.0)
+            abort ();
+    }
+
+    for (i = 0; i < N; i++)
+    {
+        a[i] = 3.0;
+        b[i] = 0.0;
+        c[i] = 0.0;
+        d[i] = 0.0;
+        e[i] = 0.0;
+    }
+
+#pragma acc data copy (a[0:N], b[0:N], c[0:N], d[0:N], e[0:N]) copyin (N)
+    {
+
+#pragma acc kernels async
+    {
+        int ii;
+
+        for (ii = 0; ii < N; ii++)
+            b[ii] = (a[ii] * a[ii] * a[ii]) / a[ii];
+    }
+
+#pragma acc kernels async
+    {
+        int ii;
+
+        for (ii = 0; ii < N; ii++)
+            c[ii] = (a[ii] + a[ii] + a[ii] + a[ii]) / a[ii];
+    }
+
+#pragma acc kernels async
+    {
+        int ii;
+
+        for (ii = 0; ii < N; ii++)
+            d[ii] = ((a[ii] * a[ii] + a[ii]) / a[ii]) - a[ii];
+    }
+
+#pragma acc kernels wait (s) async (s)
+    {
+        int ii;
+
+        for (ii = 0; ii < N; ii++)
+            e[ii] = a[ii] + b[ii] + c[ii] + d[ii];
+    }
+
+#pragma acc wait (s)
+
+    }
+
+    for (i = 0; i < N; i++)
+    {
+        if (a[i] != 3.0)
+            abort ();
+
+        if (b[i] != 9.0)
+            abort ();
+
+        if (c[i] != 4.0)
+            abort ();
+
+        if (d[i] != 1.0)
+            abort ();
+
+        if (e[i] != 17.0)
+            abort ();
+    }
+
+    for (i = 0; i < N; i++)
+    {
+        a[i] = 4.0;
+        b[i] = 0.0;
+        c[i] = 0.0;
+        d[i] = 0.0;
+        e[i] = 0.0;
+    }
+
+#pragma acc data copyin (a[0:N], b[0:N], c[0:N]) copyin (N)
+    {
+
+#pragma acc kernels async
+    {
+        int ii;
+
+        for (ii = 0; ii < N; ii++)
+            b[ii] = (a[ii] * a[ii] * a[ii]) / a[ii];
+    }
+
+#pragma acc kernels async
+    {
+        int ii;
+
+        for (ii = 0; ii < N; ii++)
+            c[ii] = (a[ii] + a[ii] + a[ii] + a[ii]) / a[ii];
+    }
+
+#pragma acc update host (a[0:N], b[0:N], c[0:N]) wait (s)
+
+    }
+
+    for (i = 0; i < N; i++)
+    {
+        if (a[i] != 4.0)
+            abort ();
+
+        if (b[i] != 16.0)
+            abort ();
+
+        if (c[i] != 4.0)
+            abort ();
+    }
+
+
+    for (i = 0; i < N; i++)
+    {
+        a[i] = 5.0;
+        b[i] = 0.0;
+        c[i] = 0.0;
+        d[i] = 0.0;
+        e[i] = 0.0;
+    }
+
+#pragma acc data copyin (a[0:N], b[0:N], c[0:N]) copyin (N)
+    {
+
+#pragma acc kernels async
+    {
+        int ii;
+
+        for (ii = 0; ii < N; ii++)
+            b[ii] = (a[ii] * a[ii] * a[ii]) / a[ii];
+    }
+
+#pragma acc kernels async
+    {
+        int ii;
+
+        for (ii = 0; ii < N; ii++)
+            c[ii] = (a[ii] + a[ii] + a[ii] + a[ii]) / a[ii];
+    }
+
+#pragma acc update host (a[0:N], b[0:N], c[0:N]) async
+
+#pragma acc wait (s)
+
+    }
+
+    for (i = 0; i < N; i++)
+    {
+        if (a[i] != 5.0)
+            abort ();
+
+        if (b[i] != 25.0)
+            abort ();
+
+        if (c[i] != 4.0)
+            abort ();
+    }
+
+    acc_shutdown (acc_device_nvidia);
+
+    return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/data-2-lib.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/data-2-lib.c
index 2ddfa7d..f553d3d 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/data-2-lib.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/data-2-lib.c
@@ -153,7 +153,7 @@ main (int argc, char **argv)
     d[ii] = ((a[ii] * a[ii] + a[ii]) / a[ii]) - a[ii];
 
 #pragma acc parallel present (a[0:N], b[0:N], c[0:N], d[0:N], e[0:N], N) \
-  async (4)
+  wait (1, 2, 3) async (4)
   for (int ii = 0; ii < N; ii++)
     e[ii] = a[ii] + b[ii] + c[ii] + d[ii];
 
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/data-2.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/data-2.c
index 0c6abe6..81d623a 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/data-2.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/data-2.c
@@ -162,7 +162,7 @@ main (int argc, char **argv)
     d[ii] = ((a[ii] * a[ii] + a[ii]) / a[ii]) - a[ii];
 
 #pragma acc parallel present (a[0:N], b[0:N], c[0:N], d[0:N], e[0:N]) \
-  wait (1) async (4)
+  wait (1, 2, 3) async (4)
   for (int ii = 0; ii < N; ii++)
     e[ii] = a[ii] + b[ii] + c[ii] + d[ii];
 
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/data-3.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/data-3.c
index 0bf706a..5ec50b8 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/data-3.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/data-3.c
@@ -138,7 +138,7 @@ main (int argc, char **argv)
     d[ii] = ((a[ii] * a[ii] + a[ii]) / a[ii]) - a[ii];
 
 #pragma acc parallel present (a[0:N], b[0:N], c[0:N], d[0:N], e[0:N]) \
-  wait (1,5) async (4)
+  wait (1, 2, 3, 5) async (4)
   for (int ii = 0; ii < N; ii++)
     e[ii] = a[ii] + b[ii] + c[ii] + d[ii];
 
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-71.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-71.c
index c85e824..6afe2a0 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-71.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-71.c
@@ -92,16 +92,22 @@ main (int argc, char **argv)
       abort ();
     }
 
-  fprintf (stderr, "CheCKpOInT\n");
-  if (acc_async_test (1) != 0)
+  if (acc_async_test (0) != 0)
     {
       fprintf (stderr, "asynchronous operation not running\n");
       abort ();
     }
 
+  /* Test unseen async number.  */
+  if (acc_async_test (1) != 1)
+    {
+      fprintf (stderr, "acc_async_test failed on unseen number\n");
+      abort ();
+    }
+
   sleep ((int) (dtime / 1000.0f) + 1);
 
-  if (acc_async_test (1) != 1)
+  if (acc_async_test (0) != 1)
     {
       fprintf (stderr, "found asynchronous operation still running\n");
       abort ();
@@ -116,7 +122,3 @@ main (int argc, char **argv)
 
   return 0;
 }
-
-/* { dg-output "CheCKpOInT(\n|\r\n|\r).*" } */
-/* { dg-output "unknown async \[0-9\]+" } */
-/* { dg-shouldfail "" } */
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-77.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-77.c
index f4f196d..2821f88 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-77.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-77.c
@@ -111,7 +111,7 @@ main (int argc, char **argv)
 
   start_timer (0);
 
-  acc_wait (1);
+  acc_wait (0);
 
   atime = stop_timer (0);
 
@@ -132,7 +132,3 @@ main (int argc, char **argv)
 
   return 0;
 }
-
-/* { dg-output "CheCKpOInT(\n|\r\n|\r).*" } */
-/* { dg-output "unknown async \[0-9\]+" } */
-/* { dg-shouldfail "" } */
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-79.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-79.c
index ef3df13..b22af26 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-79.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-79.c
@@ -114,6 +114,7 @@ main (int argc, char **argv)
 
   for (i = 0; i < N; i++)
     {
+      stream = (CUstream) acc_get_cuda_stream (i & 1);
       r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, kargs, 0);
       if (r != CUDA_SUCCESS)
 	{
@@ -122,11 +123,11 @@ main (int argc, char **argv)
 	}
     }
 
-  acc_wait_async (0, 1);
-
   if (acc_async_test (0) != 0)
     abort ();
 
+  acc_wait_async (0, 1);
+
   if (acc_async_test (1) != 0)
     abort ();
 
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-81.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-81.c
index d5f18f0..30a4b57 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-81.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-81.c
@@ -133,7 +133,7 @@ main (int argc, char **argv)
 
   for (i = 0; i <= N; i++)
     {
-      if (acc_async_test (i) != 0)
+      if (acc_async_test (i) == 0)
 	abort ();
     }
 

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 5/6, OpenACC, libgomp] Async re-work, C/C++ testsuite changes
  2018-09-25 13:12 [PATCH 5/6, OpenACC, libgomp] Async re-work, C/C++ testsuite changes Chung-Lin Tang
@ 2018-12-07 15:31 ` Thomas Schwinge
  2018-12-14 21:09   ` Thomas Schwinge
  2018-12-07 15:39 ` [PR88407] [OpenACC] Correctly handle unseen async-arguments (was: [PATCH 5/6, OpenACC, libgomp] Async re-work, C/C++ testsuite changes) Thomas Schwinge
  2018-12-07 15:57 ` Too strict synchronization with the local (host) thread? (was: [PATCH 5/6, OpenACC, libgomp] Async re-work, C/C++ testsuite changes) Thomas Schwinge
  2 siblings, 1 reply; 8+ messages in thread
From: Thomas Schwinge @ 2018-12-07 15:31 UTC (permalink / raw)
  To: Chung-Lin Tang; +Cc: gcc-patches

Hi Chung-Lin!

On Tue, 25 Sep 2018 21:11:42 +0800, Chung-Lin Tang <chunglin_tang@mentor.com> wrote:
> These are the testsuite/libgomp.oacc-c-c++-common/* changes.

Please commit the following three hunks to trunk: the code as present
doesn't declare its async/wait dependencies correctly.  To record the
review effort, please include "Reviewed-by: Thomas Schwinge
<thomas@codesourcery.com>" in the commit log, see
<https://gcc.gnu.org/wiki/Reviewed-by>.

> diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/data-2-lib.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/data-2-lib.c
> index 2ddfa7d..f553d3d 100644
> --- a/libgomp/testsuite/libgomp.oacc-c-c++-common/data-2-lib.c
> +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/data-2-lib.c
> @@ -153,7 +153,7 @@ main (int argc, char **argv)
>      d[ii] = ((a[ii] * a[ii] + a[ii]) / a[ii]) - a[ii];
>  
>  #pragma acc parallel present (a[0:N], b[0:N], c[0:N], d[0:N], e[0:N], N) \
> -  async (4)
> +  wait (1, 2, 3) async (4)
>    for (int ii = 0; ii < N; ii++)
>      e[ii] = a[ii] + b[ii] + c[ii] + d[ii];
>  
> diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/data-2.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/data-2.c
> index 0c6abe6..81d623a 100644
> --- a/libgomp/testsuite/libgomp.oacc-c-c++-common/data-2.c
> +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/data-2.c
> @@ -162,7 +162,7 @@ main (int argc, char **argv)
>      d[ii] = ((a[ii] * a[ii] + a[ii]) / a[ii]) - a[ii];
>  
>  #pragma acc parallel present (a[0:N], b[0:N], c[0:N], d[0:N], e[0:N]) \
> -  wait (1) async (4)
> +  wait (1, 2, 3) async (4)
>    for (int ii = 0; ii < N; ii++)
>      e[ii] = a[ii] + b[ii] + c[ii] + d[ii];
>  
> diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/data-3.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/data-3.c
> index 0bf706a..5ec50b8 100644
> --- a/libgomp/testsuite/libgomp.oacc-c-c++-common/data-3.c
> +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/data-3.c
> @@ -138,7 +138,7 @@ main (int argc, char **argv)
>      d[ii] = ((a[ii] * a[ii] + a[ii]) / a[ii]) - a[ii];
>  
>  #pragma acc parallel present (a[0:N], b[0:N], c[0:N], d[0:N], e[0:N]) \
> -  wait (1,5) async (4)
> +  wait (1, 2, 3, 5) async (4)
>    for (int ii = 0; ii < N; ii++)
>      e[ii] = a[ii] + b[ii] + c[ii] + d[ii];
>  


Grüße
 Thomas

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PR88407] [OpenACC] Correctly handle unseen async-arguments (was: [PATCH 5/6, OpenACC, libgomp] Async re-work, C/C++ testsuite changes)
  2018-09-25 13:12 [PATCH 5/6, OpenACC, libgomp] Async re-work, C/C++ testsuite changes Chung-Lin Tang
  2018-12-07 15:31 ` Thomas Schwinge
@ 2018-12-07 15:39 ` Thomas Schwinge
  2018-12-14 21:13   ` [PR88407] [OpenACC] Correctly handle unseen async-arguments Thomas Schwinge
  2018-12-07 15:57 ` Too strict synchronization with the local (host) thread? (was: [PATCH 5/6, OpenACC, libgomp] Async re-work, C/C++ testsuite changes) Thomas Schwinge
  2 siblings, 1 reply; 8+ messages in thread
From: Thomas Schwinge @ 2018-12-07 15:39 UTC (permalink / raw)
  To: Chung-Lin Tang, gcc-patches

Hi Chung-Lin!

On Tue, 25 Sep 2018 21:11:42 +0800, Chung-Lin Tang <chunglin_tang@mentor.com> wrote:
> These are the testsuite/libgomp.oacc-c-c++-common/* changes.

> diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-71.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-71.c
> index c85e824..6afe2a0 100644
> --- a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-71.c
> +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-71.c
> @@ -92,16 +92,22 @@ main (int argc, char **argv)
>        abort ();
>      }
>  
> -  fprintf (stderr, "CheCKpOInT\n");
> -  if (acc_async_test (1) != 0)
> +  if (acc_async_test (0) != 0)
>      {
>        fprintf (stderr, "asynchronous operation not running\n");
>        abort ();
>      }
>  
> +  /* Test unseen async number.  */
> +  if (acc_async_test (1) != 1)
> +    {
> +      fprintf (stderr, "acc_async_test failed on unseen number\n");
> +      abort ();
> +    }
> +
>    sleep ((int) (dtime / 1000.0f) + 1);
>  
> -  if (acc_async_test (1) != 1)
> +  if (acc_async_test (0) != 1)
>      {
>        fprintf (stderr, "found asynchronous operation still running\n");
>        abort ();
> @@ -116,7 +122,3 @@ main (int argc, char **argv)
>  
>    return 0;
>  }
> -
> -/* { dg-output "CheCKpOInT(\n|\r\n|\r).*" } */
> -/* { dg-output "unknown async \[0-9\]+" } */
> -/* { dg-shouldfail "" } */

That's now correct OpenACC usage, but you've now made this one
essentially the same as "libgomp.oacc-c-c++-common/lib-69.c".

> diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-77.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-77.c
> index f4f196d..2821f88 100644
> --- a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-77.c
> +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-77.c
> @@ -111,7 +111,7 @@ main (int argc, char **argv)
>  
>    start_timer (0);
>  
> -  acc_wait (1);
> +  acc_wait (0);
>  
>    atime = stop_timer (0);
>  
> @@ -132,7 +132,3 @@ main (int argc, char **argv)
>  
>    return 0;
>  }
> -
> -/* { dg-output "CheCKpOInT(\n|\r\n|\r).*" } */
> -/* { dg-output "unknown async \[0-9\]+" } */
> -/* { dg-shouldfail "" } */

Again, that's now correct OpenACC usage, but you've now made this one
essentially the same as "libgomp.oacc-c-c++-common/lib-74.c".


So, confused about the intended behavior, I've asked the OpenACC
committee to clarify, and filed <https://gcc.gnu.org/PR88407> "[OpenACC]
Correctly handle unseen async-arguments".

Assuming this gets clarified in the way I think it should, I suggest the
following.  Any comments?

commit a34177a6ce637da8060394f69358f25bce90a8be
Author: Thomas Schwinge <thomas@codesourcery.com>
Date:   Fri Dec 7 16:36:53 2018 +0100

    [PR88407] [OpenACC] Correctly handle unseen async-arguments
    
    ... which turn the operation into a no-op.
    
            libgomp/
            * plugin/plugin-nvptx.c (nvptx_async_test, nvptx_wait)
            (nvptx_wait_async): Unseen async-argument is a no-op.
            * testsuite/libgomp.oacc-c-c++-common/async_queue-1.c: Update.
            * testsuite/libgomp.oacc-c-c++-common/data-2-lib.c: Likewise.
            * testsuite/libgomp.oacc-c-c++-common/data-2.c: Likewise.
            * testsuite/libgomp.oacc-c-c++-common/lib-79.c: Likewise.
            * testsuite/libgomp.oacc-fortran/lib-12.f90: Likewise.
            * testsuite/libgomp.oacc-c-c++-common/lib-71.c: Merge into...
            * testsuite/libgomp.oacc-c-c++-common/lib-69.c: ... this.  Update.
            * testsuite/libgomp.oacc-c-c++-common/lib-77.c: Merge into...
            * testsuite/libgomp.oacc-c-c++-common/lib-74.c: ... this.  Update
---
 libgomp/plugin/plugin-nvptx.c                      |  13 +-
 .../libgomp.oacc-c-c++-common/async_queue-1.c      |  30 +++++
 .../libgomp.oacc-c-c++-common/data-2-lib.c         |   2 +
 .../testsuite/libgomp.oacc-c-c++-common/data-2.c   |   2 +
 .../testsuite/libgomp.oacc-c-c++-common/lib-69.c   |   7 ++
 .../testsuite/libgomp.oacc-c-c++-common/lib-71.c   | 122 ------------------
 .../testsuite/libgomp.oacc-c-c++-common/lib-74.c   |   4 +
 .../testsuite/libgomp.oacc-c-c++-common/lib-77.c   | 138 ---------------------
 .../testsuite/libgomp.oacc-c-c++-common/lib-79.c   |  24 ++++
 libgomp/testsuite/libgomp.oacc-fortran/lib-12.f90  |   5 +
 10 files changed, 80 insertions(+), 267 deletions(-)

diff --git libgomp/plugin/plugin-nvptx.c libgomp/plugin/plugin-nvptx.c
index 7d0d38e0c2e1..6f9b16634b10 100644
--- libgomp/plugin/plugin-nvptx.c
+++ libgomp/plugin/plugin-nvptx.c
@@ -1539,9 +1539,8 @@ nvptx_async_test (int async)
   struct ptx_stream *s;
 
   s = select_stream_for_async (async, pthread_self (), false, NULL);
-
   if (!s)
-    GOMP_PLUGIN_fatal ("unknown async %d", async);
+    return 1;
 
   r = CUDA_CALL_NOCHECK (cuStreamQuery, s->stream);
   if (r == CUDA_SUCCESS)
@@ -1596,7 +1595,7 @@ nvptx_wait (int async)
 
   s = select_stream_for_async (async, pthread_self (), false, NULL);
   if (!s)
-    GOMP_PLUGIN_fatal ("unknown async %d", async);
+    return;
 
   CUDA_CALL_ASSERT (cuStreamSynchronize, s->stream);
 
@@ -1610,14 +1609,14 @@ nvptx_wait_async (int async1, int async2)
   struct ptx_stream *s1, *s2;
   pthread_t self = pthread_self ();
 
+  s1 = select_stream_for_async (async1, self, false, NULL);
+  if (!s1)
+    return;
+
   /* The stream that is waiting (rather than being waited for) doesn't
      necessarily have to exist already.  */
   s2 = select_stream_for_async (async2, self, true, NULL);
 
-  s1 = select_stream_for_async (async1, self, false, NULL);
-  if (!s1)
-    GOMP_PLUGIN_fatal ("invalid async 1\n");
-
   if (s1 == s2)
     GOMP_PLUGIN_fatal ("identical parameters");
 
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/async_queue-1.c libgomp/testsuite/libgomp.oacc-c-c++-common/async_queue-1.c
index 48e1846a36e3..544b19fe6635 100644
--- libgomp/testsuite/libgomp.oacc-c-c++-common/async_queue-1.c
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/async_queue-1.c
@@ -41,6 +41,36 @@ int main(void)
       assert (queues[i].cuda_stream == NULL);
     }
 
+  /* No-ops still don't initialize them.  */
+  {
+    size_t i = 0;
+    /* Find the first non-special async-argument.  */
+    while (queues[i].async < 0)
+      ++i;
+    assert (i < queues_n);
+
+#pragma acc wait(queues[i].async) // no-op
+
+    ++i;
+    assert (i < queues_n);
+#pragma acc parallel wait(queues[i].async) // no-op
+    ;
+
+    ++i;
+    assert (i < queues_n);
+    acc_wait(queues[i].async); // no-op
+
+    i += 2;
+    assert (i < queues_n);
+    acc_wait_async(queues[i - 1].async, queues[i].async); // no-op, and async queue "i" does not get set up
+
+    for (size_t i = 0; i < queues_n; ++i)
+      {
+	queues[i].cuda_stream = acc_get_cuda_stream (queues[i].async);
+	assert (queues[i].cuda_stream == NULL);
+      }
+  }
+
   for (size_t i = 0; i < queues_n; ++i)
     {
       /* Use the queue to initialize it.  */
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/data-2-lib.c libgomp/testsuite/libgomp.oacc-c-c++-common/data-2-lib.c
index 2ddfa7d4a01b..806c2f405a01 100644
--- libgomp/testsuite/libgomp.oacc-c-c++-common/data-2-lib.c
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/data-2-lib.c
@@ -65,6 +65,8 @@ main (int argc, char **argv)
     b[i] = a[i];
 
   acc_wait (1);
+  /* Test unseen async-argument.  */
+  acc_wait (10);
 
   acc_memcpy_from_device (a, d_a, nbytes);
   acc_memcpy_from_device (b, d_b, nbytes);
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/data-2.c libgomp/testsuite/libgomp.oacc-c-c++-common/data-2.c
index 0c6abe69dc17..b552b94529d8 100644
--- libgomp/testsuite/libgomp.oacc-c-c++-common/data-2.c
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/data-2.c
@@ -85,6 +85,8 @@ main (int argc, char **argv)
 
 #pragma acc exit data copyout (a[0:N]) copyout (b[0:N]) wait (1) async (1)
 #pragma acc wait (1)
+  /* Test unseen async-argument.  */
+#pragma acc wait (10)
 
   for (i = 0; i < N; i++)
     {
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/lib-69.c libgomp/testsuite/libgomp.oacc-c-c++-common/lib-69.c
index 5462f1253522..c10bebaab136 100644
--- libgomp/testsuite/libgomp.oacc-c-c++-common/lib-69.c
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/lib-69.c
@@ -103,6 +103,13 @@ main (int argc, char **argv)
       abort ();
     }
 
+  /* Test unseen async-argument.  */
+  if (acc_async_test (1) != 1)
+    {
+      fprintf (stderr, "acc_async_test failed on unseen async-argument\n");
+      abort ();
+    }
+
   sleep (1);
 
   if (acc_async_test (0) != 1)
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/lib-71.c libgomp/testsuite/libgomp.oacc-c-c++-common/lib-71.c
deleted file mode 100644
index c85e82459554..000000000000
--- libgomp/testsuite/libgomp.oacc-c-c++-common/lib-71.c
+++ /dev/null
@@ -1,122 +0,0 @@
-/* { dg-do run { target openacc_nvidia_accel_selected } } */
-/* { dg-additional-options "-lcuda" } */
-
-#include <stdio.h>
-#include <unistd.h>
-#include <openacc.h>
-#include <cuda.h>
-
-int
-main (int argc, char **argv)
-{
-  CUdevice dev;
-  CUfunction delay;
-  CUmodule module;
-  CUresult r;
-  CUstream stream;
-  unsigned long *a, *d_a, dticks;
-  int nbytes;
-  float dtime;
-  void *kargs[2];
-  int clkrate;
-  int devnum, nprocs;
-
-  acc_init (acc_device_nvidia);
-
-  devnum = acc_get_device_num (acc_device_nvidia);
-
-  r = cuDeviceGet (&dev, devnum);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGet failed: %d\n", r);
-      abort ();
-    }
-
-  r =
-    cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-			  dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
-  r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
-  r = cuModuleLoad (&module, "subr.ptx");
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuModuleLoad failed: %d\n", r);
-      abort ();
-    }
-
-  r = cuModuleGetFunction (&delay, module, "delay");
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuModuleGetFunction failed: %d\n", r);
-      abort ();
-    }
-
-  nbytes = nprocs * sizeof (unsigned long);
-
-  dtime = 200.0;
-
-  dticks = (unsigned long) (dtime * clkrate);
-
-  a = (unsigned long *) malloc (nbytes);
-  d_a = (unsigned long *) acc_malloc (nbytes);
-
-  acc_map_data (a, d_a, nbytes);
-
-  kargs[0] = (void *) &d_a;
-  kargs[1] = (void *) &dticks;
-
-  r = cuStreamCreate (&stream, CU_STREAM_DEFAULT);
-  if (r != CUDA_SUCCESS)
-	{
-	  fprintf (stderr, "cuStreamCreate failed: %d\n", r);
-	  abort ();
-	}
-
-  acc_set_cuda_stream (0, stream);
-
-  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, kargs, 0);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
-      abort ();
-    }
-
-  fprintf (stderr, "CheCKpOInT\n");
-  if (acc_async_test (1) != 0)
-    {
-      fprintf (stderr, "asynchronous operation not running\n");
-      abort ();
-    }
-
-  sleep ((int) (dtime / 1000.0f) + 1);
-
-  if (acc_async_test (1) != 1)
-    {
-      fprintf (stderr, "found asynchronous operation still running\n");
-      abort ();
-    }
-
-  acc_unmap_data (a);
-
-  free (a);
-  acc_free (d_a);
-
-  acc_shutdown (acc_device_nvidia);
-
-  return 0;
-}
-
-/* { dg-output "CheCKpOInT(\n|\r\n|\r).*" } */
-/* { dg-output "unknown async \[0-9\]+" } */
-/* { dg-shouldfail "" } */
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/lib-74.c libgomp/testsuite/libgomp.oacc-c-c++-common/lib-74.c
index 0726ee420c3f..0efcf0d52229 100644
--- libgomp/testsuite/libgomp.oacc-c-c++-common/lib-74.c
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/lib-74.c
@@ -103,6 +103,8 @@ main (int argc, char **argv)
     }
 
   acc_wait (0);
+  /* Test unseen async-argument.  */
+  acc_wait (1);
 
   atime = stop_timer (0);
 
@@ -115,6 +117,8 @@ main (int argc, char **argv)
   start_timer (0);
 
   acc_wait (0);
+  /* Test unseen async-argument.  */
+  acc_wait (1);
 
   atime = stop_timer (0);
 
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/lib-77.c libgomp/testsuite/libgomp.oacc-c-c++-common/lib-77.c
deleted file mode 100644
index f4f196def3b7..000000000000
--- libgomp/testsuite/libgomp.oacc-c-c++-common/lib-77.c
+++ /dev/null
@@ -1,138 +0,0 @@
-/* { dg-do run { target openacc_nvidia_accel_selected } } */
-/* { dg-additional-options "-lcuda" } */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <openacc.h>
-#include <cuda.h>
-#include "timer.h"
-
-int
-main (int argc, char **argv)
-{
-  CUdevice dev;
-  CUfunction delay;
-  CUmodule module;
-  CUresult r;
-  CUstream stream;
-  unsigned long *a, *d_a, dticks;
-  int nbytes;
-  float atime, dtime;
-  void *kargs[2];
-  int clkrate;
-  int devnum, nprocs;
-
-  acc_init (acc_device_nvidia);
-
-  devnum = acc_get_device_num (acc_device_nvidia);
-
-  r = cuDeviceGet (&dev, devnum);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGet failed: %d\n", r);
-      abort ();
-    }
-
-  r =
-    cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-			  dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
-  r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
-  r = cuModuleLoad (&module, "subr.ptx");
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuModuleLoad failed: %d\n", r);
-      abort ();
-    }
-
-  r = cuModuleGetFunction (&delay, module, "delay");
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuModuleGetFunction failed: %d\n", r);
-      abort ();
-    }
-
-  nbytes = nprocs * sizeof (unsigned long);
-
-  dtime = 200.0;
-
-  dticks = (unsigned long) (dtime * clkrate);
-
-  a = (unsigned long *) malloc (nbytes);
-  d_a = (unsigned long *) acc_malloc (nbytes);
-
-  acc_map_data (a, d_a, nbytes);
-
-  kargs[0] = (void *) &d_a;
-  kargs[1] = (void *) &dticks;
-
-  r = cuStreamCreate (&stream, CU_STREAM_DEFAULT);
-  if (r != CUDA_SUCCESS)
-	{
-	  fprintf (stderr, "cuStreamCreate failed: %d\n", r);
-	  abort ();
-	}
-
-  acc_set_cuda_stream (0, stream);
-
-  init_timers (1);
-
-  start_timer (0);
-
-  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, kargs, 0);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
-      abort ();
-    }
-
-  fprintf (stderr, "CheCKpOInT\n");
-  acc_wait (1);
-
-  atime = stop_timer (0);
-
-  if (atime < dtime)
-    {
-      fprintf (stderr, "actual time < delay time\n");
-      abort ();
-    }
-
-  start_timer (0);
-
-  acc_wait (1);
-
-  atime = stop_timer (0);
-
-  if (0.010 < atime)
-    {
-      fprintf (stderr, "actual time < delay time\n");
-      abort ();
-    }
-
-  acc_unmap_data (a);
-
-  fini_timers ();
-
-  free (a);
-  acc_free (d_a);
-
-  acc_shutdown (acc_device_nvidia);
-
-  return 0;
-}
-
-/* { dg-output "CheCKpOInT(\n|\r\n|\r).*" } */
-/* { dg-output "unknown async \[0-9\]+" } */
-/* { dg-shouldfail "" } */
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/lib-79.c libgomp/testsuite/libgomp.oacc-c-c++-common/lib-79.c
index ef3df13ebc91..b2e2687e4e76 100644
--- libgomp/testsuite/libgomp.oacc-c-c++-common/lib-79.c
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/lib-79.c
@@ -122,6 +122,13 @@ main (int argc, char **argv)
 	}
     }
 
+  if (acc_async_test (0) != 0)
+    abort ();
+
+  /* Test unseen async-argument.  */
+  if (acc_async_test (1) != 1)
+    abort ();
+
   acc_wait_async (0, 1);
 
   if (acc_async_test (0) != 0)
@@ -130,6 +137,23 @@ main (int argc, char **argv)
   if (acc_async_test (1) != 0)
     abort ();
 
+  /* Test unseen async-argument.  */
+  {
+    if (acc_async_test (2) != 1)
+      abort ();
+
+    acc_wait_async (2, 1);
+
+    if (acc_async_test (0) != 0)
+      abort ();
+
+    if (acc_async_test (1) != 0)
+      abort ();
+
+    if (acc_async_test (2) != 1)
+      abort ();
+  }
+
   acc_wait (1);
 
   atime = stop_timer (0);
diff --git libgomp/testsuite/libgomp.oacc-fortran/lib-12.f90 libgomp/testsuite/libgomp.oacc-fortran/lib-12.f90
index 6912f67d444e..4cf62f2728ad 100644
--- libgomp/testsuite/libgomp.oacc-fortran/lib-12.f90
+++ libgomp/testsuite/libgomp.oacc-fortran/lib-12.f90
@@ -17,9 +17,14 @@ program main
 
   call acc_wait_async (0, 1)
 
+  ! Test unseen async-argument.
+  if (acc_async_test (2) .neqv. .TRUE.) call abort
+  call acc_wait_async (2, 1)
+
   call acc_wait (1)
 
   if (acc_async_test (0) .neqv. .TRUE.) call abort
   if (acc_async_test (1) .neqv. .TRUE.) call abort
+  if (acc_async_test (2) .neqv. .TRUE.) call abort
 
 end program


Grüße
 Thomas

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Too strict synchronization with the local (host) thread?  (was: [PATCH 5/6, OpenACC, libgomp] Async re-work, C/C++ testsuite changes)
  2018-09-25 13:12 [PATCH 5/6, OpenACC, libgomp] Async re-work, C/C++ testsuite changes Chung-Lin Tang
  2018-12-07 15:31 ` Thomas Schwinge
  2018-12-07 15:39 ` [PR88407] [OpenACC] Correctly handle unseen async-arguments (was: [PATCH 5/6, OpenACC, libgomp] Async re-work, C/C++ testsuite changes) Thomas Schwinge
@ 2018-12-07 15:57 ` Thomas Schwinge
  2018-12-11 13:30   ` Too strict synchronization with the local (host) thread? Chung-Lin Tang
  2 siblings, 1 reply; 8+ messages in thread
From: Thomas Schwinge @ 2018-12-07 15:57 UTC (permalink / raw)
  To: Chung-Lin Tang; +Cc: gcc-patches

Hi Chung-Lin!

On Tue, 25 Sep 2018 21:11:42 +0800, Chung-Lin Tang <chunglin_tang@mentor.com> wrote:
> These are the testsuite/libgomp.oacc-c-c++-common/* changes.

> --- a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-79.c
> +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-79.c
> @@ -114,6 +114,7 @@ main (int argc, char **argv)
>  
>    for (i = 0; i < N; i++)
>      {
> +      stream = (CUstream) acc_get_cuda_stream (i & 1);
>        r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, kargs, 0);

What's the motivation for this change?

And then:

> @@ -122,11 +123,11 @@ main (int argc, char **argv)
>  	}
>      }
>  
> -  acc_wait_async (0, 1);
> -
>    if (acc_async_test (0) != 0)
>      abort ();
>  
> +  acc_wait_async (0, 1);
> +
>    if (acc_async_test (1) != 0)
>      abort ();

I somehow feel that this change...

> --- a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-81.c
> +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-81.c
> @@ -133,7 +133,7 @@ main (int argc, char **argv)
>  
>    for (i = 0; i <= N; i++)
>      {
> -      if (acc_async_test (i) != 0)
> +      if (acc_async_test (i) == 0)
>  	abort ();
>      }

..., and this change are needed because we're now more strictly
synchronizing with the local (host) thread.

Regarding the case of "libgomp.oacc-c-c++-common/lib-81.c", as currently
present:

    [...]
      for (i = 0; i < N; i++)
        {
          r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, streams[i], kargs, 0);
          if (r != CUDA_SUCCESS)
            {
              fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
              abort ();
            }
        }

This launches N kernels on N separate async queues/CUDA streams, [0..N).

      acc_wait_all_async (N);

Then, the "acc_wait_all_async (N)" -- in my understanding! -- should
*not* synchronize with the local (host) thread, but instead just set up
the additional async queue/CUDA stream N to "depend" on [0..N).

      for (i = 0; i <= N; i++)
        {
          if (acc_async_test (i) != 0)
            abort ();
        }

Thus, all [0..N) should then still be "acc_async_test (i) != 0" (still
running).

      acc_wait (N);

Here, the "acc_wait (N)" would synchronize the local (host) thread with
async queue/CUDA stream N and thus recursively with [0..N).

      for (i = 0; i <= N; i++)
        {
          if (acc_async_test (i) != 1)
            abort ();
        }
    [...]

So, then all these async queues/CUDA streams here indeed are
"acc_async_test (i) != 1", thas is, idle.


Now, the more strict synchronization with the local (host) thread is not
wrong in term of correctness, but I suppose it will impact performance of
otherwise asynchronous operations, which now get synchronized too much?

Or, of course, I'm misunderstanding something...

(For avoidance of doubt, I would accept the "async re-work" as is, but we
should eventually clarify this, and restore the behavior we -- apparently
-- had before, where we didn't synchronize so much?  (So, technically,
the "async re-work" would constitute a regression for this kind of
usage?)


Grüße
 Thomas

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: Too strict synchronization with the local (host) thread?
  2018-12-07 15:57 ` Too strict synchronization with the local (host) thread? (was: [PATCH 5/6, OpenACC, libgomp] Async re-work, C/C++ testsuite changes) Thomas Schwinge
@ 2018-12-11 13:30   ` Chung-Lin Tang
  2018-12-14 14:50     ` Thomas Schwinge
  0 siblings, 1 reply; 8+ messages in thread
From: Chung-Lin Tang @ 2018-12-11 13:30 UTC (permalink / raw)
  To: Thomas Schwinge, Chung-Lin Tang; +Cc: gcc-patches

On 2018/12/7 11:56 PM, Thomas Schwinge wrote:
>> --- a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-79.c
>> +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-79.c
>> @@ -114,6 +114,7 @@ main (int argc, char **argv)
>>   
>>     for (i = 0; i < N; i++)
>>       {
>> +      stream = (CUstream) acc_get_cuda_stream (i & 1);
>>         r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, kargs, 0);
> What's the motivation for this change?

To place work on both streams 0 and 1.

> ..., and this change are needed because we're now more strictly
> synchronizing with the local (host) thread.
> 
> Regarding the case of "libgomp.oacc-c-c++-common/lib-81.c", as currently
> present:
> 
>      [...]
>        for (i = 0; i < N; i++)
>          {
>            r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, streams[i], kargs, 0);
>            if (r != CUDA_SUCCESS)
>              {
>                fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
>                abort ();
>              }
>          }
> 
> This launches N kernels on N separate async queues/CUDA streams, [0..N).
> 
>        acc_wait_all_async (N);
> 
> Then, the "acc_wait_all_async (N)" -- in my understanding! -- should
> *not*  synchronize with the local (host) thread, but instead just set up
> the additional async queue/CUDA stream N to "depend" on [0..N).
> 
>        for (i = 0; i <= N; i++)
>          {
>            if (acc_async_test (i) != 0)
>              abort ();
>          }
> 
> Thus, all [0..N) should then still be "acc_async_test (i) != 0" (still
> running).
> 
>        acc_wait (N);
> 
> Here, the "acc_wait (N)" would synchronize the local (host) thread with
> async queue/CUDA stream N and thus recursively with [0..N).
> 
>        for (i = 0; i <= N; i++)
>          {
>            if (acc_async_test (i) != 1)
>              abort ();
>          }
>      [...]
> 
> So, then all these async queues/CUDA streams here indeed are
> "acc_async_test (i) != 1", thas is, idle.
> 
> 
> Now, the more strict synchronization with the local (host) thread is not
> wrong in term of correctness, but I suppose it will impact performance of
> otherwise asynchronous operations, which now get synchronized too much?
> 
> Or, of course, I'm misunderstanding something...

IIRC, we encountered many issues where people misunderstood the meaning of "wait+async",
using it as if the local host sync happened, where in our original implementation it does not.

Also some areas of the OpenACC spec were vague on whether the local host synchronization should
or should not happen; basically, the wording treated as if it was only an implementation detail
and didn't matter, and didn't acknowledge that this would be something visible to the user.

At the end, IIRC, I decided that adding a local host synchronization is easier for all of us,
and took the opportunity of the re-org to make this change.

That said, I didn't notice those tests you listed above were meant to test such delicate behavior.

> (For avoidance of doubt, I would accept the "async re-work" as is, but we
> should eventually clarify this, and restore the behavior we -- apparently
> -- had before, where we didn't synchronize so much?  (So, technically,
> the "async re-work" would constitute a regression for this kind of
> usage?)

It's not hard to restore the old behavior, just a few lines to delete. Although as described
above, this change was deliberate.

This might be another issue to raise with the committee. I think I tried on this exact issue
a long time ago, but never got answers.

Thanks,
Chung-Lin

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: Too strict synchronization with the local (host) thread?
  2018-12-11 13:30   ` Too strict synchronization with the local (host) thread? Chung-Lin Tang
@ 2018-12-14 14:50     ` Thomas Schwinge
  0 siblings, 0 replies; 8+ messages in thread
From: Thomas Schwinge @ 2018-12-14 14:50 UTC (permalink / raw)
  To: Chung-Lin Tang; +Cc: gcc-patches

Hi Chung-Lin!

On Tue, 11 Dec 2018 21:30:31 +0800, Chung-Lin Tang <chunglin_tang@mentor.com> wrote:
> On 2018/12/7 11:56 PM, Thomas Schwinge wrote:
> >> --- a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-79.c
> >> +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-79.c
> >> @@ -114,6 +114,7 @@ main (int argc, char **argv)
> >>   
> >>     for (i = 0; i < N; i++)
> >>       {
> >> +      stream = (CUstream) acc_get_cuda_stream (i & 1);
> >>         r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, kargs, 0);
> > What's the motivation for this change?
> 
> To place work on both streams 0 and 1.

That's describing what it doesn, not the motivation behind it.  ;-)


> > ..., and this change are needed because we're now more strictly
> > synchronizing with the local (host) thread.
> > 
> > Regarding the case of "libgomp.oacc-c-c++-common/lib-81.c", as currently
> > present:
> > 
> >      [...]
> >        for (i = 0; i < N; i++)
> >          {
> >            r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, streams[i], kargs, 0);
> >            if (r != CUDA_SUCCESS)
> >              {
> >                fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
> >                abort ();
> >              }
> >          }
> > 
> > This launches N kernels on N separate async queues/CUDA streams, [0..N).
> > 
> >        acc_wait_all_async (N);
> > 
> > Then, the "acc_wait_all_async (N)" -- in my understanding! -- should
> > *not*  synchronize with the local (host) thread, but instead just set up
> > the additional async queue/CUDA stream N to "depend" on [0..N).
> > 
> >        for (i = 0; i <= N; i++)
> >          {
> >            if (acc_async_test (i) != 0)
> >              abort ();
> >          }
> > 
> > Thus, all [0..N) should then still be "acc_async_test (i) != 0" (still
> > running).
> > 
> >        acc_wait (N);
> > 
> > Here, the "acc_wait (N)" would synchronize the local (host) thread with
> > async queue/CUDA stream N and thus recursively with [0..N).
> > 
> >        for (i = 0; i <= N; i++)
> >          {
> >            if (acc_async_test (i) != 1)
> >              abort ();
> >          }
> >      [...]
> > 
> > So, then all these async queues/CUDA streams here indeed are
> > "acc_async_test (i) != 1", thas is, idle.
> > 
> > 
> > Now, the more strict synchronization with the local (host) thread is not
> > wrong in term of correctness, but I suppose it will impact performance of
> > otherwise asynchronous operations, which now get synchronized too much?
> > 
> > Or, of course, I'm misunderstanding something...
> 
> IIRC, we encountered many issues where people misunderstood the meaning of "wait+async",
> using it as if the local host sync happened, where in our original implementation it does not.

..., and that's the right thing, in my opinion.  (Do you disagree?)

> Also some areas of the OpenACC spec were vague on whether the local host synchronization should
> or should not happen; basically, the wording treated as if it was only an implementation detail
> and didn't matter, and didn't acknowledge that this would be something visible to the user.

I suppose in correct code that correctly uses a different mechanism for
inter-thread synchronization, it shouldn't be visible?  (Well, with the
additional synchronization, it would be visible in terms of performance
degradation.)

For example, OpenACC 2.6, 3.2.11. "acc_wait" explicitly states that "If
two or more threads share the same accelerator, the 'acc_wait' routine
will return only if all matching asynchronous operations initiated by
this thread have completed; there is no guarantee that all matching
asynchronous operations initiated by other threads have completed".

I agree that this could be made more explicit throught the specification,
and also the reading of OpenACC 2.6, 2.16.1. "async clause" is a bit
confusing regarding multiple host threads, but as I understand, the idea
still is that such wait operations do not synchronize at the host thread
level.  (Let's please assume that, and then work with the OpenACC
technical committee to get that clarified in the documentation.)

> At the end, IIRC, I decided that adding a local host synchronization is easier for all of us,

Well...

> and took the opportunity of the re-org to make this change.

Well...  Again, a re-org/re-work should not make such functional
changes...

> That said, I didn't notice those tests you listed above were meant to test such delicate behavior.
> 
> > (For avoidance of doubt, I would accept the "async re-work" as is, but we
> > should eventually clarify this, and restore the behavior we -- apparently
> > -- had before, where we didn't synchronize so much?  (So, technically,
> > the "async re-work" would constitute a regression for this kind of
> > usage?)
> 
> It's not hard to restore the old behavior, just a few lines to delete. Although as described
> above, this change was deliberate.
> 
> This might be another issue to raise with the committee. I think I tried on this exact issue
> a long time ago, but never got answers.

OK, I'll try to find that, or send me a pointer to it, if you still got.

I propose you include the following.  Would you please review the "TODO"
comments, and again also especially review the
"libgomp/oacc-parallel.c:goacc_wait" change, and confirm no corresponding
"libgomp/oacc-parallel.c:GOACC_wait" change to be done, because that code
is structured differently.

commit e44cc6dc8f76e50c6f905cd408475589dee7b3b1
Author: Thomas Schwinge <thomas@codesourcery.com>
Date:   Thu Dec 13 17:54:35 2018 +0100

    into async re-work: don't synchronize with the local thread unless actually necessary
---
 libgomp/oacc-async.c    | 8 ++++++--
 libgomp/oacc-parallel.c | 1 -
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git libgomp/oacc-async.c libgomp/oacc-async.c
index a38e42781aa0..ec5cbc408d4e 100644
--- libgomp/oacc-async.c
+++ libgomp/oacc-async.c
@@ -195,9 +195,11 @@ acc_wait_async (int async1, int async2)
   if (aq1 == aq2)
     gomp_fatal ("identical parameters");
 
-  thr->dev->openacc.async.synchronize_func (aq1);
   if (aq2)
     thr->dev->openacc.async.serialize_func (aq1, aq2);
+  else
+    //TODO Local thread synchronization.  Necessary for the "async2 == acc_async_sync" case, or can just skip?
+    thr->dev->openacc.async.synchronize_func (aq1);
 }
 
 void
@@ -232,9 +234,11 @@ acc_wait_all_async (int async)
   gomp_mutex_lock (&thr->dev->openacc.async.lock);
   for (goacc_aq_list l = thr->dev->openacc.async.active; l; l = l->next)
     {
-      thr->dev->openacc.async.synchronize_func (l->aq);
       if (waiting_queue)
 	thr->dev->openacc.async.serialize_func (l->aq, waiting_queue);
+      else
+	//TODO Local thread synchronization.  Necessary for the "async == acc_async_sync" case, or can just skip?
+	thr->dev->openacc.async.synchronize_func (l->aq);
     }
   gomp_mutex_unlock (&thr->dev->openacc.async.lock);
 }
diff --git libgomp/oacc-parallel.c libgomp/oacc-parallel.c
index 9519abeccc2c..5a441c9efe38 100644
--- libgomp/oacc-parallel.c
+++ libgomp/oacc-parallel.c
@@ -508,7 +508,6 @@ goacc_wait (int async, int num_waits, va_list *ap)
       else
 	{
 	  goacc_aq aq2 = get_goacc_asyncqueue (async);
-	  acc_dev->openacc.async.synchronize_func (aq);
 	  acc_dev->openacc.async.serialize_func (aq, aq2);
 	}
     }


Grüße
 Thomas

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 5/6, OpenACC, libgomp] Async re-work, C/C++ testsuite changes
  2018-12-07 15:31 ` Thomas Schwinge
@ 2018-12-14 21:09   ` Thomas Schwinge
  0 siblings, 0 replies; 8+ messages in thread
From: Thomas Schwinge @ 2018-12-14 21:09 UTC (permalink / raw)
  To: gcc-patches; +Cc: Chung-Lin Tang

Hi!

On Fri, 7 Dec 2018 16:30:53 +0100, I wrote:
> On Tue, 25 Sep 2018 21:11:42 +0800, Chung-Lin Tang <chunglin_tang@mentor.com> wrote:
> > These are the testsuite/libgomp.oacc-c-c++-common/* changes.
> 
> Please commit the following three hunks to trunk: the code as present
> doesn't declare its async/wait dependencies correctly.

As I had this queued as a prerequisite for other changes, in r267148 I
have now committed the following to trunk:

commit fef25f06de8e800d2a6ac04b12b6399923d414a9
Author: tschwinge <tschwinge@138bc75d-0d04-0410-961f-82ee72b054a4>
Date:   Fri Dec 14 20:42:18 2018 +0000

    Correctly describe OpenACC async/wait dependencies
    
            libgomp/
            * testsuite/libgomp.oacc-c-c++-common/data-2-lib.c: Adjust.
            * testsuite/libgomp.oacc-c-c++-common/data-2.c: Likewise.
            * testsuite/libgomp.oacc-c-c++-common/data-3.c: Likewise.
    
    Reviewed-by: Thomas Schwinge <thomas@codesourcery.com>
    
    git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@267148 138bc75d-0d04-0410-961f-82ee72b054a4
---
 libgomp/ChangeLog                                        | 6 ++++++
 libgomp/testsuite/libgomp.oacc-c-c++-common/data-2-lib.c | 2 +-
 libgomp/testsuite/libgomp.oacc-c-c++-common/data-2.c     | 2 +-
 libgomp/testsuite/libgomp.oacc-c-c++-common/data-3.c     | 2 +-
 4 files changed, 9 insertions(+), 3 deletions(-)

diff --git libgomp/ChangeLog libgomp/ChangeLog
index b4ab6b690553..b6cbb34908a2 100644
--- libgomp/ChangeLog
+++ libgomp/ChangeLog
@@ -1,3 +1,9 @@
+2018-12-14  Chung-Lin Tang  <cltang@codesourcery.com>
+
+	* testsuite/libgomp.oacc-c-c++-common/data-2-lib.c: Adjust.
+	* testsuite/libgomp.oacc-c-c++-common/data-2.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/data-3.c: Likewise.
+
 2018-12-14  Thomas Schwinge  <thomas@codesourcery.com>
 
 	PR libgomp/88370
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/data-2-lib.c libgomp/testsuite/libgomp.oacc-c-c++-common/data-2-lib.c
index 2ddfa7d4a01b..f553d3d839c5 100644
--- libgomp/testsuite/libgomp.oacc-c-c++-common/data-2-lib.c
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/data-2-lib.c
@@ -153,7 +153,7 @@ main (int argc, char **argv)
     d[ii] = ((a[ii] * a[ii] + a[ii]) / a[ii]) - a[ii];
 
 #pragma acc parallel present (a[0:N], b[0:N], c[0:N], d[0:N], e[0:N], N) \
-  async (4)
+  wait (1, 2, 3) async (4)
   for (int ii = 0; ii < N; ii++)
     e[ii] = a[ii] + b[ii] + c[ii] + d[ii];
 
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/data-2.c libgomp/testsuite/libgomp.oacc-c-c++-common/data-2.c
index 0c6abe69dc17..81d623afa0ea 100644
--- libgomp/testsuite/libgomp.oacc-c-c++-common/data-2.c
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/data-2.c
@@ -162,7 +162,7 @@ main (int argc, char **argv)
     d[ii] = ((a[ii] * a[ii] + a[ii]) / a[ii]) - a[ii];
 
 #pragma acc parallel present (a[0:N], b[0:N], c[0:N], d[0:N], e[0:N]) \
-  wait (1) async (4)
+  wait (1, 2, 3) async (4)
   for (int ii = 0; ii < N; ii++)
     e[ii] = a[ii] + b[ii] + c[ii] + d[ii];
 
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/data-3.c libgomp/testsuite/libgomp.oacc-c-c++-common/data-3.c
index 0bf706a1b5d4..5ec50b808a73 100644
--- libgomp/testsuite/libgomp.oacc-c-c++-common/data-3.c
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/data-3.c
@@ -138,7 +138,7 @@ main (int argc, char **argv)
     d[ii] = ((a[ii] * a[ii] + a[ii]) / a[ii]) - a[ii];
 
 #pragma acc parallel present (a[0:N], b[0:N], c[0:N], d[0:N], e[0:N]) \
-  wait (1,5) async (4)
+  wait (1, 2, 3, 5) async (4)
   for (int ii = 0; ii < N; ii++)
     e[ii] = a[ii] + b[ii] + c[ii] + d[ii];
 


Grüße
 Thomas

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PR88407] [OpenACC] Correctly handle unseen async-arguments
  2018-12-07 15:39 ` [PR88407] [OpenACC] Correctly handle unseen async-arguments (was: [PATCH 5/6, OpenACC, libgomp] Async re-work, C/C++ testsuite changes) Thomas Schwinge
@ 2018-12-14 21:13   ` Thomas Schwinge
  0 siblings, 0 replies; 8+ messages in thread
From: Thomas Schwinge @ 2018-12-14 21:13 UTC (permalink / raw)
  To: gcc-patches; +Cc: Chung-Lin Tang

Hi!

On Fri, 7 Dec 2018 16:38:58 +0100, I wrote:
> So, confused about the intended behavior, I've asked the OpenACC
> committee to clarify, and filed <https://gcc.gnu.org/PR88407> "[OpenACC]
> Correctly handle unseen async-arguments".
> 
> Assuming this gets clarified in the way I think it should, I suggest the
> following.  Any comments?

Have not yet heard back, but given that the PGI compiler also seems to
handle it this way, I committed the following to trunk in r267150:

commit e7acb9ffce94d592054ecba2eb1970eaf5cbc313
Author: tschwinge <tschwinge@138bc75d-0d04-0410-961f-82ee72b054a4>
Date:   Fri Dec 14 20:42:40 2018 +0000

    [PR88407] [OpenACC] Correctly handle unseen async-arguments
    
    ... which turn the operation into a no-op.
    
            libgomp/
            PR libgomp/88407
            * plugin/plugin-nvptx.c (nvptx_async_test, nvptx_wait)
            (nvptx_wait_async): Unseen async-argument is a no-op.
            * testsuite/libgomp.oacc-c-c++-common/async_queue-1.c: Update.
            * testsuite/libgomp.oacc-c-c++-common/data-2-lib.c: Likewise.
            * testsuite/libgomp.oacc-c-c++-common/data-2.c: Likewise.
            * testsuite/libgomp.oacc-c-c++-common/lib-79.c: Likewise.
            * testsuite/libgomp.oacc-fortran/lib-12.f90: Likewise.
            * testsuite/libgomp.oacc-c-c++-common/lib-71.c: Merge into...
            * testsuite/libgomp.oacc-c-c++-common/lib-69.c: ... this.  Update.
            * testsuite/libgomp.oacc-c-c++-common/lib-77.c: Merge into...
            * testsuite/libgomp.oacc-c-c++-common/lib-74.c: ... this.  Update
    
    git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@267150 138bc75d-0d04-0410-961f-82ee72b054a4
---
 libgomp/ChangeLog                                  |  13 ++
 libgomp/plugin/plugin-nvptx.c                      |  13 +-
 .../libgomp.oacc-c-c++-common/async_queue-1.c      |  30 +++++
 .../libgomp.oacc-c-c++-common/data-2-lib.c         |   2 +
 .../testsuite/libgomp.oacc-c-c++-common/data-2.c   |   2 +
 .../testsuite/libgomp.oacc-c-c++-common/lib-69.c   |   7 ++
 .../testsuite/libgomp.oacc-c-c++-common/lib-71.c   | 122 ------------------
 .../testsuite/libgomp.oacc-c-c++-common/lib-74.c   |   4 +
 .../testsuite/libgomp.oacc-c-c++-common/lib-77.c   | 138 ---------------------
 .../testsuite/libgomp.oacc-c-c++-common/lib-79.c   |  24 ++++
 libgomp/testsuite/libgomp.oacc-fortran/lib-12.f90  |   5 +
 11 files changed, 93 insertions(+), 267 deletions(-)

diff --git libgomp/ChangeLog libgomp/ChangeLog
index d84c3f4bfe2e..c1f98d76e013 100644
--- libgomp/ChangeLog
+++ libgomp/ChangeLog
@@ -1,5 +1,18 @@
 2018-12-14  Thomas Schwinge  <thomas@codesourcery.com>
 
+	PR libgomp/88407
+	* plugin/plugin-nvptx.c (nvptx_async_test, nvptx_wait)
+	(nvptx_wait_async): Unseen async-argument is a no-op.
+	* testsuite/libgomp.oacc-c-c++-common/async_queue-1.c: Update.
+	* testsuite/libgomp.oacc-c-c++-common/data-2-lib.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/data-2.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/lib-79.c: Likewise.
+	* testsuite/libgomp.oacc-fortran/lib-12.f90: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/lib-71.c: Merge into...
+	* testsuite/libgomp.oacc-c-c++-common/lib-69.c: ... this.  Update.
+	* testsuite/libgomp.oacc-c-c++-common/lib-77.c: Merge into...
+	* testsuite/libgomp.oacc-c-c++-common/lib-74.c: ... this.  Update
+
 	* testsuite/libgomp.oacc-c-c++-common/data-2-lib.c: Revise.
 	* testsuite/libgomp.oacc-c-c++-common/data-2.c: Likewise.
 
diff --git libgomp/plugin/plugin-nvptx.c libgomp/plugin/plugin-nvptx.c
index 7d0d38e0c2e1..6f9b16634b10 100644
--- libgomp/plugin/plugin-nvptx.c
+++ libgomp/plugin/plugin-nvptx.c
@@ -1539,9 +1539,8 @@ nvptx_async_test (int async)
   struct ptx_stream *s;
 
   s = select_stream_for_async (async, pthread_self (), false, NULL);
-
   if (!s)
-    GOMP_PLUGIN_fatal ("unknown async %d", async);
+    return 1;
 
   r = CUDA_CALL_NOCHECK (cuStreamQuery, s->stream);
   if (r == CUDA_SUCCESS)
@@ -1596,7 +1595,7 @@ nvptx_wait (int async)
 
   s = select_stream_for_async (async, pthread_self (), false, NULL);
   if (!s)
-    GOMP_PLUGIN_fatal ("unknown async %d", async);
+    return;
 
   CUDA_CALL_ASSERT (cuStreamSynchronize, s->stream);
 
@@ -1610,14 +1609,14 @@ nvptx_wait_async (int async1, int async2)
   struct ptx_stream *s1, *s2;
   pthread_t self = pthread_self ();
 
+  s1 = select_stream_for_async (async1, self, false, NULL);
+  if (!s1)
+    return;
+
   /* The stream that is waiting (rather than being waited for) doesn't
      necessarily have to exist already.  */
   s2 = select_stream_for_async (async2, self, true, NULL);
 
-  s1 = select_stream_for_async (async1, self, false, NULL);
-  if (!s1)
-    GOMP_PLUGIN_fatal ("invalid async 1\n");
-
   if (s1 == s2)
     GOMP_PLUGIN_fatal ("identical parameters");
 
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/async_queue-1.c libgomp/testsuite/libgomp.oacc-c-c++-common/async_queue-1.c
index 48e1846a36e3..544b19fe6635 100644
--- libgomp/testsuite/libgomp.oacc-c-c++-common/async_queue-1.c
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/async_queue-1.c
@@ -41,6 +41,36 @@ int main(void)
       assert (queues[i].cuda_stream == NULL);
     }
 
+  /* No-ops still don't initialize them.  */
+  {
+    size_t i = 0;
+    /* Find the first non-special async-argument.  */
+    while (queues[i].async < 0)
+      ++i;
+    assert (i < queues_n);
+
+#pragma acc wait(queues[i].async) // no-op
+
+    ++i;
+    assert (i < queues_n);
+#pragma acc parallel wait(queues[i].async) // no-op
+    ;
+
+    ++i;
+    assert (i < queues_n);
+    acc_wait(queues[i].async); // no-op
+
+    i += 2;
+    assert (i < queues_n);
+    acc_wait_async(queues[i - 1].async, queues[i].async); // no-op, and async queue "i" does not get set up
+
+    for (size_t i = 0; i < queues_n; ++i)
+      {
+	queues[i].cuda_stream = acc_get_cuda_stream (queues[i].async);
+	assert (queues[i].cuda_stream == NULL);
+      }
+  }
+
   for (size_t i = 0; i < queues_n; ++i)
     {
       /* Use the queue to initialize it.  */
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/data-2-lib.c libgomp/testsuite/libgomp.oacc-c-c++-common/data-2-lib.c
index e432f8d9c796..e9d1edaba7f6 100644
--- libgomp/testsuite/libgomp.oacc-c-c++-common/data-2-lib.c
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/data-2-lib.c
@@ -69,6 +69,8 @@ main (int argc, char **argv)
   acc_memcpy_from_device_async (b, d_b, nbytes, 1);
 
   acc_wait (1);
+  /* Test unseen async-argument.  */
+  acc_wait (10);
 
   for (i = 0; i < N; i++)
     {
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/data-2.c libgomp/testsuite/libgomp.oacc-c-c++-common/data-2.c
index c0f36d3be6ba..2fc4a598e8f6 100644
--- libgomp/testsuite/libgomp.oacc-c-c++-common/data-2.c
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/data-2.c
@@ -65,6 +65,8 @@ main (int argc, char **argv)
 #pragma acc update self (b[0:N]) async (1)
 
 #pragma acc wait (1)
+  /* Test unseen async-argument.  */
+#pragma acc wait (10)
 
   for (i = 0; i < N; i++)
     {
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/lib-69.c libgomp/testsuite/libgomp.oacc-c-c++-common/lib-69.c
index 5462f1253522..c10bebaab136 100644
--- libgomp/testsuite/libgomp.oacc-c-c++-common/lib-69.c
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/lib-69.c
@@ -103,6 +103,13 @@ main (int argc, char **argv)
       abort ();
     }
 
+  /* Test unseen async-argument.  */
+  if (acc_async_test (1) != 1)
+    {
+      fprintf (stderr, "acc_async_test failed on unseen async-argument\n");
+      abort ();
+    }
+
   sleep (1);
 
   if (acc_async_test (0) != 1)
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/lib-71.c libgomp/testsuite/libgomp.oacc-c-c++-common/lib-71.c
deleted file mode 100644
index c85e82459554..000000000000
--- libgomp/testsuite/libgomp.oacc-c-c++-common/lib-71.c
+++ /dev/null
@@ -1,122 +0,0 @@
-/* { dg-do run { target openacc_nvidia_accel_selected } } */
-/* { dg-additional-options "-lcuda" } */
-
-#include <stdio.h>
-#include <unistd.h>
-#include <openacc.h>
-#include <cuda.h>
-
-int
-main (int argc, char **argv)
-{
-  CUdevice dev;
-  CUfunction delay;
-  CUmodule module;
-  CUresult r;
-  CUstream stream;
-  unsigned long *a, *d_a, dticks;
-  int nbytes;
-  float dtime;
-  void *kargs[2];
-  int clkrate;
-  int devnum, nprocs;
-
-  acc_init (acc_device_nvidia);
-
-  devnum = acc_get_device_num (acc_device_nvidia);
-
-  r = cuDeviceGet (&dev, devnum);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGet failed: %d\n", r);
-      abort ();
-    }
-
-  r =
-    cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-			  dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
-  r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
-  r = cuModuleLoad (&module, "subr.ptx");
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuModuleLoad failed: %d\n", r);
-      abort ();
-    }
-
-  r = cuModuleGetFunction (&delay, module, "delay");
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuModuleGetFunction failed: %d\n", r);
-      abort ();
-    }
-
-  nbytes = nprocs * sizeof (unsigned long);
-
-  dtime = 200.0;
-
-  dticks = (unsigned long) (dtime * clkrate);
-
-  a = (unsigned long *) malloc (nbytes);
-  d_a = (unsigned long *) acc_malloc (nbytes);
-
-  acc_map_data (a, d_a, nbytes);
-
-  kargs[0] = (void *) &d_a;
-  kargs[1] = (void *) &dticks;
-
-  r = cuStreamCreate (&stream, CU_STREAM_DEFAULT);
-  if (r != CUDA_SUCCESS)
-	{
-	  fprintf (stderr, "cuStreamCreate failed: %d\n", r);
-	  abort ();
-	}
-
-  acc_set_cuda_stream (0, stream);
-
-  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, kargs, 0);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
-      abort ();
-    }
-
-  fprintf (stderr, "CheCKpOInT\n");
-  if (acc_async_test (1) != 0)
-    {
-      fprintf (stderr, "asynchronous operation not running\n");
-      abort ();
-    }
-
-  sleep ((int) (dtime / 1000.0f) + 1);
-
-  if (acc_async_test (1) != 1)
-    {
-      fprintf (stderr, "found asynchronous operation still running\n");
-      abort ();
-    }
-
-  acc_unmap_data (a);
-
-  free (a);
-  acc_free (d_a);
-
-  acc_shutdown (acc_device_nvidia);
-
-  return 0;
-}
-
-/* { dg-output "CheCKpOInT(\n|\r\n|\r).*" } */
-/* { dg-output "unknown async \[0-9\]+" } */
-/* { dg-shouldfail "" } */
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/lib-74.c libgomp/testsuite/libgomp.oacc-c-c++-common/lib-74.c
index 0726ee420c3f..0efcf0d52229 100644
--- libgomp/testsuite/libgomp.oacc-c-c++-common/lib-74.c
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/lib-74.c
@@ -103,6 +103,8 @@ main (int argc, char **argv)
     }
 
   acc_wait (0);
+  /* Test unseen async-argument.  */
+  acc_wait (1);
 
   atime = stop_timer (0);
 
@@ -115,6 +117,8 @@ main (int argc, char **argv)
   start_timer (0);
 
   acc_wait (0);
+  /* Test unseen async-argument.  */
+  acc_wait (1);
 
   atime = stop_timer (0);
 
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/lib-77.c libgomp/testsuite/libgomp.oacc-c-c++-common/lib-77.c
deleted file mode 100644
index f4f196def3b7..000000000000
--- libgomp/testsuite/libgomp.oacc-c-c++-common/lib-77.c
+++ /dev/null
@@ -1,138 +0,0 @@
-/* { dg-do run { target openacc_nvidia_accel_selected } } */
-/* { dg-additional-options "-lcuda" } */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <openacc.h>
-#include <cuda.h>
-#include "timer.h"
-
-int
-main (int argc, char **argv)
-{
-  CUdevice dev;
-  CUfunction delay;
-  CUmodule module;
-  CUresult r;
-  CUstream stream;
-  unsigned long *a, *d_a, dticks;
-  int nbytes;
-  float atime, dtime;
-  void *kargs[2];
-  int clkrate;
-  int devnum, nprocs;
-
-  acc_init (acc_device_nvidia);
-
-  devnum = acc_get_device_num (acc_device_nvidia);
-
-  r = cuDeviceGet (&dev, devnum);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGet failed: %d\n", r);
-      abort ();
-    }
-
-  r =
-    cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-			  dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
-  r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
-  r = cuModuleLoad (&module, "subr.ptx");
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuModuleLoad failed: %d\n", r);
-      abort ();
-    }
-
-  r = cuModuleGetFunction (&delay, module, "delay");
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuModuleGetFunction failed: %d\n", r);
-      abort ();
-    }
-
-  nbytes = nprocs * sizeof (unsigned long);
-
-  dtime = 200.0;
-
-  dticks = (unsigned long) (dtime * clkrate);
-
-  a = (unsigned long *) malloc (nbytes);
-  d_a = (unsigned long *) acc_malloc (nbytes);
-
-  acc_map_data (a, d_a, nbytes);
-
-  kargs[0] = (void *) &d_a;
-  kargs[1] = (void *) &dticks;
-
-  r = cuStreamCreate (&stream, CU_STREAM_DEFAULT);
-  if (r != CUDA_SUCCESS)
-	{
-	  fprintf (stderr, "cuStreamCreate failed: %d\n", r);
-	  abort ();
-	}
-
-  acc_set_cuda_stream (0, stream);
-
-  init_timers (1);
-
-  start_timer (0);
-
-  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, kargs, 0);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
-      abort ();
-    }
-
-  fprintf (stderr, "CheCKpOInT\n");
-  acc_wait (1);
-
-  atime = stop_timer (0);
-
-  if (atime < dtime)
-    {
-      fprintf (stderr, "actual time < delay time\n");
-      abort ();
-    }
-
-  start_timer (0);
-
-  acc_wait (1);
-
-  atime = stop_timer (0);
-
-  if (0.010 < atime)
-    {
-      fprintf (stderr, "actual time < delay time\n");
-      abort ();
-    }
-
-  acc_unmap_data (a);
-
-  fini_timers ();
-
-  free (a);
-  acc_free (d_a);
-
-  acc_shutdown (acc_device_nvidia);
-
-  return 0;
-}
-
-/* { dg-output "CheCKpOInT(\n|\r\n|\r).*" } */
-/* { dg-output "unknown async \[0-9\]+" } */
-/* { dg-shouldfail "" } */
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/lib-79.c libgomp/testsuite/libgomp.oacc-c-c++-common/lib-79.c
index ef3df13ebc91..b2e2687e4e76 100644
--- libgomp/testsuite/libgomp.oacc-c-c++-common/lib-79.c
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/lib-79.c
@@ -122,6 +122,13 @@ main (int argc, char **argv)
 	}
     }
 
+  if (acc_async_test (0) != 0)
+    abort ();
+
+  /* Test unseen async-argument.  */
+  if (acc_async_test (1) != 1)
+    abort ();
+
   acc_wait_async (0, 1);
 
   if (acc_async_test (0) != 0)
@@ -130,6 +137,23 @@ main (int argc, char **argv)
   if (acc_async_test (1) != 0)
     abort ();
 
+  /* Test unseen async-argument.  */
+  {
+    if (acc_async_test (2) != 1)
+      abort ();
+
+    acc_wait_async (2, 1);
+
+    if (acc_async_test (0) != 0)
+      abort ();
+
+    if (acc_async_test (1) != 0)
+      abort ();
+
+    if (acc_async_test (2) != 1)
+      abort ();
+  }
+
   acc_wait (1);
 
   atime = stop_timer (0);
diff --git libgomp/testsuite/libgomp.oacc-fortran/lib-12.f90 libgomp/testsuite/libgomp.oacc-fortran/lib-12.f90
index 6912f67d444e..4cf62f2728ad 100644
--- libgomp/testsuite/libgomp.oacc-fortran/lib-12.f90
+++ libgomp/testsuite/libgomp.oacc-fortran/lib-12.f90
@@ -17,9 +17,14 @@ program main
 
   call acc_wait_async (0, 1)
 
+  ! Test unseen async-argument.
+  if (acc_async_test (2) .neqv. .TRUE.) call abort
+  call acc_wait_async (2, 1)
+
   call acc_wait (1)
 
   if (acc_async_test (0) .neqv. .TRUE.) call abort
   if (acc_async_test (1) .neqv. .TRUE.) call abort
+  if (acc_async_test (2) .neqv. .TRUE.) call abort
 
 end program


Grüße
 Thomas

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2018-12-14 21:13 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-09-25 13:12 [PATCH 5/6, OpenACC, libgomp] Async re-work, C/C++ testsuite changes Chung-Lin Tang
2018-12-07 15:31 ` Thomas Schwinge
2018-12-14 21:09   ` Thomas Schwinge
2018-12-07 15:39 ` [PR88407] [OpenACC] Correctly handle unseen async-arguments (was: [PATCH 5/6, OpenACC, libgomp] Async re-work, C/C++ testsuite changes) Thomas Schwinge
2018-12-14 21:13   ` [PR88407] [OpenACC] Correctly handle unseen async-arguments Thomas Schwinge
2018-12-07 15:57 ` Too strict synchronization with the local (host) thread? (was: [PATCH 5/6, OpenACC, libgomp] Async re-work, C/C++ testsuite changes) Thomas Schwinge
2018-12-11 13:30   ` Too strict synchronization with the local (host) thread? Chung-Lin Tang
2018-12-14 14:50     ` Thomas Schwinge

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).