public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH 2/3] [og9] Use temporary buffers for async host2dev copies
  2019-08-13 21:37 [PATCH 0/3] [og9] OpenACC async fixes for AMD GCN Julian Brown
@ 2019-08-13 21:37 ` Julian Brown
  2019-08-13 21:43 ` [PATCH 3/3] [og9] Wait on queue-full condition in AMD GCN libgomp offloading plugin Julian Brown
  2019-08-13 22:46 ` [PATCH 1/3] [og9] Wait at end of OpenACC asynchronous kernels regions Julian Brown
  2 siblings, 0 replies; 5+ messages in thread
From: Julian Brown @ 2019-08-13 21:37 UTC (permalink / raw)
  To: gcc-patches; +Cc: Andrew Stubbs, Jakub Jelinek

In libgomp, host-to-device transfers are instigated in several places
where the source data is either on the stack, or in an unstable
heap location (i.e. which is immediately freed after performing the
host-to-device transfer).

When the transfer is asynchronous, this means that taking the address
of source data and attempting the copy from that at some later point
is extremely likely to fail. A previous fix for this problem (from our
internal branch, and included with the AMD GCN offloading patches)
attempted to separate transfers from the stack (performing them
immediately) from transfers from the heap (which can safely be done some
time later).

Unfortunately that doesn't work well with more recent changes to libgomp
and the GCN plugin. So instead, this patch copies the source data for
asynchronous host-to-device copies immediately to a temporary buffer,
then the transfer to the device can safely take place asynchronously
some time later.

Julian

ChangeLog

	libgomp/
	* plugin/plugin-gcn.c (struct copy_data): Add using_src_copy field.
	(copy_data): Free temporary buffer if using.
	(queue_push_copy): Add using_src_copy parameter.
	(GOMP_OFFLOAD_dev2dev, GOMP_OFFLOAD_async_dev2host): Update calls to
	queue_push_copy.
	(GOMP_OFFLOAD_async_host2dev): Likewise.  Allocate temporary buffer and
	copy source data to it immediately.
	* target.c (gomp_copy_host2dev): Update function comment.
	(copy_host2dev_immediate): Remove.
	(gomp_map_pointer, gomp_map_vars_internal): Replace calls to
	copy_host2dev_immediate with calls to gomp_copy_host2dev.
---
 libgomp/ChangeLog.openacc   | 14 ++++++++++
 libgomp/plugin/plugin-gcn.c | 20 ++++++++++---
 libgomp/target.c            | 56 +++++++++++++++----------------------
 3 files changed, 52 insertions(+), 38 deletions(-)

diff --git a/libgomp/ChangeLog.openacc b/libgomp/ChangeLog.openacc
index 2279545f361..2a9a7f18ca2 100644
--- a/libgomp/ChangeLog.openacc
+++ b/libgomp/ChangeLog.openacc
@@ -1,3 +1,17 @@
+2019-08-13  Julian Brown  <julian@codesourcery.com>
+
+	* plugin/plugin-gcn.c (struct copy_data): Add using_src_copy field.
+	(copy_data): Free temporary buffer if using.
+	(queue_push_copy): Add using_src_copy parameter.
+	(GOMP_OFFLOAD_dev2dev, GOMP_OFFLOAD_async_dev2host): Update calls to
+	queue_push_copy.
+	(GOMP_OFFLOAD_async_host2dev): Likewise.  Allocate temporary buffer and
+	copy source data to it immediately.
+	* target.c (gomp_copy_host2dev): Update function comment.
+	(copy_host2dev_immediate): Remove.
+	(gomp_map_pointer, gomp_map_vars_internal): Replace calls to
+	copy_host2dev_immediate with calls to gomp_copy_host2dev.
+
 2019-08-08  Julian Brown  <julian@codesourcery.com>
 
 	* plugin/plugin-gcn.c (gcn_exec): Use 1 for the default number of
diff --git a/libgomp/plugin/plugin-gcn.c b/libgomp/plugin/plugin-gcn.c
index a41568b3306..65690e643ed 100644
--- a/libgomp/plugin/plugin-gcn.c
+++ b/libgomp/plugin/plugin-gcn.c
@@ -3063,6 +3063,7 @@ struct copy_data
   const void *src;
   size_t len;
   bool use_hsa_memory_copy;
+  bool using_src_copy;
   struct goacc_asyncqueue *aq;
 };
 
@@ -3077,12 +3078,14 @@ copy_data (void *data_)
     hsa_fns.hsa_memory_copy_fn (data->dst, data->src, data->len);
   else
     memcpy (data->dst, data->src, data->len);
+  if (data->using_src_copy)
+    free ((void *) data->src);
   free (data);
 }
 
 static void
 queue_push_copy (struct goacc_asyncqueue *aq, void *dst, const void *src,
-		 size_t len, bool use_hsa_memory_copy)
+		 size_t len, bool use_hsa_memory_copy, bool using_src_copy)
 {
   if (DEBUG_QUEUES)
     HSA_DEBUG ("queue_push_copy %d:%d: %zu bytes from (%p) to (%p)\n",
@@ -3093,6 +3096,7 @@ queue_push_copy (struct goacc_asyncqueue *aq, void *dst, const void *src,
   data->src = src;
   data->len = len;
   data->use_hsa_memory_copy = use_hsa_memory_copy;
+  data->using_src_copy = using_src_copy;
   data->aq = aq;
   queue_push_callback (aq, copy_data, data);
 }
@@ -3137,7 +3141,7 @@ GOMP_OFFLOAD_dev2dev (int device, void *dst, const void *src, size_t n)
     {
       struct agent_info *agent = get_agent_info (device);
       maybe_init_omp_async (agent);
-      queue_push_copy (agent->omp_async_queue, dst, src, n, false);
+      queue_push_copy (agent->omp_async_queue, dst, src, n, false, false);
       return true;
     }
 
@@ -3469,7 +3473,15 @@ GOMP_OFFLOAD_openacc_async_host2dev (int device, void *dst, const void *src,
 {
   struct agent_info *agent = get_agent_info (device);
   assert (agent == aq->agent);
-  queue_push_copy (aq, dst, src, n, image_address_p (agent, dst));
+  /* The source data does not necessarily remain live until the deferred
+     copy happens.  Taking a snapshot of the data here avoids reading
+     uninitialised data later, but means that (a) data is copied twice and
+     (b) modifications to the copied data between the "spawning" point of
+     the asynchronous kernel and when it is executed will not be seen.
+     But, that is probably correct.  */
+  void *src_copy = GOMP_PLUGIN_malloc (n);
+  memcpy (src_copy, src, n);
+  queue_push_copy (aq, dst, src_copy, n, image_address_p (agent, dst), true);
   return true;
 }
 
@@ -3479,7 +3491,7 @@ GOMP_OFFLOAD_openacc_async_dev2host (int device, void *dst, const void *src,
 {
   struct agent_info *agent = get_agent_info (device);
   assert (agent == aq->agent);
-  queue_push_copy (aq, dst, src, n, image_address_p (agent, src));
+  queue_push_copy (aq, dst, src, n, image_address_p (agent, src), false);
   return true;
 }
 
diff --git a/libgomp/target.c b/libgomp/target.c
index 4645894f869..5f7f946e2ba 100644
--- a/libgomp/target.c
+++ b/libgomp/target.c
@@ -303,10 +303,9 @@ gomp_to_device_kind_p (int kind)
 }
 
 /* Copy host memory to an offload device.  In asynchronous mode (if AQ is
-   non-NULL), this is only safe when the source memory is a global or heap
-   location (otherwise a copy may take place from a dangling pointer to an
-   expired stack frame).  Use copy_host2dev_immediate for copies from stack
-   locations.  */
+   non-NULL), H may point to a stack location.  It is up to the underlying
+   plugin to ensure that this data is read immediately, rather than at some
+   later point when the stack frame will likely have been destroyed.  */
 
 attribute_hidden void
 gomp_copy_host2dev (struct gomp_device_descr *devicep,
@@ -346,17 +345,6 @@ gomp_copy_host2dev (struct gomp_device_descr *devicep,
     gomp_device_copy (devicep, devicep->host2dev_func, "dev", d, "host", h, sz);
 }
 
-/* Use this variant for host-to-device copies from stack locations that may not
-   be live at the time an asynchronous copy operation takes place.  */
-
-static void
-copy_host2dev_immediate (struct gomp_device_descr *devicep, void *d,
-			 const void *h, size_t sz,
-			 struct gomp_coalesce_buf *cbuf)
-{
-  gomp_copy_host2dev (devicep, NULL, d, h, sz, cbuf);
-}
-
 attribute_hidden void
 gomp_copy_dev2host (struct gomp_device_descr *devicep,
 		    struct goacc_asyncqueue *aq,
@@ -617,10 +605,10 @@ gomp_map_pointer (struct target_mem_desc *tgt, struct goacc_asyncqueue *aq,
   if (cur_node.host_start == (uintptr_t) NULL)
     {
       cur_node.tgt_offset = (uintptr_t) NULL;
-      copy_host2dev_immediate (devicep,
-			       (void *) (tgt->tgt_start + target_offset),
-			       (void *) &cur_node.tgt_offset,
-			       sizeof (void *), cbuf);
+      gomp_copy_host2dev (devicep, aq,
+			  (void *) (tgt->tgt_start + target_offset),
+			  (void *) &cur_node.tgt_offset, sizeof (void *),
+			  cbuf);
       return;
     }
   /* Add bias to the pointer value.  */
@@ -639,9 +627,8 @@ gomp_map_pointer (struct target_mem_desc *tgt, struct goacc_asyncqueue *aq,
      array section.  Now subtract bias to get what we want
      to initialize the pointer with.  */
   cur_node.tgt_offset -= bias;
-  copy_host2dev_immediate (devicep, (void *) (tgt->tgt_start + target_offset),
-			   (void *) &cur_node.tgt_offset, sizeof (void *),
-			   cbuf);
+  gomp_copy_host2dev (devicep, aq, (void *) (tgt->tgt_start + target_offset),
+		      (void *) &cur_node.tgt_offset, sizeof (void *), cbuf);
 }
 
 static void
@@ -1460,13 +1447,13 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep,
 		  cur_node.tgt_offset = gomp_map_val (tgt, hostaddrs, i - 1);
 		if (cur_node.tgt_offset)
 		  cur_node.tgt_offset -= sizes[i];
-		copy_host2dev_immediate (devicep,
-					 (void *) (n->tgt->tgt_start
-						   + n->tgt_offset
-						   + cur_node.host_start
-						   - n->host_start),
-					 (void *) &cur_node.tgt_offset,
-					 sizeof (void *), cbufp);
+		gomp_copy_host2dev (devicep, aq,
+				    (void *) (n->tgt->tgt_start
+					      + n->tgt_offset
+					      + cur_node.host_start
+					      - n->host_start),
+				    (void *) &cur_node.tgt_offset,
+				    sizeof (void *), cbufp);
 		cur_node.tgt_offset = n->tgt->tgt_start + n->tgt_offset
 				      + cur_node.host_start - n->host_start;
 		continue;
@@ -1705,8 +1692,8 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep,
 		    void *tgt_addr = (void *) (tgt->tgt_start + k->tgt_offset);
 		    /* We intentionally do not use coalescing here, as it's not
 		       data allocated by the current call to this function.  */
-		    copy_host2dev_immediate (devicep, (void *) n->tgt_offset,
-					     &tgt_addr, sizeof (void *), NULL);
+		    gomp_copy_host2dev (devicep, aq, (void *) n->tgt_offset,
+					&tgt_addr, sizeof (void *), NULL);
 		  }
 		array++;
 	      }
@@ -1828,9 +1815,10 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep,
       for (i = 0; i < mapnum; i++)
 	{
 	  cur_node.tgt_offset = gomp_map_val (tgt, hostaddrs, i);
-	  copy_host2dev_immediate (devicep,
-	    (void *) (tgt->tgt_start + i * sizeof (void *)),
-	    (void *) &cur_node.tgt_offset, sizeof (void *), cbufp);
+	  gomp_copy_host2dev (devicep, aq,
+			      (void *) (tgt->tgt_start + i * sizeof (void *)),
+			      (void *) &cur_node.tgt_offset, sizeof (void *),
+			      cbufp);
 	}
     }
 
-- 
2.22.0

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [PATCH 0/3] [og9] OpenACC async fixes for AMD GCN
@ 2019-08-13 21:37 Julian Brown
  2019-08-13 21:37 ` [PATCH 2/3] [og9] Use temporary buffers for async host2dev copies Julian Brown
                   ` (2 more replies)
  0 siblings, 3 replies; 5+ messages in thread
From: Julian Brown @ 2019-08-13 21:37 UTC (permalink / raw)
  To: gcc-patches; +Cc: Andrew Stubbs, Jakub Jelinek

These patches stabilise async support for AMD GCN. Several tests that
previously failed (some intermittently) now work.

Further commentary is provided alongside each patch. Tested with
offloading to AMD GCN.

I will apply shortly to the openacc-gcc-9-branch.

Thanks,

Julian

Julian Brown (3):
  [og9] Wait at end of OpenACC asynchronous kernels regions
  [og9] Use temporary buffers for async host2dev copies
  [og9] Wait on queue-full condition in AMD GCN libgomp offloading
    plugin

 gcc/ChangeLog.openacc       |  7 +++++
 gcc/omp-oacc-kernels.c      | 28 ++++++++++++++-----
 libgomp/ChangeLog.openacc   | 19 +++++++++++++
 libgomp/plugin/plugin-gcn.c | 31 ++++++++++++++++----
 libgomp/target.c            | 56 +++++++++++++++----------------------
 5 files changed, 94 insertions(+), 47 deletions(-)

-- 
2.22.0

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [PATCH 3/3] [og9] Wait on queue-full condition in AMD GCN libgomp offloading plugin
  2019-08-13 21:37 [PATCH 0/3] [og9] OpenACC async fixes for AMD GCN Julian Brown
  2019-08-13 21:37 ` [PATCH 2/3] [og9] Use temporary buffers for async host2dev copies Julian Brown
@ 2019-08-13 21:43 ` Julian Brown
  2019-08-13 22:46 ` [PATCH 1/3] [og9] Wait at end of OpenACC asynchronous kernels regions Julian Brown
  2 siblings, 0 replies; 5+ messages in thread
From: Julian Brown @ 2019-08-13 21:43 UTC (permalink / raw)
  To: gcc-patches; +Cc: Andrew Stubbs, Jakub Jelinek

This patch lets the AMD GCN libgomp plugin wait for asynchronous queues
to have some space to push new operations when they are full, rather
than just erroring out immediately on that condition. This fixes the
libgomp.oacc-c-c++-common/da-4.c test.

Julian

ChangeLog

	libgomp/
	* plugin/plugin-gcn.c (queue_push_callback): Wait on queue-full
	condition.
---
 libgomp/ChangeLog.openacc   |  5 +++++
 libgomp/plugin/plugin-gcn.c | 11 +++++++++--
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/libgomp/ChangeLog.openacc b/libgomp/ChangeLog.openacc
index 2a9a7f18ca2..f9d8e6ecd39 100644
--- a/libgomp/ChangeLog.openacc
+++ b/libgomp/ChangeLog.openacc
@@ -1,3 +1,8 @@
+2019-08-13  Julian Brown  <julian@codesourcery.com>
+
+	* plugin/plugin-gcn.c (queue_push_callback): Wait on queue-full
+	condition.
+
 2019-08-13  Julian Brown  <julian@codesourcery.com>
 
 	* plugin/plugin-gcn.c (struct copy_data): Add using_src_copy field.
diff --git a/libgomp/plugin/plugin-gcn.c b/libgomp/plugin/plugin-gcn.c
index 65690e643ed..099f70b647c 100644
--- a/libgomp/plugin/plugin-gcn.c
+++ b/libgomp/plugin/plugin-gcn.c
@@ -1416,8 +1416,15 @@ queue_push_callback (struct goacc_asyncqueue *aq, void (*fn)(void *),
 		     void *data)
 {
   if (aq->queue_n == ASYNC_QUEUE_SIZE)
-    GOMP_PLUGIN_fatal ("Async thread %d:%d: error: queue overflowed",
-		       aq->agent->device_id, aq->id);
+    {
+      pthread_mutex_lock (&aq->mutex);
+
+      /* Queue is full.  Wait for it to not be full.  */
+      while (aq->queue_n == ASYNC_QUEUE_SIZE)
+	pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
+
+      pthread_mutex_unlock (&aq->mutex);
+    }
 
   pthread_mutex_lock (&aq->mutex);
 
-- 
2.22.0

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [PATCH 1/3] [og9] Wait at end of OpenACC asynchronous kernels regions
  2019-08-13 21:37 [PATCH 0/3] [og9] OpenACC async fixes for AMD GCN Julian Brown
  2019-08-13 21:37 ` [PATCH 2/3] [og9] Use temporary buffers for async host2dev copies Julian Brown
  2019-08-13 21:43 ` [PATCH 3/3] [og9] Wait on queue-full condition in AMD GCN libgomp offloading plugin Julian Brown
@ 2019-08-13 22:46 ` Julian Brown
  2022-01-13 10:07   ` Thomas Schwinge
  2 siblings, 1 reply; 5+ messages in thread
From: Julian Brown @ 2019-08-13 22:46 UTC (permalink / raw)
  To: gcc-patches; +Cc: Andrew Stubbs, Jakub Jelinek

This patch provides a workaround for unreliable operation of asynchronous
kernels regions on AMD GCN. At present, kernels regions are decomposed
into a series of parallel regions surrounded by a data region capturing
the data-movement clauses needed by the region as a whole:

  #pragma acc kernels async(n)
  { ... }

is translated to:

  #pragma acc data copyin(...) copyout(...)
  {
    #pragma acc parallel async(n) present(...)
    { ... }
    #pragma acc parallel async(n) present(...)
    { ... }
  }

This is however problematic for two reasons:

 - Variables mapped by the data clause will be unmapped immediately at the end
   of the data region, regardless of whether the inner asynchronous
   parallels have completed. (This causes crashes for GCN.)

 - Even if the "present" clause caused the reference count to stay above zero
   at the end of the data region -- which it doesn't -- the "present"
   clauses on the inner parallel regions would not cause "copyout"
   variables to be transferred back to the host at the appropriate time,
   i.e. when the async parallel region had completed.

There is no "async" data construct in OpenACC, so the correct solution
(which I am deferring on for now) is probably to use asynchronous
"enter data" and "exit data" directives when translating asynchronous
kernels regions instead.

The attached patch just adds a "wait" operation before the end of
the enclosing data region. This works, but introduces undesirable
synchronisation with the host.

Julian

ChangeLog

	gcc/
	* omp-oacc-kernels.c (add_wait): New function, split out of...
	(add_async_clauses_and_wait): ...here. Call new outlined function.
	(decompose_kernels_region_body): Add wait at the end of
	explicitly-asynchronous kernels regions.
---
 gcc/ChangeLog.openacc  |  7 +++++++
 gcc/omp-oacc-kernels.c | 28 +++++++++++++++++++++-------
 2 files changed, 28 insertions(+), 7 deletions(-)

diff --git a/gcc/ChangeLog.openacc b/gcc/ChangeLog.openacc
index 84d80511603..a22f07c817c 100644
--- a/gcc/ChangeLog.openacc
+++ b/gcc/ChangeLog.openacc
@@ -1,3 +1,10 @@
+2019-08-13  Julian Brown  <julian@codesourcery.com>
+
+	* omp-oacc-kernels.c (add_wait): New function, split out of...
+	(add_async_clauses_and_wait): ...here. Call new outlined function.
+	(decompose_kernels_region_body): Add wait at the end of
+	explicitly-asynchronous kernels regions.
+
 2019-08-08  Julian Brown  <julian@codesourcery.com>
 
 	* config/gcn/gcn.c (gcn_goacc_validate_dims): Ensure
diff --git a/gcc/omp-oacc-kernels.c b/gcc/omp-oacc-kernels.c
index 20913859c12..a6c4220f472 100644
--- a/gcc/omp-oacc-kernels.c
+++ b/gcc/omp-oacc-kernels.c
@@ -900,6 +900,18 @@ maybe_build_inner_data_region (location_t loc, gimple *body,
   return body;
 }
 
+static void
+add_wait (location_t loc, gimple_seq *region_body)
+{
+  /* A "#pragma acc wait" is just a call GOACC_wait (acc_async_sync, 0).  */
+  tree wait_fn = builtin_decl_explicit (BUILT_IN_GOACC_WAIT);
+  tree sync_arg = build_int_cst (integer_type_node, GOMP_ASYNC_SYNC);
+  gimple *wait_call = gimple_build_call (wait_fn, 2,
+                                         sync_arg, integer_zero_node);
+  gimple_set_location (wait_call, loc);
+  gimple_seq_add_stmt (region_body, wait_call);
+}
+
 /* Helper function of decompose_kernels_region_body.  The statements in
    REGION_BODY are expected to be decomposed parallel regions; add an
    "async" clause to each.  Also add a "wait" pragma at the end of the
@@ -923,13 +935,7 @@ add_async_clauses_and_wait (location_t loc, gimple_seq *region_body)
       gimple_omp_target_set_clauses (as_a <gomp_target *> (stmt),
                                      target_clauses);
     }
-  /* A "#pragma acc wait" is just a call GOACC_wait (acc_async_sync, 0).  */
-  tree wait_fn = builtin_decl_explicit (BUILT_IN_GOACC_WAIT);
-  tree sync_arg = build_int_cst (integer_type_node, GOMP_ASYNC_SYNC);
-  gimple *wait_call = gimple_build_call (wait_fn, 2,
-                                         sync_arg, integer_zero_node);
-  gimple_set_location (wait_call, loc);
-  gimple_seq_add_stmt (region_body, wait_call);
+  add_wait (loc, region_body);
 }
 
 /* Auxiliary analysis of the body of a kernels region, to determine for each
@@ -1378,6 +1384,14 @@ decompose_kernels_region_body (gimple *kernels_region, tree kernels_clauses)
      a wait directive at the end.  */
   if (async_clause == NULL)
     add_async_clauses_and_wait (loc, &region_body);
+  else
+    /* !!! If we have asynchronous parallel blocks inside a (synchronous) data
+       region, then target memory will get unmapped at the point the data
+       region ends, even if the inner asynchronous parallels have not yet
+       completed.  For kernels marked "async", we might want to use "enter data
+       async(...)" and "exit data async(...)" instead.
+       For now, insert a (synchronous) wait at the end of the block.  */
+    add_wait (loc, &region_body);
 
   tree kernels_locals = gimple_bind_vars (as_a <gbind *> (kernels_body));
   gimple *body = gimple_build_bind (kernels_locals, region_body,
-- 
2.22.0

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Wait at end of OpenACC asynchronous kernels regions
  2019-08-13 22:46 ` [PATCH 1/3] [og9] Wait at end of OpenACC asynchronous kernels regions Julian Brown
@ 2022-01-13 10:07   ` Thomas Schwinge
  0 siblings, 0 replies; 5+ messages in thread
From: Thomas Schwinge @ 2022-01-13 10:07 UTC (permalink / raw)
  To: Julian Brown, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 2457 bytes --]

Hi!

On 2019-08-13T14:37:13-0700, Julian Brown <julian@codesourcery.com> wrote:
> This patch provides a workaround for unreliable operation of asynchronous
> kernels regions on AMD GCN. At present, kernels regions are decomposed
> into a series of parallel regions surrounded by a data region capturing
> the data-movement clauses needed by the region as a whole:
>
>   #pragma acc kernels async(n)
>   { ... }
>
> is translated to:

... simplified...

>   #pragma acc data copyin(...) copyout(...)
>   {
>     #pragma acc parallel async(n) present(...)
>     { ... }
>     #pragma acc parallel async(n) present(...)
>     { ... }
>   }
>
> This is however problematic for two reasons:
>
>  - Variables mapped by the data clause will be unmapped immediately at the end
>    of the data region, regardless of whether the inner asynchronous
>    parallels have completed. (This causes crashes for GCN.)
>
>  - Even if the "present" clause caused the reference count to stay above zero
>    at the end of the data region -- which it doesn't -- the "present"
>    clauses on the inner parallel regions would not cause "copyout"
>    variables to be transferred back to the host at the appropriate time,
>    i.e. when the async parallel region had completed.

> There is no "async" data construct in OpenACC

(Actually, as of OpenACC 3.2 there now is:
<https://gcc.gnu.org/PR97390> "[OpenACC] 'async' clause on 'data' construct"
-- but that's not yet implemented, so doesn't help us here.)

> so the correct solution
> (which I am deferring on for now) is probably to use asynchronous
> "enter data" and "exit data" directives when translating asynchronous
> kernels regions instead.

(Or rather, use structured 'data' (as we're now doing), but with
appropriate 'async' clauses.)

> The attached patch just adds a "wait" operation before the end of
> the enclosing data region. This works, but introduces undesirable
> synchronisation with the host.

ACK, thanks.  Pushed to master branch in
commit e52253bcc0916d9a7c7ba4bbe7501ae1ded3b8a8
"Wait at end of OpenACC asynchronous kernels regions", see attached.


Grüße
 Thomas


-----------------
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht München, HRB 106955

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 0001-Wait-at-end-of-OpenACC-asynchronous-kernels-regions.patch --]
[-- Type: text/x-diff, Size: 4764 bytes --]

From e52253bcc0916d9a7c7ba4bbe7501ae1ded3b8a8 Mon Sep 17 00:00:00 2001
From: Julian Brown <julian@codesourcery.com>
Date: Fri, 9 Aug 2019 13:01:33 -0700
Subject: [PATCH] Wait at end of OpenACC asynchronous kernels regions

In OpenACC 'kernels' decomposition, we're improperly nesting synchronous and
asynchronous data and compute regions, giving rise to data races when the
asynchronicity is actually executed, as is visible in at least on test case
with GCN offloading.

The proper fix is to correctly use the asynchronous interfaces, making the
currently synchronous data regions fully asynchronous (see also
<https://gcc.gnu.org/PR97390> "[OpenACC] 'async' clause on 'data' construct",
which is to share the same implementation), but that's for later; for now add
some more synchronization.

	gcc/
	* omp-oacc-kernels-decompose.cc (add_wait): New function, split out
	of...
	(add_async_clauses_and_wait): ...here. Call new outlined function.
	(decompose_kernels_region_body): Add wait at the end of
	explicitly-asynchronous kernels regions.
	libgomp/
	* testsuite/libgomp.oacc-c-c++-common/f-asyncwait-1.c: Remove GCN
	offloading execution XFAIL.

Co-Authored-By: Thomas Schwinge <thomas@codesourcery.com>
---
 gcc/omp-oacc-kernels-decompose.cc             | 31 ++++++++++++++-----
 .../libgomp.oacc-c-c++-common/f-asyncwait-1.c |  1 -
 2 files changed, 24 insertions(+), 8 deletions(-)

diff --git a/gcc/omp-oacc-kernels-decompose.cc b/gcc/omp-oacc-kernels-decompose.cc
index 4ca899d5ece..21872db3ed3 100644
--- a/gcc/omp-oacc-kernels-decompose.cc
+++ b/gcc/omp-oacc-kernels-decompose.cc
@@ -878,6 +878,18 @@ maybe_build_inner_data_region (location_t loc, gimple *body,
   return body;
 }
 
+static void
+add_wait (location_t loc, gimple_seq *region_body)
+{
+  /* A "#pragma acc wait" is just a call GOACC_wait (acc_async_sync, 0).  */
+  tree wait_fn = builtin_decl_explicit (BUILT_IN_GOACC_WAIT);
+  tree sync_arg = build_int_cst (integer_type_node, GOMP_ASYNC_SYNC);
+  gimple *wait_call = gimple_build_call (wait_fn, 2,
+					 sync_arg, integer_zero_node);
+  gimple_set_location (wait_call, loc);
+  gimple_seq_add_stmt (region_body, wait_call);
+}
+
 /* Helper function of decompose_kernels_region_body.  The statements in
    REGION_BODY are expected to be decomposed parts; add an 'async' clause to
    each.  Also add a 'wait' directive at the end of the sequence.  */
@@ -900,13 +912,7 @@ add_async_clauses_and_wait (location_t loc, gimple_seq *region_body)
       gimple_omp_target_set_clauses (as_a <gomp_target *> (stmt),
 				     target_clauses);
     }
-  /* A '#pragma acc wait' is just a call 'GOACC_wait (acc_async_sync, 0)'.  */
-  tree wait_fn = builtin_decl_explicit (BUILT_IN_GOACC_WAIT);
-  tree sync_arg = build_int_cst (integer_type_node, GOMP_ASYNC_SYNC);
-  gimple *wait_call = gimple_build_call (wait_fn, 2,
-					 sync_arg, integer_zero_node);
-  gimple_set_location (wait_call, loc);
-  gimple_seq_add_stmt (region_body, wait_call);
+  add_wait (loc, region_body);
 }
 
 /* Auxiliary analysis of the body of a kernels region, to determine for each
@@ -1352,6 +1358,17 @@ decompose_kernels_region_body (gimple *kernels_region, tree kernels_clauses)
      a wait directive at the end.  */
   if (async_clause == NULL)
     add_async_clauses_and_wait (loc, &region_body);
+  else
+    /* !!! If we have asynchronous parallel blocks inside a (synchronous) data
+       region, then target memory will get unmapped at the point the data
+       region ends, even if the inner asynchronous parallels have not yet
+       completed.  For kernels marked "async", we might want to use "enter data
+       async(...)" and "exit data async(...)" instead, or asynchronous data
+       regions (see also <https://gcc.gnu.org/PR97390>
+       "[OpenACC] 'async' clause on 'data' construct",
+       which is to share the same implementation).
+       For now, insert a (synchronous) wait at the end of the block.  */
+    add_wait (loc, &region_body);
 
   tree kernels_locals = gimple_bind_vars (as_a <gbind *> (kernels_body));
   gimple *body = gimple_build_bind (kernels_locals, region_body,
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/f-asyncwait-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/f-asyncwait-1.c
index f7ccecbf4b4..ef7735b2ef4 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/f-asyncwait-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/f-asyncwait-1.c
@@ -3,7 +3,6 @@
 /* Based on '../libgomp.oacc-fortran/asyncwait-1.f90'.  */
 
 /* { dg-additional-options "--param=openacc-kernels=decompose" } */
-/* { dg-xfail-run-if TODO { openacc_radeon_accel_selected } } */
 
 /* { dg-additional-options "-fopt-info-all-omp" }
    { dg-additional-options "-foffload=-fopt-info-all-omp" } */
-- 
2.34.1


^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2022-01-13 10:07 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-08-13 21:37 [PATCH 0/3] [og9] OpenACC async fixes for AMD GCN Julian Brown
2019-08-13 21:37 ` [PATCH 2/3] [og9] Use temporary buffers for async host2dev copies Julian Brown
2019-08-13 21:43 ` [PATCH 3/3] [og9] Wait on queue-full condition in AMD GCN libgomp offloading plugin Julian Brown
2019-08-13 22:46 ` [PATCH 1/3] [og9] Wait at end of OpenACC asynchronous kernels regions Julian Brown
2022-01-13 10:07   ` Thomas Schwinge

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).