public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH] nvptx: Cache stacks block for OpenMP kernel launch
@ 2020-10-26 14:14 Julian Brown
  2020-10-26 14:26 ` Jakub Jelinek
  2020-10-27 13:17 ` Julian Brown
  0 siblings, 2 replies; 16+ messages in thread
From: Julian Brown @ 2020-10-26 14:14 UTC (permalink / raw)
  To: gcc-patches; +Cc: Tom de Vries, Jakub Jelinek, Thomas Schwinge

Hi,

This patch adds caching for the stack block allocated for offloaded
OpenMP kernel launches on NVPTX. This is a performance optimisation --
we observed an average 11% or so performance improvement with this patch
across a set of accelerated GPU benchmarks on one machine (results vary
according to individual benchmark and with hardware used).

A given kernel launch will reuse the stack block from the previous launch
if it is large enough, else it is freed and reallocated. A slight caveat
is that memory will not be freed until the device is closed, so e.g. if
code is using highly variable launch geometries and large amounts of
GPU RAM, you might run out of resources slightly quicker with this patch.

Another way this patch gains performance is by omitting the
synchronisation at the end of an OpenMP offload kernel launch -- it's
safe for the GPU and CPU to continue executing in parallel at that point,
because e.g. copies-back from the device will be synchronised properly
with kernel completion anyway.

In turn, the last part necessitates a change to the way "(perhaps abort
was called)" errors are detected and reported.

Tested with offloading to NVPTX. OK for mainline?

Thanks,

Julian

2020-10-26  Julian Brown  <julian@codesourcery.com>

libgomp/
	* plugin/plugin-nvptx.c (maybe_abort_message): Add function.
	(CUDA_CALL_ERET, CUDA_CALL_ASSERT): Use above function.
	(struct ptx_device): Add omp_stacks struct.
	(nvptx_open_device): Initialise cached-stacks housekeeping info.
	(nvptx_close_device): Free cached stacks block and mutex.
	(nvptx_stacks_alloc): Rename to...
	(nvptx_stacks_acquire): This.  Cache stacks block between runs if same
	size or smaller is required.
	(nvptx_stacks_free): Rename to...
	(nvptx_stacks_release): This.  Do not free stacks block, but release
	mutex.
	(GOMP_OFFLOAD_run): Adjust for changes to above functions, and remove
	special-case "abort" error handling and synchronisation after kernel
	launch.
---
 libgomp/plugin/plugin-nvptx.c | 91 ++++++++++++++++++++++++++---------
 1 file changed, 68 insertions(+), 23 deletions(-)

diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index 11d4ceeae62e..e7ff5d5213e0 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -137,6 +137,15 @@ init_cuda_lib (void)
 #define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
 #define MAX(X,Y) ((X) > (Y) ? (X) : (Y))
 
+static const char *
+maybe_abort_message (unsigned errmsg)
+{
+  if (errmsg == CUDA_ERROR_LAUNCH_FAILED)
+    return " (perhaps abort was called)";
+  else
+    return "";
+}
+
 /* Convenience macros for the frequently used CUDA library call and
    error handling sequence as well as CUDA library calls that
    do the error checking themselves or don't do it at all.  */
@@ -147,8 +156,9 @@ init_cuda_lib (void)
       = CUDA_CALL_PREFIX FN (__VA_ARGS__);	\
     if (__r != CUDA_SUCCESS)			\
       {						\
-	GOMP_PLUGIN_error (#FN " error: %s",	\
-			   cuda_error (__r));	\
+	GOMP_PLUGIN_error (#FN " error: %s%s",	\
+			   cuda_error (__r),	\
+			   maybe_abort_message (__r));	\
 	return ERET;				\
       }						\
   } while (0)
@@ -162,8 +172,9 @@ init_cuda_lib (void)
       = CUDA_CALL_PREFIX FN (__VA_ARGS__);	\
     if (__r != CUDA_SUCCESS)			\
       {						\
-	GOMP_PLUGIN_fatal (#FN " error: %s",	\
-			   cuda_error (__r));	\
+	GOMP_PLUGIN_fatal (#FN " error: %s%s",	\
+			   cuda_error (__r),	\
+			   maybe_abort_message (__r));	\
       }						\
   } while (0)
 
@@ -307,6 +318,14 @@ struct ptx_device
   struct ptx_free_block *free_blocks;
   pthread_mutex_t free_blocks_lock;
 
+  /* OpenMP stacks, cached between kernel invocations.  */
+  struct
+    {
+      CUdeviceptr ptr;
+      size_t size;
+      pthread_mutex_t lock;
+    } omp_stacks;
+
   struct ptx_device *next;
 };
 
@@ -514,6 +533,10 @@ nvptx_open_device (int n)
   ptx_dev->free_blocks = NULL;
   pthread_mutex_init (&ptx_dev->free_blocks_lock, NULL);
 
+  ptx_dev->omp_stacks.ptr = 0;
+  ptx_dev->omp_stacks.size = 0;
+  pthread_mutex_init (&ptx_dev->omp_stacks.lock, NULL);
+
   return ptx_dev;
 }
 
@@ -534,6 +557,11 @@ nvptx_close_device (struct ptx_device *ptx_dev)
   pthread_mutex_destroy (&ptx_dev->free_blocks_lock);
   pthread_mutex_destroy (&ptx_dev->image_lock);
 
+  pthread_mutex_destroy (&ptx_dev->omp_stacks.lock);
+
+  if (ptx_dev->omp_stacks.ptr)
+    CUDA_CALL (cuMemFree, ptx_dev->omp_stacks.ptr);
+
   if (!ptx_dev->ctx_shared)
     CUDA_CALL (cuCtxDestroy, ptx_dev->ctx);
 
@@ -1866,26 +1894,49 @@ nvptx_stacks_size ()
   return 128 * 1024;
 }
 
-/* Return contiguous storage for NUM stacks, each SIZE bytes.  */
+/* Return contiguous storage for NUM stacks, each SIZE bytes, and obtain the
+   lock for that storage.  */
 
 static void *
-nvptx_stacks_alloc (size_t size, int num)
+nvptx_stacks_acquire (struct ptx_device *ptx_dev, size_t size, int num)
 {
-  CUdeviceptr stacks;
-  CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &stacks, size * num);
+  pthread_mutex_lock (&ptx_dev->omp_stacks.lock);
+
+  if (ptx_dev->omp_stacks.ptr && ptx_dev->omp_stacks.size >= size * num)
+    return (void *) ptx_dev->omp_stacks.ptr;
+
+  /* Free the old, too-small stacks.  */
+  if (ptx_dev->omp_stacks.ptr)
+    {
+      CUresult r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
+      if (r != CUDA_SUCCESS)
+	GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s\n", cuda_error (r));
+      r = CUDA_CALL_NOCHECK (cuMemFree, ptx_dev->omp_stacks.ptr);
+      if (r != CUDA_SUCCESS)
+	GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
+    }
+
+  /* Make new and bigger stacks, and remember where we put them and how big
+     they are.  */
+  CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &ptx_dev->omp_stacks.ptr,
+				  size * num);
   if (r != CUDA_SUCCESS)
     GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
-  return (void *) stacks;
+
+  ptx_dev->omp_stacks.size = size * num;
+
+  return (void *) ptx_dev->omp_stacks.ptr;
 }
 
-/* Release storage previously allocated by nvptx_stacks_alloc.  */
+/* Release the lock associated with a ptx_device's OpenMP stacks block.  */
 
 static void
-nvptx_stacks_free (void *p, int num)
+nvptx_stacks_release (CUstream stream, CUresult res, void *ptr)
 {
-  CUresult r = CUDA_CALL_NOCHECK (cuMemFree, (CUdeviceptr) p);
-  if (r != CUDA_SUCCESS)
-    GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
+  if (res != CUDA_SUCCESS)
+    GOMP_PLUGIN_fatal ("%s error: %s", __FUNCTION__, cuda_error (res));
+  struct ptx_device *ptx_dev = (struct ptx_device *) ptr;
+  pthread_mutex_unlock (&ptx_dev->omp_stacks.lock);
 }
 
 void
@@ -1898,7 +1949,6 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
   const char *fn_name = launch->fn;
   CUresult r;
   struct ptx_device *ptx_dev = ptx_devices[ord];
-  const char *maybe_abort_msg = "(perhaps abort was called)";
   int teams = 0, threads = 0;
 
   if (!args)
@@ -1922,7 +1972,7 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
   nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads);
 
   size_t stack_size = nvptx_stacks_size ();
-  void *stacks = nvptx_stacks_alloc (stack_size, teams * threads);
+  void *stacks = nvptx_stacks_acquire (ptx_dev, stack_size, teams * threads);
   void *fn_args[] = {tgt_vars, stacks, (void *) stack_size};
   size_t fn_args_size = sizeof fn_args;
   void *config[] = {
@@ -1938,13 +1988,8 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
   if (r != CUDA_SUCCESS)
     GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
 
-  r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
-  if (r == CUDA_ERROR_LAUNCH_FAILED)
-    GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
-		       maybe_abort_msg);
-  else if (r != CUDA_SUCCESS)
-    GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
-  nvptx_stacks_free (stacks, teams * threads);
+  CUDA_CALL_ASSERT (cuStreamAddCallback, NULL, nvptx_stacks_release,
+		    (void *) ptx_dev, 0);
 }
 
 /* TODO: Implement GOMP_OFFLOAD_async_run. */
-- 
2.28.0


^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2021-01-05 15:32 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-10-26 14:14 [PATCH] nvptx: Cache stacks block for OpenMP kernel launch Julian Brown
2020-10-26 14:26 ` Jakub Jelinek
2020-11-09 21:32   ` Alexander Monakov
2020-11-13 20:54     ` Julian Brown
2020-12-08  1:13       ` Julian Brown
2020-12-08 17:11         ` Alexander Monakov
2020-12-15 13:39           ` Julian Brown
2020-12-15 13:49             ` Jakub Jelinek
2020-12-15 16:49               ` Julian Brown
2020-12-15 17:00                 ` Jakub Jelinek
2020-12-15 23:16                   ` Julian Brown
2021-01-05 12:13                     ` Julian Brown
2021-01-05 15:32                       ` Jakub Jelinek
2020-10-27 13:17 ` Julian Brown
2020-10-28  7:25   ` Chung-Lin Tang
2020-10-28 11:32     ` Julian Brown

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).