commit 2e4160f0f0532890f6a2b405222f02c7ee1bb0ac
Author: Julian Brown <julian@codesourcery.com>
Date:   Wed Oct 21 10:00:19 2020 -0700

    nvptx: Cache stacks block for OpenMP kernel launch
    
    2020-11-13  Julian Brown  <julian@codesourcery.com>
    
    libgomp/
            * plugin/plugin-nvptx.c (SOFTSTACK_CACHE_LIMIT): New define.
            (struct ptx_device): Add omp_stacks struct.
            (nvptx_open_device): Initialise cached-stacks housekeeping info.
            (nvptx_close_device): Free cached stacks block and mutex.
            (nvptx_stacks_free): New function.
            (nvptx_alloc): Add SUPPRESS_ERRORS parameter.
            (GOMP_OFFLOAD_alloc): Add strategies for freeing soft-stacks block.
            (nvptx_stacks_alloc): Rename to...
            (nvptx_stacks_acquire): This.  Cache stacks block between runs if same
            size or smaller is required.
            (nvptx_stacks_free): Remove.
            (GOMP_OFFLOAD_run): Call nvptx_stacks_acquire and lock stacks block
            during kernel execution.

diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index 11d4ceeae62e..e08c36094098 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -49,6 +49,15 @@
 #include <assert.h>
 #include <errno.h>
 
+/* An arbitrary fixed limit (128MB) for the size of the OpenMP soft stacks
+   block to cache between kernel invocations.  For soft-stacks blocks bigger
+   than this, we will free the block before attempting another GPU memory
+   allocation (i.e. in GOMP_OFFLOAD_alloc).  Otherwise, if an allocation fails,
+   we will free the cached soft-stacks block anyway then retry the
+   allocation.  If that fails too, we lose.  */
+
+#define SOFTSTACK_CACHE_LIMIT 134217728
+
 #if CUDA_VERSION < 6000
 extern CUresult cuGetErrorString (CUresult, const char **);
 #define CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR 82
@@ -307,6 +316,14 @@ struct ptx_device
   struct ptx_free_block *free_blocks;
   pthread_mutex_t free_blocks_lock;
 
+  /* OpenMP stacks, cached between kernel invocations.  */
+  struct
+    {
+      CUdeviceptr ptr;
+      size_t size;
+      pthread_mutex_t lock;
+    } omp_stacks;
+
   struct ptx_device *next;
 };
 
@@ -514,6 +531,10 @@ nvptx_open_device (int n)
   ptx_dev->free_blocks = NULL;
   pthread_mutex_init (&ptx_dev->free_blocks_lock, NULL);
 
+  ptx_dev->omp_stacks.ptr = 0;
+  ptx_dev->omp_stacks.size = 0;
+  pthread_mutex_init (&ptx_dev->omp_stacks.lock, NULL);
+
   return ptx_dev;
 }
 
@@ -534,6 +555,11 @@ nvptx_close_device (struct ptx_device *ptx_dev)
   pthread_mutex_destroy (&ptx_dev->free_blocks_lock);
   pthread_mutex_destroy (&ptx_dev->image_lock);
 
+  pthread_mutex_destroy (&ptx_dev->omp_stacks.lock);
+
+  if (ptx_dev->omp_stacks.ptr)
+    CUDA_CALL (cuMemFree, ptx_dev->omp_stacks.ptr);
+
   if (!ptx_dev->ctx_shared)
     CUDA_CALL (cuCtxDestroy, ptx_dev->ctx);
 
@@ -999,12 +1025,40 @@ goacc_profiling_acc_ev_alloc (struct goacc_thread *thr, void *dp, size_t s)
   GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, api_info);
 }
 
+/* Free the cached soft-stacks block if it is above the SOFTSTACK_CACHE_LIMIT
+   size threshold, or if FORCE is true.  */
+
+static void
+nvptx_stacks_free (struct ptx_device *ptx_dev, bool force)
+{
+  pthread_mutex_lock (&ptx_dev->omp_stacks.lock);
+  if (ptx_dev->omp_stacks.ptr
+      && (force || ptx_dev->omp_stacks.size > SOFTSTACK_CACHE_LIMIT))
+    {
+      CUresult r = CUDA_CALL_NOCHECK (cuMemFree, ptx_dev->omp_stacks.ptr);
+      if (r != CUDA_SUCCESS)
+	GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
+      ptx_dev->omp_stacks.ptr = 0;
+      ptx_dev->omp_stacks.size = 0;
+    }
+  pthread_mutex_unlock (&ptx_dev->omp_stacks.lock);
+}
+
 static void *
-nvptx_alloc (size_t s)
+nvptx_alloc (size_t s, bool suppress_errors)
 {
   CUdeviceptr d;
 
-  CUDA_CALL_ERET (NULL, cuMemAlloc, &d, s);
+  CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &d, s);
+  if (suppress_errors && r == CUDA_ERROR_OUT_OF_MEMORY)
+    return NULL;
+  else if (r != CUDA_SUCCESS)
+    {
+      GOMP_PLUGIN_error ("nvptx_alloc error: %s", cuda_error (r));
+      return NULL;
+    }
+
+  /* NOTE: We only do profiling stuff if the memory allocation succeeds.  */
   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
   bool profiling_p
     = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
@@ -1352,6 +1406,8 @@ GOMP_OFFLOAD_alloc (int ord, size_t size)
   ptx_dev->free_blocks = NULL;
   pthread_mutex_unlock (&ptx_dev->free_blocks_lock);
 
+  nvptx_stacks_free (ptx_dev, false);
+
   while (blocks)
     {
       tmp = blocks->next;
@@ -1360,7 +1416,16 @@ GOMP_OFFLOAD_alloc (int ord, size_t size)
       blocks = tmp;
     }
 
-  return nvptx_alloc (size);
+  void *d = nvptx_alloc (size, true);
+  if (d)
+    return d;
+  else
+    {
+      /* Memory allocation failed.  Try freeing the stacks block, and
+	 retrying.  */
+      nvptx_stacks_free (ptx_dev, true);
+      return nvptx_alloc (size, false);
+    }
 }
 
 bool
@@ -1866,26 +1931,36 @@ nvptx_stacks_size ()
   return 128 * 1024;
 }
 
-/* Return contiguous storage for NUM stacks, each SIZE bytes.  */
+/* Return contiguous storage for NUM stacks, each SIZE bytes.  The lock for
+   the storage should be held on entry, and remains held on exit.  */
 
 static void *
-nvptx_stacks_alloc (size_t size, int num)
+nvptx_stacks_acquire (struct ptx_device *ptx_dev, size_t size, int num)
 {
-  CUdeviceptr stacks;
-  CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &stacks, size * num);
+  if (ptx_dev->omp_stacks.ptr && ptx_dev->omp_stacks.size >= size * num)
+    return (void *) ptx_dev->omp_stacks.ptr;
+
+  /* Free the old, too-small stacks.  */
+  if (ptx_dev->omp_stacks.ptr)
+    {
+      CUresult r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
+      if (r != CUDA_SUCCESS)
+	GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s\n", cuda_error (r));
+      r = CUDA_CALL_NOCHECK (cuMemFree, ptx_dev->omp_stacks.ptr);
+      if (r != CUDA_SUCCESS)
+	GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
+    }
+
+  /* Make new and bigger stacks, and remember where we put them and how big
+     they are.  */
+  CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &ptx_dev->omp_stacks.ptr,
+				  size * num);
   if (r != CUDA_SUCCESS)
     GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
-  return (void *) stacks;
-}
 
-/* Release storage previously allocated by nvptx_stacks_alloc.  */
+  ptx_dev->omp_stacks.size = size * num;
 
-static void
-nvptx_stacks_free (void *p, int num)
-{
-  CUresult r = CUDA_CALL_NOCHECK (cuMemFree, (CUdeviceptr) p);
-  if (r != CUDA_SUCCESS)
-    GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
+  return (void *) ptx_dev->omp_stacks.ptr;
 }
 
 void
@@ -1922,7 +1997,9 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
   nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads);
 
   size_t stack_size = nvptx_stacks_size ();
-  void *stacks = nvptx_stacks_alloc (stack_size, teams * threads);
+
+  pthread_mutex_lock (&ptx_dev->omp_stacks.lock);
+  void *stacks = nvptx_stacks_acquire (ptx_dev, stack_size, teams * threads);
   void *fn_args[] = {tgt_vars, stacks, (void *) stack_size};
   size_t fn_args_size = sizeof fn_args;
   void *config[] = {
@@ -1944,7 +2021,8 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
 		       maybe_abort_msg);
   else if (r != CUDA_SUCCESS)
     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
-  nvptx_stacks_free (stacks, teams * threads);
+
+  pthread_mutex_unlock (&ptx_dev->omp_stacks.lock);
 }
 
 /* TODO: Implement GOMP_OFFLOAD_async_run. */