commit 2e4160f0f0532890f6a2b405222f02c7ee1bb0ac Author: Julian Brown Date: Wed Oct 21 10:00:19 2020 -0700 nvptx: Cache stacks block for OpenMP kernel launch 2020-11-13 Julian Brown libgomp/ * plugin/plugin-nvptx.c (SOFTSTACK_CACHE_LIMIT): New define. (struct ptx_device): Add omp_stacks struct. (nvptx_open_device): Initialise cached-stacks housekeeping info. (nvptx_close_device): Free cached stacks block and mutex. (nvptx_stacks_free): New function. (nvptx_alloc): Add SUPPRESS_ERRORS parameter. (GOMP_OFFLOAD_alloc): Add strategies for freeing soft-stacks block. (nvptx_stacks_alloc): Rename to... (nvptx_stacks_acquire): This. Cache stacks block between runs if same size or smaller is required. (nvptx_stacks_free): Remove. (GOMP_OFFLOAD_run): Call nvptx_stacks_acquire and lock stacks block during kernel execution. diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c index 11d4ceeae62e..e08c36094098 100644 --- a/libgomp/plugin/plugin-nvptx.c +++ b/libgomp/plugin/plugin-nvptx.c @@ -49,6 +49,15 @@ #include #include +/* An arbitrary fixed limit (128MB) for the size of the OpenMP soft stacks + block to cache between kernel invocations. For soft-stacks blocks bigger + than this, we will free the block before attempting another GPU memory + allocation (i.e. in GOMP_OFFLOAD_alloc). Otherwise, if an allocation fails, + we will free the cached soft-stacks block anyway then retry the + allocation. If that fails too, we lose. */ + +#define SOFTSTACK_CACHE_LIMIT 134217728 + #if CUDA_VERSION < 6000 extern CUresult cuGetErrorString (CUresult, const char **); #define CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR 82 @@ -307,6 +316,14 @@ struct ptx_device struct ptx_free_block *free_blocks; pthread_mutex_t free_blocks_lock; + /* OpenMP stacks, cached between kernel invocations. */ + struct + { + CUdeviceptr ptr; + size_t size; + pthread_mutex_t lock; + } omp_stacks; + struct ptx_device *next; }; @@ -514,6 +531,10 @@ nvptx_open_device (int n) ptx_dev->free_blocks = NULL; pthread_mutex_init (&ptx_dev->free_blocks_lock, NULL); + ptx_dev->omp_stacks.ptr = 0; + ptx_dev->omp_stacks.size = 0; + pthread_mutex_init (&ptx_dev->omp_stacks.lock, NULL); + return ptx_dev; } @@ -534,6 +555,11 @@ nvptx_close_device (struct ptx_device *ptx_dev) pthread_mutex_destroy (&ptx_dev->free_blocks_lock); pthread_mutex_destroy (&ptx_dev->image_lock); + pthread_mutex_destroy (&ptx_dev->omp_stacks.lock); + + if (ptx_dev->omp_stacks.ptr) + CUDA_CALL (cuMemFree, ptx_dev->omp_stacks.ptr); + if (!ptx_dev->ctx_shared) CUDA_CALL (cuCtxDestroy, ptx_dev->ctx); @@ -999,12 +1025,40 @@ goacc_profiling_acc_ev_alloc (struct goacc_thread *thr, void *dp, size_t s) GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, api_info); } +/* Free the cached soft-stacks block if it is above the SOFTSTACK_CACHE_LIMIT + size threshold, or if FORCE is true. */ + +static void +nvptx_stacks_free (struct ptx_device *ptx_dev, bool force) +{ + pthread_mutex_lock (&ptx_dev->omp_stacks.lock); + if (ptx_dev->omp_stacks.ptr + && (force || ptx_dev->omp_stacks.size > SOFTSTACK_CACHE_LIMIT)) + { + CUresult r = CUDA_CALL_NOCHECK (cuMemFree, ptx_dev->omp_stacks.ptr); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r)); + ptx_dev->omp_stacks.ptr = 0; + ptx_dev->omp_stacks.size = 0; + } + pthread_mutex_unlock (&ptx_dev->omp_stacks.lock); +} + static void * -nvptx_alloc (size_t s) +nvptx_alloc (size_t s, bool suppress_errors) { CUdeviceptr d; - CUDA_CALL_ERET (NULL, cuMemAlloc, &d, s); + CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &d, s); + if (suppress_errors && r == CUDA_ERROR_OUT_OF_MEMORY) + return NULL; + else if (r != CUDA_SUCCESS) + { + GOMP_PLUGIN_error ("nvptx_alloc error: %s", cuda_error (r)); + return NULL; + } + + /* NOTE: We only do profiling stuff if the memory allocation succeeds. */ struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread (); bool profiling_p = __builtin_expect (thr != NULL && thr->prof_info != NULL, false); @@ -1352,6 +1406,8 @@ GOMP_OFFLOAD_alloc (int ord, size_t size) ptx_dev->free_blocks = NULL; pthread_mutex_unlock (&ptx_dev->free_blocks_lock); + nvptx_stacks_free (ptx_dev, false); + while (blocks) { tmp = blocks->next; @@ -1360,7 +1416,16 @@ GOMP_OFFLOAD_alloc (int ord, size_t size) blocks = tmp; } - return nvptx_alloc (size); + void *d = nvptx_alloc (size, true); + if (d) + return d; + else + { + /* Memory allocation failed. Try freeing the stacks block, and + retrying. */ + nvptx_stacks_free (ptx_dev, true); + return nvptx_alloc (size, false); + } } bool @@ -1866,26 +1931,36 @@ nvptx_stacks_size () return 128 * 1024; } -/* Return contiguous storage for NUM stacks, each SIZE bytes. */ +/* Return contiguous storage for NUM stacks, each SIZE bytes. The lock for + the storage should be held on entry, and remains held on exit. */ static void * -nvptx_stacks_alloc (size_t size, int num) +nvptx_stacks_acquire (struct ptx_device *ptx_dev, size_t size, int num) { - CUdeviceptr stacks; - CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &stacks, size * num); + if (ptx_dev->omp_stacks.ptr && ptx_dev->omp_stacks.size >= size * num) + return (void *) ptx_dev->omp_stacks.ptr; + + /* Free the old, too-small stacks. */ + if (ptx_dev->omp_stacks.ptr) + { + CUresult r = CUDA_CALL_NOCHECK (cuCtxSynchronize, ); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s\n", cuda_error (r)); + r = CUDA_CALL_NOCHECK (cuMemFree, ptx_dev->omp_stacks.ptr); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r)); + } + + /* Make new and bigger stacks, and remember where we put them and how big + they are. */ + CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &ptx_dev->omp_stacks.ptr, + size * num); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r)); - return (void *) stacks; -} -/* Release storage previously allocated by nvptx_stacks_alloc. */ + ptx_dev->omp_stacks.size = size * num; -static void -nvptx_stacks_free (void *p, int num) -{ - CUresult r = CUDA_CALL_NOCHECK (cuMemFree, (CUdeviceptr) p); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r)); + return (void *) ptx_dev->omp_stacks.ptr; } void @@ -1922,7 +1997,9 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args) nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads); size_t stack_size = nvptx_stacks_size (); - void *stacks = nvptx_stacks_alloc (stack_size, teams * threads); + + pthread_mutex_lock (&ptx_dev->omp_stacks.lock); + void *stacks = nvptx_stacks_acquire (ptx_dev, stack_size, teams * threads); void *fn_args[] = {tgt_vars, stacks, (void *) stack_size}; size_t fn_args_size = sizeof fn_args; void *config[] = { @@ -1944,7 +2021,8 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args) maybe_abort_msg); else if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r)); - nvptx_stacks_free (stacks, teams * threads); + + pthread_mutex_unlock (&ptx_dev->omp_stacks.lock); } /* TODO: Implement GOMP_OFFLOAD_async_run. */