From: Julian Brown <julian@codesourcery.com>
To: Alexander Monakov <amonakov@ispras.ru>
Cc: Jakub Jelinek <jakub@redhat.com>, <gcc-patches@gcc.gnu.org>,
"Thomas Schwinge" <thomas@codesourcery.com>,
Tom de Vries <tdevries@suse.de>
Subject: Re: [PATCH] nvptx: Cache stacks block for OpenMP kernel launch
Date: Tue, 15 Dec 2020 13:39:13 +0000 [thread overview]
Message-ID: <20201215133913.32520253@squid.athome> (raw)
In-Reply-To: <alpine.LNX.2.20.13.2012082009570.2504@monopod.intra.ispras.ru>
[-- Attachment #1: Type: text/plain, Size: 425 bytes --]
On Tue, 8 Dec 2020 20:11:38 +0300
Alexander Monakov <amonakov@ispras.ru> wrote:
> On Tue, 8 Dec 2020, Julian Brown wrote:
>
> > Ping?
>
> This has addressed my concerns, thanks.
Jakub, Tom -- just to confirm, is this OK for trunk now?
I noticed a slight bugfix myself in the no-stacks/out-of-memory case --
i.e. for OpenACC, in nvptx_stacks_free. The attached version of the
patch includes that fix.
Thanks,
Julian
[-- Attachment #2: nvptx-stacks-caching-4.diff --]
[-- Type: text/x-patch, Size: 7713 bytes --]
commit 2e4160f0f0532890f6a2b405222f02c7ee1bb0ac
Author: Julian Brown <julian@codesourcery.com>
Date: Wed Oct 21 10:00:19 2020 -0700
nvptx: Cache stacks block for OpenMP kernel launch
2020-11-13 Julian Brown <julian@codesourcery.com>
libgomp/
* plugin/plugin-nvptx.c (SOFTSTACK_CACHE_LIMIT): New define.
(struct ptx_device): Add omp_stacks struct.
(nvptx_open_device): Initialise cached-stacks housekeeping info.
(nvptx_close_device): Free cached stacks block and mutex.
(nvptx_stacks_free): New function.
(nvptx_alloc): Add SUPPRESS_ERRORS parameter.
(GOMP_OFFLOAD_alloc): Add strategies for freeing soft-stacks block.
(nvptx_stacks_alloc): Rename to...
(nvptx_stacks_acquire): This. Cache stacks block between runs if same
size or smaller is required.
(nvptx_stacks_free): Remove.
(GOMP_OFFLOAD_run): Call nvptx_stacks_acquire and lock stacks block
during kernel execution.
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index 11d4ceeae62e..e08c36094098 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -49,6 +49,15 @@
#include <assert.h>
#include <errno.h>
+/* An arbitrary fixed limit (128MB) for the size of the OpenMP soft stacks
+ block to cache between kernel invocations. For soft-stacks blocks bigger
+ than this, we will free the block before attempting another GPU memory
+ allocation (i.e. in GOMP_OFFLOAD_alloc). Otherwise, if an allocation fails,
+ we will free the cached soft-stacks block anyway then retry the
+ allocation. If that fails too, we lose. */
+
+#define SOFTSTACK_CACHE_LIMIT 134217728
+
#if CUDA_VERSION < 6000
extern CUresult cuGetErrorString (CUresult, const char **);
#define CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR 82
@@ -307,6 +316,14 @@ struct ptx_device
struct ptx_free_block *free_blocks;
pthread_mutex_t free_blocks_lock;
+ /* OpenMP stacks, cached between kernel invocations. */
+ struct
+ {
+ CUdeviceptr ptr;
+ size_t size;
+ pthread_mutex_t lock;
+ } omp_stacks;
+
struct ptx_device *next;
};
@@ -514,6 +531,10 @@ nvptx_open_device (int n)
ptx_dev->free_blocks = NULL;
pthread_mutex_init (&ptx_dev->free_blocks_lock, NULL);
+ ptx_dev->omp_stacks.ptr = 0;
+ ptx_dev->omp_stacks.size = 0;
+ pthread_mutex_init (&ptx_dev->omp_stacks.lock, NULL);
+
return ptx_dev;
}
@@ -534,6 +555,11 @@ nvptx_close_device (struct ptx_device *ptx_dev)
pthread_mutex_destroy (&ptx_dev->free_blocks_lock);
pthread_mutex_destroy (&ptx_dev->image_lock);
+ pthread_mutex_destroy (&ptx_dev->omp_stacks.lock);
+
+ if (ptx_dev->omp_stacks.ptr)
+ CUDA_CALL (cuMemFree, ptx_dev->omp_stacks.ptr);
+
if (!ptx_dev->ctx_shared)
CUDA_CALL (cuCtxDestroy, ptx_dev->ctx);
@@ -999,12 +1025,40 @@ goacc_profiling_acc_ev_alloc (struct goacc_thread *thr, void *dp, size_t s)
GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, api_info);
}
+/* Free the cached soft-stacks block if it is above the SOFTSTACK_CACHE_LIMIT
+ size threshold, or if FORCE is true. */
+
+static void
+nvptx_stacks_free (struct ptx_device *ptx_dev, bool force)
+{
+ pthread_mutex_lock (&ptx_dev->omp_stacks.lock);
+ if (ptx_dev->omp_stacks.ptr
+ && (force || ptx_dev->omp_stacks.size > SOFTSTACK_CACHE_LIMIT))
+ {
+ CUresult r = CUDA_CALL_NOCHECK (cuMemFree, ptx_dev->omp_stacks.ptr);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
+ ptx_dev->omp_stacks.ptr = 0;
+ ptx_dev->omp_stacks.size = 0;
+ }
+ pthread_mutex_unlock (&ptx_dev->omp_stacks.lock);
+}
+
static void *
-nvptx_alloc (size_t s)
+nvptx_alloc (size_t s, bool suppress_errors)
{
CUdeviceptr d;
- CUDA_CALL_ERET (NULL, cuMemAlloc, &d, s);
+ CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &d, s);
+ if (suppress_errors && r == CUDA_ERROR_OUT_OF_MEMORY)
+ return NULL;
+ else if (r != CUDA_SUCCESS)
+ {
+ GOMP_PLUGIN_error ("nvptx_alloc error: %s", cuda_error (r));
+ return NULL;
+ }
+
+ /* NOTE: We only do profiling stuff if the memory allocation succeeds. */
struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
bool profiling_p
= __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
@@ -1352,6 +1406,8 @@ GOMP_OFFLOAD_alloc (int ord, size_t size)
ptx_dev->free_blocks = NULL;
pthread_mutex_unlock (&ptx_dev->free_blocks_lock);
+ nvptx_stacks_free (ptx_dev, false);
+
while (blocks)
{
tmp = blocks->next;
@@ -1360,7 +1416,16 @@ GOMP_OFFLOAD_alloc (int ord, size_t size)
blocks = tmp;
}
- return nvptx_alloc (size);
+ void *d = nvptx_alloc (size, true);
+ if (d)
+ return d;
+ else
+ {
+ /* Memory allocation failed. Try freeing the stacks block, and
+ retrying. */
+ nvptx_stacks_free (ptx_dev, true);
+ return nvptx_alloc (size, false);
+ }
}
bool
@@ -1866,26 +1931,36 @@ nvptx_stacks_size ()
return 128 * 1024;
}
-/* Return contiguous storage for NUM stacks, each SIZE bytes. */
+/* Return contiguous storage for NUM stacks, each SIZE bytes. The lock for
+ the storage should be held on entry, and remains held on exit. */
static void *
-nvptx_stacks_alloc (size_t size, int num)
+nvptx_stacks_acquire (struct ptx_device *ptx_dev, size_t size, int num)
{
- CUdeviceptr stacks;
- CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &stacks, size * num);
+ if (ptx_dev->omp_stacks.ptr && ptx_dev->omp_stacks.size >= size * num)
+ return (void *) ptx_dev->omp_stacks.ptr;
+
+ /* Free the old, too-small stacks. */
+ if (ptx_dev->omp_stacks.ptr)
+ {
+ CUresult r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s\n", cuda_error (r));
+ r = CUDA_CALL_NOCHECK (cuMemFree, ptx_dev->omp_stacks.ptr);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
+ }
+
+ /* Make new and bigger stacks, and remember where we put them and how big
+ they are. */
+ CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &ptx_dev->omp_stacks.ptr,
+ size * num);
if (r != CUDA_SUCCESS)
GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
- return (void *) stacks;
-}
-/* Release storage previously allocated by nvptx_stacks_alloc. */
+ ptx_dev->omp_stacks.size = size * num;
-static void
-nvptx_stacks_free (void *p, int num)
-{
- CUresult r = CUDA_CALL_NOCHECK (cuMemFree, (CUdeviceptr) p);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
+ return (void *) ptx_dev->omp_stacks.ptr;
}
void
@@ -1922,7 +1997,9 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads);
size_t stack_size = nvptx_stacks_size ();
- void *stacks = nvptx_stacks_alloc (stack_size, teams * threads);
+
+ pthread_mutex_lock (&ptx_dev->omp_stacks.lock);
+ void *stacks = nvptx_stacks_acquire (ptx_dev, stack_size, teams * threads);
void *fn_args[] = {tgt_vars, stacks, (void *) stack_size};
size_t fn_args_size = sizeof fn_args;
void *config[] = {
@@ -1944,7 +2021,8 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
maybe_abort_msg);
else if (r != CUDA_SUCCESS)
GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
- nvptx_stacks_free (stacks, teams * threads);
+
+ pthread_mutex_unlock (&ptx_dev->omp_stacks.lock);
}
/* TODO: Implement GOMP_OFFLOAD_async_run. */
next prev parent reply other threads:[~2020-12-15 13:39 UTC|newest]
Thread overview: 16+ messages / expand[flat|nested] mbox.gz Atom feed top
2020-10-26 14:14 Julian Brown
2020-10-26 14:26 ` Jakub Jelinek
2020-11-09 21:32 ` Alexander Monakov
2020-11-13 20:54 ` Julian Brown
2020-12-08 1:13 ` Julian Brown
2020-12-08 17:11 ` Alexander Monakov
2020-12-15 13:39 ` Julian Brown [this message]
2020-12-15 13:49 ` Jakub Jelinek
2020-12-15 16:49 ` Julian Brown
2020-12-15 17:00 ` Jakub Jelinek
2020-12-15 23:16 ` Julian Brown
2021-01-05 12:13 ` Julian Brown
2021-01-05 15:32 ` Jakub Jelinek
2020-10-27 13:17 ` Julian Brown
2020-10-28 7:25 ` Chung-Lin Tang
2020-10-28 11:32 ` Julian Brown
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20201215133913.32520253@squid.athome \
--to=julian@codesourcery.com \
--cc=amonakov@ispras.ru \
--cc=gcc-patches@gcc.gnu.org \
--cc=jakub@redhat.com \
--cc=tdevries@suse.de \
--cc=thomas@codesourcery.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).