From: Thomas Schwinge <thomas@codesourcery.com>
To: <gcc-patches@gcc.gnu.org>, Tom de Vries <tdevries@suse.de>
Subject: [PING] nvptx: Support global constructors/destructors via 'collect2' for offloading (was: nvptx: Support global constructors/destructors via 'collect2')
Date: Wed, 11 Jan 2023 12:49:24 +0100 [thread overview]
Message-ID: <87fschl29n.fsf@euler.schwinge.homeip.net> (raw)
In-Reply-To: <87o7rup7f8.fsf@euler.schwinge.homeip.net>
[-- Attachment #1: Type: text/plain, Size: 1185 bytes --]
Hi!
Ping.
Grüße
Thomas
On 2022-12-23T14:37:47+0100, I wrote:
> Hi!
>
> On 2022-12-23T14:35:16+0100, I wrote:
>> On 2022-12-02T14:35:35+0100, I wrote:
>>> On 2022-12-01T22:13:38+0100, I wrote:
>>>> I'm working on support for global constructors/destructors with
>>>> GCC/nvptx
>>>
>>> See "nvptx: Support global constructors/destructors via 'collect2'"
>>> [posted before]
>>
>> Building on that, attached is now the additional "for offloading" piece:
>> "nvptx: Support global constructors/destructors via 'collect2' for offloading".
>> OK to push?
>
> Now really attached.
>
>> I did manually test this (by putting a few constructors/destructors into
>> 'libgomp/config/nvptx/oacc-parallel.c', and observing them be executed),
>> and also in my WIP development tree with standard libgfortran
>> constructors (with 'LIBGFOR_MINIMAL' disabled).
>
>
> Grüße
> Thomas
-----------------
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht München, HRB 106955
[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 0001-nvptx-Support-global-constructors-destructors-via-co.patch --]
[-- Type: text/x-diff, Size: 8131 bytes --]
From fb67006eeca0c8e2bfdf86576ed3109dacaf6868 Mon Sep 17 00:00:00 2001
From: Thomas Schwinge <thomas@codesourcery.com>
Date: Wed, 30 Nov 2022 22:09:35 +0100
Subject: [PATCH] nvptx: Support global constructors/destructors via 'collect2'
for offloading
This extends "nvptx: Support global constructors/destructors via 'collect2'"
for offloading.
libgcc/
* config/nvptx/crtstuff.c ["mgomp"]
(__do_global_ctors__entry__mgomp)
(__do_global_dtors__entry__mgomp): New.
[!"mgomp"] (__do_global_ctors__entry, __do_global_dtors__entry):
New.
libgomp/
* plugin/plugin-nvptx.c (nvptx_do_global_cdtors): New.
(nvptx_close_device, GOMP_OFFLOAD_load_image)
(GOMP_OFFLOAD_unload_image): Call it.
---
libgcc/config/nvptx/crtstuff.c | 64 ++++++++++++++++++-
libgomp/plugin/plugin-nvptx.c | 113 ++++++++++++++++++++++++++++++++-
2 files changed, 175 insertions(+), 2 deletions(-)
diff --git a/libgcc/config/nvptx/crtstuff.c b/libgcc/config/nvptx/crtstuff.c
index 0823fc49901..8dc80687e0a 100644
--- a/libgcc/config/nvptx/crtstuff.c
+++ b/libgcc/config/nvptx/crtstuff.c
@@ -29,6 +29,14 @@
files (via 'CRT_BEGIN' and 'CRT_END'): 'crtbegin.o' and 'crtend.o', but we
do so anyway, for symmetry with other configurations. */
+
+/* See 'crt0.c', 'mgomp.c'. */
+#if defined(__nvptx_softstack__) && defined(__nvptx_unisimt__)
+extern void *__nvptx_stacks[32] __attribute__((shared,nocommon));
+extern unsigned __nvptx_uni[32] __attribute__((shared,nocommon));
+#endif
+
+
#ifdef CRT_BEGIN
void
@@ -37,6 +45,33 @@ __do_global_ctors (void)
DO_GLOBAL_CTORS_BODY;
}
+/* Need '.entry' wrapper for offloading. */
+
+# if defined(__nvptx_softstack__) && defined(__nvptx_unisimt__)
+
+__attribute__((kernel)) void __do_global_ctors__entry__mgomp (void *);
+
+void
+__do_global_ctors__entry__mgomp (void *nvptx_stacks_0)
+{
+ __nvptx_stacks[0] = nvptx_stacks_0;
+ __nvptx_uni[0] = 0;
+
+ __do_global_ctors ();
+}
+
+# else
+
+__attribute__((kernel)) void __do_global_ctors__entry (void);
+
+void
+__do_global_ctors__entry (void)
+{
+ __do_global_ctors ();
+}
+
+# endif
+
#elif defined(CRT_END) /* ! CRT_BEGIN */
void
@@ -45,7 +80,7 @@ __do_global_dtors (void)
/* In this configuration here, there's no way that "this routine is run more
than once [...] when exit is called recursively": for nvptx target, the
call to '__do_global_dtors' is registered via 'atexit', which doesn't
- re-enter a function already run.
+ re-enter a function already run, and neither does nvptx offload target.
Therefore, we do *not* "arrange to remember where in the list we left off
processing". */
func_ptr *p;
@@ -53,6 +88,33 @@ __do_global_dtors (void)
(*p++) ();
}
+/* Need '.entry' wrapper for offloading. */
+
+# if defined(__nvptx_softstack__) && defined(__nvptx_unisimt__)
+
+__attribute__((kernel)) void __do_global_dtors__entry__mgomp (void *);
+
+void
+__do_global_dtors__entry__mgomp (void *nvptx_stacks_0)
+{
+ __nvptx_stacks[0] = nvptx_stacks_0;
+ __nvptx_uni[0] = 0;
+
+ __do_global_dtors ();
+}
+
+# else
+
+__attribute__((kernel)) void __do_global_dtors__entry (void);
+
+void
+__do_global_dtors__entry (void)
+{
+ __do_global_dtors ();
+}
+
+# endif
+
#else /* ! CRT_BEGIN && ! CRT_END */
#error "One of CRT_BEGIN or CRT_END must be defined."
#endif
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index fcc97c6e0d5..395639537e8 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -338,6 +338,11 @@ struct ptx_device
static struct ptx_device **ptx_devices;
+static bool nvptx_do_global_cdtors (CUmodule, struct ptx_device *,
+ const char *);
+static size_t nvptx_stacks_size ();
+static void *nvptx_stacks_acquire (struct ptx_device *, size_t, int);
+
static inline struct nvptx_thread *
nvptx_thread (void)
{
@@ -557,6 +562,17 @@ nvptx_close_device (struct ptx_device *ptx_dev)
if (!ptx_dev)
return true;
+ bool ret = true;
+
+ for (struct ptx_image_data *image = ptx_dev->images;
+ image != NULL;
+ image = image->next)
+ {
+ if (!nvptx_do_global_cdtors (image->module, ptx_dev,
+ "__do_global_dtors__entry"))
+ ret = false;
+ }
+
for (struct ptx_free_block *b = ptx_dev->free_blocks; b;)
{
struct ptx_free_block *b_next = b->next;
@@ -577,7 +593,8 @@ nvptx_close_device (struct ptx_device *ptx_dev)
CUDA_CALL (cuCtxDestroy, ptx_dev->ctx);
free (ptx_dev);
- return true;
+
+ return ret;
}
static int
@@ -1280,6 +1297,93 @@ nvptx_set_clocktick (CUmodule module, struct ptx_device *dev)
GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
}
+/* Invoke MODULE's global constructors/destructors. */
+
+static bool
+nvptx_do_global_cdtors (CUmodule module, struct ptx_device *ptx_dev,
+ const char *funcname)
+{
+ bool ret = true;
+ char *funcname_mgomp = NULL;
+ CUresult r;
+ CUfunction funcptr;
+ r = CUDA_CALL_NOCHECK (cuModuleGetFunction,
+ &funcptr, module, funcname);
+ GOMP_PLUGIN_debug (0, "cuModuleGetFunction (%s): %s\n",
+ funcname, cuda_error (r));
+ if (r == CUDA_ERROR_NOT_FOUND)
+ {
+ /* Try '[funcname]__mgomp'. */
+
+ size_t funcname_len = strlen (funcname);
+ const char *mgomp_suffix = "__mgomp";
+ size_t mgomp_suffix_len = strlen (mgomp_suffix);
+ funcname_mgomp
+ = GOMP_PLUGIN_malloc (funcname_len + mgomp_suffix_len + 1);
+ memcpy (funcname_mgomp, funcname, funcname_len);
+ memcpy (funcname_mgomp + funcname_len,
+ mgomp_suffix, mgomp_suffix_len + 1);
+ funcname = funcname_mgomp;
+
+ r = CUDA_CALL_NOCHECK (cuModuleGetFunction,
+ &funcptr, module, funcname);
+ GOMP_PLUGIN_debug (0, "cuModuleGetFunction (%s): %s\n",
+ funcname, cuda_error (r));
+ }
+ if (r == CUDA_ERROR_NOT_FOUND)
+ ;
+ else if (r != CUDA_SUCCESS)
+ {
+ GOMP_PLUGIN_error ("cuModuleGetFunction (%s) error: %s",
+ funcname, cuda_error (r));
+ ret = false;
+ }
+ else
+ {
+ /* If necessary, set up soft stack. */
+ void *nvptx_stacks_0;
+ void *kargs[1];
+ if (funcname_mgomp)
+ {
+ size_t stack_size = nvptx_stacks_size ();
+ pthread_mutex_lock (&ptx_dev->omp_stacks.lock);
+ nvptx_stacks_0 = nvptx_stacks_acquire (ptx_dev, stack_size, 1);
+ nvptx_stacks_0 += stack_size;
+ kargs[0] = &nvptx_stacks_0;
+ }
+ r = CUDA_CALL_NOCHECK (cuLaunchKernel,
+ funcptr,
+ 1, 1, 1, 1, 1, 1,
+ /* sharedMemBytes */ 0,
+ /* hStream */ NULL,
+ /* kernelParams */ funcname_mgomp ? kargs : NULL,
+ /* extra */ NULL);
+ if (r != CUDA_SUCCESS)
+ {
+ GOMP_PLUGIN_error ("cuLaunchKernel (%s) error: %s",
+ funcname, cuda_error (r));
+ ret = false;
+ }
+
+ r = CUDA_CALL_NOCHECK (cuStreamSynchronize,
+ NULL);
+ if (r != CUDA_SUCCESS)
+ {
+ GOMP_PLUGIN_error ("cuStreamSynchronize (%s) error: %s",
+ funcname, cuda_error (r));
+ ret = false;
+ }
+
+ if (funcname_mgomp)
+ pthread_mutex_unlock (&ptx_dev->omp_stacks.lock);
+ }
+
+ if (funcname_mgomp)
+ free (funcname_mgomp);
+
+ return ret;
+}
+
/* Load the (partial) program described by TARGET_DATA to device
number ORD. Allocate and return TARGET_TABLE. If not NULL, REV_FN_TABLE
will contain the on-device addresses of the functions for reverse offload.
@@ -1452,6 +1556,9 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
nvptx_set_clocktick (module, dev);
+ if (!nvptx_do_global_cdtors (module, dev, "__do_global_ctors__entry"))
+ return -1;
+
return fn_entries + var_entries + other_entries;
}
@@ -1477,6 +1584,10 @@ GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data)
for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next)
if (image->target_data == target_data)
{
+ if (!nvptx_do_global_cdtors (image->module, dev,
+ "__do_global_dtors__entry"))
+ ret = false;
+
*prev_p = image->next;
if (CUDA_CALL_NOCHECK (cuModuleUnload, image->module) != CUDA_SUCCESS)
ret = false;
--
2.25.1
next prev parent reply other threads:[~2023-01-11 11:49 UTC|newest]
Thread overview: 10+ messages / expand[flat|nested] mbox.gz Atom feed top
[not found] <878rjqaku5.fsf@dem-tschwing-1.ger.mentorg.com>
2022-12-02 13:35 ` nvptx: Support global constructors/destructors via 'collect2' Thomas Schwinge
2022-12-20 8:03 ` [PING] " Thomas Schwinge
2023-01-11 11:48 ` [PING^2] " Thomas Schwinge
2023-01-24 9:01 ` Make 'libgcc/config/nvptx/crt0.c' build '--without-headers' (was: [PING] nvptx: Support global constructors/destructors via 'collect2') Thomas Schwinge
2022-12-23 13:35 ` nvptx: Support global constructors/destructors via 'collect2' for offloading (was: " Thomas Schwinge
2022-12-23 13:37 ` Thomas Schwinge
2023-01-11 11:49 ` Thomas Schwinge [this message]
2023-01-20 20:46 ` [og12] " Thomas Schwinge
2023-01-20 20:41 ` [og12] nvptx: Support global constructors/destructors via 'collect2' Thomas Schwinge
2023-01-20 20:45 ` Thomas Schwinge
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=87fschl29n.fsf@euler.schwinge.homeip.net \
--to=thomas@codesourcery.com \
--cc=gcc-patches@gcc.gnu.org \
--cc=tdevries@suse.de \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).