From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 1643) id 6E7CF3858D20; Fri, 20 Jan 2023 20:44:04 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 6E7CF3858D20 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org; s=default; t=1674247444; bh=wjqyUVfTxgtjsdtmI2gMglIM2cOA10C5gWgyC1qSZ0g=; h=From:To:Subject:Date:From; b=InzYSe1QSBJ6ckLvYLejpnS/QnHpBXBBm6am79PtWPgEuuVFYfQ5fMDwIxKw+7Lv1 DdMZ0FIa4RSNeUVTdgW9s+gQW7Ku9it7GhNzl1pEKf8JOd2QfD1okjhHhfsV+muxao tgFqIirfYZ62lx+VVZktZfVFuC76gZ6e3l6bWTBg= Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: Thomas Schwinge To: gcc-cvs@gcc.gnu.org Subject: [gcc/devel/omp/gcc-12] nvptx: Support global constructors/destructors via 'collect2' for offloading X-Act-Checkin: gcc X-Git-Author: Thomas Schwinge X-Git-Refname: refs/heads/devel/omp/gcc-12 X-Git-Oldrev: fe07b0003bb2092bc34d4bed504be1868b88782d X-Git-Newrev: 689a5340c7e4286b451f1bc600342550c7c94da2 Message-Id: <20230120204404.6E7CF3858D20@sourceware.org> Date: Fri, 20 Jan 2023 20:44:04 +0000 (GMT) List-Id: https://gcc.gnu.org/g:689a5340c7e4286b451f1bc600342550c7c94da2 commit 689a5340c7e4286b451f1bc600342550c7c94da2 Author: Thomas Schwinge Date: Wed Nov 30 22:09:35 2022 +0100 nvptx: Support global constructors/destructors via 'collect2' for offloading This extends "nvptx: Support global constructors/destructors via 'collect2'" for offloading. libgcc/ * config/nvptx/crtstuff.c ["mgomp"] (__do_global_ctors__entry__mgomp) (__do_global_dtors__entry__mgomp): New. [!"mgomp"] (__do_global_ctors__entry, __do_global_dtors__entry): New. libgomp/ * plugin/plugin-nvptx.c (nvptx_do_global_cdtors): New. (nvptx_close_device, GOMP_OFFLOAD_load_image) (GOMP_OFFLOAD_unload_image): Call it. Diff: --- libgcc/ChangeLog.omp | 6 +++ libgcc/config/nvptx/crtstuff.c | 64 ++++++++++++++++++++++- libgomp/ChangeLog.omp | 4 ++ libgomp/plugin/plugin-nvptx.c | 113 ++++++++++++++++++++++++++++++++++++++++- 4 files changed, 185 insertions(+), 2 deletions(-) diff --git a/libgcc/ChangeLog.omp b/libgcc/ChangeLog.omp index 68a99cbe427..2e7bf5cc029 100644 --- a/libgcc/ChangeLog.omp +++ b/libgcc/ChangeLog.omp @@ -1,5 +1,11 @@ 2023-01-20 Thomas Schwinge + * config/nvptx/crtstuff.c ["mgomp"] + (__do_global_ctors__entry__mgomp) + (__do_global_dtors__entry__mgomp): New. + [!"mgomp"] (__do_global_ctors__entry, __do_global_dtors__entry): + New. + * config.host : Add 'crtbegin.o', 'crtend.o' to 'extra_parts'. * config/nvptx/crt0.c: Invoke '__do_global_ctors', diff --git a/libgcc/config/nvptx/crtstuff.c b/libgcc/config/nvptx/crtstuff.c index 0823fc49901..8dc80687e0a 100644 --- a/libgcc/config/nvptx/crtstuff.c +++ b/libgcc/config/nvptx/crtstuff.c @@ -29,6 +29,14 @@ files (via 'CRT_BEGIN' and 'CRT_END'): 'crtbegin.o' and 'crtend.o', but we do so anyway, for symmetry with other configurations. */ + +/* See 'crt0.c', 'mgomp.c'. */ +#if defined(__nvptx_softstack__) && defined(__nvptx_unisimt__) +extern void *__nvptx_stacks[32] __attribute__((shared,nocommon)); +extern unsigned __nvptx_uni[32] __attribute__((shared,nocommon)); +#endif + + #ifdef CRT_BEGIN void @@ -37,6 +45,33 @@ __do_global_ctors (void) DO_GLOBAL_CTORS_BODY; } +/* Need '.entry' wrapper for offloading. */ + +# if defined(__nvptx_softstack__) && defined(__nvptx_unisimt__) + +__attribute__((kernel)) void __do_global_ctors__entry__mgomp (void *); + +void +__do_global_ctors__entry__mgomp (void *nvptx_stacks_0) +{ + __nvptx_stacks[0] = nvptx_stacks_0; + __nvptx_uni[0] = 0; + + __do_global_ctors (); +} + +# else + +__attribute__((kernel)) void __do_global_ctors__entry (void); + +void +__do_global_ctors__entry (void) +{ + __do_global_ctors (); +} + +# endif + #elif defined(CRT_END) /* ! CRT_BEGIN */ void @@ -45,7 +80,7 @@ __do_global_dtors (void) /* In this configuration here, there's no way that "this routine is run more than once [...] when exit is called recursively": for nvptx target, the call to '__do_global_dtors' is registered via 'atexit', which doesn't - re-enter a function already run. + re-enter a function already run, and neither does nvptx offload target. Therefore, we do *not* "arrange to remember where in the list we left off processing". */ func_ptr *p; @@ -53,6 +88,33 @@ __do_global_dtors (void) (*p++) (); } +/* Need '.entry' wrapper for offloading. */ + +# if defined(__nvptx_softstack__) && defined(__nvptx_unisimt__) + +__attribute__((kernel)) void __do_global_dtors__entry__mgomp (void *); + +void +__do_global_dtors__entry__mgomp (void *nvptx_stacks_0) +{ + __nvptx_stacks[0] = nvptx_stacks_0; + __nvptx_uni[0] = 0; + + __do_global_dtors (); +} + +# else + +__attribute__((kernel)) void __do_global_dtors__entry (void); + +void +__do_global_dtors__entry (void) +{ + __do_global_dtors (); +} + +# endif + #else /* ! CRT_BEGIN && ! CRT_END */ #error "One of CRT_BEGIN or CRT_END must be defined." #endif diff --git a/libgomp/ChangeLog.omp b/libgomp/ChangeLog.omp index 4447b74a2ab..32aa9705296 100644 --- a/libgomp/ChangeLog.omp +++ b/libgomp/ChangeLog.omp @@ -1,5 +1,9 @@ 2023-01-20 Thomas Schwinge + * plugin/plugin-nvptx.c (nvptx_do_global_cdtors): New. + (nvptx_close_device, GOMP_OFFLOAD_load_image) + (GOMP_OFFLOAD_unload_image): Call it. + * plugin/plugin-nvptx.c (nvptx_exec): Assert what we know about 'blockDimX'. diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c index b2fabc61cc8..8e7b63bd637 100644 --- a/libgomp/plugin/plugin-nvptx.c +++ b/libgomp/plugin/plugin-nvptx.c @@ -344,6 +344,11 @@ static struct ptx_device **ptx_devices; default is set here. */ static unsigned lowlat_pool_size = 8*1024; +static bool nvptx_do_global_cdtors (CUmodule, struct ptx_device *, + const char *); +static size_t nvptx_stacks_size (); +static void *nvptx_stacks_acquire (struct ptx_device *, size_t, int); + static inline struct nvptx_thread * nvptx_thread (void) { @@ -571,6 +576,17 @@ nvptx_close_device (struct ptx_device *ptx_dev) if (!ptx_dev) return true; + bool ret = true; + + for (struct ptx_image_data *image = ptx_dev->images; + image != NULL; + image = image->next) + { + if (!nvptx_do_global_cdtors (image->module, ptx_dev, + "__do_global_dtors__entry")) + ret = false; + } + for (struct ptx_free_block *b = ptx_dev->free_blocks; b;) { struct ptx_free_block *b_next = b->next; @@ -591,7 +607,8 @@ nvptx_close_device (struct ptx_device *ptx_dev) CUDA_CALL (cuCtxDestroy, ptx_dev->ctx); free (ptx_dev); - return true; + + return ret; } static int @@ -1313,6 +1330,93 @@ nvptx_set_clocktick (CUmodule module, struct ptx_device *dev) GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r)); } +/* Invoke MODULE's global constructors/destructors. */ + +static bool +nvptx_do_global_cdtors (CUmodule module, struct ptx_device *ptx_dev, + const char *funcname) +{ + bool ret = true; + char *funcname_mgomp = NULL; + CUresult r; + CUfunction funcptr; + r = CUDA_CALL_NOCHECK (cuModuleGetFunction, + &funcptr, module, funcname); + GOMP_PLUGIN_debug (0, "cuModuleGetFunction (%s): %s\n", + funcname, cuda_error (r)); + if (r == CUDA_ERROR_NOT_FOUND) + { + /* Try '[funcname]__mgomp'. */ + + size_t funcname_len = strlen (funcname); + const char *mgomp_suffix = "__mgomp"; + size_t mgomp_suffix_len = strlen (mgomp_suffix); + funcname_mgomp + = GOMP_PLUGIN_malloc (funcname_len + mgomp_suffix_len + 1); + memcpy (funcname_mgomp, funcname, funcname_len); + memcpy (funcname_mgomp + funcname_len, + mgomp_suffix, mgomp_suffix_len + 1); + funcname = funcname_mgomp; + + r = CUDA_CALL_NOCHECK (cuModuleGetFunction, + &funcptr, module, funcname); + GOMP_PLUGIN_debug (0, "cuModuleGetFunction (%s): %s\n", + funcname, cuda_error (r)); + } + if (r == CUDA_ERROR_NOT_FOUND) + ; + else if (r != CUDA_SUCCESS) + { + GOMP_PLUGIN_error ("cuModuleGetFunction (%s) error: %s", + funcname, cuda_error (r)); + ret = false; + } + else + { + /* If necessary, set up soft stack. */ + void *nvptx_stacks_0; + void *kargs[1]; + if (funcname_mgomp) + { + size_t stack_size = nvptx_stacks_size (); + pthread_mutex_lock (&ptx_dev->omp_stacks.lock); + nvptx_stacks_0 = nvptx_stacks_acquire (ptx_dev, stack_size, 1); + nvptx_stacks_0 += stack_size; + kargs[0] = &nvptx_stacks_0; + } + r = CUDA_CALL_NOCHECK (cuLaunchKernel, + funcptr, + 1, 1, 1, 1, 1, 1, + /* sharedMemBytes */ 0, + /* hStream */ NULL, + /* kernelParams */ funcname_mgomp ? kargs : NULL, + /* extra */ NULL); + if (r != CUDA_SUCCESS) + { + GOMP_PLUGIN_error ("cuLaunchKernel (%s) error: %s", + funcname, cuda_error (r)); + ret = false; + } + + r = CUDA_CALL_NOCHECK (cuStreamSynchronize, + NULL); + if (r != CUDA_SUCCESS) + { + GOMP_PLUGIN_error ("cuStreamSynchronize (%s) error: %s", + funcname, cuda_error (r)); + ret = false; + } + + if (funcname_mgomp) + pthread_mutex_unlock (&ptx_dev->omp_stacks.lock); + } + + if (funcname_mgomp) + free (funcname_mgomp); + + return ret; +} + /* Load the (partial) program described by TARGET_DATA to device number ORD. Allocate and return TARGET_TABLE. If not NULL, REV_FN_TABLE will contain the on-device addresses of the functions for reverse offload. @@ -1485,6 +1589,9 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data, nvptx_set_clocktick (module, dev); + if (!nvptx_do_global_cdtors (module, dev, "__do_global_ctors__entry")) + return -1; + return fn_entries + var_entries + other_entries; } @@ -1510,6 +1617,10 @@ GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data) for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next) if (image->target_data == target_data) { + if (!nvptx_do_global_cdtors (image->module, dev, + "__do_global_dtors__entry")) + ret = false; + *prev_p = image->next; if (CUDA_CALL_NOCHECK (cuModuleUnload, image->module) != CUDA_SUCCESS) ret = false;