libgomp/nvptx: Prepare for reverse-offload callback handling This patch adds a stub 'gomp_target_rev' in the host's target.c, which will later handle the reverse offload. For nvptx, it adds support for forwarding the offload gomp_target_ext call to the host by setting values in a struct on the device and querying it on the host - invoking gomp_target_rev on the result. include/ChangeLog: * cuda/cuda.h (enum CUdevice_attribute): Add CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. (enum CUmemAttach_flags): New stub with only member CU_MEM_ATTACH_GLOBAL. (cuMemAllocManaged): Add prototype. libgomp/ChangeLog: * config/nvptx/icv-device.c (GOMP_DEVICE_NUM_VAR): Remove 'static' for this variable. * config/nvptx/target.c (GOMP_REV_OFFLOAD_VAR): #define as variable-name string and use it to define the variable. (GOMP_DEVICE_NUM_VAR): Declare this extern global var. (struct rev_offload): Define. (GOMP_target_ext): Handle reverse offload. * libgomp-plugin.h (GOMP_PLUGIN_target_rev): New prototype. * libgomp-plugin.c (GOMP_PLUGIN_target_rev): New, call ... * target.c (gomp_target_rev): ... this new stub function. * libgomp.h (gomp_target_rev): Declare. * libgomp.map (GOMP_PLUGIN_1.4): New; add GOMP_PLUGIN_target_rev. * plugin/cuda-lib.def (cuMemAllocManaged): Add. * plugin/plugin-nvptx.c (GOMP_REV_OFFLOAD_VAR): #define var string. (struct rev_offload): New. (struct ptx_device): Add concurr_managed_access, rev_data and rev_data_dev. (nvptx_open_device): Set ptx_device's concurr_managed_access; '#if 0' unused async_engines. (GOMP_OFFLOAD_load_image): Allocate rev_data variable. (rev_off_dev_to_host_cpy, rev_off_host_to_dev_cpy): New. (GOMP_OFFLOAD_run): Handle reverse offloading. include/cuda/cuda.h | 8 ++- libgomp/config/nvptx/icv-device.c | 2 +- libgomp/config/nvptx/target.c | 52 ++++++++++++-- libgomp/libgomp-plugin.c | 12 ++++ libgomp/libgomp-plugin.h | 7 ++ libgomp/libgomp.h | 5 ++ libgomp/libgomp.map | 5 ++ libgomp/plugin/cuda-lib.def | 1 + libgomp/plugin/plugin-nvptx.c | 148 +++++++++++++++++++++++++++++++++++++- libgomp/target.c | 18 +++++ 10 files changed, 246 insertions(+), 12 deletions(-) diff --git a/include/cuda/cuda.h b/include/cuda/cuda.h index 3938d05d150..08e496a2e98 100644 --- a/include/cuda/cuda.h +++ b/include/cuda/cuda.h @@ -77,9 +77,14 @@ typedef enum { CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40, - CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82 + CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82, + CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS = 89 } CUdevice_attribute; +typedef enum { + CU_MEM_ATTACH_GLOBAL = 0x1 +} CUmemAttach_flags; + enum { CU_EVENT_DEFAULT = 0, CU_EVENT_DISABLE_TIMING = 2 @@ -169,6 +174,7 @@ CUresult cuMemGetInfo (size_t *, size_t *); CUresult cuMemAlloc (CUdeviceptr *, size_t); #define cuMemAllocHost cuMemAllocHost_v2 CUresult cuMemAllocHost (void **, size_t); +CUresult cuMemAllocManaged (CUdeviceptr *, size_t, unsigned int); CUresult cuMemcpy (CUdeviceptr, CUdeviceptr, size_t); #define cuMemcpyDtoDAsync cuMemcpyDtoDAsync_v2 CUresult cuMemcpyDtoDAsync (CUdeviceptr, CUdeviceptr, size_t, CUstream); diff --git a/libgomp/config/nvptx/icv-device.c b/libgomp/config/nvptx/icv-device.c index faf90f9947c..f4f18cdac5e 100644 --- a/libgomp/config/nvptx/icv-device.c +++ b/libgomp/config/nvptx/icv-device.c @@ -60,7 +60,7 @@ omp_is_initial_device (void) /* This is set to the device number of current GPU during device initialization, when the offload image containing this libgomp portion is loaded. */ -static volatile int GOMP_DEVICE_NUM_VAR; +volatile int GOMP_DEVICE_NUM_VAR; int omp_get_device_num (void) diff --git a/libgomp/config/nvptx/target.c b/libgomp/config/nvptx/target.c index 11108d20e15..06f6cd8b611 100644 --- a/libgomp/config/nvptx/target.c +++ b/libgomp/config/nvptx/target.c @@ -26,7 +26,29 @@ #include "libgomp.h" #include +#define GOMP_REV_OFFLOAD_VAR __gomp_rev_offload_var + +/* Reverse offload. Must match version used in plugin/plugin-nvptx.c. */ +struct rev_offload { + uint64_t fn; + uint64_t mapnum; + uint64_t addrs; + uint64_t sizes; + uint64_t kinds; + int32_t dev_num; + uint32_t lock; +}; + +#if (__SIZEOF_SHORT__ != 2 \ + || __SIZEOF_SIZE_T__ != 8 \ + || __SIZEOF_POINTER__ != 8) +#error "Data-type conversion required for rev_offload" +#endif + + extern int __gomp_team_num __attribute__((shared)); +extern volatile int GOMP_DEVICE_NUM_VAR; +volatile struct rev_offload *GOMP_REV_OFFLOAD_VAR; bool GOMP_teams4 (unsigned int num_teams_lower, unsigned int num_teams_upper, @@ -88,16 +110,32 @@ GOMP_target_ext (int device, void (*fn) (void *), size_t mapnum, void **hostaddrs, size_t *sizes, unsigned short *kinds, unsigned int flags, void **depend, void **args) { - (void) device; - (void) fn; - (void) mapnum; - (void) hostaddrs; - (void) sizes; - (void) kinds; (void) flags; (void) depend; (void) args; - __builtin_unreachable (); + + if (device != GOMP_DEVICE_HOST_FALLBACK + || fn == NULL + || GOMP_REV_OFFLOAD_VAR == NULL) + return; + + while (__sync_lock_test_and_set (&GOMP_REV_OFFLOAD_VAR->lock, (uint8_t) 1)) + ; /* spin */ + + __atomic_store_n (&GOMP_REV_OFFLOAD_VAR->mapnum, mapnum, __ATOMIC_SEQ_CST); + __atomic_store_n (&GOMP_REV_OFFLOAD_VAR->addrs, hostaddrs, __ATOMIC_SEQ_CST); + __atomic_store_n (&GOMP_REV_OFFLOAD_VAR->sizes, sizes, __ATOMIC_SEQ_CST); + __atomic_store_n (&GOMP_REV_OFFLOAD_VAR->kinds, kinds, __ATOMIC_SEQ_CST); + __atomic_store_n (&GOMP_REV_OFFLOAD_VAR->dev_num, GOMP_DEVICE_NUM_VAR, + __ATOMIC_SEQ_CST); + + /* 'fn' must be last. */ + __atomic_store_n (&GOMP_REV_OFFLOAD_VAR->fn, fn, __ATOMIC_SEQ_CST); + + /* Processed on the host - when done, fn is set to NULL. */ + while (__atomic_load_n (&GOMP_REV_OFFLOAD_VAR->fn, __ATOMIC_SEQ_CST) != 0) + ; /* spin */ + __sync_lock_release (&GOMP_REV_OFFLOAD_VAR->lock); } void diff --git a/libgomp/libgomp-plugin.c b/libgomp/libgomp-plugin.c index 9d4cc623a10..316de749f69 100644 --- a/libgomp/libgomp-plugin.c +++ b/libgomp/libgomp-plugin.c @@ -78,3 +78,15 @@ GOMP_PLUGIN_fatal (const char *msg, ...) gomp_vfatal (msg, ap); va_end (ap); } + +void +GOMP_PLUGIN_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr, + uint64_t sizes_ptr, uint64_t kinds_ptr, int dev_num, + void (*dev_to_host_cpy) (void *, const void *, size_t, + void *), + void (*host_to_dev_cpy) (void *, const void *, size_t, + void *), void *token) +{ + gomp_target_rev (fn_ptr, mapnum, devaddrs_ptr, sizes_ptr, kinds_ptr, dev_num, + dev_to_host_cpy, host_to_dev_cpy, token); +} diff --git a/libgomp/libgomp-plugin.h b/libgomp/libgomp-plugin.h index ab3ed638475..40dfb52e44e 100644 --- a/libgomp/libgomp-plugin.h +++ b/libgomp/libgomp-plugin.h @@ -121,6 +121,13 @@ extern void GOMP_PLUGIN_error (const char *, ...) extern void GOMP_PLUGIN_fatal (const char *, ...) __attribute__ ((noreturn, format (printf, 1, 2))); +extern void GOMP_PLUGIN_target_rev (uint64_t, uint64_t, uint64_t, uint64_t, + uint64_t, int, + void (*) (void *, const void *, size_t, + void *), + void (*) (void *, const void *, size_t, + void *), void *); + /* Prototypes for functions implemented by libgomp plugins. */ extern const char *GOMP_OFFLOAD_get_name (void); extern unsigned int GOMP_OFFLOAD_get_caps (void); diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h index c243c4d6cf4..bbab5b4b0af 100644 --- a/libgomp/libgomp.h +++ b/libgomp/libgomp.h @@ -1014,6 +1014,11 @@ extern int gomp_pause_host (void); extern void gomp_init_targets_once (void); extern int gomp_get_num_devices (void); extern bool gomp_target_task_fn (void *); +extern void gomp_target_rev (uint64_t, uint64_t, uint64_t, uint64_t, uint64_t, + int, + void (*) (void *, const void *, size_t, void *), + void (*) (void *, const void *, size_t, void *), + void *); /* Splay tree definitions. */ typedef struct splay_tree_node_s *splay_tree_node; diff --git a/libgomp/libgomp.map b/libgomp/libgomp.map index 46d5f10f3e1..12f76f7e48f 100644 --- a/libgomp/libgomp.map +++ b/libgomp/libgomp.map @@ -622,3 +622,8 @@ GOMP_PLUGIN_1.3 { GOMP_PLUGIN_goacc_profiling_dispatch; GOMP_PLUGIN_goacc_thread; } GOMP_PLUGIN_1.2; + +GOMP_PLUGIN_1.4 { + global: + GOMP_PLUGIN_target_rev; +} GOMP_PLUGIN_1.3; diff --git a/libgomp/plugin/cuda-lib.def b/libgomp/plugin/cuda-lib.def index cd91b39b1d2..61359c7e74e 100644 --- a/libgomp/plugin/cuda-lib.def +++ b/libgomp/plugin/cuda-lib.def @@ -29,6 +29,7 @@ CUDA_ONE_CALL_MAYBE_NULL (cuLinkCreate_v2) CUDA_ONE_CALL (cuLinkDestroy) CUDA_ONE_CALL (cuMemAlloc) CUDA_ONE_CALL (cuMemAllocHost) +CUDA_ONE_CALL (cuMemAllocManaged) CUDA_ONE_CALL (cuMemcpy) CUDA_ONE_CALL (cuMemcpyDtoDAsync) CUDA_ONE_CALL (cuMemcpyDtoH) diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c index bc63e274cdf..7ab9421b060 100644 --- a/libgomp/plugin/plugin-nvptx.c +++ b/libgomp/plugin/plugin-nvptx.c @@ -54,6 +54,8 @@ #include #include +#define GOMP_REV_OFFLOAD_VAR __gomp_rev_offload_var + /* An arbitrary fixed limit (128MB) for the size of the OpenMP soft stacks block to cache between kernel invocations. For soft-stacks blocks bigger than this, we will free the block before attempting another GPU memory @@ -274,6 +276,17 @@ struct targ_fn_descriptor int max_threads_per_block; }; +/* Reverse offload. Must match version used in config/nvptx/target.c. */ +struct rev_offload { + uint64_t fn; + uint64_t mapnum; + uint64_t addrs; + uint64_t sizes; + uint64_t kinds; + int32_t dev_num; + uint32_t lock; +}; + /* A loaded PTX image. */ struct ptx_image_data { @@ -302,6 +315,7 @@ struct ptx_device bool map; bool concur; bool mkern; + bool concurr_managed_access; int mode; int clock_khz; int num_sms; @@ -329,6 +343,9 @@ struct ptx_device pthread_mutex_t lock; } omp_stacks; + struct rev_offload *rev_data; + CUdeviceptr rev_data_dev; + struct ptx_device *next; }; @@ -423,7 +440,7 @@ nvptx_open_device (int n) struct ptx_device *ptx_dev; CUdevice dev, ctx_dev; CUresult r; - int async_engines, pi; + int pi; CUDA_CALL_ERET (NULL, cuDeviceGet, &dev, n); @@ -519,11 +536,17 @@ nvptx_open_device (int n) CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev); ptx_dev->max_threads_per_multiprocessor = pi; +#if 0 + int async_engines; r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &async_engines, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev); if (r != CUDA_SUCCESS) async_engines = 1; +#endif + r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi, + CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, dev); + ptx_dev->concurr_managed_access = r == CUDA_SUCCESS ? pi : false; for (int i = 0; i != GOMP_DIM_MAX; i++) ptx_dev->default_dims[i] = 0; @@ -1313,6 +1336,38 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data, targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor) * fn_entries); + if (rev_fn_table && dev->rev_data == NULL) + { + CUdeviceptr dp = 0; + if (dev->concurr_managed_access && CUDA_CALL_EXISTS (cuMemAllocManaged)) + { + CUDA_CALL_ASSERT (cuMemAllocManaged, (void *) &dev->rev_data, + sizeof (*dev->rev_data), CU_MEM_ATTACH_GLOBAL); + dp = (CUdeviceptr) dev->rev_data; + } + else + { + CUDA_CALL_ASSERT (cuMemAllocHost, (void **) &dev->rev_data, + sizeof (*dev->rev_data)); + memset (dev->rev_data, '\0', sizeof (*dev->rev_data)); + CUDA_CALL_ASSERT (cuMemAlloc, &dev->rev_data_dev, + sizeof (*dev->rev_data)); + dp = dev->rev_data_dev; + } + CUdeviceptr device_rev_offload_var; + size_t device_rev_offload_size; + CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, + &device_rev_offload_var, + &device_rev_offload_size, module, + XSTRING (GOMP_REV_OFFLOAD_VAR)); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r)); + r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, device_rev_offload_var, &dp, + sizeof (dp)); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r)); + } + *target_table = targ_tbl; new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data)); @@ -1373,6 +1428,22 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data, targ_tbl->start = targ_tbl->end = 0; targ_tbl++; + if (rev_fn_table) + { + CUdeviceptr var; + size_t bytes; + r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &var, &bytes, module, + "$offload_func_table"); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r)); + assert (bytes == sizeof (uint64_t) * fn_entries); + *rev_fn_table = GOMP_PLUGIN_malloc (sizeof (uint64_t) * fn_entries); + r = CUDA_CALL_NOCHECK (cuMemcpyDtoH, *rev_fn_table, var, bytes); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuMemcpyDtoH error: %s", cuda_error (r)); + } + + nvptx_set_clocktick (module, dev); return fn_entries + var_entries + other_entries; @@ -1982,6 +2053,23 @@ nvptx_stacks_acquire (struct ptx_device *ptx_dev, size_t size, int num) return (void *) ptx_dev->omp_stacks.ptr; } + +void +rev_off_dev_to_host_cpy (void *dest, const void *src, size_t size, + CUstream stream) +{ + CUDA_CALL_ASSERT (cuMemcpyDtoHAsync, dest, (CUdeviceptr) src, size, stream); + CUDA_CALL_ASSERT (cuStreamSynchronize, stream); +} + +void +rev_off_host_to_dev_cpy (void *dest, const void *src, size_t size, + CUstream stream) +{ + CUDA_CALL_ASSERT (cuMemcpyHtoDAsync, (CUdeviceptr) dest, src, size, stream); + CUDA_CALL_ASSERT (cuStreamSynchronize, stream); +} + void GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args) { @@ -2016,6 +2104,10 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args) nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads); size_t stack_size = nvptx_stacks_size (); + bool reverse_off = ptx_dev->rev_data != NULL; + bool has_usm = (ptx_dev->concurr_managed_access + && CUDA_CALL_EXISTS (cuMemAllocManaged)); + CUstream copy_stream = NULL; pthread_mutex_lock (&ptx_dev->omp_stacks.lock); void *stacks = nvptx_stacks_acquire (ptx_dev, stack_size, teams * threads); @@ -2029,12 +2121,62 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args) GOMP_PLUGIN_debug (0, " %s: kernel %s: launch" " [(teams: %u), 1, 1] [(lanes: 32), (threads: %u), 1]\n", __FUNCTION__, fn_name, teams, threads); + if (reverse_off) + CUDA_CALL_ASSERT (cuStreamCreate, ©_stream, CU_STREAM_NON_BLOCKING); r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1, 32, threads, 1, 0, NULL, NULL, config); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r)); - - r = CUDA_CALL_NOCHECK (cuCtxSynchronize, ); + if (reverse_off) + while (true) + { + r = CUDA_CALL_NOCHECK (cuStreamQuery, NULL); + if (r == CUDA_SUCCESS) + break; + if (r == CUDA_ERROR_LAUNCH_FAILED) + GOMP_PLUGIN_fatal ("cuStreamQuery error: %s %s\n", cuda_error (r), + maybe_abort_msg); + else if (r != CUDA_ERROR_NOT_READY) + GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r)); + if (!has_usm) + { + CUDA_CALL_ASSERT (cuMemcpyDtoHAsync, ptx_dev->rev_data, + ptx_dev->rev_data_dev, + sizeof (*ptx_dev->rev_data), copy_stream); + CUDA_CALL_ASSERT (cuStreamSynchronize, copy_stream); + } + if (ptx_dev->rev_data->fn != 0) + { + struct rev_offload *rev_data = ptx_dev->rev_data; + uint64_t fn_ptr = rev_data->fn; + uint64_t mapnum = rev_data->mapnum; + uint64_t addr_ptr = rev_data->addrs; + uint64_t sizes_ptr = rev_data->sizes; + uint64_t kinds_ptr = rev_data->kinds; + int dev_num = (int) rev_data->dev_num; + GOMP_PLUGIN_target_rev (fn_ptr, mapnum, addr_ptr, sizes_ptr, + kinds_ptr, dev_num, rev_off_dev_to_host_cpy, + rev_off_host_to_dev_cpy, copy_stream); + rev_data->fn = 0; + if (!has_usm) + { + /* fn is the first element. */ + r = CUDA_CALL_NOCHECK (cuMemcpyHtoDAsync, + ptx_dev->rev_data_dev, + ptx_dev->rev_data, + sizeof (ptx_dev->rev_data->fn), + copy_stream); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r)); + CUDA_CALL_ASSERT (cuStreamSynchronize, copy_stream); + } + } + usleep (1); + } + else + r = CUDA_CALL_NOCHECK (cuCtxSynchronize, ); + if (reverse_off) + CUDA_CALL_ASSERT (cuStreamDestroy, copy_stream); if (r == CUDA_ERROR_LAUNCH_FAILED) GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r), maybe_abort_msg); diff --git a/libgomp/target.c b/libgomp/target.c index 135db1d88ab..0c6fad690f1 100644 --- a/libgomp/target.c +++ b/libgomp/target.c @@ -2856,6 +2856,24 @@ GOMP_target_ext (int device, void (*fn) (void *), size_t mapnum, htab_free (refcount_set); } +/* Handle reverse offload. This is not called for the host. */ + +void +gomp_target_rev (uint64_t fn_ptr __attribute__ ((unused)), + uint64_t mapnum __attribute__ ((unused)), + uint64_t devaddrs_ptr __attribute__ ((unused)), + uint64_t sizes_ptr __attribute__ ((unused)), + uint64_t kinds_ptr __attribute__ ((unused)), + int dev_num __attribute__ ((unused)), + void (*dev_to_host_cpy) (void *, const void *, size_t, + void *) __attribute__ ((unused)), + void (*host_to_dev_cpy) (void *, const void *, size_t, + void *) __attribute__ ((unused)), + void *token __attribute__ ((unused))) +{ + __builtin_unreachable (); +} + /* Host fallback for GOMP_target_data{,_ext} routines. */ static void