Hi. On the high level, I'd be highly uncomfortable with this. I guess we are in vague agreement that it cannot be efficiently implemented. It also goes against the good practice of accelerator programming, which requires queueing work on the accelerator and letting it run asynchronously with the CPU with high occupancy. (I know libgomp still waits for the GPU to finish in each GOMP_offload_run, but maybe it's better to improve *that* instead of piling on new slowness) What I said above also applies to MPI+GPU scenarios: a well-designed algorithm should arrange for MPI communications to happen in parallel with some useful offloaded calculations. I don't see the value in implementing the ability to invoke an MPI call from the accelerator in such inefficient fashion. (so yes, I disagree with "it is better to provide a feature even if it is slow – than not providing it at all", when it is advertised as a general-purpose feature, not a purely debugging helper) On to the patch itself. IIRC one of the questions was use of CUDA managed memory. I think it is unsafe because device-issued atomics are not guaranteed to appear atomic to the host, unless compiling for compute capability 6.0 or above, and using system-scope atomics ("atom.sys"). And for non-USM code path you're relying on cudaMemcpy observing device-side atomics in the right order. Atomics aside, CUDA pinned memory would be a natural choice for such a tiny structure. Did you rule it out for some reason? Some remarks on the diff below, not intended to be a complete review. Alexander > --- a/libgomp/config/nvptx/target.c > +++ b/libgomp/config/nvptx/target.c > @@ -26,7 +26,29 @@ > #include "libgomp.h" > #include > > +#define GOMP_REV_OFFLOAD_VAR __gomp_rev_offload_var Shouldn't this be in a header (needs to be in sync with the plugin). > + > +/* Reverse offload. Must match version used in plugin/plugin-nvptx.c. */ > +struct rev_offload { > + uint64_t fn; > + uint64_t mapnum; > + uint64_t addrs; > + uint64_t sizes; > + uint64_t kinds; > + int32_t dev_num; > + uint32_t lock; > +}; Likewise. > + > +#if (__SIZEOF_SHORT__ != 2 \ > + || __SIZEOF_SIZE_T__ != 8 \ > + || __SIZEOF_POINTER__ != 8) > +#error "Data-type conversion required for rev_offload" > +#endif Huh? This is not a requirement that is new for reverse offload, it has always been like that for offloading (all ABI rules regarding type sizes, struct layout, bitfield layout, endianness must match). > + > + > extern int __gomp_team_num __attribute__((shared)); > +extern volatile struct gomp_offload_icvs GOMP_ADDITIONAL_ICVS; > +volatile struct rev_offload *GOMP_REV_OFFLOAD_VAR; > > bool > GOMP_teams4 (unsigned int num_teams_lower, unsigned int num_teams_upper, > @@ -88,16 +110,32 @@ GOMP_target_ext (int device, void (*fn) (void *), size_t mapnum, > void **hostaddrs, size_t *sizes, unsigned short *kinds, > unsigned int flags, void **depend, void **args) > { > - (void) device; > - (void) fn; > - (void) mapnum; > - (void) hostaddrs; > - (void) sizes; > - (void) kinds; > (void) flags; > (void) depend; > (void) args; > - __builtin_unreachable (); > + > + if (device != GOMP_DEVICE_HOST_FALLBACK > + || fn == NULL > + || GOMP_REV_OFFLOAD_VAR == NULL) > + return; Shouldn't this be an 'assert' instead? > + > + while (__sync_lock_test_and_set (&GOMP_REV_OFFLOAD_VAR->lock, (uint8_t) 1)) > + ; /* spin */ > + > + __atomic_store_n (&GOMP_REV_OFFLOAD_VAR->mapnum, mapnum, __ATOMIC_SEQ_CST); > + __atomic_store_n (&GOMP_REV_OFFLOAD_VAR->addrs, hostaddrs, __ATOMIC_SEQ_CST); > + __atomic_store_n (&GOMP_REV_OFFLOAD_VAR->sizes, sizes, __ATOMIC_SEQ_CST); > + __atomic_store_n (&GOMP_REV_OFFLOAD_VAR->kinds, kinds, __ATOMIC_SEQ_CST); > + __atomic_store_n (&GOMP_REV_OFFLOAD_VAR->dev_num, > + GOMP_ADDITIONAL_ICVS.device_num, __ATOMIC_SEQ_CST); Looks like all these can be plain stores, you only need ... > + > + /* 'fn' must be last. */ > + __atomic_store_n (&GOMP_REV_OFFLOAD_VAR->fn, fn, __ATOMIC_SEQ_CST); ... this to be atomic with 'release' semantics in the usual producer-consumer pattern. > + > + /* Processed on the host - when done, fn is set to NULL. */ > + while (__atomic_load_n (&GOMP_REV_OFFLOAD_VAR->fn, __ATOMIC_SEQ_CST) != 0) > + ; /* spin */ > + __sync_lock_release (&GOMP_REV_OFFLOAD_VAR->lock); > } > > void > diff --git a/libgomp/libgomp-plugin.c b/libgomp/libgomp-plugin.c > index 9d4cc62..316de74 100644 > --- a/libgomp/libgomp-plugin.c > +++ b/libgomp/libgomp-plugin.c > @@ -78,3 +78,15 @@ GOMP_PLUGIN_fatal (const char *msg, ...) > gomp_vfatal (msg, ap); > va_end (ap); > } > + > +void > +GOMP_PLUGIN_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr, > + uint64_t sizes_ptr, uint64_t kinds_ptr, int dev_num, > + void (*dev_to_host_cpy) (void *, const void *, size_t, > + void *), > + void (*host_to_dev_cpy) (void *, const void *, size_t, > + void *), void *token) > +{ > + gomp_target_rev (fn_ptr, mapnum, devaddrs_ptr, sizes_ptr, kinds_ptr, dev_num, > + dev_to_host_cpy, host_to_dev_cpy, token); > +} > diff --git a/libgomp/libgomp-plugin.h b/libgomp/libgomp-plugin.h > index 6ab5ac6..875f967 100644 > --- a/libgomp/libgomp-plugin.h > +++ b/libgomp/libgomp-plugin.h > @@ -121,6 +121,13 @@ extern void GOMP_PLUGIN_error (const char *, ...) > extern void GOMP_PLUGIN_fatal (const char *, ...) > __attribute__ ((noreturn, format (printf, 1, 2))); > > +extern void GOMP_PLUGIN_target_rev (uint64_t, uint64_t, uint64_t, uint64_t, > + uint64_t, int, > + void (*) (void *, const void *, size_t, > + void *), > + void (*) (void *, const void *, size_t, > + void *), void *); > + > /* Prototypes for functions implemented by libgomp plugins. */ > extern const char *GOMP_OFFLOAD_get_name (void); > extern unsigned int GOMP_OFFLOAD_get_caps (void); > diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h > index 7519274..5803683 100644 > --- a/libgomp/libgomp.h > +++ b/libgomp/libgomp.h > @@ -1128,6 +1128,11 @@ extern int gomp_pause_host (void); > extern void gomp_init_targets_once (void); > extern int gomp_get_num_devices (void); > extern bool gomp_target_task_fn (void *); > +extern void gomp_target_rev (uint64_t, uint64_t, uint64_t, uint64_t, uint64_t, > + int, > + void (*) (void *, const void *, size_t, void *), > + void (*) (void *, const void *, size_t, void *), > + void *); > > /* Splay tree definitions. */ > typedef struct splay_tree_node_s *splay_tree_node; > diff --git a/libgomp/libgomp.map b/libgomp/libgomp.map > index 46d5f10..12f76f7 100644 > --- a/libgomp/libgomp.map > +++ b/libgomp/libgomp.map > @@ -622,3 +622,8 @@ GOMP_PLUGIN_1.3 { > GOMP_PLUGIN_goacc_profiling_dispatch; > GOMP_PLUGIN_goacc_thread; > } GOMP_PLUGIN_1.2; > + > +GOMP_PLUGIN_1.4 { > + global: > + GOMP_PLUGIN_target_rev; > +} GOMP_PLUGIN_1.3; > diff --git a/libgomp/plugin/cuda-lib.def b/libgomp/plugin/cuda-lib.def > index cd91b39..61359c7 100644 > --- a/libgomp/plugin/cuda-lib.def > +++ b/libgomp/plugin/cuda-lib.def > @@ -29,6 +29,7 @@ CUDA_ONE_CALL_MAYBE_NULL (cuLinkCreate_v2) > CUDA_ONE_CALL (cuLinkDestroy) > CUDA_ONE_CALL (cuMemAlloc) > CUDA_ONE_CALL (cuMemAllocHost) > +CUDA_ONE_CALL (cuMemAllocManaged) > CUDA_ONE_CALL (cuMemcpy) > CUDA_ONE_CALL (cuMemcpyDtoDAsync) > CUDA_ONE_CALL (cuMemcpyDtoH) > diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c > index ba6b229..1bd9ee2 100644 > --- a/libgomp/plugin/plugin-nvptx.c > +++ b/libgomp/plugin/plugin-nvptx.c > @@ -54,6 +54,8 @@ > #include > #include > > +#define GOMP_REV_OFFLOAD_VAR __gomp_rev_offload_var > + > /* An arbitrary fixed limit (128MB) for the size of the OpenMP soft stacks > block to cache between kernel invocations. For soft-stacks blocks bigger > than this, we will free the block before attempting another GPU memory > @@ -274,6 +276,17 @@ struct targ_fn_descriptor > int max_threads_per_block; > }; > > +/* Reverse offload. Must match version used in config/nvptx/target.c. */ > +struct rev_offload { > + uint64_t fn; > + uint64_t mapnum; > + uint64_t addrs; > + uint64_t sizes; > + uint64_t kinds; > + int32_t dev_num; > + uint32_t lock; > +}; > + > /* A loaded PTX image. */ > struct ptx_image_data > { > @@ -302,6 +315,7 @@ struct ptx_device > bool map; > bool concur; > bool mkern; > + bool concurr_managed_access; > int mode; > int clock_khz; > int num_sms; > @@ -329,6 +343,9 @@ struct ptx_device > pthread_mutex_t lock; > } omp_stacks; > > + struct rev_offload *rev_data; > + CUdeviceptr rev_data_dev; > + > struct ptx_device *next; > }; > > @@ -423,7 +440,7 @@ nvptx_open_device (int n) > struct ptx_device *ptx_dev; > CUdevice dev, ctx_dev; > CUresult r; > - int async_engines, pi; > + int pi; > > CUDA_CALL_ERET (NULL, cuDeviceGet, &dev, n); > > @@ -519,11 +536,17 @@ nvptx_open_device (int n) > CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev); > ptx_dev->max_threads_per_multiprocessor = pi; > > +#if 0 > + int async_engines; > r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &async_engines, > CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev); > if (r != CUDA_SUCCESS) > async_engines = 1; > +#endif > > + r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi, > + CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, dev); > + ptx_dev->concurr_managed_access = r == CUDA_SUCCESS ? pi : false; > for (int i = 0; i != GOMP_DIM_MAX; i++) > ptx_dev->default_dims[i] = 0; > > @@ -1380,7 +1403,7 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data, > else if (rev_fn_table) > { > CUdeviceptr var; > - size_t bytes; > + size_t bytes, i; > r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &var, &bytes, module, > "$offload_func_table"); > if (r != CUDA_SUCCESS) > @@ -1390,6 +1413,47 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data, > r = CUDA_CALL_NOCHECK (cuMemcpyDtoH, *rev_fn_table, var, bytes); > if (r != CUDA_SUCCESS) > GOMP_PLUGIN_fatal ("cuMemcpyDtoH error: %s", cuda_error (r)); > + /* Free if only NULL entries. */ > + for (i = 0; i < fn_entries; ++i) > + if (rev_fn_table[i] != 0) > + break; > + if (i == fn_entries) > + { > + free (*rev_fn_table); > + *rev_fn_table = NULL; > + } > + } > + > + if (rev_fn_table && dev->rev_data == NULL) > + { > + CUdeviceptr dp = 0; > + if (dev->concurr_managed_access && CUDA_CALL_EXISTS (cuMemAllocManaged)) > + { > + CUDA_CALL_ASSERT (cuMemAllocManaged, (void *) &dev->rev_data, > + sizeof (*dev->rev_data), CU_MEM_ATTACH_GLOBAL); > + dp = (CUdeviceptr) dev->rev_data; > + } > + else > + { > + CUDA_CALL_ASSERT (cuMemAllocHost, (void **) &dev->rev_data, > + sizeof (*dev->rev_data)); > + memset (dev->rev_data, '\0', sizeof (*dev->rev_data)); > + CUDA_CALL_ASSERT (cuMemAlloc, &dev->rev_data_dev, > + sizeof (*dev->rev_data)); > + dp = dev->rev_data_dev; > + } > + CUdeviceptr device_rev_offload_var; > + size_t device_rev_offload_size; > + CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, > + &device_rev_offload_var, > + &device_rev_offload_size, module, > + XSTRING (GOMP_REV_OFFLOAD_VAR)); > + if (r != CUDA_SUCCESS) > + GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r)); > + r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, device_rev_offload_var, &dp, > + sizeof (dp)); > + if (r != CUDA_SUCCESS) > + GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r)); > } > > nvptx_set_clocktick (module, dev); > @@ -2001,6 +2065,23 @@ nvptx_stacks_acquire (struct ptx_device *ptx_dev, size_t size, int num) > return (void *) ptx_dev->omp_stacks.ptr; > } > > + > +void > +rev_off_dev_to_host_cpy (void *dest, const void *src, size_t size, > + CUstream stream) > +{ > + CUDA_CALL_ASSERT (cuMemcpyDtoHAsync, dest, (CUdeviceptr) src, size, stream); > + CUDA_CALL_ASSERT (cuStreamSynchronize, stream); > +} > + > +void > +rev_off_host_to_dev_cpy (void *dest, const void *src, size_t size, > + CUstream stream) > +{ > + CUDA_CALL_ASSERT (cuMemcpyHtoDAsync, (CUdeviceptr) dest, src, size, stream); > + CUDA_CALL_ASSERT (cuStreamSynchronize, stream); > +} > + > void > GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args) > { > @@ -2035,6 +2116,10 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args) > nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads); > > size_t stack_size = nvptx_stacks_size (); > + bool reverse_off = ptx_dev->rev_data != NULL; > + bool has_usm = (ptx_dev->concurr_managed_access > + && CUDA_CALL_EXISTS (cuMemAllocManaged)); > + CUstream copy_stream = NULL; > > pthread_mutex_lock (&ptx_dev->omp_stacks.lock); > void *stacks = nvptx_stacks_acquire (ptx_dev, stack_size, teams * threads); > @@ -2048,12 +2133,62 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args) > GOMP_PLUGIN_debug (0, " %s: kernel %s: launch" > " [(teams: %u), 1, 1] [(lanes: 32), (threads: %u), 1]\n", > __FUNCTION__, fn_name, teams, threads); > + if (reverse_off) > + CUDA_CALL_ASSERT (cuStreamCreate, ©_stream, CU_STREAM_NON_BLOCKING); > r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1, > 32, threads, 1, 0, NULL, NULL, config); > if (r != CUDA_SUCCESS) > GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r)); > - > - r = CUDA_CALL_NOCHECK (cuCtxSynchronize, ); > + if (reverse_off) > + while (true) > + { > + r = CUDA_CALL_NOCHECK (cuStreamQuery, NULL); > + if (r == CUDA_SUCCESS) > + break; > + if (r == CUDA_ERROR_LAUNCH_FAILED) > + GOMP_PLUGIN_fatal ("cuStreamQuery error: %s %s\n", cuda_error (r), > + maybe_abort_msg); > + else if (r != CUDA_ERROR_NOT_READY) > + GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r)); > + if (!has_usm) > + { > + CUDA_CALL_ASSERT (cuMemcpyDtoHAsync, ptx_dev->rev_data, > + ptx_dev->rev_data_dev, > + sizeof (*ptx_dev->rev_data), copy_stream); > + CUDA_CALL_ASSERT (cuStreamSynchronize, copy_stream); > + } > + if (ptx_dev->rev_data->fn != 0) Surely this needs to be an atomic load with 'acquire' semantics in has_usm case? > + { > + struct rev_offload *rev_data = ptx_dev->rev_data; > + uint64_t fn_ptr = rev_data->fn; > + uint64_t mapnum = rev_data->mapnum; > + uint64_t addr_ptr = rev_data->addrs; > + uint64_t sizes_ptr = rev_data->sizes; > + uint64_t kinds_ptr = rev_data->kinds; > + int dev_num = (int) rev_data->dev_num; > + GOMP_PLUGIN_target_rev (fn_ptr, mapnum, addr_ptr, sizes_ptr, > + kinds_ptr, dev_num, rev_off_dev_to_host_cpy, > + rev_off_host_to_dev_cpy, copy_stream); > + rev_data->fn = 0; Atomic store? > + if (!has_usm) > + { > + /* fn is the first element. */ > + r = CUDA_CALL_NOCHECK (cuMemcpyHtoDAsync, > + ptx_dev->rev_data_dev, > + ptx_dev->rev_data, > + sizeof (ptx_dev->rev_data->fn), > + copy_stream); > + if (r != CUDA_SUCCESS) > + GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r)); > + CUDA_CALL_ASSERT (cuStreamSynchronize, copy_stream); > + } > + } > + usleep (1); > + } > + else > + r = CUDA_CALL_NOCHECK (cuCtxSynchronize, ); > + if (reverse_off) > + CUDA_CALL_ASSERT (cuStreamDestroy, copy_stream); > if (r == CUDA_ERROR_LAUNCH_FAILED) > GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r), > maybe_abort_msg); > diff --git a/libgomp/target.c b/libgomp/target.c > index 5763483..9377de0 100644 > --- a/libgomp/target.c > +++ b/libgomp/target.c > @@ -2925,6 +2925,25 @@ GOMP_target_ext (int device, void (*fn) (void *), size_t mapnum, > htab_free (refcount_set); > } > > +/* Handle reverse offload. This is called by the device plugins for a > + reverse offload; it is not called if the outer target runs on the host. */ > + > +void > +gomp_target_rev (uint64_t fn_ptr __attribute__ ((unused)), > + uint64_t mapnum __attribute__ ((unused)), > + uint64_t devaddrs_ptr __attribute__ ((unused)), > + uint64_t sizes_ptr __attribute__ ((unused)), > + uint64_t kinds_ptr __attribute__ ((unused)), > + int dev_num __attribute__ ((unused)), > + void (*dev_to_host_cpy) (void *, const void *, size_t, > + void *) __attribute__ ((unused)), > + void (*host_to_dev_cpy) (void *, const void *, size_t, > + void *) __attribute__ ((unused)), > + void *token __attribute__ ((unused))) > +{ > + __builtin_unreachable (); > +} > + > /* Host fallback for GOMP_target_data{,_ext} routines. */ > > static void