libgomp: Add no-target-region rev offload test + fix plugin-nvptx OpenMP permits that a 'target device(ancestor:1)' is called without being enclosed in a target region - using the current device (i.e. the host) in that case. This commit adds a testcase for this. In case of nvptx, the missing on-device 'GOMP_target_ext' call causes that it and also the associated on-device GOMP_REV_OFFLOAD_VAR variable are not linked in from nvptx's libgomp.a. Thus, handle the failing cuModuleGetGlobal gracefully by disabling reverse offload and assuming that the failure is fine. libgomp/ChangeLog: * plugin/plugin-nvptx.c (GOMP_OFFLOAD_load_image): Use unsigned int for 'i' to match 'fn_entries'; regard absent GOMP_REV_OFFLOAD_VAR as valid and the code having no reverse-offload code. * testsuite/libgomp.c-c++-common/reverse-offload-2.c: New test. libgomp/plugin/plugin-nvptx.c | 36 ++++++++++------ .../libgomp.c-c++-common/reverse-offload-2.c | 49 ++++++++++++++++++++++ 2 files changed, 73 insertions(+), 12 deletions(-) diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c index 0768fca350b..e803f083591 100644 --- a/libgomp/plugin/plugin-nvptx.c +++ b/libgomp/plugin/plugin-nvptx.c @@ -1390,7 +1390,8 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data, else if (rev_fn_table) { CUdeviceptr var; - size_t bytes, i; + size_t bytes; + unsigned int i; r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &var, &bytes, module, "$offload_func_table"); if (r != CUDA_SUCCESS) @@ -1413,12 +1414,11 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data, if (rev_fn_table && *rev_fn_table && dev->rev_data == NULL) { - /* cuMemHostAlloc memory is accessible on the device, if unified-shared - address is supported; this is assumed - see comment in - nvptx_open_device for CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING. */ - CUDA_CALL_ASSERT (cuMemHostAlloc, (void **) &dev->rev_data, - sizeof (*dev->rev_data), CU_MEMHOSTALLOC_DEVICEMAP); - CUdeviceptr dp = (CUdeviceptr) dev->rev_data; + /* Get the on-device GOMP_REV_OFFLOAD_VAR variable. It should be + available but it might be not. One reason could be: if the user code + has 'omp target device(ancestor:1)' in pure hostcode, GOMP_target_ext + is not called on the device and, hence, it and GOMP_REV_OFFLOAD_VAR + are not linked in. */ CUdeviceptr device_rev_offload_var; size_t device_rev_offload_size; CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, @@ -1426,11 +1426,23 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data, &device_rev_offload_size, module, XSTRING (GOMP_REV_OFFLOAD_VAR)); if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuModuleGetGlobal error - GOMP_REV_OFFLOAD_VAR: %s", cuda_error (r)); - r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, device_rev_offload_var, &dp, - sizeof (dp)); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r)); + { + free (*rev_fn_table); + *rev_fn_table = NULL; + } + else + { + /* cuMemHostAlloc memory is accessible on the device, if + unified-shared address is supported; this is assumed - see comment + in nvptx_open_device for CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING. */ + CUDA_CALL_ASSERT (cuMemHostAlloc, (void **) &dev->rev_data, + sizeof (*dev->rev_data), CU_MEMHOSTALLOC_DEVICEMAP); + CUdeviceptr dp = (CUdeviceptr) dev->rev_data; + r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, device_rev_offload_var, &dp, + sizeof (dp)); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r)); + } } nvptx_set_clocktick (module, dev); diff --git a/libgomp/testsuite/libgomp.c-c++-common/reverse-offload-2.c b/libgomp/testsuite/libgomp.c-c++-common/reverse-offload-2.c new file mode 100644 index 00000000000..33bd38481bb --- /dev/null +++ b/libgomp/testsuite/libgomp.c-c++-common/reverse-offload-2.c @@ -0,0 +1,49 @@ +/* { dg-do run } */ +/* { dg-additional-options "-foffload-options=nvptx-none=-misa=sm_35" { target { offload_target_nvptx } } } */ + +#pragma omp requires reverse_offload + +int +main () +{ + int A[10]; + int y; + + for (int i = 0; i < 10; i++) + A[i] = 2*i; + + y = 42; + + /* Pointlessly copy to the default device. */ + #pragma omp target data map(to: A) + { + /* Not enclosed in a target region (= i.e. running on the host); the + following is valid - it runs on the current device (= host). */ + #pragma omp target device ( ancestor:1 ) firstprivate(y) map(to: A) + { + if (y != 42) + __builtin_abort (); + for (int i = 0; i < 10; i++) + if (A[i] != 2*i) + __builtin_abort (); + for (int i = 0; i < 10; i++) + if (A[i] != 2*i) + A[i] = 4*i; + y = 31; + } + + if (y != 42) + __builtin_abort (); + for (int i = 0; i < 10; i++) + if (A[i] != 2*i) + __builtin_abort (); + } + + if (y != 42) + __builtin_abort (); + for (int i = 0; i < 10; i++) + if (A[i] != 2*i) + __builtin_abort (); + + return 0; +}