From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 1534) id 1CC903852C6A; Mon, 28 Nov 2022 14:29:49 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 1CC903852C6A DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org; s=default; t=1669645789; bh=tZspY1SKa6fZgdm79k6JQ2tn8vU7jDH+aZRs6vajI9Y=; h=From:To:Subject:Date:From; b=nBPt/arvucLFSykzR8zfoiFmmra1qCKCUgYOg90KogqSqe3tgua1F0szbnmyfP0FK lsfXpMrO72K9HEZfCiAEtZHNe7AXmbCMt2GTv6lONLhIvsXeMuoJ0n8nYAVD+G9hCc xusdaj5Atkx1zJQp5QMesBEl1O1eSVSuXTXEGTro= Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: Tobias Burnus To: gcc-cvs@gcc.gnu.org Subject: [gcc/devel/omp/gcc-12] libgomp: Add no-target-region rev offload test + fix plugin-nvptx X-Act-Checkin: gcc X-Git-Author: Tobias Burnus X-Git-Refname: refs/heads/devel/omp/gcc-12 X-Git-Oldrev: 6c6d03e4c69442f3b0c6407693978775ebd4ddf1 X-Git-Newrev: 97c07e0b841ac919c5eb6f4b8c335ede1f32de78 Message-Id: <20221128142949.1CC903852C6A@sourceware.org> Date: Mon, 28 Nov 2022 14:29:49 +0000 (GMT) List-Id: https://gcc.gnu.org/g:97c07e0b841ac919c5eb6f4b8c335ede1f32de78 commit 97c07e0b841ac919c5eb6f4b8c335ede1f32de78 Author: Tobias Burnus Date: Mon Nov 28 15:16:47 2022 +0100 libgomp: Add no-target-region rev offload test + fix plugin-nvptx OpenMP permits that a 'target device(ancestor:1)' is called without being enclosed in a target region - using the current device (i.e. the host) in that case. This commit adds a testcase for this. In case of nvptx, the missing on-device 'GOMP_target_ext' call causes that it and also the associated on-device GOMP_REV_OFFLOAD_VAR variable are not linked in from nvptx's libgomp.a. Thus, handle the failing cuModuleGetGlobal gracefully by disabling reverse offload and assuming that the failure is fine. libgomp/ChangeLog: * plugin/plugin-nvptx.c (GOMP_OFFLOAD_load_image): Use unsigned int for 'i' to match 'fn_entries'; regard absent GOMP_REV_OFFLOAD_VAR as valid and the code having no reverse-offload code. * testsuite/libgomp.c-c++-common/reverse-offload-2.c: New test. (cherry picked from commit 9f9d128f459e0c5ace8f7b85504d277b5a838daf) Diff: --- libgomp/ChangeLog.omp | 10 +++++ libgomp/plugin/plugin-nvptx.c | 36 ++++++++++------ .../libgomp.c-c++-common/reverse-offload-2.c | 49 ++++++++++++++++++++++ 3 files changed, 83 insertions(+), 12 deletions(-) diff --git a/libgomp/ChangeLog.omp b/libgomp/ChangeLog.omp index b259d72e576..3360c9d7608 100644 --- a/libgomp/ChangeLog.omp +++ b/libgomp/ChangeLog.omp @@ -1,3 +1,13 @@ +2022-11-28 Tobias Burnus + + Backported from master: + 2022-11-25 Tobias Burnus + + * plugin/plugin-nvptx.c (GOMP_OFFLOAD_load_image): Use unsigned int + for 'i' to match 'fn_entries'; regard absent GOMP_REV_OFFLOAD_VAR + as valid and the code having no reverse-offload code. + * testsuite/libgomp.c-c++-common/reverse-offload-2.c: New test. + 2022-11-28 Tobias Burnus Backported from master: diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c index be3af6cea2a..d4c6dd119fe 100644 --- a/libgomp/plugin/plugin-nvptx.c +++ b/libgomp/plugin/plugin-nvptx.c @@ -1423,7 +1423,8 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data, else if (rev_fn_table) { CUdeviceptr var; - size_t bytes, i; + size_t bytes; + unsigned int i; r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &var, &bytes, module, "$offload_func_table"); if (r != CUDA_SUCCESS) @@ -1446,12 +1447,11 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data, if (rev_fn_table && *rev_fn_table && dev->rev_data == NULL) { - /* cuMemHostAlloc memory is accessible on the device, if unified-shared - address is supported; this is assumed - see comment in - nvptx_open_device for CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING. */ - CUDA_CALL_ASSERT (cuMemHostAlloc, (void **) &dev->rev_data, - sizeof (*dev->rev_data), CU_MEMHOSTALLOC_DEVICEMAP); - CUdeviceptr dp = (CUdeviceptr) dev->rev_data; + /* Get the on-device GOMP_REV_OFFLOAD_VAR variable. It should be + available but it might be not. One reason could be: if the user code + has 'omp target device(ancestor:1)' in pure hostcode, GOMP_target_ext + is not called on the device and, hence, it and GOMP_REV_OFFLOAD_VAR + are not linked in. */ CUdeviceptr device_rev_offload_var; size_t device_rev_offload_size; CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, @@ -1459,11 +1459,23 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data, &device_rev_offload_size, module, XSTRING (GOMP_REV_OFFLOAD_VAR)); if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuModuleGetGlobal error - GOMP_REV_OFFLOAD_VAR: %s", cuda_error (r)); - r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, device_rev_offload_var, &dp, - sizeof (dp)); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r)); + { + free (*rev_fn_table); + *rev_fn_table = NULL; + } + else + { + /* cuMemHostAlloc memory is accessible on the device, if + unified-shared address is supported; this is assumed - see comment + in nvptx_open_device for CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING. */ + CUDA_CALL_ASSERT (cuMemHostAlloc, (void **) &dev->rev_data, + sizeof (*dev->rev_data), CU_MEMHOSTALLOC_DEVICEMAP); + CUdeviceptr dp = (CUdeviceptr) dev->rev_data; + r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, device_rev_offload_var, &dp, + sizeof (dp)); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r)); + } } nvptx_set_clocktick (module, dev); diff --git a/libgomp/testsuite/libgomp.c-c++-common/reverse-offload-2.c b/libgomp/testsuite/libgomp.c-c++-common/reverse-offload-2.c new file mode 100644 index 00000000000..33bd38481bb --- /dev/null +++ b/libgomp/testsuite/libgomp.c-c++-common/reverse-offload-2.c @@ -0,0 +1,49 @@ +/* { dg-do run } */ +/* { dg-additional-options "-foffload-options=nvptx-none=-misa=sm_35" { target { offload_target_nvptx } } } */ + +#pragma omp requires reverse_offload + +int +main () +{ + int A[10]; + int y; + + for (int i = 0; i < 10; i++) + A[i] = 2*i; + + y = 42; + + /* Pointlessly copy to the default device. */ + #pragma omp target data map(to: A) + { + /* Not enclosed in a target region (= i.e. running on the host); the + following is valid - it runs on the current device (= host). */ + #pragma omp target device ( ancestor:1 ) firstprivate(y) map(to: A) + { + if (y != 42) + __builtin_abort (); + for (int i = 0; i < 10; i++) + if (A[i] != 2*i) + __builtin_abort (); + for (int i = 0; i < 10; i++) + if (A[i] != 2*i) + A[i] = 4*i; + y = 31; + } + + if (y != 42) + __builtin_abort (); + for (int i = 0; i < 10; i++) + if (A[i] != 2*i) + __builtin_abort (); + } + + if (y != 42) + __builtin_abort (); + for (int i = 0; i < 10; i++) + if (A[i] != 2*i) + __builtin_abort (); + + return 0; +}