libgomp: cuda.h and omp_target_memcpy_rect cleanup Fixes for commit r14-2792-g25072a477a56a727b369bf9b20f4d18198ff5894 "OpenMP: Call cuMemcpy2D/cuMemcpy3D for nvptx for omp_target_memcpy_rect", namely: In that commit, the code was changed to handle shared-memory devices; however, as pointed out, omp_target_memcpy_check already set the pointer to NULL in that case. Hence, this commit reverts to the prior version. In cuda.h, it adds cuMemcpyPeer{,Async} for symmetry for cuMemcpy3DPeer (all currently unused) and in three structs, fixes reserved-member names and remove a bogus 'const' in three structs. And it changes a DLSYM to DLSYM_OPT as not all plugins support the new functions, yet. include/ChangeLog: * cuda/cuda.h (CUDA_MEMCPY2D, CUDA_MEMCPY3D, CUDA_MEMCPY3D_PEER): Remove bogus 'const' from 'const void *dst' and fix reserved-name name in those structs. (cuMemcpyPeer, cuMemcpyPeerAsync): Add. libgomp/ChangeLog: * target.c (omp_target_memcpy_rect_worker): Undo dim=1 change for GOMP_OFFLOAD_CAP_SHARED_MEM. (omp_target_memcpy_rect_copy): Likewise for lock condition. (gomp_load_plugin_for_device): Use DLSYM_OPT not DLSYM for memcpy3d/memcpy2d. * plugin/plugin-nvptx.c (GOMP_OFFLOAD_memcpy2d, GOMP_OFFLOAD_memcpy3d): Use memset 0 to nullify reserved and unused src/dst fields for that mem type; remove '{src,dst}LOD = 0'. Co-authored-by: Thomas Schwinge include/cuda/cuda.h | 12 +++++----- libgomp/plugin/plugin-nvptx.c | 6 +++-- libgomp/target.c | 52 ++++++++++++++----------------------------- 3 files changed, 28 insertions(+), 42 deletions(-) diff --git a/include/cuda/cuda.h b/include/cuda/cuda.h index 09c3c2b8dbe..94fc64a488d 100644 --- a/include/cuda/cuda.h +++ b/include/cuda/cuda.h @@ -147,7 +147,7 @@ typedef struct { size_t dstXInBytes, dstY; CUmemorytype dstMemoryType; - const void *dstHost; + void *dstHost; CUdeviceptr dstDevice; CUarray dstArray; size_t dstPitch; @@ -162,16 +162,16 @@ typedef struct { const void *srcHost; CUdeviceptr srcDevice; CUarray srcArray; - void *dummy; + void *reserved0; size_t srcPitch, srcHeight; size_t dstXInBytes, dstY, dstZ; size_t dstLOD; CUmemorytype dstMemoryType; - const void *dstHost; + void *dstHost; CUdeviceptr dstDevice; CUarray dstArray; - void *dummy2; + void *reserved1; size_t dstPitch, dstHeight; size_t WidthInBytes, Height, Depth; @@ -190,7 +190,7 @@ typedef struct { size_t dstXInBytes, dstY, dstZ; size_t dstLOD; CUmemorytype dstMemoryType; - const void *dstHost; + void *dstHost; CUdeviceptr dstDevice; CUarray dstArray; CUcontext dstContext; @@ -246,6 +246,8 @@ CUresult cuMemAlloc (CUdeviceptr *, size_t); CUresult cuMemAllocHost (void **, size_t); CUresult cuMemHostAlloc (void **, size_t, unsigned int); CUresult cuMemcpy (CUdeviceptr, CUdeviceptr, size_t); +CUresult cuMemcpyPeer (CUdeviceptr, CUcontext, CUdeviceptr, CUcontext, size_t); +CUresult cuMemcpyPeerAsync (CUdeviceptr, CUcontext, CUdeviceptr, CUcontext, size_t, CUstream); #define cuMemcpyDtoDAsync cuMemcpyDtoDAsync_v2 CUresult cuMemcpyDtoDAsync (CUdeviceptr, CUdeviceptr, size_t, CUstream); #define cuMemcpyDtoH cuMemcpyDtoH_v2 diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c index 9cdc55cac6b..00d4241ae02 100644 --- a/libgomp/plugin/plugin-nvptx.c +++ b/libgomp/plugin/plugin-nvptx.c @@ -1794,6 +1794,8 @@ GOMP_OFFLOAD_memcpy2d (int dst_ord, int src_ord, size_t dim1_size, /* TODO: Consider using CU_MEMORYTYPE_UNIFIED if supported. */ CUDA_MEMCPY2D data; + + memset (&data, 0, sizeof (data)); data.WidthInBytes = dim1_size; data.Height = dim0_len; @@ -1855,6 +1857,8 @@ GOMP_OFFLOAD_memcpy3d (int dst_ord, int src_ord, size_t dim2_size, /* TODO: Consider using CU_MEMORYTYPE_UNIFIED if supported. */ CUDA_MEMCPY3D data; + + memset (&data, 0, sizeof (data)); data.WidthInBytes = dim2_size; data.Height = dim1_len; data.Depth = dim0_len; @@ -1874,7 +1878,6 @@ GOMP_OFFLOAD_memcpy3d (int dst_ord, int src_ord, size_t dim2_size, data.dstXInBytes = dst_offset2_size; data.dstY = dst_offset1_len; data.dstZ = dst_offset0_len; - data.dstLOD = 0; if (src_ord == -1) { @@ -1891,7 +1894,6 @@ GOMP_OFFLOAD_memcpy3d (int dst_ord, int src_ord, size_t dim2_size, data.srcXInBytes = src_offset2_size; data.srcY = src_offset1_len; data.srcZ = src_offset0_len; - data.srcLOD = 0; CUDA_CALL (cuMemcpy3D, &data); return true; diff --git a/libgomp/target.c b/libgomp/target.c index 5cf2e8dce37..cd4cc1b01ca 100644 --- a/libgomp/target.c +++ b/libgomp/target.c @@ -4540,33 +4540,22 @@ omp_target_memcpy_rect_worker (void *dst, const void *src, size_t element_size, || __builtin_mul_overflow (element_size, dst_offsets[0], &dst_off) || __builtin_mul_overflow (element_size, src_offsets[0], &src_off)) return EINVAL; - if (src_devicep != NULL && src_devicep == dst_devicep) - ret = src_devicep->dev2dev_func (src_devicep->target_id, - (char *) dst + dst_off, - (const char *) src + src_off, - length); - else if (src_devicep != NULL - && (dst_devicep == NULL - || (dst_devicep->capabilities - & GOMP_OFFLOAD_CAP_SHARED_MEM))) - ret = src_devicep->dev2host_func (src_devicep->target_id, + if (dst_devicep == NULL && src_devicep == NULL) + { + memcpy ((char *) dst + dst_off, (const char *) src + src_off, + length); + ret = 1; + } + else if (src_devicep == NULL) + ret = dst_devicep->host2dev_func (dst_devicep->target_id, (char *) dst + dst_off, (const char *) src + src_off, length); - else if (dst_devicep != NULL - && (src_devicep == NULL - || (src_devicep->capabilities - & GOMP_OFFLOAD_CAP_SHARED_MEM))) - ret = dst_devicep->host2dev_func (dst_devicep->target_id, + else if (dst_devicep == NULL) + ret = src_devicep->dev2host_func (src_devicep->target_id, (char *) dst + dst_off, (const char *) src + src_off, length); - else if (dst_devicep == NULL && src_devicep == NULL) - { - memcpy ((char *) dst + dst_off, (const char *) src + src_off, - length); - ret = 1; - } else if (src_devicep == dst_devicep) ret = src_devicep->dev2dev_func (src_devicep->target_id, (char *) dst + dst_off, @@ -4584,7 +4573,8 @@ omp_target_memcpy_rect_worker (void *dst, const void *src, size_t element_size, else if (*tmp_size < length) { *tmp_size = length; - *tmp = realloc (*tmp, length); + free (*tmp); + *tmp = malloc (length); if (*tmp == NULL) return ENOMEM; } @@ -4599,7 +4589,7 @@ omp_target_memcpy_rect_worker (void *dst, const void *src, size_t element_size, return ret ? 0 : EINVAL; } - /* host->device, device->host and same-device device->device. */ + /* host->device, device->host and intra device. */ if (num_dims == 2 && ((src_devicep && src_devicep == dst_devicep @@ -4711,16 +4701,8 @@ omp_target_memcpy_rect_copy (void *dst, const void *src, bool lock_src; bool lock_dst; - lock_src = (src_devicep - && (!dst_devicep - || src_devicep == dst_devicep - || !(src_devicep->capabilities - & GOMP_OFFLOAD_CAP_SHARED_MEM))); - lock_dst = (dst_devicep - && (!lock_src - || (src_devicep != dst_devicep - && !(dst_devicep->capabilities - & GOMP_OFFLOAD_CAP_SHARED_MEM)))); + lock_src = src_devicep != NULL; + lock_dst = dst_devicep != NULL && src_devicep != dst_devicep; if (lock_src) gomp_mutex_lock (&src_devicep->lock); if (lock_dst) @@ -5076,8 +5058,8 @@ gomp_load_plugin_for_device (struct gomp_device_descr *device, DLSYM (free); DLSYM (dev2host); DLSYM (host2dev); - DLSYM (memcpy2d); - DLSYM (memcpy3d); + DLSYM_OPT (memcpy2d, memcpy2d); + DLSYM_OPT (memcpy3d, memcpy3d); device->capabilities = device->get_caps_func (); if (device->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400) {