diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c index c04c3acd679..7fc7f4a5bbf 100644 --- a/libgomp/plugin/plugin-nvptx.c +++ b/libgomp/plugin/plugin-nvptx.c @@ -391,7 +391,7 @@ nvptx_attach_host_thread_to_device (int n) CUresult r; struct ptx_device *ptx_dev; CUcontext thd_ctx; - +__builtin_fprintf (stderr, "DEBUG: nvptx_attach_host_thread_to_device - %d\n", n); r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &dev); if (r == CUDA_ERROR_NOT_PERMITTED) { @@ -400,6 +400,7 @@ nvptx_attach_host_thread_to_device (int n) } if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT) { +__builtin_fprintf (stderr, "DEBUG: ERROR nvptx_attach_host_thread_to_device - %d\n", n); GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r)); return false; } @@ -445,9 +446,11 @@ nvptx_open_device (int n) ptx_dev->dev = dev; ptx_dev->ctx_shared = false; +__builtin_fprintf (stderr, "DEBUG: nvptx_open_device - %d\n", n); r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &ctx_dev); if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT) { +__builtin_fprintf (stderr, "DEBUG: ERROR nvptx_open_device - %d\n", n); GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r)); return NULL; } @@ -1174,24 +1177,28 @@ nvptx_get_current_cuda_context (void) const char * GOMP_OFFLOAD_get_name (void) { +__builtin_fprintf (stderr, "DEBUG GOMP_OFFLOAD_get_name\n"); return "nvptx"; } unsigned int GOMP_OFFLOAD_get_caps (void) { +__builtin_fprintf (stderr, "DEBUG GOMP_OFFLOAD_get_caps\n"); return GOMP_OFFLOAD_CAP_OPENACC_200 | GOMP_OFFLOAD_CAP_OPENMP_400; } int GOMP_OFFLOAD_get_type (void) { +__builtin_fprintf (stderr, "DEBUG GOMP_OFFLOAD_get_type\n"); return OFFLOAD_TARGET_TYPE_NVIDIA_PTX; } int GOMP_OFFLOAD_get_num_devices (unsigned int omp_requires_mask) { +__builtin_fprintf (stderr, "DEBUG GOMP_OFFLOAD_get_num_devices %u\n", omp_requires_mask); int num_devices = nvptx_get_num_devices (); /* Return -1 if no omp_requires_mask cannot be fulfilled but devices were present. Unified-shared address: see comment in @@ -1207,6 +1214,7 @@ GOMP_OFFLOAD_get_num_devices (unsigned int omp_requires_mask) bool GOMP_OFFLOAD_init_device (int n) { +__builtin_fprintf (stderr, "DEBUG GOMP_OFFLOAD_init_device %u\n", n); struct ptx_device *dev; pthread_mutex_lock (&ptx_dev_lock); @@ -1248,6 +1256,7 @@ GOMP_OFFLOAD_init_device (int n) bool GOMP_OFFLOAD_fini_device (int n) { +__builtin_fprintf (stderr, "DEBUG GOMP_OFFLOAD_fini_device %u\n", n); pthread_mutex_lock (&ptx_dev_lock); if (ptx_devices[n] != NULL) @@ -1278,6 +1287,7 @@ GOMP_OFFLOAD_fini_device (int n) unsigned GOMP_OFFLOAD_version (void) { +__builtin_fprintf (stderr, "DEBUG GOMP_OFFLOAD_version\n"); return GOMP_VERSION; } @@ -1311,6 +1321,7 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data, uint64_t **rev_fn_table, uint64_t *host_ind_fn_table) { +__builtin_fprintf (stderr, "DEBUG GOMP_OFFLOAD_load_image, %d, %u\n", ord, version); CUmodule module; const char *const *var_names; const struct targ_fn_launch *fn_descs; @@ -1538,6 +1549,7 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data, bool GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data) { +__builtin_fprintf (stderr, "DEBUG GOMP_OFFLOAD_unload_image, %d, %u\n", ord, version); struct ptx_image_data *image, **prev_p; struct ptx_device *dev = ptx_devices[ord]; @@ -1568,6 +1580,7 @@ GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data) void * GOMP_OFFLOAD_alloc (int ord, size_t size) { +__builtin_fprintf (stderr, "DEBUG GOMP_OFFLOAD_alloc, %d, %lu\n", ord, (long unsigned)size); if (!nvptx_attach_host_thread_to_device (ord)) return NULL; @@ -1604,6 +1617,7 @@ GOMP_OFFLOAD_alloc (int ord, size_t size) bool GOMP_OFFLOAD_free (int ord, void *ptr) { +__builtin_fprintf (stderr, "DEBUG GOMP_OFFLOAD_free, %d\n", ord); return (nvptx_attach_host_thread_to_device (ord) && nvptx_free (ptr, ptx_devices[ord])); } @@ -1615,6 +1629,7 @@ GOMP_OFFLOAD_openacc_exec (void (*fn) (void *), void **devaddrs, unsigned *dims, void *targ_mem_desc) { +__builtin_fprintf (stderr, "DEBUG GOMP_OFFLOAD_openacc_exec\n"); GOMP_PLUGIN_debug (0, "nvptx %s\n", __FUNCTION__); CUdeviceptr dp = (CUdeviceptr) devaddrs; @@ -1637,6 +1652,7 @@ GOMP_OFFLOAD_openacc_async_exec (void (*fn) (void *), unsigned *dims, void *targ_mem_desc, struct goacc_asyncqueue *aq) { +__builtin_fprintf (stderr, "DEBUG GOMP_OFFLOAD_openacc_async_exec\n"); GOMP_PLUGIN_debug (0, "nvptx %s\n", __FUNCTION__); CUdeviceptr dp = (CUdeviceptr) devaddrs; @@ -1646,6 +1662,7 @@ GOMP_OFFLOAD_openacc_async_exec (void (*fn) (void *), void * GOMP_OFFLOAD_openacc_create_thread_data (int ord) { +__builtin_fprintf (stderr, "DEBUG GOMP_OFFLOAD_openacc_create_thread_data\n"); struct ptx_device *ptx_dev; struct nvptx_thread *nvthd = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread)); @@ -1670,18 +1687,21 @@ GOMP_OFFLOAD_openacc_create_thread_data (int ord) void GOMP_OFFLOAD_openacc_destroy_thread_data (void *data) { +__builtin_fprintf (stderr, "DEBUG GOMP_OFFLOAD_openacc_destroy_thread_data\n"); free (data); } void * GOMP_OFFLOAD_openacc_cuda_get_current_device (void) { +__builtin_fprintf (stderr, "DEBUG GOMP_OFFLOAD_openacc_cuda_get_current_device\n"); return nvptx_get_current_cuda_device (); } void * GOMP_OFFLOAD_openacc_cuda_get_current_context (void) { +__builtin_fprintf (stderr, "DEBUG GOMP_OFFLOAD_openacc_cuda_get_current_context\n"); return nvptx_get_current_cuda_context (); } @@ -1689,6 +1709,7 @@ GOMP_OFFLOAD_openacc_cuda_get_current_context (void) void * GOMP_OFFLOAD_openacc_cuda_get_stream (struct goacc_asyncqueue *aq) { +__builtin_fprintf (stderr, "DEBUG GOMP_OFFLOAD_openacc_cuda_get_stream\n"); return (void *) aq->cuda_stream; } @@ -1696,6 +1717,7 @@ GOMP_OFFLOAD_openacc_cuda_get_stream (struct goacc_asyncqueue *aq) int GOMP_OFFLOAD_openacc_cuda_set_stream (struct goacc_asyncqueue *aq, void *stream) { +__builtin_fprintf (stderr, "DEBUG GOMP_OFFLOAD_openacc_cuda_set_stream\n"); if (aq->cuda_stream) { CUDA_CALL_ASSERT (cuStreamSynchronize, aq->cuda_stream); @@ -1721,6 +1743,7 @@ nvptx_goacc_asyncqueue_construct (unsigned int flags) struct goacc_asyncqueue * GOMP_OFFLOAD_openacc_async_construct (int device __attribute__((unused))) { +__builtin_fprintf (stderr, "DEBUG GOMP_OFFLOAD_openacc_async_construct\n"); return nvptx_goacc_asyncqueue_construct (CU_STREAM_DEFAULT); } @@ -1735,12 +1758,14 @@ nvptx_goacc_asyncqueue_destruct (struct goacc_asyncqueue *aq) bool GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq) { +__builtin_fprintf (stderr, "DEBUG GOMP_OFFLOAD_openacc_async_destruct\n"); return nvptx_goacc_asyncqueue_destruct (aq); } int GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq) { +__builtin_fprintf (stderr, "DEBUG GOMP_OFFLOAD_openacc_async_test\n"); CUresult r = CUDA_CALL_NOCHECK (cuStreamQuery, aq->cuda_stream); if (r == CUDA_SUCCESS) return 1; @@ -1761,6 +1786,7 @@ nvptx_goacc_asyncqueue_synchronize (struct goacc_asyncqueue *aq) bool GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq) { +__builtin_fprintf (stderr, "DEBUG GOMP_OFFLOAD_openacc_async_synchronize\n"); return nvptx_goacc_asyncqueue_synchronize (aq); } @@ -1768,6 +1794,7 @@ bool GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1, struct goacc_asyncqueue *aq2) { +__builtin_fprintf (stderr, "DEBUG GOMP_OFFLOAD_openacc_async_serialize\n"); CUevent e; CUDA_CALL_ERET (false, cuEventCreate, &e, CU_EVENT_DISABLE_TIMING); CUDA_CALL_ERET (false, cuEventRecord, e, aq1->cuda_stream); @@ -1790,6 +1817,7 @@ GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq, void (*callback_fn)(void *), void *userptr) { +__builtin_fprintf (stderr, "DEBUG GOMP_OFFLOAD_openacc_async_queue_callback\n"); struct nvptx_callback *b = GOMP_PLUGIN_malloc (sizeof (*b)); b->fn = callback_fn; b->ptr = userptr; @@ -1837,6 +1865,7 @@ cuda_memcpy_sanity_check (const void *h, const void *d, size_t s) bool GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n) { +__builtin_fprintf (stderr, "DEBUG GOMP_OFFLOAD_host2dev\n"); if (!nvptx_attach_host_thread_to_device (ord) || !cuda_memcpy_sanity_check (src, dst, n)) return false; @@ -1847,6 +1876,7 @@ GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n) bool GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n) { +__builtin_fprintf (stderr, "DEBUG GOMP_OFFLOAD_dev2host\n"); if (!nvptx_attach_host_thread_to_device (ord) || !cuda_memcpy_sanity_check (dst, src, n)) return false; @@ -1857,6 +1887,7 @@ GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n) bool GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n) { +__builtin_fprintf (stderr, "DEBUG GOMP_OFFLOAD_dev2dev\n"); CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n, NULL); return true; } @@ -1868,6 +1899,7 @@ GOMP_OFFLOAD_memcpy2d (int dst_ord, int src_ord, size_t dim1_size, const void *src, size_t src_offset1_size, size_t src_offset0_len, size_t src_dim1_size) { +__builtin_fprintf (stderr, "DEBUG GOMP_OFFLOAD_memcpy2d\n"); if (!nvptx_attach_host_thread_to_device (src_ord != -1 ? src_ord : dst_ord)) return false; @@ -1960,6 +1992,7 @@ GOMP_OFFLOAD_memcpy3d (int dst_ord, int src_ord, size_t dim2_size, size_t src_offset0_len, size_t src_dim2_size, size_t src_dim1_len) { +__builtin_fprintf (stderr, "DEBUG GOMP_OFFLOAD_memcpy3d\n"); if (!nvptx_attach_host_thread_to_device (src_ord != -1 ? src_ord : dst_ord)) return false; @@ -2050,6 +2083,7 @@ bool GOMP_OFFLOAD_openacc_async_host2dev (int ord, void *dst, const void *src, size_t n, struct goacc_asyncqueue *aq) { +__builtin_fprintf (stderr, "DEBUG GOMP_OFFLOAD_openacc_async_host2dev\n"); if (!nvptx_attach_host_thread_to_device (ord) || !cuda_memcpy_sanity_check (src, dst, n)) return false; @@ -2061,6 +2095,7 @@ bool GOMP_OFFLOAD_openacc_async_dev2host (int ord, void *dst, const void *src, size_t n, struct goacc_asyncqueue *aq) { +__builtin_fprintf (stderr, "DEBUG GOMP_OFFLOAD_openacc_async_dev2host\n"); if (!nvptx_attach_host_thread_to_device (ord) || !cuda_memcpy_sanity_check (dst, src, n)) return false; @@ -2071,6 +2106,7 @@ GOMP_OFFLOAD_openacc_async_dev2host (int ord, void *dst, const void *src, union goacc_property_value GOMP_OFFLOAD_openacc_get_property (int n, enum goacc_property prop) { +__builtin_fprintf (stderr, "DEBUG GOMP_OFFLOAD_openacc_get_property\n"); union goacc_property_value propval = { .val = 0 }; pthread_mutex_lock (&ptx_dev_lock); @@ -2211,6 +2247,7 @@ nvptx_stacks_acquire (struct ptx_device *ptx_dev, size_t size, int num) void GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args) { +__builtin_fprintf (stderr, "DEBUG GOMP_OFFLOAD_run\n"); struct targ_fn_descriptor *tgt_fn_desc = (struct targ_fn_descriptor *) tgt_fn; CUfunction function = tgt_fn_desc->fn; diff --git a/libgomp/target.c b/libgomp/target.c index 1367e9cce6c..f758b20ba4c 100644 --- a/libgomp/target.c +++ b/libgomp/target.c @@ -2524,6 +2524,8 @@ gomp_unload_image_from_device (struct gomp_device_descr *devicep, node = splay_tree_lookup (&devicep->mem_map, &k); } +__builtin_fprintf(stderr, "DEBUG: gomp_unload_image_from_device\n"); + if (!devicep->unload_image_func (devicep->target_id, version, target_data)) { gomp_mutex_unlock (&devicep->lock); @@ -2698,12 +2700,14 @@ GOMP_offload_unregister_ver (unsigned version, const void *host_table, target_data = data; gomp_mutex_lock (®ister_lock); +__builtin_fprintf(stderr, "DEBUG: GOMP_offload_unregister_ver\n"); /* Unload image from all initialized devices. */ for (i = 0; i < num_devices; i++) { struct gomp_device_descr *devicep = &devices[i]; gomp_mutex_lock (&devicep->lock); +__builtin_fprintf(stderr, "DEBUG: GOMP_offload_unregister_ver dev=%d; state=%d\n", i, devicep->state); if (devicep->type == target_type && devicep->state == GOMP_DEVICE_INITIALIZED) gomp_unload_image_from_device (devicep, version, @@ -2775,6 +2779,7 @@ gomp_fini_device (struct gomp_device_descr *devicep) attribute_hidden void gomp_unload_device (struct gomp_device_descr *devicep) { +__builtin_fprintf(stderr, "DEBUG: gomp_unload_device; state=%d\n", devicep->state); if (devicep->state == GOMP_DEVICE_INITIALIZED) { unsigned i; @@ -5217,6 +5222,7 @@ gomp_target_fini (void) bool ret = true; struct gomp_device_descr *devicep = &devices[i]; gomp_mutex_lock (&devicep->lock); +__builtin_fprintf(stderr, "DEBUG: gomp_target_fini; dev=%d, state=%d\n", i, devicep->state); if (devicep->state == GOMP_DEVICE_INITIALIZED) ret = gomp_fini_device (devicep); gomp_mutex_unlock (&devicep->lock);