public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [WIP] OpenMP 4 NVPTX support
@ 2015-04-21 15:58 Jakub Jelinek
  2015-04-22 15:08 ` Bernd Schmidt
                   ` (2 more replies)
  0 siblings, 3 replies; 11+ messages in thread
From: Jakub Jelinek @ 2015-04-21 15:58 UTC (permalink / raw)
  To: Julian Brown, Thomas Schwinge, Bernd Schmidt, Tobias Burnus, Ilya Verbin
  Cc: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 2578 bytes --]

Hi!

Attached is a minimal patch to get at least a trivial OpenMP 4.0 testcase
offloading to NVPTX (the first patch).  The second patch is WIP, just first
few needed changes to make libgomp to build for NVPTX (several weeks of work
at least).

The following seems to work and the output suggests that it was offloaded to
a non-SHM arch:

int
main ()
{
  int v = 0;
  int *w = 0;
  int x = 0;
#pragma omp target
  {
    v = 6;
    w = &v;
    x = 1; // omp_is_initial_device ();
  }
  __builtin_printf ("%d %p %p %d\n", v, &v, w, x);
  return 0;
}

but already tiny bit more complicated testcase:

extern void *malloc (__SIZE_TYPE__);
extern void free (void *);

int
main ()
{
  int v = 0;
  int *w = 0;
  int x = 0;
#pragma omp target
  {
    v = 6;
    w = &v;
    char *p = malloc (64);
    x = 1; // omp_is_initial_device ();
    free (p);
  }
  __builtin_printf ("%d %p %p %d\n", v, &v, w, x);
  return 0;
}

suggests that while it is nice that when building nvptx accel compiler
we build libgcc.a, libc.a, libm.a, libgfortran.a (and in the future hopefully libgomp.a),
nothing attempts to link those in :(.

Is the plan to link those in at mkoffload time (haven't seen any attempt
of mkoffload to invoke the nvptx-none-ld linker though), or link those in
somehow at link_ptx time in the plugin?
In either case, it isn't clear to me how things will work (if at all) in the
case where multiple shared libraries (or executable and at least one shared
library) have their own offloading bits, and if you try to e.g. call an
offloaded function defined in the shared library from an offloaded kernel in
the executable, because if any library needs some global singleton case, if
it is linked multiple times, no idea what the PTX JIT will do.

Once that is resolved, another thing will be to figure out how to
efficiently implement the TLS libgomp needs for its ICVs and other state
- right now it uses either __thread, or pthread_getspecific, neither of
these is usable of course.  I've been thinking about an array of those
structures in .shared memory indexed by %tid.x, but I guess that runs into
the issue that the array would need to be declared fixed size and there is a
very small size limitation on .shared memory size.
So perhaps a file scope .shared pointer to global memory, where whomever
launches an OpenMP 4.0 kernel (either the libgomp-plugin-nvptx.so.1 doing
GOMP_run, or later on dynamic parallelism from GOMP_target in the nvptx
libgomp.a) allocates the memory and some wrapper sets the .shared variable
to that allocated memory, then calls the kernel?

	Jakub

[-- Attachment #2: U1 --]
[-- Type: text/plain, Size: 3122 bytes --]

--- libgomp/plugin/plugin-nvptx.c.jj	2015-04-21 08:38:00.000000000 +0200
+++ libgomp/plugin/plugin-nvptx.c	2015-04-21 16:55:25.247470080 +0200
@@ -978,8 +978,8 @@ event_add (enum ptx_event_type type, CUe
 
 void
 nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
-	  size_t *sizes, unsigned short *kinds, int num_gangs, int num_workers,
-	  int vector_length, int async, void *targ_mem_desc)
+	    size_t *sizes, unsigned short *kinds, int num_gangs,
+	    int num_workers, int vector_length, int async, void *targ_mem_desc)
 {
   struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
   CUfunction function;
@@ -1137,7 +1137,6 @@ nvptx_host2dev (void *d, const void *h,
   CUresult r;
   CUdeviceptr pb;
   size_t ps;
-  struct nvptx_thread *nvthd = nvptx_thread ();
 
   if (!s)
     return 0;
@@ -1162,7 +1161,8 @@ nvptx_host2dev (void *d, const void *h,
     GOMP_PLUGIN_fatal ("invalid size");
 
 #ifndef DISABLE_ASYNC
-  if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
+  struct nvptx_thread *nvthd = nvptx_thread ();
+  if (nvthd && nvthd->current_stream != nvthd->ptx_dev->null_stream)
     {
       CUevent *e;
 
@@ -1202,7 +1202,6 @@ nvptx_dev2host (void *h, const void *d,
   CUresult r;
   CUdeviceptr pb;
   size_t ps;
-  struct nvptx_thread *nvthd = nvptx_thread ();
 
   if (!s)
     return 0;
@@ -1227,7 +1226,8 @@ nvptx_dev2host (void *h, const void *d,
     GOMP_PLUGIN_fatal ("invalid size");
 
 #ifndef DISABLE_ASYNC
-  if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
+  struct nvptx_thread *nvthd = nvptx_thread ();
+  if (nvthd && nvthd->current_stream != nvthd->ptx_dev->null_stream)
     {
       CUevent *e;
 
@@ -1559,7 +1559,8 @@ GOMP_OFFLOAD_get_name (void)
 unsigned int
 GOMP_OFFLOAD_get_caps (void)
 {
-  return GOMP_OFFLOAD_CAP_OPENACC_200;
+  return GOMP_OFFLOAD_CAP_OPENACC_200
+	 | GOMP_OFFLOAD_CAP_OPENMP_400;
 }
 
 int
@@ -1759,7 +1760,7 @@ GOMP_OFFLOAD_openacc_parallel (void (*fn
 			       void *targ_mem_desc)
 {
   nvptx_exec (fn, mapnum, hostaddrs, devaddrs, sizes, kinds, num_gangs,
-	    num_workers, vector_length, async, targ_mem_desc);
+	      num_workers, vector_length, async, targ_mem_desc);
 }
 
 void
@@ -1889,3 +1890,27 @@ GOMP_OFFLOAD_openacc_set_cuda_stream (in
 {
   return nvptx_set_cuda_stream (async, stream);
 }
+
+void
+GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars)
+{
+  CUfunction function = ((struct targ_fn_descriptor *) tgt_fn)->fn;
+  CUresult r;
+  struct ptx_device *ptx_dev = ptx_devices[ord];
+  const char *maybe_abort_msg = "(perhaps abort was called)";
+  void *args = &tgt_vars;
+
+  r = cuLaunchKernel (function,
+		      1, 1, 1,
+		      1, 1, 1,
+		      0, ptx_dev->null_stream->stream, &args, 0);
+  if (r != CUDA_SUCCESS)
+    GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
+
+  r = cuCtxSynchronize ();
+  if (r == CUDA_ERROR_LAUNCH_FAILED)
+    GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
+		       maybe_abort_msg);
+  else if (r != CUDA_SUCCESS)
+    GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
+}

[-- Attachment #3: U2 --]
[-- Type: text/plain, Size: 2686 bytes --]

--- configure.jj	2015-04-21 08:38:24.000000000 +0200
+++ configure	2015-04-21 09:16:42.994959648 +0200
@@ -3171,6 +3171,9 @@ if test x$enable_libgomp = x ; then
 	;;
     *-*-darwin* | *-*-aix*)
 	;;
+    # And on NVPTX as an offloading target.
+    nvptx*-*-*)
+	;;
     *)
 	noconfigdirs="$noconfigdirs target-libgomp"
 	;;
--- libgomp/configure.jj	2015-04-21 11:08:08.347628799 +0200
+++ libgomp/configure	2015-04-21 11:07:39.000000000 +0200
@@ -15038,6 +15038,9 @@ case "$host" in
   *-*-rtems*)
     # RTEMS supports Pthreads, but the library is not available at GCC build time.
     ;;
+  nvptx*-*-*)
+    # NVPTX does not support Pthreads, has its own code replacement.
+    ;;
   *)
     # Check to see if -pthread or -lpthread is needed.  Prefer the former.
     # In case the pthread.h system header is not found, this test will fail.
--- libgomp/configure.tgt.jj	2015-04-21 08:38:00.000000000 +0200
+++ libgomp/configure.tgt	2015-04-21 10:59:30.857197475 +0200
@@ -151,6 +151,10 @@ case "${target}" in
 	XLDFLAGS="${XLDFLAGS} -lpthread"
 	;;
 
+  nvptx*-*-*)
+	config_path="nvptx"
+	;;
+
   *)
 	;;
 
--- libgomp/config.h.in.jj	2015-04-21 08:38:01.000000000 +0200
+++ libgomp/config.h.in	2015-04-21 08:38:01.000000000 +0200
@@ -39,6 +39,9 @@
 /* Define if pthread_{,attr_}{g,s}etaffinity_np is supported. */
 #undef HAVE_PTHREAD_AFFINITY_NP
 
+/* Define to 1 if you have the <pthread.h> header file. */
+#undef HAVE_PTHREAD_H
+
 /* Define to 1 if you have the <semaphore.h> header file. */
 #undef HAVE_SEMAPHORE_H
 
--- libgomp/libgomp.h.jj	2015-04-21 08:38:00.000000000 +0200
+++ libgomp/libgomp.h	2015-04-21 11:15:35.952217394 +0200
@@ -40,7 +40,9 @@
 #include "gstdint.h"
 #include "libgomp-plugin.h"
 
+#ifdef HAVE_PTHREAD_H
 #include <pthread.h>
+#endif
 #include <stdbool.h>
 #include <stdlib.h>
 #include <stdarg.h>
--- libgomp/configure.ac.jj	2015-04-21 08:38:00.000000000 +0200
+++ libgomp/configure.ac	2015-04-21 11:06:38.418117846 +0200
@@ -179,6 +179,9 @@ case "$host" in
   *-*-rtems*)
     # RTEMS supports Pthreads, but the library is not available at GCC build time.
     ;;
+  nvptx*-*-*)
+    # NVPTX does not support Pthreads, has its own code replacement.
+    ;;
   *)
     # Check to see if -pthread or -lpthread is needed.  Prefer the former.
     # In case the pthread.h system header is not found, this test will fail.
--- configure.ac.jj	2015-04-21 08:38:09.000000000 +0200
+++ configure.ac	2015-04-21 09:14:50.107827544 +0200
@@ -539,6 +539,9 @@ if test x$enable_libgomp = x ; then
 	;;
     *-*-darwin* | *-*-aix*)
 	;;
+    # And on NVPTX as an offloading target.
+    nvptx*-*-*)
+	;;
     *)
 	noconfigdirs="$noconfigdirs target-libgomp"
 	;;

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2015-10-05 16:08 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-04-21 15:58 [WIP] OpenMP 4 NVPTX support Jakub Jelinek
2015-04-22 15:08 ` Bernd Schmidt
2015-05-13 20:19   ` [gomp4] nvptx offloading linking (was: [WIP] OpenMP 4 NVPTX support) Thomas Schwinge
2015-10-02 19:46     ` nvptx offloading linking Thomas Schwinge
2015-10-05 16:08     ` [gomp4] [nvptx] Don't explicitly pass "-lgomp" to the offload compiler (was: nvptx offloading linking) Thomas Schwinge
2015-04-23 14:40 ` [WIP] OpenMP 4 NVPTX support Jakub Jelinek
2015-07-22 16:13 ` Thomas Schwinge
2015-07-22 16:47   ` Jakub Jelinek
2015-08-10 15:56     ` Empty libgomp for nvptx (was: [WIP] OpenMP 4 NVPTX support) Thomas Schwinge
2015-08-10 16:29       ` Empty libgomp for nvptx Thomas Schwinge
2015-08-10 16:31   ` [gomp4] [nvptx] Move GOMP stuff from libgcc to libgomp (was: [WIP] OpenMP 4 NVPTX support) Thomas Schwinge

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).