public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [patch] adjust default nvptx launch geometry for OpenACC offloaded regions
@ 2018-06-20 21:59 Cesar Philippidis
  2018-06-20 22:16 ` Tom de Vries
  2018-06-29 17:16 ` Cesar Philippidis
  0 siblings, 2 replies; 14+ messages in thread
From: Cesar Philippidis @ 2018-06-20 21:59 UTC (permalink / raw)
  To: gcc-patches, Jakub Jelinek; +Cc: tdevries

[-- Attachment #1: Type: text/plain, Size: 1221 bytes --]

At present, the nvptx libgomp plugin does not take into account the
amount of shared resources on GPUs (mostly shared-memory are register
usage) when selecting the default num_gangs and num_workers. In certain
situations, an OpenACC offloaded function can fail to launch if the GPU
does not have sufficient shared resources to accommodate all of the
threads in a CUDA block. This typically manifests when a PTX function
uses a lot of registers and num_workers is set too large, although it
can also happen if the shared-memory has been exhausted by the threads
in a vector.

This patch resolves that issue by adjusting num_workers based the amount
of shared resources used by each threads. If worker parallelism has been
requested, libgomp will spawn as many workers as possible up to 32.
Without this patch, libgomp would always default to launching 32 workers
when worker parallelism is used.

Besides for the worker parallelism, this patch also includes some
heuristics on selecting num_gangs. Before, the plugin would launch two
gangs per GPU multiprocessor. Now it follows the formula contained in
the "CUDA Occupancy Calculator" spreadsheet that's distributed with CUDA.

Is this patch OK for trunk?

Thanks,
Cesar

[-- Attachment #2: trunk-default-par.diff --]
[-- Type: text/x-patch, Size: 15455 bytes --]

2018-06-20  Cesar Philippidis  <cesar@codesourcery.com>

        gcc/
        * config/nvptx/nvptx.c (PTX_GANG_DEFAULT): Delete define.
        (PTX_DEFAULT_RUNTIME_DIM): New define.
        (nvptx_goacc_validate_dims): Use it to allow the runtime to
        dynamically allocate num_workers and num_gangs.
        (nvptx_dim_limit): Don't impose an arbritary num_workers.

        libgomp/
        * plugin/plugin-nvptx.c (struct ptx_device): Add
        max_threads_per_block, warp_size, max_threads_per_multiprocessor,
        max_shared_memory_per_multiprocessor, binary_version,
        register_allocation_unit_size, register_allocation_granularity,
        compute_capability_major, compute_capability_minor members.
        (nvptx_open_device): Probe driver for those values.  Adjust
        regs_per_sm and max_shared_memory_per_multiprocessor for K80
        hardware. Dynamically allocate default num_workers.
        (nvptx_exec): Don't probe the CUDA runtime for the hardware
        info.  Use the new variables inside targ_fn_descriptor and
        ptx_device instead.  (GOMP_OFFLOAD_load_image): Set num_gangs,
        register_allocation_{unit_size,granularity}.  Adjust the
        default num_gangs.  Add diagnostic when the hardware cannot
        support the requested num_workers.
        * plugin/cuda/cuda.h (CUdevice_attribute): Add
        CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
        CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR.


diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index 5608bee..c1946e7 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -5165,7 +5165,7 @@ nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget),
 /* Define dimension sizes for known hardware.  */
 #define PTX_VECTOR_LENGTH 32
 #define PTX_WORKER_LENGTH 32
-#define PTX_GANG_DEFAULT  0 /* Defer to runtime.  */
+#define PTX_DEFAULT_RUNTIME_DIM 0 /* Defer to runtime.  */
 
 /* Implement TARGET_SIMT_VF target hook: number of threads in a warp.  */
 
@@ -5214,9 +5214,9 @@ nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level)
     {
       dims[GOMP_DIM_VECTOR] = PTX_VECTOR_LENGTH;
       if (dims[GOMP_DIM_WORKER] < 0)
-	dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
+	dims[GOMP_DIM_WORKER] = PTX_DEFAULT_RUNTIME_DIM;
       if (dims[GOMP_DIM_GANG] < 0)
-	dims[GOMP_DIM_GANG] = PTX_GANG_DEFAULT;
+	dims[GOMP_DIM_GANG] = PTX_DEFAULT_RUNTIME_DIM;
       changed = true;
     }
 
@@ -5230,9 +5230,6 @@ nvptx_dim_limit (int axis)
 {
   switch (axis)
     {
-    case GOMP_DIM_WORKER:
-      return PTX_WORKER_LENGTH;
-
     case GOMP_DIM_VECTOR:
       return PTX_VECTOR_LENGTH;
 
diff --git a/libgomp/plugin/cuda/cuda.h b/libgomp/plugin/cuda/cuda.h
index 4799825..c7d50db 100644
--- a/libgomp/plugin/cuda/cuda.h
+++ b/libgomp/plugin/cuda/cuda.h
@@ -69,6 +69,8 @@ typedef enum {
   CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31,
   CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39,
   CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40,
+  CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75,
+  CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76,
   CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82
 } CUdevice_attribute;
 
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index 89326e5..ada1df2 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -409,11 +409,25 @@ struct ptx_device
   bool map;
   bool concur;
   bool mkern;
-  int  mode;
+  int mode;
+  int compute_capability_major;
+  int compute_capability_minor;
   int clock_khz;
   int num_sms;
   int regs_per_block;
   int regs_per_sm;
+  int max_threads_per_block;
+  int warp_size;
+  int max_threads_per_multiprocessor;
+  int max_shared_memory_per_multiprocessor;
+
+  int binary_version;
+
+  /* register_allocation_unit_size and register_allocation_granularity
+     were extracted from the "Register Allocation Granularity" in
+     Nvidia's CUDA Occupancy Calculator spreadsheet.  */
+  int register_allocation_unit_size;
+  int register_allocation_granularity;
 
   struct ptx_image_data *images;  /* Images loaded on device.  */
   pthread_mutex_t image_lock;     /* Lock for above list.  */
@@ -725,6 +739,9 @@ nvptx_open_device (int n)
   ptx_dev->ord = n;
   ptx_dev->dev = dev;
   ptx_dev->ctx_shared = false;
+  ptx_dev->binary_version = 0;
+  ptx_dev->register_allocation_unit_size = 0;
+  ptx_dev->register_allocation_granularity = 0;
 
   r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &ctx_dev);
   if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
@@ -765,6 +782,14 @@ nvptx_open_device (int n)
   ptx_dev->mode = pi;
 
   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
+		  &pi, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, dev);
+  ptx_dev->compute_capability_major = pi;
+
+  CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
+		  &pi, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, dev);
+  ptx_dev->compute_capability_minor = pi;
+
+  CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 		  &pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
   ptx_dev->mkern = pi;
 
@@ -794,13 +819,28 @@ nvptx_open_device (int n)
   ptx_dev->regs_per_sm = pi;
 
   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
+		  &pi, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev);
+  ptx_dev->max_threads_per_block = pi;
+
+  CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 		  &pi, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev);
+  ptx_dev->warp_size = pi;
   if (pi != 32)
     {
       GOMP_PLUGIN_error ("Only warp size 32 is supported");
       return NULL;
     }
 
+  CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
+		  &pi, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev);
+  ptx_dev->max_threads_per_multiprocessor = pi;
+
+  CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
+		  &pi,
+		  CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR,
+		  dev);
+  ptx_dev->max_shared_memory_per_multiprocessor = pi;
+
   r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &async_engines,
 			 CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
   if (r != CUDA_SUCCESS)
@@ -809,6 +849,39 @@ nvptx_open_device (int n)
   ptx_dev->images = NULL;
   pthread_mutex_init (&ptx_dev->image_lock, NULL);
 
+  GOMP_PLUGIN_debug (0, "Nvidia device %d:\n\tGPU_OVERLAP = %d\n"
+		     "\tCAN_MAP_HOST_MEMORY = %d\n\tCONCURRENT_KERNELS = %d\n"
+		     "\tCOMPUTE_MODE = %d\n\tINTEGRATED = %d\n"
+		     "\tCU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = %d\n"
+		     "\tCU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = %d\n"
+		     "\tINTEGRATED = %d\n"
+		     "\tMAX_THREADS_PER_BLOCK = %d\n\tWARP_SIZE = %d\n"
+		     "\tMULTIPROCESSOR_COUNT = %d\n"
+		     "\tMAX_THREADS_PER_MULTIPROCESSOR = %d\n"
+		     "\tMAX_REGISTERS_PER_MULTIPROCESSOR = %d\n"
+		     "\tMAX_SHARED_MEMORY_PER_MULTIPROCESSOR = %d\n",
+		     ptx_dev->ord, ptx_dev->overlap, ptx_dev->map,
+		     ptx_dev->concur, ptx_dev->mode, ptx_dev->mkern,
+		     ptx_dev->compute_capability_major,
+		     ptx_dev->compute_capability_minor,
+		     ptx_dev->mkern, ptx_dev->max_threads_per_block,
+		     ptx_dev->warp_size, ptx_dev->num_sms,
+		     ptx_dev->max_threads_per_multiprocessor,
+		     ptx_dev->regs_per_sm,
+		     ptx_dev->max_shared_memory_per_multiprocessor);
+
+  /* K80 (SM_37) boards contain two physical GPUs.  Consequntly they
+     report 2x larger values for MAX_REGISTERS_PER_MULTIPROCESSOR and
+     MAX_SHARED_MEMORY_PER_MULTIPROCESSOR.  Those values need to be
+     adjusted on order to allow the nvptx_exec to select an
+     appropriate num_workers.  */
+  if (ptx_dev->compute_capability_major == 3
+      && ptx_dev->compute_capability_minor == 7)
+    {
+      ptx_dev->regs_per_sm /= 2;
+      ptx_dev->max_shared_memory_per_multiprocessor /= 2;
+    }
+
   if (!init_streams_for_device (ptx_dev, async_engines))
     return NULL;
 
@@ -1120,6 +1193,14 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
   void *hp, *dp;
   struct nvptx_thread *nvthd = nvptx_thread ();
   const char *maybe_abort_msg = "(perhaps abort was called)";
+  int cpu_size = nvptx_thread ()->ptx_dev->max_threads_per_multiprocessor;
+  int block_size = nvptx_thread ()->ptx_dev->max_threads_per_block;
+  int dev_size = nvptx_thread ()->ptx_dev->num_sms;
+  int warp_size = nvptx_thread ()->ptx_dev->warp_size;
+  int rf_size = nvptx_thread ()->ptx_dev->regs_per_sm;
+  int reg_unit_size = nvptx_thread ()->ptx_dev->register_allocation_unit_size;
+  int reg_granularity
+    = nvptx_thread ()->ptx_dev->register_allocation_granularity;
 
   function = targ_fn->fn;
 
@@ -1138,71 +1219,92 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
        seen_zero = 1;
     }
 
-  if (seen_zero)
-    {
-      /* See if the user provided GOMP_OPENACC_DIM environment
-	 variable to specify runtime defaults. */
-      static int default_dims[GOMP_DIM_MAX];
+  /* Calculate the optimal number of gangs for the current device.  */
+  int reg_used = targ_fn->regs_per_thread;
+  int reg_per_warp = ((reg_used * warp_size + reg_unit_size - 1)
+		      / reg_unit_size) * reg_unit_size;
+  int threads_per_sm = (rf_size / reg_per_warp / reg_granularity)
+    * reg_granularity * warp_size;
+  int threads_per_block = threads_per_sm > block_size
+    ? block_size : threads_per_sm;
 
-      pthread_mutex_lock (&ptx_dev_lock);
-      if (!default_dims[0])
-	{
-	  for (int i = 0; i < GOMP_DIM_MAX; ++i)
-	    default_dims[i] = GOMP_PLUGIN_acc_default_dim (i);
-
-	  int warp_size, block_size, dev_size, cpu_size;
-	  CUdevice dev = nvptx_thread()->ptx_dev->dev;
-	  /* 32 is the default for known hardware.  */
-	  int gang = 0, worker = 32, vector = 32;
-	  CUdevice_attribute cu_tpb, cu_ws, cu_mpc, cu_tpm;
-
-	  cu_tpb = CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK;
-	  cu_ws = CU_DEVICE_ATTRIBUTE_WARP_SIZE;
-	  cu_mpc = CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT;
-	  cu_tpm  = CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR;
-
-	  if (CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &block_size, cu_tpb,
-				 dev) == CUDA_SUCCESS
-	      && CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &warp_size, cu_ws,
-				    dev) == CUDA_SUCCESS
-	      && CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &dev_size, cu_mpc,
-				    dev) == CUDA_SUCCESS
-	      && CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &cpu_size, cu_tpm,
-				    dev) == CUDA_SUCCESS)
-	    {
-	      GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
-				 " dev_size=%d, cpu_size=%d\n",
-				 warp_size, block_size, dev_size, cpu_size);
-	      gang = (cpu_size / block_size) * dev_size;
-	      worker = block_size / warp_size;
-	      vector = warp_size;
-	    }
+  threads_per_block /= warp_size;
 
-	  /* There is no upper bound on the gang size.  The best size
-	     matches the hardware configuration.  Logical gangs are
-	     scheduled onto physical hardware.  To maximize usage, we
-	     should guess a large number.  */
-	  if (default_dims[GOMP_DIM_GANG] < 1)
-	    default_dims[GOMP_DIM_GANG] = gang ? gang : 1024;
-	  /* The worker size must not exceed the hardware.  */
-	  if (default_dims[GOMP_DIM_WORKER] < 1
-	      || (default_dims[GOMP_DIM_WORKER] > worker && gang))
-	    default_dims[GOMP_DIM_WORKER] = worker;
-	  /* The vector size must exactly match the hardware.  */
-	  if (default_dims[GOMP_DIM_VECTOR] < 1
-	      || (default_dims[GOMP_DIM_VECTOR] != vector && gang))
-	    default_dims[GOMP_DIM_VECTOR] = vector;
-
-	  GOMP_PLUGIN_debug (0, " default dimensions [%d,%d,%d]\n",
-			     default_dims[GOMP_DIM_GANG],
-			     default_dims[GOMP_DIM_WORKER],
-			     default_dims[GOMP_DIM_VECTOR]);
-	}
-      pthread_mutex_unlock (&ptx_dev_lock);
+  if (threads_per_sm > cpu_size)
+    threads_per_sm = cpu_size;
 
+  /* Set default launch geometry.  */
+  static int default_dims[GOMP_DIM_MAX];
+  pthread_mutex_lock (&ptx_dev_lock);
+  if (!default_dims[0])
+    {
+      /* 32 is the default for known hardware.  */
+      int gang = 0, worker = 32, vector = 32;
+
+      gang = (cpu_size / block_size) * dev_size;
+      vector = warp_size;
+
+      /* If the user hasn't specified the number of gangs, determine
+	 it dynamically based on the hardware configuration.  */
+      if (default_dims[GOMP_DIM_GANG] == 0)
+	default_dims[GOMP_DIM_GANG] = -1;
+      /* The worker size must not exceed the hardware.  */
+      if (default_dims[GOMP_DIM_WORKER] < 1
+	  || (default_dims[GOMP_DIM_WORKER] > worker && gang))
+	default_dims[GOMP_DIM_WORKER] = -1;
+      /* The vector size must exactly match the hardware.  */
+      if (default_dims[GOMP_DIM_VECTOR] < 1
+	  || (default_dims[GOMP_DIM_VECTOR] != vector && gang))
+	default_dims[GOMP_DIM_VECTOR] = vector;
+
+      GOMP_PLUGIN_debug (0, " default dimensions [%d,%d,%d]\n",
+			 default_dims[GOMP_DIM_GANG],
+			 default_dims[GOMP_DIM_WORKER],
+			 default_dims[GOMP_DIM_VECTOR]);
+    }
+  pthread_mutex_unlock (&ptx_dev_lock);
+
+  if (seen_zero)
+    {
       for (i = 0; i != GOMP_DIM_MAX; i++)
-	if (!dims[i])
-	  dims[i] = default_dims[i];
+ 	if (!dims[i])
+	  {
+	    if (default_dims[i] > 0)
+	      dims[i] = default_dims[i];
+	    else
+	      switch (i) {
+	      case GOMP_DIM_GANG:
+		/* The constant 2 was emperically.  The justification
+		   behind it is to prevent the hardware from idling by
+		   throwing twice the amount of work that it can
+		   physically handle.  */
+		dims[i] = (reg_granularity > 0)
+		  ? 2 * threads_per_sm / warp_size * dev_size
+		  : 2 * dev_size;
+		break;
+	      case GOMP_DIM_WORKER:
+		dims[i] = threads_per_block;
+		break;
+	      case GOMP_DIM_VECTOR:
+		dims[i] = warp_size;
+		break;
+	      default:
+		abort ();
+	      }
+	  }
+    }
+
+  /* Check if the accelerator has sufficient hardware resources to
+     launch the offloaded kernel.  */
+  if (dims[GOMP_DIM_WORKER] > 1)
+    {
+      if (reg_granularity > 0 && dims[GOMP_DIM_WORKER] > threads_per_block)
+	GOMP_PLUGIN_fatal ("The Nvidia accelerator has insufficient resources "
+			   "to launch '%s'; recompile the program with "
+			   "'num_workers = %d' on that offloaded region or "
+			   "'-fopenacc-dim=-:%d'.\n",
+			   targ_fn->launch->fn, threads_per_block,
+			   threads_per_block);
     }
 
   /* This reserves a chunk of a pre-allocated page of memory mapped on both
@@ -1870,6 +1972,39 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
       targ_fns->regs_per_thread = nregs;
       targ_fns->max_threads_per_block = mthrs;
 
+      if (!dev->binary_version)
+	{
+	  int val;
+	  CUDA_CALL_ERET (-1, cuFuncGetAttribute, &val,
+			  CU_FUNC_ATTRIBUTE_BINARY_VERSION, function);
+	  dev->binary_version = val;
+
+	  /* These values were obtained from the CUDA Occupancy Calculator
+	     spreadsheet.  */
+	  if (dev->binary_version == 20
+	      || dev->binary_version == 21)
+	    {
+	    dev->register_allocation_unit_size = 128;
+	    dev->register_allocation_granularity = 2;
+	    }
+	  else if (dev->binary_version == 60)
+	    {
+	      dev->register_allocation_unit_size = 256;
+	      dev->register_allocation_granularity = 2;
+	    }
+	  else if (dev->binary_version <= 70)
+	    {
+	      dev->register_allocation_unit_size = 256;
+	      dev->register_allocation_granularity = 4;
+	    }
+	  else
+	    {
+	      /* Fallback to -1 to for unknown targets.  */
+	      dev->register_allocation_unit_size = -1;
+	      dev->register_allocation_granularity = -1;
+	    }
+	}
+
       targ_tbl->start = (uintptr_t) targ_fns;
       targ_tbl->end = targ_tbl->start + 1;
     }

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2018-07-30 10:16 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-06-20 21:59 [patch] adjust default nvptx launch geometry for OpenACC offloaded regions Cesar Philippidis
2018-06-20 22:16 ` Tom de Vries
2018-06-21 13:58   ` Cesar Philippidis
2018-07-02 14:14     ` Tom de Vries
2018-07-02 14:39       ` Cesar Philippidis
2018-07-11 19:13       ` Cesar Philippidis
2018-07-26 11:58         ` Tom de Vries
2018-07-26 12:13         ` [libgomp, nvptx] Move device property sampling from nvptx_exec to nvptx_open Tom de Vries
2018-07-26 12:45         ` [patch] adjust default nvptx launch geometry for OpenACC offloaded regions Tom de Vries
2018-07-26 14:27         ` Cesar Philippidis
2018-07-26 15:18           ` Tom de Vries
2018-07-30 10:16         ` Tom de Vries
2018-06-29 17:16 ` Cesar Philippidis
2018-06-30 11:36   ` Cesar Philippidis

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).