[RFC PATCH] Coalesce host to device transfers in libgomp

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [RFC PATCH] Coalesce host to device transfers in libgomp
@ 2017-10-24  9:57 Jakub Jelinek
  2017-10-24 15:59 ` Cesar Philippidis
                   ` (2 more replies)
  0 siblings, 3 replies; 19+ messages in thread
From: Jakub Jelinek @ 2017-10-24  9:57 UTC (permalink / raw)
  To: Alexander Monakov, Thomas Schwinge, Martin Jambor, Cesar Philippidis
  Cc: gcc-patches

Hi!

Poeple from NVidia reported privately unexpected amount of host2dev
transfers for #pragma omp target*.
The code even had comments like:
                   /* FIXME: Perhaps add some smarts, like if copying
                      several adjacent fields from host to target, use some
                      host buffer to avoid sending each var individually.  */
and the especially bad example of this was the
      for (i = 0; i < mapnum; i++)
        {
          cur_node.tgt_offset = gomp_map_val (tgt, hostaddrs, i);
          /* FIXME: see above FIXME comment.  */
          gomp_copy_host2dev (devicep,
                              (void *) (tgt->tgt_start + i * sizeof (void *)),
                              (void *) &cur_node.tgt_offset, sizeof (void *));
        }
loop transfering the addresses or firstprivate_int values to the device
- where we issued mapnum host2dev transfers each just pointer-sized
when we could have just prepared all the pointers in an array and host2dev
copy them all together.

The following patch implements coalescing of transfers (only those that are
copied into the freshly allocated device buffer) into one or multiple larger
transfers.  The patch doesn't coalesce > 32KB transfers or transfers where
the gap is 4KB or more.  I guess it would be not too hard to do similar
coalescing for the dev2host transfers that are from a single device mapping,
though probably far less important than the more common host2dev transfers.

Tested on x86_64-linux to nvptx-none offloading.

Thoughts on this?

2017-10-24  Jakub Jelinek  <jakub@redhat.com>

	* target.c (struct gomp_map_cache): New type.
	(gomp_cache_add): New function.
	(gomp_copy_host2dev): Add CACHE argument, if copying into
	the cached ranges, memcpy into buffer instead of copying
	into device.
	(gomp_map_vars_existing, gomp_map_pointer, gomp_map_fields_existing):
	Add CACHE argument, pass it through to other calls.
	(gomp_map_vars): Aggregate copies from host to device if small enough
	and with small enough gaps in between into memcpy into a buffer and
	fewer host to device copies from the buffer.
	(gomp_update): Adjust gomp_copy_host2dev caller.

--- libgomp/target.c.jj	2017-04-20 14:59:08.296263304 +0200
+++ libgomp/target.c	2017-10-23 19:08:14.348336118 +0200
@@ -177,10 +177,77 @@ gomp_device_copy (struct gomp_device_des
     }
 }
 
+struct gomp_map_cache
+{
+  void *buf;
+  struct target_mem_desc *tgt;
+  size_t *chunks;
+  long chunk_cnt;
+  long use_cnt;
+};
+
+static inline void
+gomp_cache_add (struct gomp_map_cache *cache, size_t start, size_t len)
+{
+  if (len > 32 * 1024 || len == 0)
+    return;
+  if (cache->chunk_cnt)
+    {
+      if (cache->chunk_cnt < 0)
+	return;
+      if (start < cache->chunks[2 * cache->chunk_cnt - 1])
+	{
+	  cache->chunk_cnt = -1;
+	  return;
+	}
+      if (start < cache->chunks[2 * cache->chunk_cnt - 1] + 4 * 1024)
+	{
+	  cache->chunks[2 * cache->chunk_cnt - 1] = start + len;
+	  cache->use_cnt++;
+	  return;
+	}
+      /* If the last chunk is only used by one mapping, discard it,
+	 as it will be one host to device copy anyway and
+	 memcpying it around will only waste cycles.  */
+      if (cache->use_cnt == 1)
+	cache->chunk_cnt--;
+    }
+  cache->chunks[2 * cache->chunk_cnt] = start;
+  cache->chunks[2 * cache->chunk_cnt + 1] = start + len;
+  cache->chunk_cnt++;
+  cache->use_cnt = 1;
+}
+
 static void
 gomp_copy_host2dev (struct gomp_device_descr *devicep,
-		    void *d, const void *h, size_t sz)
+		    void *d, const void *h, size_t sz,
+		    struct gomp_map_cache *cache)
 {
+  if (cache)
+    {
+      uintptr_t doff = (uintptr_t) d - cache->tgt->tgt_start;
+      if (doff < cache->chunks[2 * cache->chunk_cnt - 1])
+	{
+	  long first = 0;
+	  long last = cache->chunk_cnt - 1;
+	  while (first <= last)
+	    {
+	      long middle = (first + last) >> 1;
+	      if (cache->chunks[2 * middle + 1] <= doff)
+		first = middle + 1;
+	      else if (cache->chunks[2 * middle] <= doff)
+		{
+		  if (doff + sz > cache->chunks[2 * middle + 1])
+		    gomp_fatal ("internal libgomp cache error");
+		  memcpy ((char *) cache->buf + (doff - cache->chunks[0]),
+			  h, sz);
+		  return;
+		}
+	      else
+		last = middle - 1;
+	    }
+	}
+    }
   gomp_device_copy (devicep, devicep->host2dev_func, "dev", d, "host", h, sz);
 }
 
@@ -208,7 +275,7 @@ gomp_free_device_memory (struct gomp_dev
 static inline void
 gomp_map_vars_existing (struct gomp_device_descr *devicep, splay_tree_key oldn,
 			splay_tree_key newn, struct target_var_desc *tgt_var,
-			unsigned char kind)
+			unsigned char kind, struct gomp_map_cache *cache)
 {
   tgt_var->key = oldn;
   tgt_var->copy_from = GOMP_MAP_COPY_FROM_P (kind);
@@ -232,7 +299,7 @@ gomp_map_vars_existing (struct gomp_devi
 			(void *) (oldn->tgt->tgt_start + oldn->tgt_offset
 				  + newn->host_start - oldn->host_start),
 			(void *) newn->host_start,
-			newn->host_end - newn->host_start);
+			newn->host_end - newn->host_start, cache);
 
   if (oldn->refcount != REFCOUNT_INFINITY)
     oldn->refcount++;
@@ -247,7 +314,8 @@ get_kind (bool short_mapkind, void *kind
 
 static void
 gomp_map_pointer (struct target_mem_desc *tgt, uintptr_t host_ptr,
-		  uintptr_t target_offset, uintptr_t bias)
+		  uintptr_t target_offset, uintptr_t bias,
+		  struct gomp_map_cache *cache)
 {
   struct gomp_device_descr *devicep = tgt->device_descr;
   struct splay_tree_s *mem_map = &devicep->mem_map;
@@ -257,11 +325,10 @@ gomp_map_pointer (struct target_mem_desc
   if (cur_node.host_start == (uintptr_t) NULL)
     {
       cur_node.tgt_offset = (uintptr_t) NULL;
-      /* FIXME: see comment about coalescing host/dev transfers below.  */
       gomp_copy_host2dev (devicep,
 			  (void *) (tgt->tgt_start + target_offset),
 			  (void *) &cur_node.tgt_offset,
-			  sizeof (void *));
+			  sizeof (void *), cache);
       return;
     }
   /* Add bias to the pointer value.  */
@@ -280,15 +347,15 @@ gomp_map_pointer (struct target_mem_desc
      array section.  Now subtract bias to get what we want
      to initialize the pointer with.  */
   cur_node.tgt_offset -= bias;
-  /* FIXME: see comment about coalescing host/dev transfers below.  */
   gomp_copy_host2dev (devicep, (void *) (tgt->tgt_start + target_offset),
-		      (void *) &cur_node.tgt_offset, sizeof (void *));
+		      (void *) &cur_node.tgt_offset, sizeof (void *), cache);
 }
 
 static void
 gomp_map_fields_existing (struct target_mem_desc *tgt, splay_tree_key n,
 			  size_t first, size_t i, void **hostaddrs,
-			  size_t *sizes, void *kinds)
+			  size_t *sizes, void *kinds,
+			  struct gomp_map_cache *cache)
 {
   struct gomp_device_descr *devicep = tgt->device_descr;
   struct splay_tree_s *mem_map = &devicep->mem_map;
@@ -306,7 +373,7 @@ gomp_map_fields_existing (struct target_
       && n2->host_start - n->host_start == n2->tgt_offset - n->tgt_offset)
     {
       gomp_map_vars_existing (devicep, n2, &cur_node,
-			      &tgt->list[i], kind & typemask);
+			      &tgt->list[i], kind & typemask, cache);
       return;
     }
   if (sizes[i] == 0)
@@ -322,7 +389,7 @@ gomp_map_fields_existing (struct target_
 		 == n2->tgt_offset - n->tgt_offset)
 	    {
 	      gomp_map_vars_existing (devicep, n2, &cur_node, &tgt->list[i],
-				      kind & typemask);
+				      kind & typemask, cache);
 	      return;
 	    }
 	}
@@ -334,7 +401,7 @@ gomp_map_fields_existing (struct target_
 	  && n2->host_start - n->host_start == n2->tgt_offset - n->tgt_offset)
 	{
 	  gomp_map_vars_existing (devicep, n2, &cur_node, &tgt->list[i],
-				  kind & typemask);
+				  kind & typemask, cache);
 	  return;
 	}
     }
@@ -381,6 +448,7 @@ gomp_map_vars (struct gomp_device_descr
   tgt->list_count = mapnum;
   tgt->refcount = pragma_kind == GOMP_MAP_VARS_ENTER_DATA ? 0 : 1;
   tgt->device_descr = devicep;
+  struct gomp_map_cache cache, *cachep = NULL;
 
   if (mapnum == 0)
     {
@@ -391,11 +459,25 @@ gomp_map_vars (struct gomp_device_descr
 
   tgt_align = sizeof (void *);
   tgt_size = 0;
+  cache.chunks = NULL;
+  cache.chunk_cnt = -1;
+  cache.use_cnt = 0;
+  cache.buf = NULL;
+  if (mapnum > 1 || pragma_kind == GOMP_MAP_VARS_TARGET)
+    {
+      cache.chunks
+	= (size_t *) gomp_alloca ((2 * mapnum + 2) * sizeof (size_t));
+      cache.chunk_cnt = 0;
+    }
   if (pragma_kind == GOMP_MAP_VARS_TARGET)
     {
       size_t align = 4 * sizeof (void *);
       tgt_align = align;
       tgt_size = mapnum * sizeof (void *);
+      cache.chunk_cnt = 1;
+      cache.use_cnt = 1 + (mapnum > 1);
+      cache.chunks[0] = 0;
+      cache.chunks[1] = tgt_size;
     }
 
   gomp_mutex_lock (&devicep->lock);
@@ -449,19 +531,34 @@ gomp_map_vars (struct gomp_device_descr
 	      size_t align = (size_t) 1 << (kind >> rshift);
 	      if (tgt_align < align)
 		tgt_align = align;
-	      tgt_size -= (uintptr_t) hostaddrs[first]
-			  - (uintptr_t) hostaddrs[i];
+	      tgt_size -= (uintptr_t) hostaddrs[first] - cur_node.host_start;
 	      tgt_size = (tgt_size + align - 1) & ~(align - 1);
-	      tgt_size += cur_node.host_end - (uintptr_t) hostaddrs[i];
+	      tgt_size += cur_node.host_end - cur_node.host_start;
 	      not_found_cnt += last - i;
 	      for (i = first; i <= last; i++)
-		tgt->list[i].key = NULL;
+		{
+		  tgt->list[i].key = NULL;
+		  switch (get_kind (short_mapkind, kinds, i) & typemask)
+		    {
+		    case GOMP_MAP_ALLOC:
+		    case GOMP_MAP_FROM:
+		    case GOMP_MAP_FORCE_ALLOC:
+		    case GOMP_MAP_ALWAYS_FROM:
+		      break;
+		    default:
+		      /* All the others copy data if newly allocated.  */
+		      gomp_cache_add (&cache, tgt_size - cur_node.host_end
+					      + (uintptr_t) hostaddrs[i],
+				      sizes[i]);
+		      break;
+		    }
+		}
 	      i--;
 	      continue;
 	    }
 	  for (i = first; i <= last; i++)
 	    gomp_map_fields_existing (tgt, n, first, i, hostaddrs,
-				      sizes, kinds);
+				      sizes, kinds, NULL);
 	  i--;
 	  continue;
 	}
@@ -485,6 +582,8 @@ gomp_map_vars (struct gomp_device_descr
 	  if (tgt_align < align)
 	    tgt_align = align;
 	  tgt_size = (tgt_size + align - 1) & ~(align - 1);
+	  gomp_cache_add (&cache, tgt_size,
+			  cur_node.host_end - cur_node.host_start);
 	  tgt_size += cur_node.host_end - cur_node.host_start;
 	  has_firstprivate = true;
 	  continue;
@@ -504,7 +603,7 @@ gomp_map_vars (struct gomp_device_descr
 	n = splay_tree_lookup (mem_map, &cur_node);
       if (n && n->refcount != REFCOUNT_LINK)
 	gomp_map_vars_existing (devicep, n, &cur_node, &tgt->list[i],
-				kind & typemask);
+				kind & typemask, NULL);
       else
 	{
 	  tgt->list[i].key = NULL;
@@ -514,6 +613,19 @@ gomp_map_vars (struct gomp_device_descr
 	  if (tgt_align < align)
 	    tgt_align = align;
 	  tgt_size = (tgt_size + align - 1) & ~(align - 1);
+	  switch (kind & typemask)
+	    {
+	    case GOMP_MAP_ALLOC:
+	    case GOMP_MAP_FROM:
+	    case GOMP_MAP_FORCE_ALLOC:
+	    case GOMP_MAP_ALWAYS_FROM:
+	      break;
+	    default:
+	      /* All the others copy data if newly allocated.  */
+	      gomp_cache_add (&cache, tgt_size,
+			      cur_node.host_end - cur_node.host_start);
+	      break;
+	    }
 	  tgt_size += cur_node.host_end - cur_node.host_start;
 	  if ((kind & typemask) == GOMP_MAP_TO_PSET)
 	    {
@@ -562,6 +674,19 @@ gomp_map_vars (struct gomp_device_descr
       tgt->tgt_start = (uintptr_t) tgt->to_free;
       tgt->tgt_start = (tgt->tgt_start + tgt_align - 1) & ~(tgt_align - 1);
       tgt->tgt_end = tgt->tgt_start + tgt_size;
+
+      if (cache.use_cnt == 1)
+	cache.chunk_cnt--;
+      if (cache.chunk_cnt > 0)
+	{
+	  cache.buf
+	    = malloc (cache.chunks[2 * cache.chunk_cnt - 1] - cache.chunks[0]);
+	  if (cache.buf)
+	    {
+	      cache.tgt = tgt;
+	      cachep = &cache;
+	    }
+	}
     }
   else
     {
@@ -600,7 +725,7 @@ gomp_map_vars (struct gomp_device_descr
 		len = sizes[i];
 		gomp_copy_host2dev (devicep,
 				    (void *) (tgt->tgt_start + tgt_size),
-				    (void *) hostaddrs[i], len);
+				    (void *) hostaddrs[i], len, cachep);
 		tgt_size += len;
 		continue;
 	      case GOMP_MAP_FIRSTPRIVATE_INT:
@@ -633,7 +758,7 @@ gomp_map_vars (struct gomp_device_descr
 		  }
 		for (i = first; i <= last; i++)
 		  gomp_map_fields_existing (tgt, n, first, i, hostaddrs,
-					    sizes, kinds);
+					    sizes, kinds, cachep);
 		i--;
 		continue;
 	      case GOMP_MAP_ALWAYS_POINTER:
@@ -658,7 +783,7 @@ gomp_map_vars (struct gomp_device_descr
 					      + cur_node.host_start
 					      - n->host_start),
 				    (void *) &cur_node.tgt_offset,
-				    sizeof (void *));
+				    sizeof (void *), cachep);
 		cur_node.tgt_offset = n->tgt->tgt_start + n->tgt_offset
 				      + cur_node.host_start - n->host_start;
 		continue;
@@ -674,7 +799,7 @@ gomp_map_vars (struct gomp_device_descr
 	    splay_tree_key n = splay_tree_lookup (mem_map, k);
 	    if (n && n->refcount != REFCOUNT_LINK)
 	      gomp_map_vars_existing (devicep, n, k, &tgt->list[i],
-				      kind & typemask);
+				      kind & typemask, cachep);
 	    else
 	      {
 		k->link_key = NULL;
@@ -725,26 +850,22 @@ gomp_map_vars (struct gomp_device_descr
 		  case GOMP_MAP_FORCE_TOFROM:
 		  case GOMP_MAP_ALWAYS_TO:
 		  case GOMP_MAP_ALWAYS_TOFROM:
-		    /* FIXME: Perhaps add some smarts, like if copying
-		       several adjacent fields from host to target, use some
-		       host buffer to avoid sending each var individually.  */
 		    gomp_copy_host2dev (devicep,
 					(void *) (tgt->tgt_start
 						  + k->tgt_offset),
 					(void *) k->host_start,
-					k->host_end - k->host_start);
+					k->host_end - k->host_start, cachep);
 		    break;
 		  case GOMP_MAP_POINTER:
 		    gomp_map_pointer (tgt, (uintptr_t) *(void **) k->host_start,
-				      k->tgt_offset, sizes[i]);
+				      k->tgt_offset, sizes[i], cachep);
 		    break;
 		  case GOMP_MAP_TO_PSET:
-		    /* FIXME: see above FIXME comment.  */
 		    gomp_copy_host2dev (devicep,
 					(void *) (tgt->tgt_start
 						  + k->tgt_offset),
 					(void *) k->host_start,
-					k->host_end - k->host_start);
+					k->host_end - k->host_start, cachep);
 
 		    for (j = i + 1; j < mapnum; j++)
 		      if (!GOMP_MAP_POINTER_P (get_kind (short_mapkind, kinds,
@@ -767,7 +888,7 @@ gomp_map_vars (struct gomp_device_descr
 					    k->tgt_offset
 					    + ((uintptr_t) hostaddrs[j]
 					       - k->host_start),
-					    sizes[j]);
+					    sizes[j], cachep);
 			  i++;
 			}
 		    break;
@@ -795,7 +916,7 @@ gomp_map_vars (struct gomp_device_descr
 					(void *) (tgt->tgt_start
 						  + k->tgt_offset),
 					(void *) k->host_start,
-					sizeof (void *));
+					sizeof (void *), cachep);
 		    break;
 		  default:
 		    gomp_mutex_unlock (&devicep->lock);
@@ -822,13 +943,23 @@ gomp_map_vars (struct gomp_device_descr
       for (i = 0; i < mapnum; i++)
 	{
 	  cur_node.tgt_offset = gomp_map_val (tgt, hostaddrs, i);
-	  /* FIXME: see above FIXME comment.  */
 	  gomp_copy_host2dev (devicep,
 			      (void *) (tgt->tgt_start + i * sizeof (void *)),
-			      (void *) &cur_node.tgt_offset, sizeof (void *));
+			      (void *) &cur_node.tgt_offset, sizeof (void *),
+			      cachep);
 	}
     }
 
+  if (cachep)
+    {
+      long c = 0;
+      for (c = 0; c < cache.chunk_cnt; ++c)
+	gomp_copy_host2dev (devicep, (void *) (tgt->tgt_start + cache.chunks[2 * c]),
+			    (char *) cache.buf + (cache.chunks[2 * c] - cache.chunks[0]),
+			    cache.chunks[2 * c + 1] - cache.chunks[2 * c], NULL);
+      free (cache.buf);
+    }
+
   /* If the variable from "omp target enter data" map-list was already mapped,
      tgt is not needed.  Otherwise tgt will be freed by gomp_unmap_vars or
      gomp_exit_data.  */
@@ -970,7 +1101,7 @@ gomp_update (struct gomp_device_descr *d
 	    size_t size = cur_node.host_end - cur_node.host_start;
 
 	    if (GOMP_MAP_COPY_TO_P (kind & typemask))
-	      gomp_copy_host2dev (devicep, devaddr, hostaddr, size);
+	      gomp_copy_host2dev (devicep, devaddr, hostaddr, size, NULL);
 	    if (GOMP_MAP_COPY_FROM_P (kind & typemask))
 	      gomp_copy_dev2host (devicep, hostaddr, devaddr, size);
 	  }

	Jakub

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC PATCH] Coalesce host to device transfers in libgomp
  2017-10-24  9:57 [RFC PATCH] Coalesce host to device transfers in libgomp Jakub Jelinek
@ 2017-10-24 15:59 ` Cesar Philippidis
  2017-10-24 16:02   ` Jakub Jelinek
  2017-10-24 17:40 ` Alexander Monakov
  2019-05-23 14:40 ` Thomas Schwinge
  2 siblings, 1 reply; 19+ messages in thread
From: Cesar Philippidis @ 2017-10-24 15:59 UTC (permalink / raw)
  To: Jakub Jelinek, Alexander Monakov, Thomas Schwinge, Martin Jambor
  Cc: gcc-patches

On 10/24/2017 02:55 AM, Jakub Jelinek wrote:

> Poeple from NVidia reported privately unexpected amount of host2dev
> transfers for #pragma omp target*.

Did they mention which program they were testing?

> The code even had comments like:
>                    /* FIXME: Perhaps add some smarts, like if copying
>                       several adjacent fields from host to target, use some
>                       host buffer to avoid sending each var individually.  */
> and the especially bad example of this was the
>       for (i = 0; i < mapnum; i++)
>         {
>           cur_node.tgt_offset = gomp_map_val (tgt, hostaddrs, i);
>           /* FIXME: see above FIXME comment.  */
>           gomp_copy_host2dev (devicep,
>                               (void *) (tgt->tgt_start + i * sizeof (void *)),
>                               (void *) &cur_node.tgt_offset, sizeof (void *));
>         }
> loop transfering the addresses or firstprivate_int values to the device
> - where we issued mapnum host2dev transfers each just pointer-sized
> when we could have just prepared all the pointers in an array and host2dev
> copy them all together.
> 
> The following patch implements coalescing of transfers (only those that are
> copied into the freshly allocated device buffer) into one or multiple larger
> transfers.  The patch doesn't coalesce > 32KB transfers or transfers where
> the gap is 4KB or more.  I guess it would be not too hard to do similar
> coalescing for the dev2host transfers that are from a single device mapping,
> though probably far less important than the more common host2dev transfers.

Why did you chose the 32KB and 4KB limits? I wonder if that would have
any impact on firstprivate_int values. If this proves to be effective,
it seems like we should be able to eliminate GOMP_MAP_FIRSTPRIVATE_INT
altogether.

> Tested on x86_64-linux to nvptx-none offloading.

By the way, you can capture all of the calls to the CUDA driver using
nvprof. I usually use this command:

  nvprof --csv --print-gpu-trace --log-file foo.nvprof ./foo.exe

You can then grep for CUDA, which usually represents the
dev2host/host2dev functions.

> Thoughts on this?
> 
> 2017-10-24  Jakub Jelinek  <jakub@redhat.com>
> 
> 	* target.c (struct gomp_map_cache): New type.
> 	(gomp_cache_add): New function.
> 	(gomp_copy_host2dev): Add CACHE argument, if copying into
> 	the cached ranges, memcpy into buffer instead of copying
> 	into device.
> 	(gomp_map_vars_existing, gomp_map_pointer, gomp_map_fields_existing):
> 	Add CACHE argument, pass it through to other calls.
> 	(gomp_map_vars): Aggregate copies from host to device if small enough
> 	and with small enough gaps in between into memcpy into a buffer and
> 	fewer host to device copies from the buffer.
> 	(gomp_update): Adjust gomp_copy_host2dev caller.
> 
> --- libgomp/target.c.jj	2017-04-20 14:59:08.296263304 +0200
> +++ libgomp/target.c	2017-10-23 19:08:14.348336118 +0200
> @@ -177,10 +177,77 @@ gomp_device_copy (struct gomp_device_des
>      }
>  }
>  
> +struct gomp_map_cache
> +{
> +  void *buf;
> +  struct target_mem_desc *tgt;
> +  size_t *chunks;
> +  long chunk_cnt;
> +  long use_cnt;
> +};
> +

Maybe include a comment here stating that you want to restrict caching
to 32KB with variables with no gaps larger than 4KB?

> +static inline void
> +gomp_cache_add (struct gomp_map_cache *cache, size_t start, size_t len)
> +{
> +  if (len > 32 * 1024 || len == 0)
> +    return;
> +  if (cache->chunk_cnt)
> +    {
> +      if (cache->chunk_cnt < 0)
> +	return;
> +      if (start < cache->chunks[2 * cache->chunk_cnt - 1])
> +	{
> +	  cache->chunk_cnt = -1;
> +	  return;
> +	}
> +      if (start < cache->chunks[2 * cache->chunk_cnt - 1] + 4 * 1024)
> +	{
> +	  cache->chunks[2 * cache->chunk_cnt - 1] = start + len;
> +	  cache->use_cnt++;
> +	  return;
> +	}
> +      /* If the last chunk is only used by one mapping, discard it,
> +	 as it will be one host to device copy anyway and
> +	 memcpying it around will only waste cycles.  */
> +      if (cache->use_cnt == 1)
> +	cache->chunk_cnt--;
> +    }
> +  cache->chunks[2 * cache->chunk_cnt] = start;
> +  cache->chunks[2 * cache->chunk_cnt + 1] = start + len;
> +  cache->chunk_cnt++;
> +  cache->use_cnt = 1;
> +}

I'll need to swap in more state to review the rest of the patch, but I
like the idea.

One other minor optimization, would be to change arguments to offloaded
functions from a single struct to individual arguments. At least for
nvptx, cuLaunchKernel accepts variable arguments for PTX kernels. There
are two advantages of this. 1) At least with nvptx, nvptx_exec wouldn't
need to reserve a block of device memory for struct argument. 2) This
would eliminate one level of indirection for each offloaded argument
(although SRA probably takes care of the majority of this already).

Cesar

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC PATCH] Coalesce host to device transfers in libgomp
  2017-10-24 15:59 ` Cesar Philippidis
@ 2017-10-24 16:02   ` Jakub Jelinek
  2017-10-24 17:56     ` Alexander Monakov
  0 siblings, 1 reply; 19+ messages in thread
From: Jakub Jelinek @ 2017-10-24 16:02 UTC (permalink / raw)
  To: Cesar Philippidis
  Cc: Alexander Monakov, Thomas Schwinge, Martin Jambor, gcc-patches

On Tue, Oct 24, 2017 at 08:47:39AM -0700, Cesar Philippidis wrote:
> On 10/24/2017 02:55 AM, Jakub Jelinek wrote:
> 
> > Poeple from NVidia reported privately unexpected amount of host2dev
> > transfers for #pragma omp target*.
> 
> Did they mention which program they were testing?

No.  Just the nvprof counted counts from GCC and LLVM.

> > The following patch implements coalescing of transfers (only those that are
> > copied into the freshly allocated device buffer) into one or multiple larger
> > transfers.  The patch doesn't coalesce > 32KB transfers or transfers where
> > the gap is 4KB or more.  I guess it would be not too hard to do similar
> > coalescing for the dev2host transfers that are from a single device mapping,
> > though probably far less important than the more common host2dev transfers.
> 
> Why did you chose the 32KB and 4KB limits? I wonder if that would have
> any impact on firstprivate_int values. If this proves to be effective,
> it seems like we should be able to eliminate GOMP_MAP_FIRSTPRIVATE_INT
> altogether.

The thing is that this is a generic code, so it is hard to come up with
reasonable limits.  We could even have some limits e.g. in *devicep
if we get different needs for different offloading targets.

The 32KB and 4KB just come from some discussions with Alexander on IRC
that larger copies saturate the PCI and the overhead isn't significant, so
in that case copying e.g. megabyte into another memory and then to the
device would likely not be beneficial.

I'd prefer to keep GOMP_MAP_FIRSTPRIVATE_INT, I think it is a useful
optimization for the most common case, even if it is not 2 separate host2dev
copies for it compared to 1 for GOMP_MAP_FIRSTPRIVATE_INT, it is still extra
memory dereferences both on the host and on the target.

> > +struct gomp_map_cache
> > +{
> > +  void *buf;
> > +  struct target_mem_desc *tgt;
> > +  size_t *chunks;
> > +  long chunk_cnt;
> > +  long use_cnt;
> > +};
> > +
> 
> Maybe include a comment here stating that you want to restrict caching
> to 32KB with variables with no gaps larger than 4KB?

Sure.  Maybe even better to turn those for now into defines and add comments
to those.

> One other minor optimization, would be to change arguments to offloaded
> functions from a single struct to individual arguments. At least for
> nvptx, cuLaunchKernel accepts variable arguments for PTX kernels. There
> are two advantages of this. 1) At least with nvptx, nvptx_exec wouldn't
> need to reserve a block of device memory for struct argument. 2) This
> would eliminate one level of indirection for each offloaded argument
> (although SRA probably takes care of the majority of this already).

At least for OpenMP, we are now using a wrapper around the generated code
which sets stuff up, so not sure if that would be possible.  The wrapper
among other things sets up the soft-stack.  Not sure if it wouldn't be
possible to replace it with a magic call at the begining of OpenMP kernel
starts.

	Jakub

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC PATCH] Coalesce host to device transfers in libgomp
  2017-10-24  9:57 [RFC PATCH] Coalesce host to device transfers in libgomp Jakub Jelinek
  2017-10-24 15:59 ` Cesar Philippidis
@ 2017-10-24 17:40 ` Alexander Monakov
  2017-10-24 19:36   ` Jakub Jelinek
  2017-10-25 12:03   ` Jakub Jelinek
  2019-05-23 14:40 ` Thomas Schwinge
  2 siblings, 2 replies; 19+ messages in thread
From: Alexander Monakov @ 2017-10-24 17:40 UTC (permalink / raw)
  To: Jakub Jelinek
  Cc: Thomas Schwinge, Martin Jambor, Cesar Philippidis, gcc-patches

On Tue, 24 Oct 2017, Jakub Jelinek wrote:
> loop transfering the addresses or firstprivate_int values to the device
> - where we issued mapnum host2dev transfers each just pointer-sized
> when we could have just prepared all the pointers in an array and host2dev
> copy them all together.

Can you please give an example OpenMP code? I thought such variables are
just fields of one omp_data_? struct that is copied all at once, but I guess
I'm misunderstanding.

> Thoughts on this?

I need some time to understand the patch well, at the moment I have just
a couple superficial comments, below.

> --- libgomp/target.c.jj	2017-04-20 14:59:08.296263304 +0200
> +++ libgomp/target.c	2017-10-23 19:08:14.348336118 +0200
> @@ -177,10 +177,77 @@ gomp_device_copy (struct gomp_device_des
>      }
>  }
>  
> +struct gomp_map_cache
> +{
> +  void *buf;
> +  struct target_mem_desc *tgt;
> +  size_t *chunks;
> +  long chunk_cnt;
> +  long use_cnt;
> +};

Would really appreciate comments for meaning of fields here.  Also, is the
struct properly named?  From the patch description I understood it to be a
copy coalescing buffer, not a cache.

> @@ -449,19 +531,34 @@ gomp_map_vars (struct gomp_device_descr
>  	      size_t align = (size_t) 1 << (kind >> rshift);
>  	      if (tgt_align < align)
>  		tgt_align = align;
> -	      tgt_size -= (uintptr_t) hostaddrs[first]
> -			  - (uintptr_t) hostaddrs[i];
> +	      tgt_size -= (uintptr_t) hostaddrs[first] - cur_node.host_start;
>  	      tgt_size = (tgt_size + align - 1) & ~(align - 1);
> -	      tgt_size += cur_node.host_end - (uintptr_t) hostaddrs[i];
> +	      tgt_size += cur_node.host_end - cur_node.host_start;
>  	      not_found_cnt += last - i;
>  	      for (i = first; i <= last; i++)
> -		tgt->list[i].key = NULL;
> +		{
> +		  tgt->list[i].key = NULL;
> +		  switch (get_kind (short_mapkind, kinds, i) & typemask)
> +		    {
> +		    case GOMP_MAP_ALLOC:
> +		    case GOMP_MAP_FROM:
> +		    case GOMP_MAP_FORCE_ALLOC:
> +		    case GOMP_MAP_ALWAYS_FROM:
> +		      break;
> +		    default:
> +		      /* All the others copy data if newly allocated.  */
> +		      gomp_cache_add (&cache, tgt_size - cur_node.host_end
> +					      + (uintptr_t) hostaddrs[i],
> +				      sizes[i]);

A similar switch needed to be duplicated below.  Would it be appropriate to
pass the map kind to gomp_cache_add, or have a thin wrapper around it to have
checks for appropriate kinds in one place?

Thanks.
Alexander

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC PATCH] Coalesce host to device transfers in libgomp
  2017-10-24 16:02   ` Jakub Jelinek
@ 2017-10-24 17:56     ` Alexander Monakov
  0 siblings, 0 replies; 19+ messages in thread
From: Alexander Monakov @ 2017-10-24 17:56 UTC (permalink / raw)
  To: Jakub Jelinek
  Cc: Cesar Philippidis, Thomas Schwinge, Martin Jambor, gcc-patches

On Tue, 24 Oct 2017, Jakub Jelinek wrote:
> > Why did you chose the 32KB and 4KB limits? I wonder if that would have
> > any impact on firstprivate_int values. If this proves to be effective,
> > it seems like we should be able to eliminate GOMP_MAP_FIRSTPRIVATE_INT
> > altogether.
> 
> The thing is that this is a generic code, so it is hard to come up with
> reasonable limits.  We could even have some limits e.g. in *devicep
> if we get different needs for different offloading targets.
> 
> The 32KB and 4KB just come from some discussions with Alexander on IRC
> that larger copies saturate the PCI and the overhead isn't significant, so
> in that case copying e.g. megabyte into another memory and then to the
> device would likely not be beneficial.

Hm, I guess some miscommunication happened here.  On IRC I said,

>> from my tests, at 32+MB it approaches bus bandwidth (10GB/s for gen3 pcie),
>> at few kilobytes I'd expect aggregation to pay off

Note I really meant 32+ megabytes, not kilobytes, but of course I'm not
suggesting that libgomp allocates a multi-megabyte staging buffer and
memcpy's everything into it all the time.

Generally speaking, for optimal transfers one should use permanently allocated
locked ("pinned") memory and/or asynchronous transfers, but unfortunately at the
moment I don't have a good understanding of existing design and OpenMP spec
constraints to know what libgomp can/should do.

Alexander

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC PATCH] Coalesce host to device transfers in libgomp
  2017-10-24 17:40 ` Alexander Monakov
@ 2017-10-24 19:36   ` Jakub Jelinek
  2017-10-25 12:03   ` Jakub Jelinek
  1 sibling, 0 replies; 19+ messages in thread
From: Jakub Jelinek @ 2017-10-24 19:36 UTC (permalink / raw)
  To: Alexander Monakov
  Cc: Thomas Schwinge, Martin Jambor, Cesar Philippidis, gcc-patches

On Tue, Oct 24, 2017 at 08:39:13PM +0300, Alexander Monakov wrote:
> On Tue, 24 Oct 2017, Jakub Jelinek wrote:
> > loop transfering the addresses or firstprivate_int values to the device
> > - where we issued mapnum host2dev transfers each just pointer-sized
> > when we could have just prepared all the pointers in an array and host2dev
> > copy them all together.
> 
> Can you please give an example OpenMP code? I thought such variables are
> just fields of one omp_data_? struct that is copied all at once, but I guess
> I'm misunderstanding.

Basically anything with multiple mappings.
void foo () {
int a[10], b[10], c[10], d, e, f;
struct S { int g[10]; } h;
init (a, b, c, &d, &e, &f, &h);
#pragma omp target map(to:a, b, c) firstprivate (d, e, f, h)
use (a, b, c, d, e, f, h);
}
The above has mapnum 7, if none of this is mapped, then the current trunk
will perform 3 host2dev 40 byte copies for the 3 arrays, 1 40 byte copy for
the firstprivate h, and 7 pointer-sized copies for the addresses of the 3
arrays, one firstprivate struct and 3 ints encoded in pointers.
As all the 4 40 byte allocations plus the 7 * pointer sized allocations
are adjacent with no gaps, with the patch there will be a single host2dev
transfer of 160+7*sizeof(void*) bytes.

> > +struct gomp_map_cache
> > +{
> > +  void *buf;
> > +  struct target_mem_desc *tgt;
> > +  size_t *chunks;
> > +  long chunk_cnt;
> > +  long use_cnt;
> > +};
> 
> Would really appreciate comments for meaning of fields here.  Also, is the
> struct properly named?  From the patch description I understood it to be a
> copy coalescing buffer, not a cache.

I'll rename it and add comments.

> > @@ -449,19 +531,34 @@ gomp_map_vars (struct gomp_device_descr
> >  	      size_t align = (size_t) 1 << (kind >> rshift);
> >  	      if (tgt_align < align)
> >  		tgt_align = align;
> > -	      tgt_size -= (uintptr_t) hostaddrs[first]
> > -			  - (uintptr_t) hostaddrs[i];
> > +	      tgt_size -= (uintptr_t) hostaddrs[first] - cur_node.host_start;
> >  	      tgt_size = (tgt_size + align - 1) & ~(align - 1);
> > -	      tgt_size += cur_node.host_end - (uintptr_t) hostaddrs[i];
> > +	      tgt_size += cur_node.host_end - cur_node.host_start;
> >  	      not_found_cnt += last - i;
> >  	      for (i = first; i <= last; i++)
> > -		tgt->list[i].key = NULL;
> > +		{
> > +		  tgt->list[i].key = NULL;
> > +		  switch (get_kind (short_mapkind, kinds, i) & typemask)
> > +		    {
> > +		    case GOMP_MAP_ALLOC:
> > +		    case GOMP_MAP_FROM:
> > +		    case GOMP_MAP_FORCE_ALLOC:
> > +		    case GOMP_MAP_ALWAYS_FROM:
> > +		      break;
> > +		    default:
> > +		      /* All the others copy data if newly allocated.  */
> > +		      gomp_cache_add (&cache, tgt_size - cur_node.host_end
> > +					      + (uintptr_t) hostaddrs[i],
> > +				      sizes[i]);
> 
> A similar switch needed to be duplicated below.  Would it be appropriate to
> pass the map kind to gomp_cache_add, or have a thin wrapper around it to have
> checks for appropriate kinds in one place?

No, I'd prefer to keep the logic out of gomp_cache_add, but can add an
inline predicate whether kind writes to device.

	Jakub

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC PATCH] Coalesce host to device transfers in libgomp
  2017-10-24 17:40 ` Alexander Monakov
  2017-10-24 19:36   ` Jakub Jelinek
@ 2017-10-25 12:03   ` Jakub Jelinek
  2017-10-27 14:13     ` [PATCH] Implement omp async support for nvptx Tom de Vries
                       ` (2 more replies)
  1 sibling, 3 replies; 19+ messages in thread
From: Jakub Jelinek @ 2017-10-25 12:03 UTC (permalink / raw)
  To: Alexander Monakov
  Cc: Thomas Schwinge, Martin Jambor, Cesar Philippidis, gcc-patches

On Tue, Oct 24, 2017 at 08:39:13PM +0300, Alexander Monakov wrote:
> > +struct gomp_map_cache
> > +{
> > +  void *buf;
> > +  struct target_mem_desc *tgt;
> > +  size_t *chunks;
> > +  long chunk_cnt;
> > +  long use_cnt;
> > +};
> 
> Would really appreciate comments for meaning of fields here.  Also, is the
> struct properly named?  From the patch description I understood it to be a
> copy coalescing buffer, not a cache.
...

Here is an updated patch with some renaming, extra macros, one extra inline
function and comments.

As for async transfers, I think at least right now we need to make sure the
transfers complete by the time we release the device lock, but we could
perhaps gain something by queing the transfers asynchronously and then
waiting for them before releasing the lock (add some 2 further plugin
callbacks and some way how to keep around a device "async" handle).
And we don't really have the async target implemented yet for NVPTX :(,
guess that should be the highest priority after this optimization.

2017-10-25  Jakub Jelinek  <jakub@redhat.com>

	* target.c (struct gomp_coalesce_buf): New type.
	(MAX_COALESCE_BUF_SIZE, MAX_COALESCE_BUF_GAP): Define.
	(gomp_coalesce_buf_add, gomp_to_device_kind_p): New functions.
	(gomp_copy_host2dev): Add CBUF argument, if copying into
	the cached ranges, memcpy into buffer instead of copying
	into device.
	(gomp_map_vars_existing, gomp_map_pointer, gomp_map_fields_existing):
	Add CBUF argument, pass it through to other calls.
	(gomp_map_vars): Aggregate copies from host to device if small enough
	and with small enough gaps in between into memcpy into a buffer and
	fewer host to device copies from the buffer.
	(gomp_update): Adjust gomp_copy_host2dev caller.

--- libgomp/target.c.jj	2017-10-24 12:07:03.763759657 +0200
+++ libgomp/target.c	2017-10-25 13:17:31.608975390 +0200
@@ -177,10 +177,122 @@ gomp_device_copy (struct gomp_device_des
     }
 }
 
+/* Infrastructure for coalescing adjacent or nearly adjacent (in device addresses)
+   host to device memory transfers.  */
+
+struct gomp_coalesce_buf
+{
+  /* Buffer into which gomp_copy_host2dev will memcpy data and from which
+     it will be copied to the device.  */
+  void *buf;
+  struct target_mem_desc *tgt;
+  /* Array with offsets, chunks[2 * i] is the starting offset and
+     chunks[2 * i + 1] ending offset relative to tgt->tgt_start device address
+     of chunks which are to be copied to buf and later copied to device.  */
+  size_t *chunks;
+  /* Number of chunks in chunks array, or -1 if coalesce buffering should not
+     be performed.  */
+  long chunk_cnt;
+  /* During construction of chunks array, how many memory regions are within
+     the last chunk.  If there is just one memory region for a chunk, we copy
+     it directly to device rather than going through buf.  */
+  long use_cnt;
+};
+
+/* Maximum size of memory region considered for coalescing.  Larger copies
+   are performed directly.  */
+#define MAX_COALESCE_BUF_SIZE	(32 * 1024)
+
+/* Maximum size of a gap in between regions to consider them being copied
+   within the same chunk.  All the device offsets considered are within
+   newly allocated device memory, so it isn't fatal if we copy some padding
+   in between from host to device.  The gaps come either from alignment
+   padding or from memory regions which are not supposed to be copied from
+   host to device (e.g. map(alloc:), map(from:) etc.).  */
+#define MAX_COALESCE_BUF_GAP	(4 * 1024)
+
+/* Add region with device tgt_start relative offset and length to CBUF.  */
+
+static inline void
+gomp_coalesce_buf_add (struct gomp_coalesce_buf *cbuf, size_t start, size_t len)
+{
+  if (len > MAX_COALESCE_BUF_SIZE || len == 0)
+    return;
+  if (cbuf->chunk_cnt)
+    {
+      if (cbuf->chunk_cnt < 0)
+	return;
+      if (start < cbuf->chunks[2 * cbuf->chunk_cnt - 1])
+	{
+	  cbuf->chunk_cnt = -1;
+	  return;
+	}
+      if (start < cbuf->chunks[2 * cbuf->chunk_cnt - 1] + MAX_COALESCE_BUF_GAP)
+	{
+	  cbuf->chunks[2 * cbuf->chunk_cnt - 1] = start + len;
+	  cbuf->use_cnt++;
+	  return;
+	}
+      /* If the last chunk is only used by one mapping, discard it,
+	 as it will be one host to device copy anyway and
+	 memcpying it around will only waste cycles.  */
+      if (cbuf->use_cnt == 1)
+	cbuf->chunk_cnt--;
+    }
+  cbuf->chunks[2 * cbuf->chunk_cnt] = start;
+  cbuf->chunks[2 * cbuf->chunk_cnt + 1] = start + len;
+  cbuf->chunk_cnt++;
+  cbuf->use_cnt = 1;
+}
+
+/* Return true for mapping kinds which need to copy data from the
+   host to device for regions that weren't previously mapped.  */
+
+static inline bool
+gomp_to_device_kind_p (int kind)
+{
+  switch (kind)
+    {
+    case GOMP_MAP_ALLOC:
+    case GOMP_MAP_FROM:
+    case GOMP_MAP_FORCE_ALLOC:
+    case GOMP_MAP_ALWAYS_FROM:
+      return false;
+    default:
+      return true;
+    }
+}
+
 static void
 gomp_copy_host2dev (struct gomp_device_descr *devicep,
-		    void *d, const void *h, size_t sz)
+		    void *d, const void *h, size_t sz,
+		    struct gomp_coalesce_buf *cbuf)
 {
+  if (cbuf)
+    {
+      uintptr_t doff = (uintptr_t) d - cbuf->tgt->tgt_start;
+      if (doff < cbuf->chunks[2 * cbuf->chunk_cnt - 1])
+	{
+	  long first = 0;
+	  long last = cbuf->chunk_cnt - 1;
+	  while (first <= last)
+	    {
+	      long middle = (first + last) >> 1;
+	      if (cbuf->chunks[2 * middle + 1] <= doff)
+		first = middle + 1;
+	      else if (cbuf->chunks[2 * middle] <= doff)
+		{
+		  if (doff + sz > cbuf->chunks[2 * middle + 1])
+		    gomp_fatal ("internal libgomp cbuf error");
+		  memcpy ((char *) cbuf->buf + (doff - cbuf->chunks[0]),
+			  h, sz);
+		  return;
+		}
+	      else
+		last = middle - 1;
+	    }
+	}
+    }
   gomp_device_copy (devicep, devicep->host2dev_func, "dev", d, "host", h, sz);
 }
 
@@ -208,7 +320,7 @@ gomp_free_device_memory (struct gomp_dev
 static inline void
 gomp_map_vars_existing (struct gomp_device_descr *devicep, splay_tree_key oldn,
 			splay_tree_key newn, struct target_var_desc *tgt_var,
-			unsigned char kind)
+			unsigned char kind, struct gomp_coalesce_buf *cbuf)
 {
   tgt_var->key = oldn;
   tgt_var->copy_from = GOMP_MAP_COPY_FROM_P (kind);
@@ -232,7 +344,7 @@ gomp_map_vars_existing (struct gomp_devi
 			(void *) (oldn->tgt->tgt_start + oldn->tgt_offset
 				  + newn->host_start - oldn->host_start),
 			(void *) newn->host_start,
-			newn->host_end - newn->host_start);
+			newn->host_end - newn->host_start, cbuf);
 
   if (oldn->refcount != REFCOUNT_INFINITY)
     oldn->refcount++;
@@ -247,7 +359,8 @@ get_kind (bool short_mapkind, void *kind
 
 static void
 gomp_map_pointer (struct target_mem_desc *tgt, uintptr_t host_ptr,
-		  uintptr_t target_offset, uintptr_t bias)
+		  uintptr_t target_offset, uintptr_t bias,
+		  struct gomp_coalesce_buf *cbuf)
 {
   struct gomp_device_descr *devicep = tgt->device_descr;
   struct splay_tree_s *mem_map = &devicep->mem_map;
@@ -257,11 +370,10 @@ gomp_map_pointer (struct target_mem_desc
   if (cur_node.host_start == (uintptr_t) NULL)
     {
       cur_node.tgt_offset = (uintptr_t) NULL;
-      /* FIXME: see comment about coalescing host/dev transfers below.  */
       gomp_copy_host2dev (devicep,
 			  (void *) (tgt->tgt_start + target_offset),
 			  (void *) &cur_node.tgt_offset,
-			  sizeof (void *));
+			  sizeof (void *), cbuf);
       return;
     }
   /* Add bias to the pointer value.  */
@@ -280,15 +392,15 @@ gomp_map_pointer (struct target_mem_desc
      array section.  Now subtract bias to get what we want
      to initialize the pointer with.  */
   cur_node.tgt_offset -= bias;
-  /* FIXME: see comment about coalescing host/dev transfers below.  */
   gomp_copy_host2dev (devicep, (void *) (tgt->tgt_start + target_offset),
-		      (void *) &cur_node.tgt_offset, sizeof (void *));
+		      (void *) &cur_node.tgt_offset, sizeof (void *), cbuf);
 }
 
 static void
 gomp_map_fields_existing (struct target_mem_desc *tgt, splay_tree_key n,
 			  size_t first, size_t i, void **hostaddrs,
-			  size_t *sizes, void *kinds)
+			  size_t *sizes, void *kinds,
+			  struct gomp_coalesce_buf *cbuf)
 {
   struct gomp_device_descr *devicep = tgt->device_descr;
   struct splay_tree_s *mem_map = &devicep->mem_map;
@@ -306,7 +418,7 @@ gomp_map_fields_existing (struct target_
       && n2->host_start - n->host_start == n2->tgt_offset - n->tgt_offset)
     {
       gomp_map_vars_existing (devicep, n2, &cur_node,
-			      &tgt->list[i], kind & typemask);
+			      &tgt->list[i], kind & typemask, cbuf);
       return;
     }
   if (sizes[i] == 0)
@@ -322,7 +434,7 @@ gomp_map_fields_existing (struct target_
 		 == n2->tgt_offset - n->tgt_offset)
 	    {
 	      gomp_map_vars_existing (devicep, n2, &cur_node, &tgt->list[i],
-				      kind & typemask);
+				      kind & typemask, cbuf);
 	      return;
 	    }
 	}
@@ -334,7 +446,7 @@ gomp_map_fields_existing (struct target_
 	  && n2->host_start - n->host_start == n2->tgt_offset - n->tgt_offset)
 	{
 	  gomp_map_vars_existing (devicep, n2, &cur_node, &tgt->list[i],
-				  kind & typemask);
+				  kind & typemask, cbuf);
 	  return;
 	}
     }
@@ -381,6 +493,7 @@ gomp_map_vars (struct gomp_device_descr
   tgt->list_count = mapnum;
   tgt->refcount = pragma_kind == GOMP_MAP_VARS_ENTER_DATA ? 0 : 1;
   tgt->device_descr = devicep;
+  struct gomp_coalesce_buf cbuf, *cbufp = NULL;
 
   if (mapnum == 0)
     {
@@ -391,11 +504,25 @@ gomp_map_vars (struct gomp_device_descr
 
   tgt_align = sizeof (void *);
   tgt_size = 0;
+  cbuf.chunks = NULL;
+  cbuf.chunk_cnt = -1;
+  cbuf.use_cnt = 0;
+  cbuf.buf = NULL;
+  if (mapnum > 1 || pragma_kind == GOMP_MAP_VARS_TARGET)
+    {
+      cbuf.chunks
+	= (size_t *) gomp_alloca ((2 * mapnum + 2) * sizeof (size_t));
+      cbuf.chunk_cnt = 0;
+    }
   if (pragma_kind == GOMP_MAP_VARS_TARGET)
     {
       size_t align = 4 * sizeof (void *);
       tgt_align = align;
       tgt_size = mapnum * sizeof (void *);
+      cbuf.chunk_cnt = 1;
+      cbuf.use_cnt = 1 + (mapnum > 1);
+      cbuf.chunks[0] = 0;
+      cbuf.chunks[1] = tgt_size;
     }
 
   gomp_mutex_lock (&devicep->lock);
@@ -449,19 +576,26 @@ gomp_map_vars (struct gomp_device_descr
 	      size_t align = (size_t) 1 << (kind >> rshift);
 	      if (tgt_align < align)
 		tgt_align = align;
-	      tgt_size -= (uintptr_t) hostaddrs[first]
-			  - (uintptr_t) hostaddrs[i];
+	      tgt_size -= (uintptr_t) hostaddrs[first] - cur_node.host_start;
 	      tgt_size = (tgt_size + align - 1) & ~(align - 1);
-	      tgt_size += cur_node.host_end - (uintptr_t) hostaddrs[i];
+	      tgt_size += cur_node.host_end - cur_node.host_start;
 	      not_found_cnt += last - i;
 	      for (i = first; i <= last; i++)
-		tgt->list[i].key = NULL;
+		{
+		  tgt->list[i].key = NULL;
+		  if (gomp_to_device_kind_p (get_kind (short_mapkind, kinds, i)
+					     & typemask))
+		    gomp_coalesce_buf_add (&cbuf,
+					   tgt_size - cur_node.host_end
+					   + (uintptr_t) hostaddrs[i],
+					   sizes[i]);
+		}
 	      i--;
 	      continue;
 	    }
 	  for (i = first; i <= last; i++)
 	    gomp_map_fields_existing (tgt, n, first, i, hostaddrs,
-				      sizes, kinds);
+				      sizes, kinds, NULL);
 	  i--;
 	  continue;
 	}
@@ -485,6 +619,8 @@ gomp_map_vars (struct gomp_device_descr
 	  if (tgt_align < align)
 	    tgt_align = align;
 	  tgt_size = (tgt_size + align - 1) & ~(align - 1);
+	  gomp_coalesce_buf_add (&cbuf, tgt_size,
+				 cur_node.host_end - cur_node.host_start);
 	  tgt_size += cur_node.host_end - cur_node.host_start;
 	  has_firstprivate = true;
 	  continue;
@@ -504,7 +640,7 @@ gomp_map_vars (struct gomp_device_descr
 	n = splay_tree_lookup (mem_map, &cur_node);
       if (n && n->refcount != REFCOUNT_LINK)
 	gomp_map_vars_existing (devicep, n, &cur_node, &tgt->list[i],
-				kind & typemask);
+				kind & typemask, NULL);
       else
 	{
 	  tgt->list[i].key = NULL;
@@ -514,6 +650,9 @@ gomp_map_vars (struct gomp_device_descr
 	  if (tgt_align < align)
 	    tgt_align = align;
 	  tgt_size = (tgt_size + align - 1) & ~(align - 1);
+	  if (gomp_to_device_kind_p (kind & typemask))
+	    gomp_coalesce_buf_add (&cbuf, tgt_size,
+				   cur_node.host_end - cur_node.host_start);
 	  tgt_size += cur_node.host_end - cur_node.host_start;
 	  if ((kind & typemask) == GOMP_MAP_TO_PSET)
 	    {
@@ -562,6 +701,19 @@ gomp_map_vars (struct gomp_device_descr
       tgt->tgt_start = (uintptr_t) tgt->to_free;
       tgt->tgt_start = (tgt->tgt_start + tgt_align - 1) & ~(tgt_align - 1);
       tgt->tgt_end = tgt->tgt_start + tgt_size;
+
+      if (cbuf.use_cnt == 1)
+	cbuf.chunk_cnt--;
+      if (cbuf.chunk_cnt > 0)
+	{
+	  cbuf.buf
+	    = malloc (cbuf.chunks[2 * cbuf.chunk_cnt - 1] - cbuf.chunks[0]);
+	  if (cbuf.buf)
+	    {
+	      cbuf.tgt = tgt;
+	      cbufp = &cbuf;
+	    }
+	}
     }
   else
     {
@@ -600,7 +752,7 @@ gomp_map_vars (struct gomp_device_descr
 		len = sizes[i];
 		gomp_copy_host2dev (devicep,
 				    (void *) (tgt->tgt_start + tgt_size),
-				    (void *) hostaddrs[i], len);
+				    (void *) hostaddrs[i], len, cbufp);
 		tgt_size += len;
 		continue;
 	      case GOMP_MAP_FIRSTPRIVATE_INT:
@@ -633,7 +785,7 @@ gomp_map_vars (struct gomp_device_descr
 		  }
 		for (i = first; i <= last; i++)
 		  gomp_map_fields_existing (tgt, n, first, i, hostaddrs,
-					    sizes, kinds);
+					    sizes, kinds, cbufp);
 		i--;
 		continue;
 	      case GOMP_MAP_ALWAYS_POINTER:
@@ -658,7 +810,7 @@ gomp_map_vars (struct gomp_device_descr
 					      + cur_node.host_start
 					      - n->host_start),
 				    (void *) &cur_node.tgt_offset,
-				    sizeof (void *));
+				    sizeof (void *), cbufp);
 		cur_node.tgt_offset = n->tgt->tgt_start + n->tgt_offset
 				      + cur_node.host_start - n->host_start;
 		continue;
@@ -674,7 +826,7 @@ gomp_map_vars (struct gomp_device_descr
 	    splay_tree_key n = splay_tree_lookup (mem_map, k);
 	    if (n && n->refcount != REFCOUNT_LINK)
 	      gomp_map_vars_existing (devicep, n, k, &tgt->list[i],
-				      kind & typemask);
+				      kind & typemask, cbufp);
 	    else
 	      {
 		k->link_key = NULL;
@@ -725,26 +877,22 @@ gomp_map_vars (struct gomp_device_descr
 		  case GOMP_MAP_FORCE_TOFROM:
 		  case GOMP_MAP_ALWAYS_TO:
 		  case GOMP_MAP_ALWAYS_TOFROM:
-		    /* FIXME: Perhaps add some smarts, like if copying
-		       several adjacent fields from host to target, use some
-		       host buffer to avoid sending each var individually.  */
 		    gomp_copy_host2dev (devicep,
 					(void *) (tgt->tgt_start
 						  + k->tgt_offset),
 					(void *) k->host_start,
-					k->host_end - k->host_start);
+					k->host_end - k->host_start, cbufp);
 		    break;
 		  case GOMP_MAP_POINTER:
 		    gomp_map_pointer (tgt, (uintptr_t) *(void **) k->host_start,
-				      k->tgt_offset, sizes[i]);
+				      k->tgt_offset, sizes[i], cbufp);
 		    break;
 		  case GOMP_MAP_TO_PSET:
-		    /* FIXME: see above FIXME comment.  */
 		    gomp_copy_host2dev (devicep,
 					(void *) (tgt->tgt_start
 						  + k->tgt_offset),
 					(void *) k->host_start,
-					k->host_end - k->host_start);
+					k->host_end - k->host_start, cbufp);
 
 		    for (j = i + 1; j < mapnum; j++)
 		      if (!GOMP_MAP_POINTER_P (get_kind (short_mapkind, kinds,
@@ -767,7 +915,7 @@ gomp_map_vars (struct gomp_device_descr
 					    k->tgt_offset
 					    + ((uintptr_t) hostaddrs[j]
 					       - k->host_start),
-					    sizes[j]);
+					    sizes[j], cbufp);
 			  i++;
 			}
 		    break;
@@ -795,7 +943,7 @@ gomp_map_vars (struct gomp_device_descr
 					(void *) (tgt->tgt_start
 						  + k->tgt_offset),
 					(void *) k->host_start,
-					sizeof (void *));
+					sizeof (void *), cbufp);
 		    break;
 		  default:
 		    gomp_mutex_unlock (&devicep->lock);
@@ -822,13 +970,23 @@ gomp_map_vars (struct gomp_device_descr
       for (i = 0; i < mapnum; i++)
 	{
 	  cur_node.tgt_offset = gomp_map_val (tgt, hostaddrs, i);
-	  /* FIXME: see above FIXME comment.  */
 	  gomp_copy_host2dev (devicep,
 			      (void *) (tgt->tgt_start + i * sizeof (void *)),
-			      (void *) &cur_node.tgt_offset, sizeof (void *));
+			      (void *) &cur_node.tgt_offset, sizeof (void *),
+			      cbufp);
 	}
     }
 
+  if (cbufp)
+    {
+      long c = 0;
+      for (c = 0; c < cbuf.chunk_cnt; ++c)
+	gomp_copy_host2dev (devicep, (void *) (tgt->tgt_start + cbuf.chunks[2 * c]),
+			    (char *) cbuf.buf + (cbuf.chunks[2 * c] - cbuf.chunks[0]),
+			    cbuf.chunks[2 * c + 1] - cbuf.chunks[2 * c], NULL);
+      free (cbuf.buf);
+    }
+
   /* If the variable from "omp target enter data" map-list was already mapped,
      tgt is not needed.  Otherwise tgt will be freed by gomp_unmap_vars or
      gomp_exit_data.  */
@@ -970,7 +1128,7 @@ gomp_update (struct gomp_device_descr *d
 	    size_t size = cur_node.host_end - cur_node.host_start;
 
 	    if (GOMP_MAP_COPY_TO_P (kind & typemask))
-	      gomp_copy_host2dev (devicep, devaddr, hostaddr, size);
+	      gomp_copy_host2dev (devicep, devaddr, hostaddr, size, NULL);
 	    if (GOMP_MAP_COPY_FROM_P (kind & typemask))
 	      gomp_copy_dev2host (devicep, hostaddr, devaddr, size);
 	  }


	Jakub

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH] Implement omp async support for nvptx
  2017-10-25 12:03   ` Jakub Jelinek
@ 2017-10-27 14:13     ` Tom de Vries
  2017-10-30  7:25       ` Jakub Jelinek
  2018-12-06 17:02     ` [RFC PATCH] Coalesce host to device transfers in libgomp Thomas Schwinge
  2019-12-18 17:15     ` Thomas Schwinge
  2 siblings, 1 reply; 19+ messages in thread
From: Tom de Vries @ 2017-10-27 14:13 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: Alexander Monakov, Thomas Schwinge, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 1030 bytes --]

[ was: Re: [RFC PATCH] Coalesce host to device transfers in libgomp ]
On 10/25/2017 01:38 PM, Jakub Jelinek wrote:
> And we don't really have the async target implemented yet for NVPTX:(,
> guess that should be the highest priority after this optimization.

Hi,

how about this approach:
1 - Move async_run from plugin-hsa.c to default_async_run
2 - Implement omp async support for nvptx
?

The first patch moves the GOMP_OFFLOAD_async_run implementation from 
plugin-hsa.c to target.c, making it the default implementation if the 
plugin does not define the GOMP_OFFLOAD_async_run symbol.

The second patch removes the GOMP_OFFLOAD_async_run symbol from the 
nvptx plugin, activating the default implementation, and makes sure 
GOMP_OFFLOAD_run can be called from a fresh thread.

I've tested this with libgomp.c/c.exp and the previously failing 
target-33.c and target-34.c are now passing, and there are no regressions.

OK for trunk after complete testing (and adding function comment for 
default_async_run)?

Thanks,
- Tom


[-- Attachment #2: 0001-Move-async_run-from-plugin-hsa.c-to-default_async_run.patch --]
[-- Type: text/x-patch, Size: 5419 bytes --]

Move async_run from plugin-hsa.c to default_async_run

2017-10-27  Tom de Vries  <tom@codesourcery.com>

	* plugin/plugin-hsa.c (struct async_run_info): Move ...
	(run_kernel_asynchronously): Rename to ...
	(GOMP_OFFLOAD_async_run): Rename to ...
	* target.c (struct async_run_info): ... here.
	(default_async_run_1): ... this.
	(default_async_run): ... this.
	(gomp_target_task_fn): Handle missing async_run.
	(gomp_load_plugin_for_device): Make async_run optional.

---
 libgomp/plugin/plugin-hsa.c | 58 -----------------------------------------
 libgomp/target.c            | 63 ++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 60 insertions(+), 61 deletions(-)

diff --git a/libgomp/plugin/plugin-hsa.c b/libgomp/plugin/plugin-hsa.c
index fc08f5d..65a89a3 100644
--- a/libgomp/plugin/plugin-hsa.c
+++ b/libgomp/plugin/plugin-hsa.c
@@ -1625,64 +1625,6 @@ GOMP_OFFLOAD_run (int n __attribute__((unused)),
   run_kernel (kernel, vars, kla);
 }
 
-/* Information to be passed to a thread running a kernel asycnronously.  */
-
-struct async_run_info
-{
-  int device;
-  void *tgt_fn;
-  void *tgt_vars;
-  void **args;
-  void *async_data;
-};
-
-/* Thread routine to run a kernel asynchronously.  */
-
-static void *
-run_kernel_asynchronously (void *thread_arg)
-{
-  struct async_run_info *info = (struct async_run_info *) thread_arg;
-  int device = info->device;
-  void *tgt_fn = info->tgt_fn;
-  void *tgt_vars = info->tgt_vars;
-  void **args = info->args;
-  void *async_data = info->async_data;
-
-  free (info);
-  GOMP_OFFLOAD_run (device, tgt_fn, tgt_vars, args);
-  GOMP_PLUGIN_target_task_completion (async_data);
-  return NULL;
-}
-
-/* Part of the libgomp plugin interface.  Run a kernel like GOMP_OFFLOAD_run
-   does, but asynchronously and call GOMP_PLUGIN_target_task_completion when it
-   has finished.  */
-
-void
-GOMP_OFFLOAD_async_run (int device, void *tgt_fn, void *tgt_vars,
-			void **args, void *async_data)
-{
-  pthread_t pt;
-  struct async_run_info *info;
-  HSA_DEBUG ("GOMP_OFFLOAD_async_run invoked\n")
-  info = GOMP_PLUGIN_malloc (sizeof (struct async_run_info));
-
-  info->device = device;
-  info->tgt_fn = tgt_fn;
-  info->tgt_vars = tgt_vars;
-  info->args = args;
-  info->async_data = async_data;
-
-  int err = pthread_create (&pt, NULL, &run_kernel_asynchronously, info);
-  if (err != 0)
-    GOMP_PLUGIN_fatal ("HSA asynchronous thread creation failed: %s",
-		       strerror (err));
-  err = pthread_detach (pt);
-  if (err != 0)
-    GOMP_PLUGIN_fatal ("Failed to detach a thread to run HSA kernel "
-		       "asynchronously: %s", strerror (err));
-}
-
 /* Deinitialize all information associated with MODULE and kernels within
    it.  Return TRUE on success.  */
 
diff --git a/libgomp/target.c b/libgomp/target.c
index 3dd119f..456ed78 100644
--- a/libgomp/target.c
+++ b/libgomp/target.c
@@ -1868,6 +1868,59 @@ GOMP_target_enter_exit_data (int device, size_t mapnum, void **hostaddrs,
     gomp_exit_data (devicep, mapnum, hostaddrs, sizes, kinds);
 }
 
+/* Information to be passed to a thread running a kernel asycnronously.  */
+
+struct async_run_info
+{
+  struct gomp_device_descr *devicep;
+  void *tgt_fn;
+  void *tgt_vars;
+  void **args;
+  void *async_data;
+};
+
+/* Thread routine to run a kernel asynchronously.  */
+
+static void *
+default_async_run_1 (void *thread_arg)
+{
+  struct async_run_info *info = (struct async_run_info *) thread_arg;
+  struct gomp_device_descr *devicep = info->devicep;
+  void *tgt_fn = info->tgt_fn;
+  void *tgt_vars = info->tgt_vars;
+  void **args = info->args;
+  void *async_data = info->async_data;
+
+  free (info);
+  devicep->run_func (devicep->target_id, tgt_fn, tgt_vars, args);
+  GOMP_PLUGIN_target_task_completion (async_data);
+  return NULL;
+}
+
+static void
+default_async_run (struct gomp_device_descr *devicep, void *tgt_fn,
+		   void *tgt_vars, void **args, void *async_data)
+{
+  pthread_t pt;
+  struct async_run_info *info;
+  info = GOMP_PLUGIN_malloc (sizeof (struct async_run_info));
+
+  info->devicep = devicep;
+  info->tgt_fn = tgt_fn;
+  info->tgt_vars = tgt_vars;
+  info->args = args;
+  info->async_data = async_data;
+
+ int err = pthread_create (&pt, NULL, &default_async_run_1, info);
+  if (err != 0)
+    GOMP_PLUGIN_fatal ("Asynchronous thread creation failed: %s",
+		       strerror (err));
+  err = pthread_detach (pt);
+  if (err != 0)
+    GOMP_PLUGIN_fatal ("Failed to detach a thread to run kernel "
+		       "asynchronously: %s", strerror (err));
+}
+
 bool
 gomp_target_task_fn (void *data)
 {
@@ -1909,8 +1962,12 @@ gomp_target_task_fn (void *data)
 	}
       ttask->state = GOMP_TARGET_TASK_READY_TO_RUN;
 
-      devicep->async_run_func (devicep->target_id, fn_addr, actual_arguments,
-			       ttask->args, (void *) ttask);
+      if (devicep->async_run_func)
+	devicep->async_run_func (devicep->target_id, fn_addr, actual_arguments,
+				 ttask->args, (void *) ttask);
+      else
+	default_async_run (devicep, fn_addr, actual_arguments, ttask->args,
+			   (void *) ttask);
       return true;
     }
   else if (devicep == NULL
@@ -2393,7 +2450,7 @@ gomp_load_plugin_for_device (struct gomp_device_descr *device,
   if (device->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400)
     {
       DLSYM (run);
-      DLSYM (async_run);
+      DLSYM_OPT (async_run, async_run);
       DLSYM_OPT (can_run, can_run);
       DLSYM (dev2dev);
     }

[-- Attachment #3: 0002-Implement-omp-async-support-for-nvptx.patch --]
[-- Type: text/x-patch, Size: 1220 bytes --]

Implement omp async support for nvptx

2017-10-27  Tom de Vries  <tom@codesourcery.com>

	PR libgomp/81688
	* plugin/plugin-nvptx.c (GOMP_OFFLOAD_run): Call
	nvptx_attach_host_thread_to_device.
	(GOMP_OFFLOAD_async_run): Remove.

---
 libgomp/plugin/plugin-nvptx.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index 71630b5..4e0009f 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -2127,6 +2127,8 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
   const char *maybe_abort_msg = "(perhaps abort was called)";
   int teams = 0, threads = 0;
 
+  nvptx_attach_host_thread_to_device (ord);
+
   if (!args)
     GOMP_PLUGIN_fatal ("No target arguments provided");
   while (*args)
@@ -2170,10 +2172,3 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
   nvptx_stacks_free (stacks, teams * threads);
 }
-
-void
-GOMP_OFFLOAD_async_run (int ord, void *tgt_fn, void *tgt_vars, void **args,
-			void *async_data)
-{
-  GOMP_PLUGIN_fatal ("GOMP_OFFLOAD_async_run unimplemented");
-}

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] Implement omp async support for nvptx
  2017-10-27 14:13     ` [PATCH] Implement omp async support for nvptx Tom de Vries
@ 2017-10-30  7:25       ` Jakub Jelinek
  2017-10-30 12:02         ` Tom de Vries
  0 siblings, 1 reply; 19+ messages in thread
From: Jakub Jelinek @ 2017-10-30  7:25 UTC (permalink / raw)
  To: Tom de Vries; +Cc: Alexander Monakov, Thomas Schwinge, gcc-patches

On Fri, Oct 27, 2017 at 03:57:28PM +0200, Tom de Vries wrote:
> how about this approach:
> 1 - Move async_run from plugin-hsa.c to default_async_run
> 2 - Implement omp async support for nvptx
> ?
> 
> The first patch moves the GOMP_OFFLOAD_async_run implementation from
> plugin-hsa.c to target.c, making it the default implementation if the plugin
> does not define the GOMP_OFFLOAD_async_run symbol.
> 
> The second patch removes the GOMP_OFFLOAD_async_run symbol from the nvptx
> plugin, activating the default implementation, and makes sure
> GOMP_OFFLOAD_run can be called from a fresh thread.
> 
> I've tested this with libgomp.c/c.exp and the previously failing target-33.c
> and target-34.c are now passing, and there are no regressions.
> 
> OK for trunk after complete testing (and adding function comment for
> default_async_run)?

Can't PTX do better than this?  What I mean is that while we probably need
to take the device lock for the possible memory transfers and deallocation
at the end of the region and thus perform some action on the host in between
the end of the async target region and data copying/deallocation, can't we
have a single thread per device instead of one thread per async target
region, use CUDA async APIs and poll for all the pending async regions
together?  I mean, if we need to take the device lock, then we need to
serialize the finalization anyway and reusing the same thread would
significantly decrease the overhead if there are many async regions.

And, if it at least in theory can do better than that, then even if we
punt on that for now due to time/resource constraints, maybe it would be
better to do this inside of plugin where it can be more easily replaced
later.

	Jakub

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] Implement omp async support for nvptx
  2017-10-30  7:25       ` Jakub Jelinek
@ 2017-10-30 12:02         ` Tom de Vries
  2017-10-30 13:52           ` Tom de Vries
  0 siblings, 1 reply; 19+ messages in thread
From: Tom de Vries @ 2017-10-30 12:02 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: Alexander Monakov, Thomas Schwinge, gcc-patches

On 10/30/2017 08:15 AM, Jakub Jelinek wrote:
> On Fri, Oct 27, 2017 at 03:57:28PM +0200, Tom de Vries wrote:
>> how about this approach:
>> 1 - Move async_run from plugin-hsa.c to default_async_run
>> 2 - Implement omp async support for nvptx
>> ?
>>
>> The first patch moves the GOMP_OFFLOAD_async_run implementation from
>> plugin-hsa.c to target.c, making it the default implementation if the plugin
>> does not define the GOMP_OFFLOAD_async_run symbol.
>>
>> The second patch removes the GOMP_OFFLOAD_async_run symbol from the nvptx
>> plugin, activating the default implementation, and makes sure
>> GOMP_OFFLOAD_run can be called from a fresh thread.
>>
>> I've tested this with libgomp.c/c.exp and the previously failing target-33.c
>> and target-34.c are now passing, and there are no regressions.
>>
>> OK for trunk after complete testing (and adding function comment for
>> default_async_run)?
> 
> Can't PTX do better than this?

It can.

I found your comment describing this implementation as a hack here ( 
https://gcc.gnu.org/ml/gcc-patches/2015-11/msg02726.html ) after sending 
this on Friday, and thought about things a little bit more. So let me 
try again.

This is not an optimal nvptx async implementation. This is a proposal to 
have a poor man's async implementation in the common code, rather than 
having libgomp accel ports implementing GOMP_OFFLOAD_async_run as abort 
at first.

AFAIU, the purpose of the async functionality is to have jobs executed 
concurrently and/or interleaved on the device. While this implementation 
does not offer jobs to the device in separate queues, such that the 
device can decide on concurrent and interleaved behaviour, it does 
present the device with a possibly interleaved job schedule (which is 
slightly better than having a poor mans async implementation that is 
just synchronous).

In order to have an optimal implementation, one would still need to 
implement the GOMP_OFFLOAD_async_run hook, which would bypass this 
implementation.

I'm not sure how useful this would be, but I can even imagine using this 
if all the accel ports have implemented the GOMP_OFFLOAD_async_run hook.
We could define a variable OMP_ASYNC with semantics:
- 0: ignore plugins GOMP_OFFLOAD_async_run hook, fall back on
      synchronous behaviour
- 1: ignore plugins GOMP_OFFLOAD_async_run hook, use poor man's
      implementation.
- 2: use plugins GOMP_OFFLOAD_async_run hook.
This could be helpful in debugging programs with async behaviour.

> What I mean is that while we probably need
> to take the device lock for the possible memory transfers and deallocation
> at the end of the region and thus perform some action on the host in between
> the end of the async target region and data copying/deallocation, can't we
> have a single thread per device instead of one thread per async target
> region, use CUDA async APIs and poll for all the pending async regions
> together?  I mean, if we need to take the device lock, then we need to
> serialize the finalization anyway and reusing the same thread would
> significantly decrease the overhead if there are many async regions.
> 

As for the poor mans implementation, it is indeed inefficient and could 
be improved, but I wonder if it's worth the effort. [ Perhaps though for 
debugging purposes the ability to control the interleaving in some way 
might be useful. ]

I imagine that an efficient nvptx implementation will use cuda streams, 
which are queues where both kernels and mem transfers can be queued. So 
rather than calling GOMP_PLUGIN_target_task_completion once the kernel 
is done, it would be more efficient to be able call a similar function 
that schedules the data transfers that need to happen, without assuming 
that the kernel is already done. However, AFAIU, that won't take care of 
deallocation. So I guess the first approach will be to use cuda events 
to poll whether a kernel has completed, and then call 
GOMP_PLUGIN_target_task_completion.

> And, if it at least in theory can do better than that, then even if we
> punt on that for now due to time/resource constraints, maybe it would be
> better to do this inside of plugin where it can be more easily replaced
> later.

I'm trying to argue the other way round: if there is no optimal 
implementation in the plugin, let's provide at least a non-optimal but 
non-synchronous implementation as default, and exercise the async code 
rather than having tests fail with a plugin abort.

Thanks,
- Tom

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] Implement omp async support for nvptx
  2017-10-30 12:02         ` Tom de Vries
@ 2017-10-30 13:52           ` Tom de Vries
  0 siblings, 0 replies; 19+ messages in thread
From: Tom de Vries @ 2017-10-30 13:52 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: Alexander Monakov, Thomas Schwinge, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 5128 bytes --]

On 10/30/2017 12:55 PM, Tom de Vries wrote:
> On 10/30/2017 08:15 AM, Jakub Jelinek wrote:
>> On Fri, Oct 27, 2017 at 03:57:28PM +0200, Tom de Vries wrote:
>>> how about this approach:
>>> 1 - Move async_run from plugin-hsa.c to default_async_run
>>> 2 - Implement omp async support for nvptx
>>> ?
>>>
>>> The first patch moves the GOMP_OFFLOAD_async_run implementation from
>>> plugin-hsa.c to target.c, making it the default implementation if the 
>>> plugin
>>> does not define the GOMP_OFFLOAD_async_run symbol.
>>>
>>> The second patch removes the GOMP_OFFLOAD_async_run symbol from the 
>>> nvptx
>>> plugin, activating the default implementation, and makes sure
>>> GOMP_OFFLOAD_run can be called from a fresh thread.
>>>
>>> I've tested this with libgomp.c/c.exp and the previously failing 
>>> target-33.c
>>> and target-34.c are now passing, and there are no regressions.
>>>
>>> OK for trunk after complete testing (and adding function comment for
>>> default_async_run)?
>>
>> Can't PTX do better than this?
> 
> It can.
> 
> I found your comment describing this implementation as a hack here ( 
> https://gcc.gnu.org/ml/gcc-patches/2015-11/msg02726.html ) after sending 
> this on Friday, and thought about things a little bit more. So let me 
> try again.
> 
> This is not an optimal nvptx async implementation. This is a proposal to 
> have a poor man's async implementation in the common code, rather than 
> having libgomp accel ports implementing GOMP_OFFLOAD_async_run as abort 
> at first.
> 
> AFAIU, the purpose of the async functionality is to have jobs executed 
> concurrently and/or interleaved on the device. While this implementation 
> does not offer jobs to the device in separate queues, such that the 
> device can decide on concurrent and interleaved behaviour, it does 
> present the device with a possibly interleaved job schedule (which is 
> slightly better than having a poor mans async implementation that is 
> just synchronous).
> 
> In order to have an optimal implementation, one would still need to 
> implement the GOMP_OFFLOAD_async_run hook, which would bypass this 
> implementation.
> 
> I'm not sure how useful this would be, but I can even imagine using this 
> if all the accel ports have implemented the GOMP_OFFLOAD_async_run hook.
> We could define a variable OMP_ASYNC with semantics:
> - 0: ignore plugins GOMP_OFFLOAD_async_run hook, fall back on
>  Â Â Â Â  synchronous behaviour
> - 1: ignore plugins GOMP_OFFLOAD_async_run hook, use poor man's
>  Â Â Â Â  implementation.
> - 2: use plugins GOMP_OFFLOAD_async_run hook.
> This could be helpful in debugging programs with async behaviour.
> 
>> What I mean is that while we probably need
>> to take the device lock for the possible memory transfers and 
>> deallocation
>> at the end of the region and thus perform some action on the host in 
>> between
>> the end of the async target region and data copying/deallocation, 
>> can't we
>> have a single thread per device instead of one thread per async target
>> region, use CUDA async APIs and poll for all the pending async regions
>> together?Â  I mean, if we need to take the device lock, then we need to
>> serialize the finalization anyway and reusing the same thread would
>> significantly decrease the overhead if there are many async regions.
>>
> 
> As for the poor mans implementation, it is indeed inefficient and could 
> be improved, but I wonder if it's worth the effort. [ Perhaps though for 
> debugging purposes the ability to control the interleaving in some way 
> might be useful. ]
> 
> I imagine that an efficient nvptx implementation will use cuda streams, 
> which are queues where both kernels and mem transfers can be queued. So 
> rather than calling GOMP_PLUGIN_target_task_completion once the kernel 
> is done, it would be more efficient to be able call a similar function 
> that schedules the data transfers that need to happen, without assuming 
> that the kernel is already done. However, AFAIU, that won't take care of 
> deallocation. So I guess the first approach will be to use cuda events 
> to poll whether a kernel has completed, and then call 
> GOMP_PLUGIN_target_task_completion.
> 
>> And, if it at least in theory can do better than that, then even if we
>> punt on that for now due to time/resource constraints, maybe it would be
>> better to do this inside of plugin where it can be more easily replaced
>> later.
> 
> I'm trying to argue the other way round: if there is no optimal 
> implementation in the plugin, let's provide at least a non-optimal but 
> non-synchronous implementation as default, and exercise the async code 
> rather than having tests fail with a plugin abort.

And if you're concerned about not having tests failing to remind us that 
the default async support is not optimal and the GOMP_OFFLOAD_async_run 
hook is missing, we can add target-3[34]-opt.c testcases that force the 
use of the GOMP_OFFLOAD_async_run hook, and will abort if the hook is 
missing. Updated untested patch attached (implemented the the env var 
proposed above as OMP_TARGET_ASYNC).

Thanks,
- Tom

[-- Attachment #2: 0001-Move-async_run-from-plugin-hsa.c-to-default_async_run.patch --]
[-- Type: text/x-patch, Size: 8882 bytes --]

Move async_run from plugin-hsa.c to default_async_run

2017-10-27  Tom de Vries  <tom@codesourcery.com>

	* env.c (gomp_target_async, gomp_target_async_set): Define.
	(initialize_env): Handle environment variable OMP_TARGET_ASYNC.
	* libgomp.h ((gomp_target_async, gomp_target_async_set)): Declare.
	* plugin/plugin-hsa.c (struct async_run_info): Move ...
	(run_kernel_asynchronously): Rename to ...
	(GOMP_OFFLOAD_async_run): Rename to ...
	* target.c (struct async_run_info): ... here.
	(default_async_run_1): ... this.
	(default_async_run): ... this.
	(gomp_target_task_fn): Handle gomp_target_async.
	(gomp_load_plugin_for_device): Make async_run optional.
	* testsuite/libgomp.c/target-33-opt.c: New test.
	* testsuite/libgomp.c/target-34-opt.c: New test.

---
 libgomp/env.c                               | 13 +++++
 libgomp/libgomp.h                           |  2 +
 libgomp/plugin/plugin-hsa.c                 | 58 -------------------
 libgomp/target.c                            | 89 +++++++++++++++++++++++++++--
 libgomp/testsuite/libgomp.c/target-33-opt.c |  3 +
 libgomp/testsuite/libgomp.c/target-34-opt.c |  3 +
 6 files changed, 106 insertions(+), 62 deletions(-)

diff --git a/libgomp/env.c b/libgomp/env.c
index 802c73b..6e4d22c 100644
--- a/libgomp/env.c
+++ b/libgomp/env.c
@@ -75,6 +75,8 @@ struct gomp_task_icv gomp_global_icv = {
 
 unsigned long gomp_max_active_levels_var = INT_MAX;
 bool gomp_cancel_var = false;
+int gomp_target_async = 2;
+bool gomp_target_async_set = false;
 int gomp_max_task_priority_var = 0;
 #ifndef HAVE_SYNC_BUILTINS
 gomp_mutex_t gomp_managed_threads_lock;
@@ -1230,6 +1232,17 @@ initialize_env (void)
   parse_boolean ("OMP_CANCELLATION", &gomp_cancel_var);
   parse_int ("OMP_DEFAULT_DEVICE", &gomp_global_icv.default_device_var, true);
   parse_int ("OMP_MAX_TASK_PRIORITY", &gomp_max_task_priority_var, true);
+  if (parse_int ("OMP_TARGET_ASYNC", &gomp_target_async, true))
+    {
+      if (0 <= gomp_target_async && gomp_target_async <= 2)
+	gomp_target_async_set = true;
+      else
+	{
+	  gomp_error ("OMP_TARGET_ASYNC not in 0-2 range: %d. "
+		      "Using default 2", gomp_target_async);
+	  gomp_target_async = 2;
+	}
+    }
   parse_unsigned_long ("OMP_MAX_ACTIVE_LEVELS", &gomp_max_active_levels_var,
 		       true);
   if (parse_unsigned_long ("OMP_THREAD_LIMIT", &thread_limit_var, false))
diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h
index 940b5b8..d896b30 100644
--- a/libgomp/libgomp.h
+++ b/libgomp/libgomp.h
@@ -354,6 +354,8 @@ extern struct gomp_task_icv gomp_global_icv;
 extern gomp_mutex_t gomp_managed_threads_lock;
 #endif
 extern unsigned long gomp_max_active_levels_var;
+extern int gomp_target_async;
+extern bool gomp_target_async_set;
 extern bool gomp_cancel_var;
 extern int gomp_max_task_priority_var;
 extern unsigned long long gomp_spin_count_var, gomp_throttled_spin_count_var;
diff --git a/libgomp/plugin/plugin-hsa.c b/libgomp/plugin/plugin-hsa.c
index fc08f5d..65a89a3 100644
--- a/libgomp/plugin/plugin-hsa.c
+++ b/libgomp/plugin/plugin-hsa.c
@@ -1625,64 +1625,6 @@ GOMP_OFFLOAD_run (int n __attribute__((unused)),
   run_kernel (kernel, vars, kla);
 }
 
-/* Information to be passed to a thread running a kernel asycnronously.  */
-
-struct async_run_info
-{
-  int device;
-  void *tgt_fn;
-  void *tgt_vars;
-  void **args;
-  void *async_data;
-};
-
-/* Thread routine to run a kernel asynchronously.  */
-
-static void *
-run_kernel_asynchronously (void *thread_arg)
-{
-  struct async_run_info *info = (struct async_run_info *) thread_arg;
-  int device = info->device;
-  void *tgt_fn = info->tgt_fn;
-  void *tgt_vars = info->tgt_vars;
-  void **args = info->args;
-  void *async_data = info->async_data;
-
-  free (info);
-  GOMP_OFFLOAD_run (device, tgt_fn, tgt_vars, args);
-  GOMP_PLUGIN_target_task_completion (async_data);
-  return NULL;
-}
-
-/* Part of the libgomp plugin interface.  Run a kernel like GOMP_OFFLOAD_run
-   does, but asynchronously and call GOMP_PLUGIN_target_task_completion when it
-   has finished.  */
-
-void
-GOMP_OFFLOAD_async_run (int device, void *tgt_fn, void *tgt_vars,
-			void **args, void *async_data)
-{
-  pthread_t pt;
-  struct async_run_info *info;
-  HSA_DEBUG ("GOMP_OFFLOAD_async_run invoked\n")
-  info = GOMP_PLUGIN_malloc (sizeof (struct async_run_info));
-
-  info->device = device;
-  info->tgt_fn = tgt_fn;
-  info->tgt_vars = tgt_vars;
-  info->args = args;
-  info->async_data = async_data;
-
-  int err = pthread_create (&pt, NULL, &run_kernel_asynchronously, info);
-  if (err != 0)
-    GOMP_PLUGIN_fatal ("HSA asynchronous thread creation failed: %s",
-		       strerror (err));
-  err = pthread_detach (pt);
-  if (err != 0)
-    GOMP_PLUGIN_fatal ("Failed to detach a thread to run HSA kernel "
-		       "asynchronously: %s", strerror (err));
-}
-
 /* Deinitialize all information associated with MODULE and kernels within
    it.  Return TRUE on success.  */
 
diff --git a/libgomp/target.c b/libgomp/target.c
index 3dd119f..0930fa4 100644
--- a/libgomp/target.c
+++ b/libgomp/target.c
@@ -1868,6 +1868,59 @@ GOMP_target_enter_exit_data (int device, size_t mapnum, void **hostaddrs,
     gomp_exit_data (devicep, mapnum, hostaddrs, sizes, kinds);
 }
 
+/* Information to be passed to a thread running a kernel asycnronously.  */
+
+struct async_run_info
+{
+  struct gomp_device_descr *devicep;
+  void *tgt_fn;
+  void *tgt_vars;
+  void **args;
+  void *async_data;
+};
+
+/* Thread routine to run a kernel asynchronously.  */
+
+static void *
+default_async_run_1 (void *thread_arg)
+{
+  struct async_run_info *info = (struct async_run_info *) thread_arg;
+  struct gomp_device_descr *devicep = info->devicep;
+  void *tgt_fn = info->tgt_fn;
+  void *tgt_vars = info->tgt_vars;
+  void **args = info->args;
+  void *async_data = info->async_data;
+
+  free (info);
+  devicep->run_func (devicep->target_id, tgt_fn, tgt_vars, args);
+  GOMP_PLUGIN_target_task_completion (async_data);
+  return NULL;
+}
+
+static void
+default_async_run (struct gomp_device_descr *devicep, void *tgt_fn,
+		   void *tgt_vars, void **args, void *async_data)
+{
+  pthread_t pt;
+  struct async_run_info *info;
+  info = GOMP_PLUGIN_malloc (sizeof (struct async_run_info));
+
+  info->devicep = devicep;
+  info->tgt_fn = tgt_fn;
+  info->tgt_vars = tgt_vars;
+  info->args = args;
+  info->async_data = async_data;
+
+ int err = pthread_create (&pt, NULL, &default_async_run_1, info);
+  if (err != 0)
+    GOMP_PLUGIN_fatal ("Asynchronous thread creation failed: %s",
+		       strerror (err));
+  err = pthread_detach (pt);
+  if (err != 0)
+    GOMP_PLUGIN_fatal ("Failed to detach a thread to run kernel "
+		       "asynchronously: %s", strerror (err));
+}
+
 bool
 gomp_target_task_fn (void *data)
 {
@@ -1909,9 +1962,37 @@ gomp_target_task_fn (void *data)
 	}
       ttask->state = GOMP_TARGET_TASK_READY_TO_RUN;
 
-      devicep->async_run_func (devicep->target_id, fn_addr, actual_arguments,
-			       ttask->args, (void *) ttask);
-      return true;
+      if (gomp_target_async == 2)
+	{
+	  if (devicep->async_run_func)
+	    {
+	      devicep->async_run_func (devicep->target_id, fn_addr,
+				       actual_arguments, ttask->args,
+				       (void *) ttask);
+	      return true;
+	    }
+	  else if (gomp_target_async_set)
+	    gomp_fatal ("GOMP_OFFLOAD_async_run not implemented, cannot support"
+			" OMP_TARGET_ASYNC == 2");
+	}
+
+      if (gomp_target_async >= 1)
+	{
+	  default_async_run (devicep, fn_addr, actual_arguments, ttask->args,
+			     (void *) ttask);
+	  return true;
+	}
+
+      if (gomp_target_async == 0)
+	{
+	  devicep->run_func (devicep->target_id, fn_addr, actual_arguments,
+			     ttask->args);
+	  GOMP_PLUGIN_target_task_completion (ttask);
+	  return true;
+	}
+
+      gomp_fatal ("gomp_target_async not in 0-2 range: %d",
+		  gomp_target_async);
     }
   else if (devicep == NULL
 	   || !(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400)
@@ -2393,7 +2474,7 @@ gomp_load_plugin_for_device (struct gomp_device_descr *device,
   if (device->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400)
     {
       DLSYM (run);
-      DLSYM (async_run);
+      DLSYM_OPT (async_run, async_run);
       DLSYM_OPT (can_run, can_run);
       DLSYM (dev2dev);
     }
diff --git a/libgomp/testsuite/libgomp.c/target-33-opt.c b/libgomp/testsuite/libgomp.c/target-33-opt.c
new file mode 100644
index 0000000..ee5b52c
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/target-33-opt.c
@@ -0,0 +1,3 @@
+/* { dg-set-target-env-var OMP_TARGET_ASYNC "2" } */
+
+#include "target-33.c"
diff --git a/libgomp/testsuite/libgomp.c/target-34-opt.c b/libgomp/testsuite/libgomp.c/target-34-opt.c
new file mode 100644
index 0000000..0c8b197
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/target-34-opt.c
@@ -0,0 +1,3 @@
+/* { dg-set-target-env-var OMP_TARGET_ASYNC "2" } */
+
+#include "target-34.c"

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC PATCH] Coalesce host to device transfers in libgomp
  2017-10-25 12:03   ` Jakub Jelinek
  2017-10-27 14:13     ` [PATCH] Implement omp async support for nvptx Tom de Vries
@ 2018-12-06 17:02     ` Thomas Schwinge
  2018-12-06 17:19       ` Jakub Jelinek
  2019-12-18 17:15     ` Thomas Schwinge
  2 siblings, 1 reply; 19+ messages in thread
From: Thomas Schwinge @ 2018-12-06 17:02 UTC (permalink / raw)
  To: Jakub Jelinek, gcc-patches
  Cc: Martin Jambor, Alexander Monakov, Chung-Lin Tang

Hi Jakub!

While reviewing Chung-Lin's
<https://gcc.gnu.org/ml/gcc-patches/2018-09/msg01428.html> "[PATCH 4/6,
OpenACC, libgomp] Async re-work, libgomp/target.c changes", I noticed the
following unrelated hunk.  Is that intentional or just an oversight that
it hasn't been included in your "gomp_coalesce_buf" changes (quoted below
for reference)?

commit 2abec5454063076ebd0fddf6ed25a3459c4f5ac3
Author: Thomas Schwinge <thomas@codesourcery.com>
Date:   Thu Dec 6 17:52:34 2018 +0100

    Coalesce host to device transfers in libgomp: link pointer
    
            libgomp/
            * target.c (gomp_map_vars): Call "gomp_copy_host2dev" instead of
            "devicep->host2dev_func".
---
 libgomp/target.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git libgomp/target.c libgomp/target.c
index 8ebc2a370a16..9cb2ec8d026f 100644
--- libgomp/target.c
+++ libgomp/target.c
@@ -957,9 +957,9 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
 		    /* Set link pointer on target to the device address of the
 		       mapped object.  */
 		    void *tgt_addr = (void *) (tgt->tgt_start + k->tgt_offset);
-		    devicep->host2dev_func (devicep->target_id,
-					    (void *) n->tgt_offset,
-					    &tgt_addr, sizeof (void *));
+		    gomp_copy_host2dev (devicep, (void *) n->tgt_offset,
+					&tgt_addr, sizeof (void *), cbufp);
+
 		  }
 		array++;
 	      }

If approving this patch, please respond with "Reviewed-by: NAME <EMAIL>"
so that your effort will be recorded in the commit log, see
<https://gcc.gnu.org/wiki/Reviewed-by>.


Grüße
 Thomas


On Wed, 25 Oct 2017 13:38:50 +0200, Jakub Jelinek <jakub@redhat.com> wrote:
> Here is an updated patch with some renaming, extra macros, one extra inline
> function and comments.

> 2017-10-25  Jakub Jelinek  <jakub@redhat.com>
> 
> 	* target.c (struct gomp_coalesce_buf): New type.
> 	(MAX_COALESCE_BUF_SIZE, MAX_COALESCE_BUF_GAP): Define.
> 	(gomp_coalesce_buf_add, gomp_to_device_kind_p): New functions.
> 	(gomp_copy_host2dev): Add CBUF argument, if copying into
> 	the cached ranges, memcpy into buffer instead of copying
> 	into device.
> 	(gomp_map_vars_existing, gomp_map_pointer, gomp_map_fields_existing):
> 	Add CBUF argument, pass it through to other calls.
> 	(gomp_map_vars): Aggregate copies from host to device if small enough
> 	and with small enough gaps in between into memcpy into a buffer and
> 	fewer host to device copies from the buffer.
> 	(gomp_update): Adjust gomp_copy_host2dev caller.
> 
> --- libgomp/target.c.jj	2017-10-24 12:07:03.763759657 +0200
> +++ libgomp/target.c	2017-10-25 13:17:31.608975390 +0200
> @@ -177,10 +177,122 @@ gomp_device_copy (struct gomp_device_des
>      }
>  }
>  
> +/* Infrastructure for coalescing adjacent or nearly adjacent (in device addresses)
> +   host to device memory transfers.  */
> +
> +struct gomp_coalesce_buf
> +{
> +  /* Buffer into which gomp_copy_host2dev will memcpy data and from which
> +     it will be copied to the device.  */
> +  void *buf;
> +  struct target_mem_desc *tgt;
> +  /* Array with offsets, chunks[2 * i] is the starting offset and
> +     chunks[2 * i + 1] ending offset relative to tgt->tgt_start device address
> +     of chunks which are to be copied to buf and later copied to device.  */
> +  size_t *chunks;
> +  /* Number of chunks in chunks array, or -1 if coalesce buffering should not
> +     be performed.  */
> +  long chunk_cnt;
> +  /* During construction of chunks array, how many memory regions are within
> +     the last chunk.  If there is just one memory region for a chunk, we copy
> +     it directly to device rather than going through buf.  */
> +  long use_cnt;
> +};
> +
> +/* Maximum size of memory region considered for coalescing.  Larger copies
> +   are performed directly.  */
> +#define MAX_COALESCE_BUF_SIZE	(32 * 1024)
> +
> +/* Maximum size of a gap in between regions to consider them being copied
> +   within the same chunk.  All the device offsets considered are within
> +   newly allocated device memory, so it isn't fatal if we copy some padding
> +   in between from host to device.  The gaps come either from alignment
> +   padding or from memory regions which are not supposed to be copied from
> +   host to device (e.g. map(alloc:), map(from:) etc.).  */
> +#define MAX_COALESCE_BUF_GAP	(4 * 1024)
> +
> +/* Add region with device tgt_start relative offset and length to CBUF.  */
> +
> +static inline void
> +gomp_coalesce_buf_add (struct gomp_coalesce_buf *cbuf, size_t start, size_t len)
> +{
> +  if (len > MAX_COALESCE_BUF_SIZE || len == 0)
> +    return;
> +  if (cbuf->chunk_cnt)
> +    {
> +      if (cbuf->chunk_cnt < 0)
> +	return;
> +      if (start < cbuf->chunks[2 * cbuf->chunk_cnt - 1])
> +	{
> +	  cbuf->chunk_cnt = -1;
> +	  return;
> +	}
> +      if (start < cbuf->chunks[2 * cbuf->chunk_cnt - 1] + MAX_COALESCE_BUF_GAP)
> +	{
> +	  cbuf->chunks[2 * cbuf->chunk_cnt - 1] = start + len;
> +	  cbuf->use_cnt++;
> +	  return;
> +	}
> +      /* If the last chunk is only used by one mapping, discard it,
> +	 as it will be one host to device copy anyway and
> +	 memcpying it around will only waste cycles.  */
> +      if (cbuf->use_cnt == 1)
> +	cbuf->chunk_cnt--;
> +    }
> +  cbuf->chunks[2 * cbuf->chunk_cnt] = start;
> +  cbuf->chunks[2 * cbuf->chunk_cnt + 1] = start + len;
> +  cbuf->chunk_cnt++;
> +  cbuf->use_cnt = 1;
> +}
> +
> +/* Return true for mapping kinds which need to copy data from the
> +   host to device for regions that weren't previously mapped.  */
> +
> +static inline bool
> +gomp_to_device_kind_p (int kind)
> +{
> +  switch (kind)
> +    {
> +    case GOMP_MAP_ALLOC:
> +    case GOMP_MAP_FROM:
> +    case GOMP_MAP_FORCE_ALLOC:
> +    case GOMP_MAP_ALWAYS_FROM:
> +      return false;
> +    default:
> +      return true;
> +    }
> +}
> +
>  static void
>  gomp_copy_host2dev (struct gomp_device_descr *devicep,
> -		    void *d, const void *h, size_t sz)
> +		    void *d, const void *h, size_t sz,
> +		    struct gomp_coalesce_buf *cbuf)
>  {
> +  if (cbuf)
> +    {
> +      uintptr_t doff = (uintptr_t) d - cbuf->tgt->tgt_start;
> +      if (doff < cbuf->chunks[2 * cbuf->chunk_cnt - 1])
> +	{
> +	  long first = 0;
> +	  long last = cbuf->chunk_cnt - 1;
> +	  while (first <= last)
> +	    {
> +	      long middle = (first + last) >> 1;
> +	      if (cbuf->chunks[2 * middle + 1] <= doff)
> +		first = middle + 1;
> +	      else if (cbuf->chunks[2 * middle] <= doff)
> +		{
> +		  if (doff + sz > cbuf->chunks[2 * middle + 1])
> +		    gomp_fatal ("internal libgomp cbuf error");
> +		  memcpy ((char *) cbuf->buf + (doff - cbuf->chunks[0]),
> +			  h, sz);
> +		  return;
> +		}
> +	      else
> +		last = middle - 1;
> +	    }
> +	}
> +    }
>    gomp_device_copy (devicep, devicep->host2dev_func, "dev", d, "host", h, sz);
>  }
>  
> @@ -208,7 +320,7 @@ gomp_free_device_memory (struct gomp_dev
>  static inline void
>  gomp_map_vars_existing (struct gomp_device_descr *devicep, splay_tree_key oldn,
>  			splay_tree_key newn, struct target_var_desc *tgt_var,
> -			unsigned char kind)
> +			unsigned char kind, struct gomp_coalesce_buf *cbuf)
>  {
>    tgt_var->key = oldn;
>    tgt_var->copy_from = GOMP_MAP_COPY_FROM_P (kind);
> @@ -232,7 +344,7 @@ gomp_map_vars_existing (struct gomp_devi
>  			(void *) (oldn->tgt->tgt_start + oldn->tgt_offset
>  				  + newn->host_start - oldn->host_start),
>  			(void *) newn->host_start,
> -			newn->host_end - newn->host_start);
> +			newn->host_end - newn->host_start, cbuf);
>  
>    if (oldn->refcount != REFCOUNT_INFINITY)
>      oldn->refcount++;
> @@ -247,7 +359,8 @@ get_kind (bool short_mapkind, void *kind
>  
>  static void
>  gomp_map_pointer (struct target_mem_desc *tgt, uintptr_t host_ptr,
> -		  uintptr_t target_offset, uintptr_t bias)
> +		  uintptr_t target_offset, uintptr_t bias,
> +		  struct gomp_coalesce_buf *cbuf)
>  {
>    struct gomp_device_descr *devicep = tgt->device_descr;
>    struct splay_tree_s *mem_map = &devicep->mem_map;
> @@ -257,11 +370,10 @@ gomp_map_pointer (struct target_mem_desc
>    if (cur_node.host_start == (uintptr_t) NULL)
>      {
>        cur_node.tgt_offset = (uintptr_t) NULL;
> -      /* FIXME: see comment about coalescing host/dev transfers below.  */
>        gomp_copy_host2dev (devicep,
>  			  (void *) (tgt->tgt_start + target_offset),
>  			  (void *) &cur_node.tgt_offset,
> -			  sizeof (void *));
> +			  sizeof (void *), cbuf);
>        return;
>      }
>    /* Add bias to the pointer value.  */
> @@ -280,15 +392,15 @@ gomp_map_pointer (struct target_mem_desc
>       array section.  Now subtract bias to get what we want
>       to initialize the pointer with.  */
>    cur_node.tgt_offset -= bias;
> -  /* FIXME: see comment about coalescing host/dev transfers below.  */
>    gomp_copy_host2dev (devicep, (void *) (tgt->tgt_start + target_offset),
> -		      (void *) &cur_node.tgt_offset, sizeof (void *));
> +		      (void *) &cur_node.tgt_offset, sizeof (void *), cbuf);
>  }
>  
>  static void
>  gomp_map_fields_existing (struct target_mem_desc *tgt, splay_tree_key n,
>  			  size_t first, size_t i, void **hostaddrs,
> -			  size_t *sizes, void *kinds)
> +			  size_t *sizes, void *kinds,
> +			  struct gomp_coalesce_buf *cbuf)
>  {
>    struct gomp_device_descr *devicep = tgt->device_descr;
>    struct splay_tree_s *mem_map = &devicep->mem_map;
> @@ -306,7 +418,7 @@ gomp_map_fields_existing (struct target_
>        && n2->host_start - n->host_start == n2->tgt_offset - n->tgt_offset)
>      {
>        gomp_map_vars_existing (devicep, n2, &cur_node,
> -			      &tgt->list[i], kind & typemask);
> +			      &tgt->list[i], kind & typemask, cbuf);
>        return;
>      }
>    if (sizes[i] == 0)
> @@ -322,7 +434,7 @@ gomp_map_fields_existing (struct target_
>  		 == n2->tgt_offset - n->tgt_offset)
>  	    {
>  	      gomp_map_vars_existing (devicep, n2, &cur_node, &tgt->list[i],
> -				      kind & typemask);
> +				      kind & typemask, cbuf);
>  	      return;
>  	    }
>  	}
> @@ -334,7 +446,7 @@ gomp_map_fields_existing (struct target_
>  	  && n2->host_start - n->host_start == n2->tgt_offset - n->tgt_offset)
>  	{
>  	  gomp_map_vars_existing (devicep, n2, &cur_node, &tgt->list[i],
> -				  kind & typemask);
> +				  kind & typemask, cbuf);
>  	  return;
>  	}
>      }
> @@ -381,6 +493,7 @@ gomp_map_vars (struct gomp_device_descr
>    tgt->list_count = mapnum;
>    tgt->refcount = pragma_kind == GOMP_MAP_VARS_ENTER_DATA ? 0 : 1;
>    tgt->device_descr = devicep;
> +  struct gomp_coalesce_buf cbuf, *cbufp = NULL;
>  
>    if (mapnum == 0)
>      {
> @@ -391,11 +504,25 @@ gomp_map_vars (struct gomp_device_descr
>  
>    tgt_align = sizeof (void *);
>    tgt_size = 0;
> +  cbuf.chunks = NULL;
> +  cbuf.chunk_cnt = -1;
> +  cbuf.use_cnt = 0;
> +  cbuf.buf = NULL;
> +  if (mapnum > 1 || pragma_kind == GOMP_MAP_VARS_TARGET)
> +    {
> +      cbuf.chunks
> +	= (size_t *) gomp_alloca ((2 * mapnum + 2) * sizeof (size_t));
> +      cbuf.chunk_cnt = 0;
> +    }
>    if (pragma_kind == GOMP_MAP_VARS_TARGET)
>      {
>        size_t align = 4 * sizeof (void *);
>        tgt_align = align;
>        tgt_size = mapnum * sizeof (void *);
> +      cbuf.chunk_cnt = 1;
> +      cbuf.use_cnt = 1 + (mapnum > 1);
> +      cbuf.chunks[0] = 0;
> +      cbuf.chunks[1] = tgt_size;
>      }
>  
>    gomp_mutex_lock (&devicep->lock);
> @@ -449,19 +576,26 @@ gomp_map_vars (struct gomp_device_descr
>  	      size_t align = (size_t) 1 << (kind >> rshift);
>  	      if (tgt_align < align)
>  		tgt_align = align;
> -	      tgt_size -= (uintptr_t) hostaddrs[first]
> -			  - (uintptr_t) hostaddrs[i];
> +	      tgt_size -= (uintptr_t) hostaddrs[first] - cur_node.host_start;
>  	      tgt_size = (tgt_size + align - 1) & ~(align - 1);
> -	      tgt_size += cur_node.host_end - (uintptr_t) hostaddrs[i];
> +	      tgt_size += cur_node.host_end - cur_node.host_start;
>  	      not_found_cnt += last - i;
>  	      for (i = first; i <= last; i++)
> -		tgt->list[i].key = NULL;
> +		{
> +		  tgt->list[i].key = NULL;
> +		  if (gomp_to_device_kind_p (get_kind (short_mapkind, kinds, i)
> +					     & typemask))
> +		    gomp_coalesce_buf_add (&cbuf,
> +					   tgt_size - cur_node.host_end
> +					   + (uintptr_t) hostaddrs[i],
> +					   sizes[i]);
> +		}
>  	      i--;
>  	      continue;
>  	    }
>  	  for (i = first; i <= last; i++)
>  	    gomp_map_fields_existing (tgt, n, first, i, hostaddrs,
> -				      sizes, kinds);
> +				      sizes, kinds, NULL);
>  	  i--;
>  	  continue;
>  	}
> @@ -485,6 +619,8 @@ gomp_map_vars (struct gomp_device_descr
>  	  if (tgt_align < align)
>  	    tgt_align = align;
>  	  tgt_size = (tgt_size + align - 1) & ~(align - 1);
> +	  gomp_coalesce_buf_add (&cbuf, tgt_size,
> +				 cur_node.host_end - cur_node.host_start);
>  	  tgt_size += cur_node.host_end - cur_node.host_start;
>  	  has_firstprivate = true;
>  	  continue;
> @@ -504,7 +640,7 @@ gomp_map_vars (struct gomp_device_descr
>  	n = splay_tree_lookup (mem_map, &cur_node);
>        if (n && n->refcount != REFCOUNT_LINK)
>  	gomp_map_vars_existing (devicep, n, &cur_node, &tgt->list[i],
> -				kind & typemask);
> +				kind & typemask, NULL);
>        else
>  	{
>  	  tgt->list[i].key = NULL;
> @@ -514,6 +650,9 @@ gomp_map_vars (struct gomp_device_descr
>  	  if (tgt_align < align)
>  	    tgt_align = align;
>  	  tgt_size = (tgt_size + align - 1) & ~(align - 1);
> +	  if (gomp_to_device_kind_p (kind & typemask))
> +	    gomp_coalesce_buf_add (&cbuf, tgt_size,
> +				   cur_node.host_end - cur_node.host_start);
>  	  tgt_size += cur_node.host_end - cur_node.host_start;
>  	  if ((kind & typemask) == GOMP_MAP_TO_PSET)
>  	    {
> @@ -562,6 +701,19 @@ gomp_map_vars (struct gomp_device_descr
>        tgt->tgt_start = (uintptr_t) tgt->to_free;
>        tgt->tgt_start = (tgt->tgt_start + tgt_align - 1) & ~(tgt_align - 1);
>        tgt->tgt_end = tgt->tgt_start + tgt_size;
> +
> +      if (cbuf.use_cnt == 1)
> +	cbuf.chunk_cnt--;
> +      if (cbuf.chunk_cnt > 0)
> +	{
> +	  cbuf.buf
> +	    = malloc (cbuf.chunks[2 * cbuf.chunk_cnt - 1] - cbuf.chunks[0]);
> +	  if (cbuf.buf)
> +	    {
> +	      cbuf.tgt = tgt;
> +	      cbufp = &cbuf;
> +	    }
> +	}
>      }
>    else
>      {
> @@ -600,7 +752,7 @@ gomp_map_vars (struct gomp_device_descr
>  		len = sizes[i];
>  		gomp_copy_host2dev (devicep,
>  				    (void *) (tgt->tgt_start + tgt_size),
> -				    (void *) hostaddrs[i], len);
> +				    (void *) hostaddrs[i], len, cbufp);
>  		tgt_size += len;
>  		continue;
>  	      case GOMP_MAP_FIRSTPRIVATE_INT:
> @@ -633,7 +785,7 @@ gomp_map_vars (struct gomp_device_descr
>  		  }
>  		for (i = first; i <= last; i++)
>  		  gomp_map_fields_existing (tgt, n, first, i, hostaddrs,
> -					    sizes, kinds);
> +					    sizes, kinds, cbufp);
>  		i--;
>  		continue;
>  	      case GOMP_MAP_ALWAYS_POINTER:
> @@ -658,7 +810,7 @@ gomp_map_vars (struct gomp_device_descr
>  					      + cur_node.host_start
>  					      - n->host_start),
>  				    (void *) &cur_node.tgt_offset,
> -				    sizeof (void *));
> +				    sizeof (void *), cbufp);
>  		cur_node.tgt_offset = n->tgt->tgt_start + n->tgt_offset
>  				      + cur_node.host_start - n->host_start;
>  		continue;
> @@ -674,7 +826,7 @@ gomp_map_vars (struct gomp_device_descr
>  	    splay_tree_key n = splay_tree_lookup (mem_map, k);
>  	    if (n && n->refcount != REFCOUNT_LINK)
>  	      gomp_map_vars_existing (devicep, n, k, &tgt->list[i],
> -				      kind & typemask);
> +				      kind & typemask, cbufp);
>  	    else
>  	      {
>  		k->link_key = NULL;
> @@ -725,26 +877,22 @@ gomp_map_vars (struct gomp_device_descr
>  		  case GOMP_MAP_FORCE_TOFROM:
>  		  case GOMP_MAP_ALWAYS_TO:
>  		  case GOMP_MAP_ALWAYS_TOFROM:
> -		    /* FIXME: Perhaps add some smarts, like if copying
> -		       several adjacent fields from host to target, use some
> -		       host buffer to avoid sending each var individually.  */
>  		    gomp_copy_host2dev (devicep,
>  					(void *) (tgt->tgt_start
>  						  + k->tgt_offset),
>  					(void *) k->host_start,
> -					k->host_end - k->host_start);
> +					k->host_end - k->host_start, cbufp);
>  		    break;
>  		  case GOMP_MAP_POINTER:
>  		    gomp_map_pointer (tgt, (uintptr_t) *(void **) k->host_start,
> -				      k->tgt_offset, sizes[i]);
> +				      k->tgt_offset, sizes[i], cbufp);
>  		    break;
>  		  case GOMP_MAP_TO_PSET:
> -		    /* FIXME: see above FIXME comment.  */
>  		    gomp_copy_host2dev (devicep,
>  					(void *) (tgt->tgt_start
>  						  + k->tgt_offset),
>  					(void *) k->host_start,
> -					k->host_end - k->host_start);
> +					k->host_end - k->host_start, cbufp);
>  
>  		    for (j = i + 1; j < mapnum; j++)
>  		      if (!GOMP_MAP_POINTER_P (get_kind (short_mapkind, kinds,
> @@ -767,7 +915,7 @@ gomp_map_vars (struct gomp_device_descr
>  					    k->tgt_offset
>  					    + ((uintptr_t) hostaddrs[j]
>  					       - k->host_start),
> -					    sizes[j]);
> +					    sizes[j], cbufp);
>  			  i++;
>  			}
>  		    break;
> @@ -795,7 +943,7 @@ gomp_map_vars (struct gomp_device_descr
>  					(void *) (tgt->tgt_start
>  						  + k->tgt_offset),
>  					(void *) k->host_start,
> -					sizeof (void *));
> +					sizeof (void *), cbufp);
>  		    break;
>  		  default:
>  		    gomp_mutex_unlock (&devicep->lock);
> @@ -822,13 +970,23 @@ gomp_map_vars (struct gomp_device_descr
>        for (i = 0; i < mapnum; i++)
>  	{
>  	  cur_node.tgt_offset = gomp_map_val (tgt, hostaddrs, i);
> -	  /* FIXME: see above FIXME comment.  */
>  	  gomp_copy_host2dev (devicep,
>  			      (void *) (tgt->tgt_start + i * sizeof (void *)),
> -			      (void *) &cur_node.tgt_offset, sizeof (void *));
> +			      (void *) &cur_node.tgt_offset, sizeof (void *),
> +			      cbufp);
>  	}
>      }
>  
> +  if (cbufp)
> +    {
> +      long c = 0;
> +      for (c = 0; c < cbuf.chunk_cnt; ++c)
> +	gomp_copy_host2dev (devicep, (void *) (tgt->tgt_start + cbuf.chunks[2 * c]),
> +			    (char *) cbuf.buf + (cbuf.chunks[2 * c] - cbuf.chunks[0]),
> +			    cbuf.chunks[2 * c + 1] - cbuf.chunks[2 * c], NULL);
> +      free (cbuf.buf);
> +    }
> +
>    /* If the variable from "omp target enter data" map-list was already mapped,
>       tgt is not needed.  Otherwise tgt will be freed by gomp_unmap_vars or
>       gomp_exit_data.  */
> @@ -970,7 +1128,7 @@ gomp_update (struct gomp_device_descr *d
>  	    size_t size = cur_node.host_end - cur_node.host_start;
>  
>  	    if (GOMP_MAP_COPY_TO_P (kind & typemask))
> -	      gomp_copy_host2dev (devicep, devaddr, hostaddr, size);
> +	      gomp_copy_host2dev (devicep, devaddr, hostaddr, size, NULL);
>  	    if (GOMP_MAP_COPY_FROM_P (kind & typemask))
>  	      gomp_copy_dev2host (devicep, hostaddr, devaddr, size);
>  	  }

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC PATCH] Coalesce host to device transfers in libgomp
  2018-12-06 17:02     ` [RFC PATCH] Coalesce host to device transfers in libgomp Thomas Schwinge
@ 2018-12-06 17:19       ` Jakub Jelinek
  2018-12-06 17:54         ` Thomas Schwinge
  0 siblings, 1 reply; 19+ messages in thread
From: Jakub Jelinek @ 2018-12-06 17:19 UTC (permalink / raw)
  To: Thomas Schwinge
  Cc: gcc-patches, Martin Jambor, Alexander Monakov, Chung-Lin Tang

On Thu, Dec 06, 2018 at 06:01:48PM +0100, Thomas Schwinge wrote:
> While reviewing Chung-Lin's
> <https://gcc.gnu.org/ml/gcc-patches/2018-09/msg01428.html> "[PATCH 4/6,
> OpenACC, libgomp] Async re-work, libgomp/target.c changes", I noticed the
> following unrelated hunk.  Is that intentional or just an oversight that
> it hasn't been included in your "gomp_coalesce_buf" changes (quoted below
> for reference)?

I believe it is intentional, the coalescing code coalesces only stuff
allocated by the current gomp_map_vars call, for the link_key case we know
that is not the case, it is a copy to a file scope data variable in the PTX
code.  Perhaps we could do the change but pass NULL instead
of cbufp as the last argument?

> commit 2abec5454063076ebd0fddf6ed25a3459c4f5ac3
> Author: Thomas Schwinge <thomas@codesourcery.com>
> Date:   Thu Dec 6 17:52:34 2018 +0100
> 
>     Coalesce host to device transfers in libgomp: link pointer
>     
>             libgomp/
>             * target.c (gomp_map_vars): Call "gomp_copy_host2dev" instead of
>             "devicep->host2dev_func".
> ---
>  libgomp/target.c | 6 +++---
>  1 file changed, 3 insertions(+), 3 deletions(-)
> 
> diff --git libgomp/target.c libgomp/target.c
> index 8ebc2a370a16..9cb2ec8d026f 100644
> --- libgomp/target.c
> +++ libgomp/target.c
> @@ -957,9 +957,9 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
>  		    /* Set link pointer on target to the device address of the
>  		       mapped object.  */
>  		    void *tgt_addr = (void *) (tgt->tgt_start + k->tgt_offset);
> -		    devicep->host2dev_func (devicep->target_id,
> -					    (void *) n->tgt_offset,
> -					    &tgt_addr, sizeof (void *));
> +		    gomp_copy_host2dev (devicep, (void *) n->tgt_offset,
> +					&tgt_addr, sizeof (void *), cbufp);
> +
>  		  }
>  		array++;
>  	      }
> 

	Jakub

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC PATCH] Coalesce host to device transfers in libgomp
  2018-12-06 17:19       ` Jakub Jelinek
@ 2018-12-06 17:54         ` Thomas Schwinge
  2018-12-06 17:57           ` Jakub Jelinek
  0 siblings, 1 reply; 19+ messages in thread
From: Thomas Schwinge @ 2018-12-06 17:54 UTC (permalink / raw)
  To: Jakub Jelinek, gcc-patches
  Cc: Martin Jambor, Alexander Monakov, Chung-Lin Tang

Hi Jakub!

On Thu, 6 Dec 2018 18:18:56 +0100, Jakub Jelinek <jakub@redhat.com> wrote:
> On Thu, Dec 06, 2018 at 06:01:48PM +0100, Thomas Schwinge wrote:
> > While reviewing Chung-Lin's
> > <https://gcc.gnu.org/ml/gcc-patches/2018-09/msg01428.html> "[PATCH 4/6,
> > OpenACC, libgomp] Async re-work, libgomp/target.c changes", I noticed the
> > following unrelated hunk.  Is that intentional or just an oversight that
> > it hasn't been included in your "gomp_coalesce_buf" changes (quoted below
> > for reference)?
> 
> I believe it is intentional, the coalescing code coalesces only stuff
> allocated by the current gomp_map_vars call, for the link_key case we know
> that is not the case, it is a copy to a file scope data variable in the PTX
> code.

Hmm, I thought this would just copy an address (as opposed to data) from
the host to the device, so that would be fine for coalescing.  But I'm
not familiar with that code, so it's certainly possible that I'm not
understanding this correctly.

> Perhaps we could do the change but pass NULL instead
> of cbufp as the last argument?

Like this?

commit 241027a03b70c788ef94ccf258b799332fb1b20e
Author: Thomas Schwinge <thomas@codesourcery.com>
Date:   Thu Dec 6 18:53:16 2018 +0100

    Coalesce host to device transfers in libgomp: not for link pointer
    
    2018-12-06  Thomas Schwinge  <thomas@codesourcery.com>
                Jakub Jelinek  <jakub@redhat.com>
    
            libgomp/
            * target.c (gomp_map_vars): Call "gomp_copy_host2dev" instead of
            "devicep->host2dev_func".
---
 libgomp/target.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git libgomp/target.c libgomp/target.c
index 8ebc2a370a16..60f4c96f3908 100644
--- libgomp/target.c
+++ libgomp/target.c
@@ -957,9 +957,11 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
 		    /* Set link pointer on target to the device address of the
 		       mapped object.  */
 		    void *tgt_addr = (void *) (tgt->tgt_start + k->tgt_offset);
-		    devicep->host2dev_func (devicep->target_id,
-					    (void *) n->tgt_offset,
-					    &tgt_addr, sizeof (void *));
+		    /* We intentionally do not use coalescing here, as it's not
+		       data allocated by the current call to this function.  */
+		    gomp_copy_host2dev (devicep, (void *) n->tgt_offset,
+					&tgt_addr, sizeof (void *), NULL);
+
 		  }
 		array++;
 	      }


Grüße
 Thomas

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC PATCH] Coalesce host to device transfers in libgomp
  2018-12-06 17:54         ` Thomas Schwinge
@ 2018-12-06 17:57           ` Jakub Jelinek
  2018-12-09 12:53             ` Thomas Schwinge
  0 siblings, 1 reply; 19+ messages in thread
From: Jakub Jelinek @ 2018-12-06 17:57 UTC (permalink / raw)
  To: Thomas Schwinge
  Cc: gcc-patches, Martin Jambor, Alexander Monakov, Chung-Lin Tang

On Thu, Dec 06, 2018 at 06:54:20PM +0100, Thomas Schwinge wrote:
> On Thu, 6 Dec 2018 18:18:56 +0100, Jakub Jelinek <jakub@redhat.com> wrote:
> > On Thu, Dec 06, 2018 at 06:01:48PM +0100, Thomas Schwinge wrote:
> > > While reviewing Chung-Lin's
> > > <https://gcc.gnu.org/ml/gcc-patches/2018-09/msg01428.html> "[PATCH 4/6,
> > > OpenACC, libgomp] Async re-work, libgomp/target.c changes", I noticed the
> > > following unrelated hunk.  Is that intentional or just an oversight that
> > > it hasn't been included in your "gomp_coalesce_buf" changes (quoted below
> > > for reference)?
> > 
> > I believe it is intentional, the coalescing code coalesces only stuff
> > allocated by the current gomp_map_vars call, for the link_key case we know
> > that is not the case, it is a copy to a file scope data variable in the PTX
> > code.
> 
> Hmm, I thought this would just copy an address (as opposed to data) from
> the host to the device, so that would be fine for coalescing.  But I'm
> not familiar with that code, so it's certainly possible that I'm not
> understanding this correctly.

The actual data transfer can be coalesced, just the address is copied into
the offloaded file scope var and so that exact transfer can't be coalesced.

> > Perhaps we could do the change but pass NULL instead
> > of cbufp as the last argument?
> 
> Like this?
> 
> commit 241027a03b70c788ef94ccf258b799332fb1b20e
> Author: Thomas Schwinge <thomas@codesourcery.com>
> Date:   Thu Dec 6 18:53:16 2018 +0100
> 
>     Coalesce host to device transfers in libgomp: not for link pointer
>     
>     2018-12-06  Thomas Schwinge  <thomas@codesourcery.com>
>                 Jakub Jelinek  <jakub@redhat.com>
>     
>             libgomp/
>             * target.c (gomp_map_vars): Call "gomp_copy_host2dev" instead of
>             "devicep->host2dev_func".

Ok for trunk, thanks.  Perhaps no need for the "s in the ChangeLog.

> ---
>  libgomp/target.c | 8 +++++---
>  1 file changed, 5 insertions(+), 3 deletions(-)
> 
> diff --git libgomp/target.c libgomp/target.c
> index 8ebc2a370a16..60f4c96f3908 100644
> --- libgomp/target.c
> +++ libgomp/target.c
> @@ -957,9 +957,11 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
>  		    /* Set link pointer on target to the device address of the
>  		       mapped object.  */
>  		    void *tgt_addr = (void *) (tgt->tgt_start + k->tgt_offset);
> -		    devicep->host2dev_func (devicep->target_id,
> -					    (void *) n->tgt_offset,
> -					    &tgt_addr, sizeof (void *));
> +		    /* We intentionally do not use coalescing here, as it's not
> +		       data allocated by the current call to this function.  */
> +		    gomp_copy_host2dev (devicep, (void *) n->tgt_offset,
> +					&tgt_addr, sizeof (void *), NULL);
> +
>  		  }
>  		array++;
>  	      }

	Jakub

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC PATCH] Coalesce host to device transfers in libgomp
  2018-12-06 17:57           ` Jakub Jelinek
@ 2018-12-09 12:53             ` Thomas Schwinge
  0 siblings, 0 replies; 19+ messages in thread
From: Thomas Schwinge @ 2018-12-09 12:53 UTC (permalink / raw)
  To: Jakub Jelinek, gcc-patches
  Cc: Martin Jambor, Alexander Monakov, Chung-Lin Tang

Hi!

On Thu, 6 Dec 2018 18:57:31 +0100, Jakub Jelinek <jakub@redhat.com> wrote:
> On Thu, Dec 06, 2018 at 06:54:20PM +0100, Thomas Schwinge wrote:
> > On Thu, 6 Dec 2018 18:18:56 +0100, Jakub Jelinek <jakub@redhat.com> wrote:
> > > On Thu, Dec 06, 2018 at 06:01:48PM +0100, Thomas Schwinge wrote:
> > > > While reviewing Chung-Lin's
> > > > <https://gcc.gnu.org/ml/gcc-patches/2018-09/msg01428.html> "[PATCH 4/6,
> > > > OpenACC, libgomp] Async re-work, libgomp/target.c changes", I noticed the
> > > > following unrelated hunk.  Is that intentional or just an oversight that
> > > > it hasn't been included in your "gomp_coalesce_buf" changes (quoted below
> > > > for reference)?
> > > 
> > > I believe it is intentional, the coalescing code coalesces only stuff
> > > allocated by the current gomp_map_vars call, for the link_key case we know
> > > that is not the case, it is a copy to a file scope data variable in the PTX
> > > code.
> > 
> > Hmm, I thought this would just copy an address (as opposed to data) from
> > the host to the device, so that would be fine for coalescing.  But I'm
> > not familiar with that code, so it's certainly possible that I'm not
> > understanding this correctly.
> 
> The actual data transfer can be coalesced, just the address is copied into
> the offloaded file scope var and so that exact transfer can't be coalesced.

Ah, I see, thanks!

> > > Perhaps we could do the change but pass NULL instead
> > > of cbufp as the last argument?

Committed to trunk in r266919:

commit 9d5a0b9dbb3aa4493f6e20b711607a25783bcec3
Author: tschwinge <tschwinge@138bc75d-0d04-0410-961f-82ee72b054a4>
Date:   Sun Dec 9 12:47:23 2018 +0000

    Coalesce host to device transfers in libgomp: not for link pointer
    
            libgomp/
            * target.c (gomp_map_vars): Call gomp_copy_host2dev instead of
            devicep->host2dev_func.
    
    git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@266919 138bc75d-0d04-0410-961f-82ee72b054a4
---
 libgomp/ChangeLog | 6 ++++++
 libgomp/target.c  | 7 ++++---
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git libgomp/ChangeLog libgomp/ChangeLog
index 7ce0cdb42e14..99417ef62cf0 100644
--- libgomp/ChangeLog
+++ libgomp/ChangeLog
@@ -1,3 +1,9 @@
+2018-12-09  Thomas Schwinge  <thomas@codesourcery.com>
+	    Jakub Jelinek  <jakub@redhat.com>
+
+	* target.c (gomp_map_vars): Call gomp_copy_host2dev instead of
+	devicep->host2dev_func.
+
 2018-12-08  Jakub Jelinek  <jakub@redhat.com>
 
 	PR libgomp/87995
diff --git libgomp/target.c libgomp/target.c
index 8ebc2a370a16..a62ae2c3e4b3 100644
--- libgomp/target.c
+++ libgomp/target.c
@@ -957,9 +957,10 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
 		    /* Set link pointer on target to the device address of the
 		       mapped object.  */
 		    void *tgt_addr = (void *) (tgt->tgt_start + k->tgt_offset);
-		    devicep->host2dev_func (devicep->target_id,
-					    (void *) n->tgt_offset,
-					    &tgt_addr, sizeof (void *));
+		    /* We intentionally do not use coalescing here, as it's not
+		       data allocated by the current call to this function.  */
+		    gomp_copy_host2dev (devicep, (void *) n->tgt_offset,
+					&tgt_addr, sizeof (void *), NULL);
 		  }
 		array++;
 	      }


Grüße
 Thomas

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC PATCH] Coalesce host to device transfers in libgomp
  2017-10-24  9:57 [RFC PATCH] Coalesce host to device transfers in libgomp Jakub Jelinek
  2017-10-24 15:59 ` Cesar Philippidis
  2017-10-24 17:40 ` Alexander Monakov
@ 2019-05-23 14:40 ` Thomas Schwinge
  2019-05-23 14:57   ` Jakub Jelinek
  2 siblings, 1 reply; 19+ messages in thread
From: Thomas Schwinge @ 2019-05-23 14:40 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: gcc-patches, Alexander Monakov

[-- Attachment #1: Type: text/plain, Size: 1542 bytes --]

Hi!

On Tue, 24 Oct 2017 11:55:27 +0200, Jakub Jelinek <jakub@redhat.com> wrote:
> The following patch implements coalescing of transfers (only those that are
> copied into the freshly allocated device buffer) into one or multiple larger
> transfers.  The patch doesn't coalesce > 32KB transfers or transfers where
> the gap is 4KB or more.  I guess it would be not too hard to do similar
> coalescing for the dev2host transfers that are from a single device mapping,
> though probably far less important than the more common host2dev transfers.

I too wondered about device to host copies.  (..., and in the OpenACC
context, how that would interact with 'async'...)

And then, I wondered about 'OpenMP target enter data' directives -- if
that one creates/copies multiple objects, wouldn't that likewise benefit
from the coalescing optimization?  There is the (implementation?)
problem, though, that 'GOMP_target_enter_exit_data' calls 'gomp_map_vars'
separately for each mapping -- is that just because of the special
'GOMP_MAP_STRUCT' handling?  (Could we easily do "ranges" between such
interrupters?)

And then, could we go as far as using the coalescing optimization even
for 'update'/'exit data' directives, and/or potentially for generally all
host to device and device to host copies, when we can determine that the
device addresses are adjacent to each other?  Or would figuring that out
require more effort compared to just launching individual transfers?
Just an idea that I had...

Grüße
 Thomas

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 658 bytes --]

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC PATCH] Coalesce host to device transfers in libgomp
  2019-05-23 14:40 ` Thomas Schwinge
@ 2019-05-23 14:57   ` Jakub Jelinek
  0 siblings, 0 replies; 19+ messages in thread
From: Jakub Jelinek @ 2019-05-23 14:57 UTC (permalink / raw)
  To: Thomas Schwinge; +Cc: gcc-patches, Alexander Monakov

On Thu, May 23, 2019 at 04:40:14PM +0200, Thomas Schwinge wrote:
> Hi!
> 
> On Tue, 24 Oct 2017 11:55:27 +0200, Jakub Jelinek <jakub@redhat.com> wrote:
> > The following patch implements coalescing of transfers (only those that are
> > copied into the freshly allocated device buffer) into one or multiple larger
> > transfers.  The patch doesn't coalesce > 32KB transfers or transfers where
> > the gap is 4KB or more.  I guess it would be not too hard to do similar
> > coalescing for the dev2host transfers that are from a single device mapping,
> > though probably far less important than the more common host2dev transfers.
> 
> I too wondered about device to host copies.  (..., and in the OpenACC
> context, how that would interact with 'async'...)
> 
> And then, I wondered about 'OpenMP target enter data' directives -- if
> that one creates/copies multiple objects, wouldn't that likewise benefit
> from the coalescing optimization?  There is the (implementation?)
> problem, though, that 'GOMP_target_enter_exit_data' calls 'gomp_map_vars'
> separately for each mapping -- is that just because of the special
> 'GOMP_MAP_STRUCT' handling?  (Could we easily do "ranges" between such
> interrupters?)

No, the reason for that is that while for target data and target we can
estimate the lifetime of the device data (sure, one can do weird things
too), with enter data / exit data it is far less likely that the data that
will be mapped by the same enter data will be deallocated by an exit data
together as well.  If one allocates all data together on enter data and then
frees all but one of them, the whole block will need to be allocated on the
device until even the last one is freed.
For target construct, the only way to get something similar would be if some
other thread performs target enter data while the target task is running,
that would be quite racy.  For target data, one can add target enter data
inside of target data, but it is quite unusual I'd say.

	Jakub

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC PATCH] Coalesce host to device transfers in libgomp
  2017-10-25 12:03   ` Jakub Jelinek
  2017-10-27 14:13     ` [PATCH] Implement omp async support for nvptx Tom de Vries
  2018-12-06 17:02     ` [RFC PATCH] Coalesce host to device transfers in libgomp Thomas Schwinge
@ 2019-12-18 17:15     ` Thomas Schwinge
  2 siblings, 0 replies; 19+ messages in thread
From: Thomas Schwinge @ 2019-12-18 17:15 UTC (permalink / raw)
  To: Jakub Jelinek, gcc-patches; +Cc: Martin Jambor, Alexander Monakov


[-- Attachment #1.1: Type: text/plain, Size: 872 bytes --]

Hi!

On 2017-10-25T13:38:50+0200, Jakub Jelinek <jakub@redhat.com> wrote:
> --- libgomp/target.c.jj	2017-10-24 12:07:03.763759657 +0200
> +++ libgomp/target.c	2017-10-25 13:17:31.608975390 +0200

> +/* Return true for mapping kinds which need to copy data from the
> +   host to device for regions that weren't previously mapped.  */
> +
> +static inline bool
> +gomp_to_device_kind_p (int kind)
> +{
> +  switch (kind)
> +    {
> +    case GOMP_MAP_ALLOC:
> +    case GOMP_MAP_FROM:
> +    case GOMP_MAP_FORCE_ALLOC:
> +    case GOMP_MAP_ALWAYS_FROM:
> +      return false;
> +    default:
> +      return true;
> +    }
> +}

Poor 'GOMP_MAP_FORCE_FROM'...  ;'-|

See attached "[OpenACC] In 'libgomp/target.c:gomp_to_device_kind_p',
handle 'GOMP_MAP_FORCE_FROM' like 'GOMP_MAP_FROM'"; committed to trunk in
r279533.


Grüße
 Thomas



[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1.2: 0001-OpenACC-In-libgomp-target.c-gomp_to_device_kin.trunk.patch --]
[-- Type: text/x-diff, Size: 1443 bytes --]

From 74bb6382e2be4c478e2f58daa3cdf1c42b6c2480 Mon Sep 17 00:00:00 2001
From: tschwinge <tschwinge@138bc75d-0d04-0410-961f-82ee72b054a4>
Date: Wed, 18 Dec 2019 17:01:22 +0000
Subject: [PATCH] [OpenACC] In 'libgomp/target.c:gomp_to_device_kind_p', handle
 'GOMP_MAP_FORCE_FROM' like 'GOMP_MAP_FROM'

Fix oversight from r254194 "Coalesce host to device transfers in libgomp".

	libgomp/
	* target.c (gomp_to_device_kind_p): Handle 'GOMP_MAP_FORCE_FROM'
	like 'GOMP_MAP_FROM'.

git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@279533 138bc75d-0d04-0410-961f-82ee72b054a4
---
 libgomp/ChangeLog | 3 +++
 libgomp/target.c  | 1 +
 2 files changed, 4 insertions(+)

diff --git a/libgomp/ChangeLog b/libgomp/ChangeLog
index 871a1537c77..472519c7e3e 100644
--- a/libgomp/ChangeLog
+++ b/libgomp/ChangeLog
@@ -1,5 +1,8 @@
 2019-12-18  Thomas Schwinge  <thomas@codesourcery.com>
 
+	* target.c (gomp_to_device_kind_p): Handle 'GOMP_MAP_FORCE_FROM'
+	like 'GOMP_MAP_FROM'.
+
 	PR libgomp/92726
 	PR libgomp/92970
 	PR libgomp/92984
diff --git a/libgomp/target.c b/libgomp/target.c
index 41cf6a3d7d2..a3cdb34bd51 100644
--- a/libgomp/target.c
+++ b/libgomp/target.c
@@ -279,6 +279,7 @@ gomp_to_device_kind_p (int kind)
     case GOMP_MAP_ALLOC:
     case GOMP_MAP_FROM:
     case GOMP_MAP_FORCE_ALLOC:
+    case GOMP_MAP_FORCE_FROM:
     case GOMP_MAP_ALWAYS_FROM:
       return false;
     default:
-- 
2.17.1


[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 658 bytes --]

^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2019-12-18 17:13 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-10-24  9:57 [RFC PATCH] Coalesce host to device transfers in libgomp Jakub Jelinek
2017-10-24 15:59 ` Cesar Philippidis
2017-10-24 16:02   ` Jakub Jelinek
2017-10-24 17:56     ` Alexander Monakov
2017-10-24 17:40 ` Alexander Monakov
2017-10-24 19:36   ` Jakub Jelinek
2017-10-25 12:03   ` Jakub Jelinek
2017-10-27 14:13     ` [PATCH] Implement omp async support for nvptx Tom de Vries
2017-10-30  7:25       ` Jakub Jelinek
2017-10-30 12:02         ` Tom de Vries
2017-10-30 13:52           ` Tom de Vries
2018-12-06 17:02     ` [RFC PATCH] Coalesce host to device transfers in libgomp Thomas Schwinge
2018-12-06 17:19       ` Jakub Jelinek
2018-12-06 17:54         ` Thomas Schwinge
2018-12-06 17:57           ` Jakub Jelinek
2018-12-09 12:53             ` Thomas Schwinge
2019-12-18 17:15     ` Thomas Schwinge
2019-05-23 14:40 ` Thomas Schwinge
2019-05-23 14:57   ` Jakub Jelinek

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).