public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [RFC PATCH] Coalesce host to device transfers in libgomp
@ 2017-10-24  9:57 Jakub Jelinek
  2017-10-24 15:59 ` Cesar Philippidis
                   ` (2 more replies)
  0 siblings, 3 replies; 19+ messages in thread
From: Jakub Jelinek @ 2017-10-24  9:57 UTC (permalink / raw)
  To: Alexander Monakov, Thomas Schwinge, Martin Jambor, Cesar Philippidis
  Cc: gcc-patches

Hi!

Poeple from NVidia reported privately unexpected amount of host2dev
transfers for #pragma omp target*.
The code even had comments like:
                   /* FIXME: Perhaps add some smarts, like if copying
                      several adjacent fields from host to target, use some
                      host buffer to avoid sending each var individually.  */
and the especially bad example of this was the
      for (i = 0; i < mapnum; i++)
        {
          cur_node.tgt_offset = gomp_map_val (tgt, hostaddrs, i);
          /* FIXME: see above FIXME comment.  */
          gomp_copy_host2dev (devicep,
                              (void *) (tgt->tgt_start + i * sizeof (void *)),
                              (void *) &cur_node.tgt_offset, sizeof (void *));
        }
loop transfering the addresses or firstprivate_int values to the device
- where we issued mapnum host2dev transfers each just pointer-sized
when we could have just prepared all the pointers in an array and host2dev
copy them all together.

The following patch implements coalescing of transfers (only those that are
copied into the freshly allocated device buffer) into one or multiple larger
transfers.  The patch doesn't coalesce > 32KB transfers or transfers where
the gap is 4KB or more.  I guess it would be not too hard to do similar
coalescing for the dev2host transfers that are from a single device mapping,
though probably far less important than the more common host2dev transfers.

Tested on x86_64-linux to nvptx-none offloading.

Thoughts on this?

2017-10-24  Jakub Jelinek  <jakub@redhat.com>

	* target.c (struct gomp_map_cache): New type.
	(gomp_cache_add): New function.
	(gomp_copy_host2dev): Add CACHE argument, if copying into
	the cached ranges, memcpy into buffer instead of copying
	into device.
	(gomp_map_vars_existing, gomp_map_pointer, gomp_map_fields_existing):
	Add CACHE argument, pass it through to other calls.
	(gomp_map_vars): Aggregate copies from host to device if small enough
	and with small enough gaps in between into memcpy into a buffer and
	fewer host to device copies from the buffer.
	(gomp_update): Adjust gomp_copy_host2dev caller.

--- libgomp/target.c.jj	2017-04-20 14:59:08.296263304 +0200
+++ libgomp/target.c	2017-10-23 19:08:14.348336118 +0200
@@ -177,10 +177,77 @@ gomp_device_copy (struct gomp_device_des
     }
 }
 
+struct gomp_map_cache
+{
+  void *buf;
+  struct target_mem_desc *tgt;
+  size_t *chunks;
+  long chunk_cnt;
+  long use_cnt;
+};
+
+static inline void
+gomp_cache_add (struct gomp_map_cache *cache, size_t start, size_t len)
+{
+  if (len > 32 * 1024 || len == 0)
+    return;
+  if (cache->chunk_cnt)
+    {
+      if (cache->chunk_cnt < 0)
+	return;
+      if (start < cache->chunks[2 * cache->chunk_cnt - 1])
+	{
+	  cache->chunk_cnt = -1;
+	  return;
+	}
+      if (start < cache->chunks[2 * cache->chunk_cnt - 1] + 4 * 1024)
+	{
+	  cache->chunks[2 * cache->chunk_cnt - 1] = start + len;
+	  cache->use_cnt++;
+	  return;
+	}
+      /* If the last chunk is only used by one mapping, discard it,
+	 as it will be one host to device copy anyway and
+	 memcpying it around will only waste cycles.  */
+      if (cache->use_cnt == 1)
+	cache->chunk_cnt--;
+    }
+  cache->chunks[2 * cache->chunk_cnt] = start;
+  cache->chunks[2 * cache->chunk_cnt + 1] = start + len;
+  cache->chunk_cnt++;
+  cache->use_cnt = 1;
+}
+
 static void
 gomp_copy_host2dev (struct gomp_device_descr *devicep,
-		    void *d, const void *h, size_t sz)
+		    void *d, const void *h, size_t sz,
+		    struct gomp_map_cache *cache)
 {
+  if (cache)
+    {
+      uintptr_t doff = (uintptr_t) d - cache->tgt->tgt_start;
+      if (doff < cache->chunks[2 * cache->chunk_cnt - 1])
+	{
+	  long first = 0;
+	  long last = cache->chunk_cnt - 1;
+	  while (first <= last)
+	    {
+	      long middle = (first + last) >> 1;
+	      if (cache->chunks[2 * middle + 1] <= doff)
+		first = middle + 1;
+	      else if (cache->chunks[2 * middle] <= doff)
+		{
+		  if (doff + sz > cache->chunks[2 * middle + 1])
+		    gomp_fatal ("internal libgomp cache error");
+		  memcpy ((char *) cache->buf + (doff - cache->chunks[0]),
+			  h, sz);
+		  return;
+		}
+	      else
+		last = middle - 1;
+	    }
+	}
+    }
   gomp_device_copy (devicep, devicep->host2dev_func, "dev", d, "host", h, sz);
 }
 
@@ -208,7 +275,7 @@ gomp_free_device_memory (struct gomp_dev
 static inline void
 gomp_map_vars_existing (struct gomp_device_descr *devicep, splay_tree_key oldn,
 			splay_tree_key newn, struct target_var_desc *tgt_var,
-			unsigned char kind)
+			unsigned char kind, struct gomp_map_cache *cache)
 {
   tgt_var->key = oldn;
   tgt_var->copy_from = GOMP_MAP_COPY_FROM_P (kind);
@@ -232,7 +299,7 @@ gomp_map_vars_existing (struct gomp_devi
 			(void *) (oldn->tgt->tgt_start + oldn->tgt_offset
 				  + newn->host_start - oldn->host_start),
 			(void *) newn->host_start,
-			newn->host_end - newn->host_start);
+			newn->host_end - newn->host_start, cache);
 
   if (oldn->refcount != REFCOUNT_INFINITY)
     oldn->refcount++;
@@ -247,7 +314,8 @@ get_kind (bool short_mapkind, void *kind
 
 static void
 gomp_map_pointer (struct target_mem_desc *tgt, uintptr_t host_ptr,
-		  uintptr_t target_offset, uintptr_t bias)
+		  uintptr_t target_offset, uintptr_t bias,
+		  struct gomp_map_cache *cache)
 {
   struct gomp_device_descr *devicep = tgt->device_descr;
   struct splay_tree_s *mem_map = &devicep->mem_map;
@@ -257,11 +325,10 @@ gomp_map_pointer (struct target_mem_desc
   if (cur_node.host_start == (uintptr_t) NULL)
     {
       cur_node.tgt_offset = (uintptr_t) NULL;
-      /* FIXME: see comment about coalescing host/dev transfers below.  */
       gomp_copy_host2dev (devicep,
 			  (void *) (tgt->tgt_start + target_offset),
 			  (void *) &cur_node.tgt_offset,
-			  sizeof (void *));
+			  sizeof (void *), cache);
       return;
     }
   /* Add bias to the pointer value.  */
@@ -280,15 +347,15 @@ gomp_map_pointer (struct target_mem_desc
      array section.  Now subtract bias to get what we want
      to initialize the pointer with.  */
   cur_node.tgt_offset -= bias;
-  /* FIXME: see comment about coalescing host/dev transfers below.  */
   gomp_copy_host2dev (devicep, (void *) (tgt->tgt_start + target_offset),
-		      (void *) &cur_node.tgt_offset, sizeof (void *));
+		      (void *) &cur_node.tgt_offset, sizeof (void *), cache);
 }
 
 static void
 gomp_map_fields_existing (struct target_mem_desc *tgt, splay_tree_key n,
 			  size_t first, size_t i, void **hostaddrs,
-			  size_t *sizes, void *kinds)
+			  size_t *sizes, void *kinds,
+			  struct gomp_map_cache *cache)
 {
   struct gomp_device_descr *devicep = tgt->device_descr;
   struct splay_tree_s *mem_map = &devicep->mem_map;
@@ -306,7 +373,7 @@ gomp_map_fields_existing (struct target_
       && n2->host_start - n->host_start == n2->tgt_offset - n->tgt_offset)
     {
       gomp_map_vars_existing (devicep, n2, &cur_node,
-			      &tgt->list[i], kind & typemask);
+			      &tgt->list[i], kind & typemask, cache);
       return;
     }
   if (sizes[i] == 0)
@@ -322,7 +389,7 @@ gomp_map_fields_existing (struct target_
 		 == n2->tgt_offset - n->tgt_offset)
 	    {
 	      gomp_map_vars_existing (devicep, n2, &cur_node, &tgt->list[i],
-				      kind & typemask);
+				      kind & typemask, cache);
 	      return;
 	    }
 	}
@@ -334,7 +401,7 @@ gomp_map_fields_existing (struct target_
 	  && n2->host_start - n->host_start == n2->tgt_offset - n->tgt_offset)
 	{
 	  gomp_map_vars_existing (devicep, n2, &cur_node, &tgt->list[i],
-				  kind & typemask);
+				  kind & typemask, cache);
 	  return;
 	}
     }
@@ -381,6 +448,7 @@ gomp_map_vars (struct gomp_device_descr
   tgt->list_count = mapnum;
   tgt->refcount = pragma_kind == GOMP_MAP_VARS_ENTER_DATA ? 0 : 1;
   tgt->device_descr = devicep;
+  struct gomp_map_cache cache, *cachep = NULL;
 
   if (mapnum == 0)
     {
@@ -391,11 +459,25 @@ gomp_map_vars (struct gomp_device_descr
 
   tgt_align = sizeof (void *);
   tgt_size = 0;
+  cache.chunks = NULL;
+  cache.chunk_cnt = -1;
+  cache.use_cnt = 0;
+  cache.buf = NULL;
+  if (mapnum > 1 || pragma_kind == GOMP_MAP_VARS_TARGET)
+    {
+      cache.chunks
+	= (size_t *) gomp_alloca ((2 * mapnum + 2) * sizeof (size_t));
+      cache.chunk_cnt = 0;
+    }
   if (pragma_kind == GOMP_MAP_VARS_TARGET)
     {
       size_t align = 4 * sizeof (void *);
       tgt_align = align;
       tgt_size = mapnum * sizeof (void *);
+      cache.chunk_cnt = 1;
+      cache.use_cnt = 1 + (mapnum > 1);
+      cache.chunks[0] = 0;
+      cache.chunks[1] = tgt_size;
     }
 
   gomp_mutex_lock (&devicep->lock);
@@ -449,19 +531,34 @@ gomp_map_vars (struct gomp_device_descr
 	      size_t align = (size_t) 1 << (kind >> rshift);
 	      if (tgt_align < align)
 		tgt_align = align;
-	      tgt_size -= (uintptr_t) hostaddrs[first]
-			  - (uintptr_t) hostaddrs[i];
+	      tgt_size -= (uintptr_t) hostaddrs[first] - cur_node.host_start;
 	      tgt_size = (tgt_size + align - 1) & ~(align - 1);
-	      tgt_size += cur_node.host_end - (uintptr_t) hostaddrs[i];
+	      tgt_size += cur_node.host_end - cur_node.host_start;
 	      not_found_cnt += last - i;
 	      for (i = first; i <= last; i++)
-		tgt->list[i].key = NULL;
+		{
+		  tgt->list[i].key = NULL;
+		  switch (get_kind (short_mapkind, kinds, i) & typemask)
+		    {
+		    case GOMP_MAP_ALLOC:
+		    case GOMP_MAP_FROM:
+		    case GOMP_MAP_FORCE_ALLOC:
+		    case GOMP_MAP_ALWAYS_FROM:
+		      break;
+		    default:
+		      /* All the others copy data if newly allocated.  */
+		      gomp_cache_add (&cache, tgt_size - cur_node.host_end
+					      + (uintptr_t) hostaddrs[i],
+				      sizes[i]);
+		      break;
+		    }
+		}
 	      i--;
 	      continue;
 	    }
 	  for (i = first; i <= last; i++)
 	    gomp_map_fields_existing (tgt, n, first, i, hostaddrs,
-				      sizes, kinds);
+				      sizes, kinds, NULL);
 	  i--;
 	  continue;
 	}
@@ -485,6 +582,8 @@ gomp_map_vars (struct gomp_device_descr
 	  if (tgt_align < align)
 	    tgt_align = align;
 	  tgt_size = (tgt_size + align - 1) & ~(align - 1);
+	  gomp_cache_add (&cache, tgt_size,
+			  cur_node.host_end - cur_node.host_start);
 	  tgt_size += cur_node.host_end - cur_node.host_start;
 	  has_firstprivate = true;
 	  continue;
@@ -504,7 +603,7 @@ gomp_map_vars (struct gomp_device_descr
 	n = splay_tree_lookup (mem_map, &cur_node);
       if (n && n->refcount != REFCOUNT_LINK)
 	gomp_map_vars_existing (devicep, n, &cur_node, &tgt->list[i],
-				kind & typemask);
+				kind & typemask, NULL);
       else
 	{
 	  tgt->list[i].key = NULL;
@@ -514,6 +613,19 @@ gomp_map_vars (struct gomp_device_descr
 	  if (tgt_align < align)
 	    tgt_align = align;
 	  tgt_size = (tgt_size + align - 1) & ~(align - 1);
+	  switch (kind & typemask)
+	    {
+	    case GOMP_MAP_ALLOC:
+	    case GOMP_MAP_FROM:
+	    case GOMP_MAP_FORCE_ALLOC:
+	    case GOMP_MAP_ALWAYS_FROM:
+	      break;
+	    default:
+	      /* All the others copy data if newly allocated.  */
+	      gomp_cache_add (&cache, tgt_size,
+			      cur_node.host_end - cur_node.host_start);
+	      break;
+	    }
 	  tgt_size += cur_node.host_end - cur_node.host_start;
 	  if ((kind & typemask) == GOMP_MAP_TO_PSET)
 	    {
@@ -562,6 +674,19 @@ gomp_map_vars (struct gomp_device_descr
       tgt->tgt_start = (uintptr_t) tgt->to_free;
       tgt->tgt_start = (tgt->tgt_start + tgt_align - 1) & ~(tgt_align - 1);
       tgt->tgt_end = tgt->tgt_start + tgt_size;
+
+      if (cache.use_cnt == 1)
+	cache.chunk_cnt--;
+      if (cache.chunk_cnt > 0)
+	{
+	  cache.buf
+	    = malloc (cache.chunks[2 * cache.chunk_cnt - 1] - cache.chunks[0]);
+	  if (cache.buf)
+	    {
+	      cache.tgt = tgt;
+	      cachep = &cache;
+	    }
+	}
     }
   else
     {
@@ -600,7 +725,7 @@ gomp_map_vars (struct gomp_device_descr
 		len = sizes[i];
 		gomp_copy_host2dev (devicep,
 				    (void *) (tgt->tgt_start + tgt_size),
-				    (void *) hostaddrs[i], len);
+				    (void *) hostaddrs[i], len, cachep);
 		tgt_size += len;
 		continue;
 	      case GOMP_MAP_FIRSTPRIVATE_INT:
@@ -633,7 +758,7 @@ gomp_map_vars (struct gomp_device_descr
 		  }
 		for (i = first; i <= last; i++)
 		  gomp_map_fields_existing (tgt, n, first, i, hostaddrs,
-					    sizes, kinds);
+					    sizes, kinds, cachep);
 		i--;
 		continue;
 	      case GOMP_MAP_ALWAYS_POINTER:
@@ -658,7 +783,7 @@ gomp_map_vars (struct gomp_device_descr
 					      + cur_node.host_start
 					      - n->host_start),
 				    (void *) &cur_node.tgt_offset,
-				    sizeof (void *));
+				    sizeof (void *), cachep);
 		cur_node.tgt_offset = n->tgt->tgt_start + n->tgt_offset
 				      + cur_node.host_start - n->host_start;
 		continue;
@@ -674,7 +799,7 @@ gomp_map_vars (struct gomp_device_descr
 	    splay_tree_key n = splay_tree_lookup (mem_map, k);
 	    if (n && n->refcount != REFCOUNT_LINK)
 	      gomp_map_vars_existing (devicep, n, k, &tgt->list[i],
-				      kind & typemask);
+				      kind & typemask, cachep);
 	    else
 	      {
 		k->link_key = NULL;
@@ -725,26 +850,22 @@ gomp_map_vars (struct gomp_device_descr
 		  case GOMP_MAP_FORCE_TOFROM:
 		  case GOMP_MAP_ALWAYS_TO:
 		  case GOMP_MAP_ALWAYS_TOFROM:
-		    /* FIXME: Perhaps add some smarts, like if copying
-		       several adjacent fields from host to target, use some
-		       host buffer to avoid sending each var individually.  */
 		    gomp_copy_host2dev (devicep,
 					(void *) (tgt->tgt_start
 						  + k->tgt_offset),
 					(void *) k->host_start,
-					k->host_end - k->host_start);
+					k->host_end - k->host_start, cachep);
 		    break;
 		  case GOMP_MAP_POINTER:
 		    gomp_map_pointer (tgt, (uintptr_t) *(void **) k->host_start,
-				      k->tgt_offset, sizes[i]);
+				      k->tgt_offset, sizes[i], cachep);
 		    break;
 		  case GOMP_MAP_TO_PSET:
-		    /* FIXME: see above FIXME comment.  */
 		    gomp_copy_host2dev (devicep,
 					(void *) (tgt->tgt_start
 						  + k->tgt_offset),
 					(void *) k->host_start,
-					k->host_end - k->host_start);
+					k->host_end - k->host_start, cachep);
 
 		    for (j = i + 1; j < mapnum; j++)
 		      if (!GOMP_MAP_POINTER_P (get_kind (short_mapkind, kinds,
@@ -767,7 +888,7 @@ gomp_map_vars (struct gomp_device_descr
 					    k->tgt_offset
 					    + ((uintptr_t) hostaddrs[j]
 					       - k->host_start),
-					    sizes[j]);
+					    sizes[j], cachep);
 			  i++;
 			}
 		    break;
@@ -795,7 +916,7 @@ gomp_map_vars (struct gomp_device_descr
 					(void *) (tgt->tgt_start
 						  + k->tgt_offset),
 					(void *) k->host_start,
-					sizeof (void *));
+					sizeof (void *), cachep);
 		    break;
 		  default:
 		    gomp_mutex_unlock (&devicep->lock);
@@ -822,13 +943,23 @@ gomp_map_vars (struct gomp_device_descr
       for (i = 0; i < mapnum; i++)
 	{
 	  cur_node.tgt_offset = gomp_map_val (tgt, hostaddrs, i);
-	  /* FIXME: see above FIXME comment.  */
 	  gomp_copy_host2dev (devicep,
 			      (void *) (tgt->tgt_start + i * sizeof (void *)),
-			      (void *) &cur_node.tgt_offset, sizeof (void *));
+			      (void *) &cur_node.tgt_offset, sizeof (void *),
+			      cachep);
 	}
     }
 
+  if (cachep)
+    {
+      long c = 0;
+      for (c = 0; c < cache.chunk_cnt; ++c)
+	gomp_copy_host2dev (devicep, (void *) (tgt->tgt_start + cache.chunks[2 * c]),
+			    (char *) cache.buf + (cache.chunks[2 * c] - cache.chunks[0]),
+			    cache.chunks[2 * c + 1] - cache.chunks[2 * c], NULL);
+      free (cache.buf);
+    }
+
   /* If the variable from "omp target enter data" map-list was already mapped,
      tgt is not needed.  Otherwise tgt will be freed by gomp_unmap_vars or
      gomp_exit_data.  */
@@ -970,7 +1101,7 @@ gomp_update (struct gomp_device_descr *d
 	    size_t size = cur_node.host_end - cur_node.host_start;
 
 	    if (GOMP_MAP_COPY_TO_P (kind & typemask))
-	      gomp_copy_host2dev (devicep, devaddr, hostaddr, size);
+	      gomp_copy_host2dev (devicep, devaddr, hostaddr, size, NULL);
 	    if (GOMP_MAP_COPY_FROM_P (kind & typemask))
 	      gomp_copy_dev2host (devicep, hostaddr, devaddr, size);
 	  }

	Jakub

^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2019-12-18 17:13 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-10-24  9:57 [RFC PATCH] Coalesce host to device transfers in libgomp Jakub Jelinek
2017-10-24 15:59 ` Cesar Philippidis
2017-10-24 16:02   ` Jakub Jelinek
2017-10-24 17:56     ` Alexander Monakov
2017-10-24 17:40 ` Alexander Monakov
2017-10-24 19:36   ` Jakub Jelinek
2017-10-25 12:03   ` Jakub Jelinek
2017-10-27 14:13     ` [PATCH] Implement omp async support for nvptx Tom de Vries
2017-10-30  7:25       ` Jakub Jelinek
2017-10-30 12:02         ` Tom de Vries
2017-10-30 13:52           ` Tom de Vries
2018-12-06 17:02     ` [RFC PATCH] Coalesce host to device transfers in libgomp Thomas Schwinge
2018-12-06 17:19       ` Jakub Jelinek
2018-12-06 17:54         ` Thomas Schwinge
2018-12-06 17:57           ` Jakub Jelinek
2018-12-09 12:53             ` Thomas Schwinge
2019-12-18 17:15     ` Thomas Schwinge
2019-05-23 14:40 ` Thomas Schwinge
2019-05-23 14:57   ` Jakub Jelinek

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).