public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc/devel/omp/gcc-12] amdgcn, libgomp: custom USM allocator
@ 2023-01-11 17:50 Andrew Stubbs
  0 siblings, 0 replies; only message in thread
From: Andrew Stubbs @ 2023-01-11 17:50 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:2d6fb9681337077a3b639f94ed725245f17bc596

commit 2d6fb9681337077a3b639f94ed725245f17bc596
Author: Andrew Stubbs <ams@codesourcery.com>
Date:   Tue Dec 13 23:31:21 2022 +0000

    amdgcn, libgomp: custom USM allocator
    
    There were problems with critical driver data sharing pages with USM data, so
    this new allocator implementation moves USM to entirely different pages.
    
    libgomp/ChangeLog:
    
            * plugin/plugin-gcn.c: Include sys/mman.h and unistd.h.
            (usm_heap_create): New function.
            (struct usm_splay_tree_key_s): Delete function.
            (usm_splay_compare): Delete function.
            (splay_tree_prefix): Delete define.
            (GOMP_OFFLOAD_usm_alloc): Use new allocator.
            (GOMP_OFFLOAD_usm_free): Likewise.
            (GOMP_OFFLOAD_is_usm_ptr): Likewise.
            (gomp_fatal): Delete macro.
            (splay_tree_c): Delete.
            * usm-allocator.c: New file.

Diff:
---
 libgomp/plugin/plugin-gcn.c | 205 ++++++++++++++++++++++-----------------
 libgomp/usm-allocator.c     | 231 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 347 insertions(+), 89 deletions(-)

diff --git a/libgomp/plugin/plugin-gcn.c b/libgomp/plugin/plugin-gcn.c
index 3c0404c09b2..36fab3951d5 100644
--- a/libgomp/plugin/plugin-gcn.c
+++ b/libgomp/plugin/plugin-gcn.c
@@ -48,6 +48,8 @@
 #include "oacc-plugin.h"
 #include "oacc-int.h"
 #include <assert.h>
+#include <sys/mman.h>
+#include <unistd.h>
 
 /* These probably won't be in elf.h for a while.  */
 #ifndef R_AMDGPU_NONE
@@ -3071,6 +3073,102 @@ wait_queue (struct goacc_asyncqueue *aq)
 }
 
 /* }}}  */
+/* {{{ Unified Shared Memory
+
+   Normal heap memory is already enabled for USM, but by default it is "fine-
+   grained" memory, meaning that the GPU must access it via the system bus,
+   slowly.  Changing the page to "coarse-grained" mode means that the page
+   is migrated on-demand and can therefore be accessed quickly by both CPU and
+   GPU (although care should be taken to prevent thrashing the page back and
+   forth).
+
+   GOMP_OFFLOAD_alloc also allocates coarse-grained memory, but in that case
+   the initial location is GPU memory; GOMP_OFFLOAD_usm_alloc returns system
+   memory configure coarse-grained.
+
+   The USM memory space is allocated as a largish block and then subdivided
+   via a custom allocator.  (It would be possible to reconfigure regular
+   "malloc'd" memory, but if it ends up on the same page as memory used by
+   the HSA driver then bad things happen.)  */
+
+#include "../usm-allocator.c"
+
+/* Record a list of the memory blocks configured for USM.  */
+static struct usm_heap_pages {
+  void *start;
+  void *end;
+  struct usm_heap_pages *next;
+} *usm_heap_pages = NULL;
+
+/* Initialize or extend the USM memory space.  This is called whenever
+   allocation fails.  SIZE is the minimum size required for the failed
+   allocation to succeed; the function may choose a larger size.
+   Note that Linux lazy allocation means that the memory returned isn't
+   guarenteed to acually exist.  */
+
+static bool
+usm_heap_create (size_t size)
+{
+  static int lock = 0;
+  while (__atomic_exchange_n (&lock, 1, MEMMODEL_ACQUIRE) != 0)
+    ;
+
+  size_t default_size = 1L * 1024 * 1024 * 1024; /* 1GB */
+  if (size < default_size)
+    size = default_size;
+
+  /* Round up to a whole page.  */
+  int pagesize = getpagesize ();
+  int misalignment = size % pagesize;
+  if (misalignment > 0)
+    size += pagesize - misalignment;
+
+  /* Try to get contiguous memory, but it might not be possible.
+     The most recent previous allocation is at the head of the list.  */
+  void *addrhint = (usm_heap_pages ? usm_heap_pages->end : NULL);
+  void *new_pages = mmap (addrhint, size, PROT_READ | PROT_WRITE,
+			  MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+  if (!new_pages)
+    {
+      GCN_DEBUG ("Could not allocate Unified Shared Memory heap.");
+      __atomic_store_n (&lock, 0, MEMMODEL_RELEASE);
+      return false;
+    }
+
+  /* Register the heap allocation as coarse grained, which implies USM.  */
+  struct hsa_amd_svm_attribute_pair_s attr = {
+    HSA_AMD_SVM_ATTRIB_GLOBAL_FLAG,
+    HSA_AMD_SVM_GLOBAL_FLAG_COARSE_GRAINED
+  };
+  hsa_status_t status = hsa_fns.hsa_amd_svm_attributes_set_fn (new_pages, size,
+							       &attr, 1);
+  if (status != HSA_STATUS_SUCCESS)
+    GOMP_PLUGIN_fatal ("Failed to allocate Unified Shared Memory;"
+		       " please update your drivers and/or kernel");
+
+  if (new_pages == addrhint)
+    {
+      /* We got contiguous pages, so we don't need extra list entries.  */
+      usm_heap_pages->end += size;
+    }
+  else
+    {
+      /* We need a new list entry to record a discontiguous range.  */
+      struct usm_heap_pages *page = malloc (sizeof (*page));
+      page->start = new_pages;
+      page->end = new_pages + size;
+      page->next = usm_heap_pages;
+      usm_heap_pages = page;
+    }
+
+  /* Add the new memory into the allocator's free table.  */
+  usm_register_memory (new_pages, size);
+
+  __atomic_store_n (&lock, 0, MEMMODEL_RELEASE);
+  return true;
+}
+
+/* }}} */
 /* {{{ OpenACC support  */
 
 /* Execute an OpenACC kernel, synchronously or asynchronously.  */
@@ -3905,74 +4003,24 @@ GOMP_OFFLOAD_evaluate_device (int device_num, const char *kind,
   return !isa || isa_code (isa) == agent->device_isa;
 }
 
-/* Use a splay tree to track USM allocations.  */
 
-typedef struct usm_splay_tree_node_s *usm_splay_tree_node;
-typedef struct usm_splay_tree_s *usm_splay_tree;
-typedef struct usm_splay_tree_key_s *usm_splay_tree_key;
-
-struct usm_splay_tree_key_s {
-  void *addr;
-  size_t size;
-};
-
-static inline int
-usm_splay_compare (usm_splay_tree_key x, usm_splay_tree_key y)
-{
-  if ((x->addr <= y->addr && x->addr + x->size > y->addr)
-      || (y->addr <= x->addr && y->addr + y->size > x->addr))
-    return 0;
-
-  return (x->addr > y->addr ? 1 : -1);
-}
-
-#define splay_tree_prefix usm
-#include "../splay-tree.h"
-
-static struct usm_splay_tree_s usm_map = { NULL };
-
-/* Allocate memory suitable for Unified Shared Memory.
-
-   Normal heap memory is already enabled for USM, but by default it is "fine-
-   grained" memory, meaning that the GPU must access it via the system bus,
-   slowly.  Changing the page to "coarse-grained" mode means that the page
-   is migrated on-demand and can therefore be accessed quickly by both CPU and
-   GPU (although care should be taken to prevent thrashing the page back and
-   forth).
-
-   GOMP_OFFLOAD_alloc also allocates coarse-grained memory, but in that case
-   the initial location is GPU memory; this function returns system memory.
-
-   We record and track allocations so that GOMP_OFFLOAD_is_usm_ptr can look
-   them up.  */
+/* Allocate memory suitable for Unified Shared Memory.  */
 
 void *
 GOMP_OFFLOAD_usm_alloc (int device, size_t size)
 {
-  void *ptr = malloc (size);
-  if (!ptr || !hsa_fns.hsa_amd_svm_attributes_set_fn)
-    return ptr;
-
-  /* Register the heap allocation as coarse grained, which implies USM.  */
-  struct hsa_amd_svm_attribute_pair_s attr = {
-    HSA_AMD_SVM_ATTRIB_GLOBAL_FLAG,
-    HSA_AMD_SVM_GLOBAL_FLAG_COARSE_GRAINED
-  };
-  hsa_status_t status = hsa_fns.hsa_amd_svm_attributes_set_fn (ptr, size,
-							       &attr, 1);
-  if (status != HSA_STATUS_SUCCESS)
-    GOMP_PLUGIN_fatal ("Failed to allocate Unified Shared Memory;"
-		       " please update your drivers and/or kernel");
-
-  /* Record the allocation for GOMP_OFFLOAD_is_usm_ptr.  */
-  usm_splay_tree_node node = malloc (sizeof (struct usm_splay_tree_node_s));
-  node->key.addr = ptr;
-  node->key.size = size;
-  node->left = NULL;
-  node->right = NULL;
-  usm_splay_tree_insert (&usm_map, node);
-
-  return ptr;
+  while (1)
+    {
+      void *result = usm_alloc (size);
+      if (result)
+	return result;
+
+      /* Allocation failed.  Try again if we can create a new heap block.
+	 Note: it's possible another thread could get to the new memory
+	 first, so the while loop is necessary. */
+      if (!usm_heap_create (size))
+	return NULL;
+    }
 }
 
 /* Free memory allocated via GOMP_OFFLOAD_usm_alloc.  */
@@ -3980,15 +4028,7 @@ GOMP_OFFLOAD_usm_alloc (int device, size_t size)
 bool
 GOMP_OFFLOAD_usm_free (int device, void *ptr)
 {
-  struct usm_splay_tree_key_s key = { ptr, 1 };
-  usm_splay_tree_key node = usm_splay_tree_lookup (&usm_map, &key);
-  if (node)
-    {
-      usm_splay_tree_remove (&usm_map, &key);
-      free (node);
-    }
-
-  free (ptr);
+  usm_free (ptr);
   return true;
 }
 
@@ -3997,8 +4037,10 @@ GOMP_OFFLOAD_usm_free (int device, void *ptr)
 bool
 GOMP_OFFLOAD_is_usm_ptr (void *ptr)
 {
-  struct usm_splay_tree_key_s key = { ptr, 1 };
-  return usm_splay_tree_lookup (&usm_map, &key);
+  for (struct usm_heap_pages *heap = usm_heap_pages; heap; heap = heap->next)
+    if (ptr >= (void*)heap && ptr < heap->end)
+      return true;
+  return false;
 }
 
 /* }}} */
@@ -4292,18 +4334,3 @@ GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
 }
 
 /* }}} */
-/* {{{ USM splay tree */
-
-/* Include this now so that splay-tree.c doesn't include it later.  This
-   avoids a conflict with splay_tree_prefix.  */
-#include "libgomp.h"
-
-/* This allows splay-tree.c to call gomp_fatal in this context.  The splay
-   tree code doesn't use the variadic arguments right now.  */
-#define gomp_fatal(MSG, ...) GOMP_PLUGIN_fatal (MSG)
-
-/* Include the splay tree code inline, with the prefixes added.  */
-#define splay_tree_prefix usm
-#define splay_tree_c
-#include "../splay-tree.h"
-/* }}}  */
diff --git a/libgomp/usm-allocator.c b/libgomp/usm-allocator.c
new file mode 100644
index 00000000000..c45109169ca
--- /dev/null
+++ b/libgomp/usm-allocator.c
@@ -0,0 +1,231 @@
+/* Copyright (C) 2023 Free Software Foundation, Inc.
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* This is a simple "malloc" implementation intended for use with Unified
+   Shared Memory.  It allocates memory from a pool allocated and configured
+   by the device plugin, and is intended to be #included from there.  It
+   keeps the allocated/free chain in a side-table (splay tree) to ensure that
+   the allocation routine does not migrate all the USM pages back into host
+   memory.  */
+
+
+#include "libgomp.h"
+
+/* Use a splay tree to track USM allocations.  */
+
+typedef struct usm_splay_tree_node_s *usm_splay_tree_node;
+typedef struct usm_splay_tree_s *usm_splay_tree;
+typedef struct usm_splay_tree_key_s *usm_splay_tree_key;
+
+struct usm_splay_tree_key_s {
+  void *base;
+  size_t size;
+};
+
+static inline int
+usm_splay_compare (usm_splay_tree_key x, usm_splay_tree_key y)
+{
+  return (x->base == y->base ? 0
+	  : x->base > y->base ? 1
+	  : -1);
+}
+#define splay_tree_prefix usm
+#include "splay-tree.h"
+
+static int usm_lock = 0;
+static struct usm_splay_tree_s usm_allocations = { NULL };
+static struct usm_splay_tree_s usm_free_space = { NULL };
+
+#define ALIGN(VAR) (((VAR) + 7) & ~7)    /* 8-byte granularity.  */
+
+/* Coalesce contiguous free space into one entry.  This considers the entries
+   either side of the root node only, so it should be called each time a new
+   entry in inserted into the root.  */
+
+static void
+usm_coalesce_free_space ()
+{
+  usm_splay_tree_node prev, next, node = usm_free_space.root;
+
+  for (prev = node->left; prev && prev->right; prev = prev->right)
+    ;
+  for (next = node->right; next && next->left; next = next->left)
+    ;
+
+  /* Coalesce adjacent free chunks.  */
+  if (next
+      && node->key.base + node->key.size == next->key.base)
+    {
+      /* Free chunk follows.  */
+      node->key.size += next->key.size;
+      usm_splay_tree_remove (&usm_free_space, &next->key);
+      free (next);
+    }
+  if (prev
+      && prev->key.base + prev->key.size == node->key.base)
+    {
+      /* Free chunk precedes.  */
+      prev->key.size += node->key.size;
+      usm_splay_tree_remove (&usm_free_space, &node->key);
+      free (node);
+    }
+}
+
+/* Add a new memory region into the free chain.  This is how the USM heap is
+   initialized and extended.  If the new region is contiguous with an existing
+   region then any free space will be coalesced.  */
+
+static void
+usm_register_memory (char *base, size_t size)
+{
+  if (base == NULL)
+    return;
+
+  while (__atomic_exchange_n (&usm_lock, 1, MEMMODEL_ACQUIRE) == 1)
+    ;
+
+  usm_splay_tree_node node = malloc (sizeof (struct usm_splay_tree_node_s));
+  node->key.base = base;
+  node->key.size = size;
+  node->left = NULL;
+  node->right = NULL;
+  usm_splay_tree_insert (&usm_free_space, node);
+  usm_coalesce_free_space (node);
+
+  __atomic_store_n (&usm_lock, 0, MEMMODEL_RELEASE);
+}
+
+/* This splay_tree_foreach callback selects the first free space large enough
+   to hold the allocation needed.  Since the splay_tree walk may start in the
+   middle the "first" isn't necessarily the "leftmost" entry.  */
+
+struct usm_callback_data {
+  size_t size;
+  usm_splay_tree_node found;
+};
+
+static int
+usm_alloc_callback (usm_splay_tree_key key, void *data)
+{
+  struct usm_callback_data *cbd = (struct usm_callback_data *)data;
+
+  if (key->size >= cbd->size)
+    {
+      cbd->found = (usm_splay_tree_node)key;
+      return 1;
+    }
+
+  return 0;
+}
+
+/* USM "malloc".  Selects and moves and address range from usm_free_space to
+   usm_allocations, while leaving any excess in usm_free_space.  */
+
+static void *
+usm_alloc (size_t size)
+{
+  /* Memory is allocated in N-byte granularity.  */
+  size = ALIGN (size);
+
+  /* Acquire the lock.  */
+  while (__atomic_exchange_n (&usm_lock, 1, MEMMODEL_ACQUIRE) == 1)
+    ;
+
+  if (!usm_free_space.root)
+    {
+      /* No memory registered, or no free space.  */
+      __atomic_store_n (&usm_lock, 0, MEMMODEL_RELEASE);
+      return NULL;
+    }
+
+  /* Find a suitable free block.  */
+  struct usm_callback_data cbd = {size, NULL};
+  usm_splay_tree_foreach_lazy (&usm_free_space, usm_alloc_callback, &cbd);
+  usm_splay_tree_node freenode = cbd.found;
+
+  void *result = NULL;
+  if (freenode)
+    {
+      /* Allocation successful.  */
+      result = freenode->key.base;
+      usm_splay_tree_node allocnode = malloc (sizeof (*allocnode));
+      allocnode->key.base = result;
+      allocnode->key.size = size;
+      allocnode->left = NULL;
+      allocnode->right = NULL;
+      usm_splay_tree_insert (&usm_allocations, allocnode);
+
+      /* Update the free chain.  */
+      size_t stillfree_size = freenode->key.size - size;
+      if (stillfree_size > 0)
+	{
+	  freenode->key.base = freenode->key.base + size;
+	  freenode->key.size = stillfree_size;
+	}
+      else
+	{
+	  usm_splay_tree_remove (&usm_free_space, &freenode->key);
+	  free (freenode);
+	}
+    }
+
+  /* Release the lock.  */
+  __atomic_store_n (&usm_lock, 0, MEMMODEL_RELEASE);
+
+  return result;
+}
+
+/* USM "free".  Moves an address range from usm_allocations to usm_free_space
+   and merges that record with any contiguous free memory.  */
+
+static void
+usm_free (void *addr)
+{
+  /* Acquire the lock.  */
+  while (__atomic_exchange_n (&usm_lock, 1, MEMMODEL_ACQUIRE) == 1)
+    ;
+
+  /* Convert the memory map to free.  */
+  struct usm_splay_tree_key_s key = {addr};
+  usm_splay_tree_key found = usm_splay_tree_lookup (&usm_allocations, &key);
+  if (!found)
+    GOMP_PLUGIN_fatal ("invalid free");
+  usm_splay_tree_remove (&usm_allocations, &key);
+  usm_splay_tree_insert (&usm_free_space, (usm_splay_tree_node)found);
+  usm_coalesce_free_space ();
+
+  /* Release the lock.  */
+  __atomic_store_n (&usm_lock, 0, MEMMODEL_RELEASE);
+}
+
+#undef ALIGN
+
+/* This allows splay-tree.c to call gomp_fatal in this context.  The splay
+   tree code doesn't use the variadic arguments right now.  */
+#define gomp_fatal(MSG, ...) GOMP_PLUGIN_fatal (MSG)
+
+/* Include the splay tree code inline, with the prefixes added.  */
+#define splay_tree_prefix usm
+#define splay_tree_c
+#include "splay-tree.h"

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2023-01-11 17:50 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-01-11 17:50 [gcc/devel/omp/gcc-12] amdgcn, libgomp: custom USM allocator Andrew Stubbs

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).