amdgcn, libgomp: custom USM allocator There were problems with critical driver data sharing pages with USM data, so this new allocator implementation moves USM to entirely different pages. libgomp/ChangeLog: * plugin/plugin-gcn.c: Include sys/mman.h and unistd.h. (usm_heap_create): New function. (struct usm_splay_tree_key_s): Delete function. (usm_splay_compare): Delete function. (splay_tree_prefix): Delete define. (GOMP_OFFLOAD_usm_alloc): Use new allocator. (GOMP_OFFLOAD_usm_free): Likewise. (GOMP_OFFLOAD_is_usm_ptr): Likewise. (gomp_fatal): Delete macro. (splay_tree_c): Delete. * usm-allocator.c: New file. diff --git a/libgomp/plugin/plugin-gcn.c b/libgomp/plugin/plugin-gcn.c index 3c0404c09b2..36fab3951d5 100644 --- a/libgomp/plugin/plugin-gcn.c +++ b/libgomp/plugin/plugin-gcn.c @@ -48,6 +48,8 @@ #include "oacc-plugin.h" #include "oacc-int.h" #include +#include +#include /* These probably won't be in elf.h for a while. */ #ifndef R_AMDGPU_NONE @@ -3071,6 +3073,102 @@ wait_queue (struct goacc_asyncqueue *aq) } /* }}} */ +/* {{{ Unified Shared Memory + + Normal heap memory is already enabled for USM, but by default it is "fine- + grained" memory, meaning that the GPU must access it via the system bus, + slowly. Changing the page to "coarse-grained" mode means that the page + is migrated on-demand and can therefore be accessed quickly by both CPU and + GPU (although care should be taken to prevent thrashing the page back and + forth). + + GOMP_OFFLOAD_alloc also allocates coarse-grained memory, but in that case + the initial location is GPU memory; GOMP_OFFLOAD_usm_alloc returns system + memory configure coarse-grained. + + The USM memory space is allocated as a largish block and then subdivided + via a custom allocator. (It would be possible to reconfigure regular + "malloc'd" memory, but if it ends up on the same page as memory used by + the HSA driver then bad things happen.) */ + +#include "../usm-allocator.c" + +/* Record a list of the memory blocks configured for USM. */ +static struct usm_heap_pages { + void *start; + void *end; + struct usm_heap_pages *next; +} *usm_heap_pages = NULL; + +/* Initialize or extend the USM memory space. This is called whenever + allocation fails. SIZE is the minimum size required for the failed + allocation to succeed; the function may choose a larger size. + Note that Linux lazy allocation means that the memory returned isn't + guarenteed to acually exist. */ + +static bool +usm_heap_create (size_t size) +{ + static int lock = 0; + while (__atomic_exchange_n (&lock, 1, MEMMODEL_ACQUIRE) != 0) + ; + + size_t default_size = 1L * 1024 * 1024 * 1024; /* 1GB */ + if (size < default_size) + size = default_size; + + /* Round up to a whole page. */ + int pagesize = getpagesize (); + int misalignment = size % pagesize; + if (misalignment > 0) + size += pagesize - misalignment; + + /* Try to get contiguous memory, but it might not be possible. + The most recent previous allocation is at the head of the list. */ + void *addrhint = (usm_heap_pages ? usm_heap_pages->end : NULL); + void *new_pages = mmap (addrhint, size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (!new_pages) + { + GCN_DEBUG ("Could not allocate Unified Shared Memory heap."); + __atomic_store_n (&lock, 0, MEMMODEL_RELEASE); + return false; + } + + /* Register the heap allocation as coarse grained, which implies USM. */ + struct hsa_amd_svm_attribute_pair_s attr = { + HSA_AMD_SVM_ATTRIB_GLOBAL_FLAG, + HSA_AMD_SVM_GLOBAL_FLAG_COARSE_GRAINED + }; + hsa_status_t status = hsa_fns.hsa_amd_svm_attributes_set_fn (new_pages, size, + &attr, 1); + if (status != HSA_STATUS_SUCCESS) + GOMP_PLUGIN_fatal ("Failed to allocate Unified Shared Memory;" + " please update your drivers and/or kernel"); + + if (new_pages == addrhint) + { + /* We got contiguous pages, so we don't need extra list entries. */ + usm_heap_pages->end += size; + } + else + { + /* We need a new list entry to record a discontiguous range. */ + struct usm_heap_pages *page = malloc (sizeof (*page)); + page->start = new_pages; + page->end = new_pages + size; + page->next = usm_heap_pages; + usm_heap_pages = page; + } + + /* Add the new memory into the allocator's free table. */ + usm_register_memory (new_pages, size); + + __atomic_store_n (&lock, 0, MEMMODEL_RELEASE); + return true; +} + +/* }}} */ /* {{{ OpenACC support */ /* Execute an OpenACC kernel, synchronously or asynchronously. */ @@ -3905,74 +4003,24 @@ GOMP_OFFLOAD_evaluate_device (int device_num, const char *kind, return !isa || isa_code (isa) == agent->device_isa; } -/* Use a splay tree to track USM allocations. */ -typedef struct usm_splay_tree_node_s *usm_splay_tree_node; -typedef struct usm_splay_tree_s *usm_splay_tree; -typedef struct usm_splay_tree_key_s *usm_splay_tree_key; - -struct usm_splay_tree_key_s { - void *addr; - size_t size; -}; - -static inline int -usm_splay_compare (usm_splay_tree_key x, usm_splay_tree_key y) -{ - if ((x->addr <= y->addr && x->addr + x->size > y->addr) - || (y->addr <= x->addr && y->addr + y->size > x->addr)) - return 0; - - return (x->addr > y->addr ? 1 : -1); -} - -#define splay_tree_prefix usm -#include "../splay-tree.h" - -static struct usm_splay_tree_s usm_map = { NULL }; - -/* Allocate memory suitable for Unified Shared Memory. - - Normal heap memory is already enabled for USM, but by default it is "fine- - grained" memory, meaning that the GPU must access it via the system bus, - slowly. Changing the page to "coarse-grained" mode means that the page - is migrated on-demand and can therefore be accessed quickly by both CPU and - GPU (although care should be taken to prevent thrashing the page back and - forth). - - GOMP_OFFLOAD_alloc also allocates coarse-grained memory, but in that case - the initial location is GPU memory; this function returns system memory. - - We record and track allocations so that GOMP_OFFLOAD_is_usm_ptr can look - them up. */ +/* Allocate memory suitable for Unified Shared Memory. */ void * GOMP_OFFLOAD_usm_alloc (int device, size_t size) { - void *ptr = malloc (size); - if (!ptr || !hsa_fns.hsa_amd_svm_attributes_set_fn) - return ptr; - - /* Register the heap allocation as coarse grained, which implies USM. */ - struct hsa_amd_svm_attribute_pair_s attr = { - HSA_AMD_SVM_ATTRIB_GLOBAL_FLAG, - HSA_AMD_SVM_GLOBAL_FLAG_COARSE_GRAINED - }; - hsa_status_t status = hsa_fns.hsa_amd_svm_attributes_set_fn (ptr, size, - &attr, 1); - if (status != HSA_STATUS_SUCCESS) - GOMP_PLUGIN_fatal ("Failed to allocate Unified Shared Memory;" - " please update your drivers and/or kernel"); - - /* Record the allocation for GOMP_OFFLOAD_is_usm_ptr. */ - usm_splay_tree_node node = malloc (sizeof (struct usm_splay_tree_node_s)); - node->key.addr = ptr; - node->key.size = size; - node->left = NULL; - node->right = NULL; - usm_splay_tree_insert (&usm_map, node); - - return ptr; + while (1) + { + void *result = usm_alloc (size); + if (result) + return result; + + /* Allocation failed. Try again if we can create a new heap block. + Note: it's possible another thread could get to the new memory + first, so the while loop is necessary. */ + if (!usm_heap_create (size)) + return NULL; + } } /* Free memory allocated via GOMP_OFFLOAD_usm_alloc. */ @@ -3980,15 +4028,7 @@ GOMP_OFFLOAD_usm_alloc (int device, size_t size) bool GOMP_OFFLOAD_usm_free (int device, void *ptr) { - struct usm_splay_tree_key_s key = { ptr, 1 }; - usm_splay_tree_key node = usm_splay_tree_lookup (&usm_map, &key); - if (node) - { - usm_splay_tree_remove (&usm_map, &key); - free (node); - } - - free (ptr); + usm_free (ptr); return true; } @@ -3997,8 +4037,10 @@ GOMP_OFFLOAD_usm_free (int device, void *ptr) bool GOMP_OFFLOAD_is_usm_ptr (void *ptr) { - struct usm_splay_tree_key_s key = { ptr, 1 }; - return usm_splay_tree_lookup (&usm_map, &key); + for (struct usm_heap_pages *heap = usm_heap_pages; heap; heap = heap->next) + if (ptr >= (void*)heap && ptr < heap->end) + return true; + return false; } /* }}} */ @@ -4292,18 +4334,3 @@ GOMP_OFFLOAD_openacc_destroy_thread_data (void *data) } /* }}} */ -/* {{{ USM splay tree */ - -/* Include this now so that splay-tree.c doesn't include it later. This - avoids a conflict with splay_tree_prefix. */ -#include "libgomp.h" - -/* This allows splay-tree.c to call gomp_fatal in this context. The splay - tree code doesn't use the variadic arguments right now. */ -#define gomp_fatal(MSG, ...) GOMP_PLUGIN_fatal (MSG) - -/* Include the splay tree code inline, with the prefixes added. */ -#define splay_tree_prefix usm -#define splay_tree_c -#include "../splay-tree.h" -/* }}} */ diff --git a/libgomp/usm-allocator.c b/libgomp/usm-allocator.c new file mode 100644 index 00000000000..c45109169ca --- /dev/null +++ b/libgomp/usm-allocator.c @@ -0,0 +1,231 @@ +/* Copyright (C) 2023 Free Software Foundation, Inc. + + This file is part of the GNU Offloading and Multi Processing Library + (libgomp). + + Libgomp is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY + WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +/* This is a simple "malloc" implementation intended for use with Unified + Shared Memory. It allocates memory from a pool allocated and configured + by the device plugin, and is intended to be #included from there. It + keeps the allocated/free chain in a side-table (splay tree) to ensure that + the allocation routine does not migrate all the USM pages back into host + memory. */ + + +#include "libgomp.h" + +/* Use a splay tree to track USM allocations. */ + +typedef struct usm_splay_tree_node_s *usm_splay_tree_node; +typedef struct usm_splay_tree_s *usm_splay_tree; +typedef struct usm_splay_tree_key_s *usm_splay_tree_key; + +struct usm_splay_tree_key_s { + void *base; + size_t size; +}; + +static inline int +usm_splay_compare (usm_splay_tree_key x, usm_splay_tree_key y) +{ + return (x->base == y->base ? 0 + : x->base > y->base ? 1 + : -1); +} +#define splay_tree_prefix usm +#include "splay-tree.h" + +static int usm_lock = 0; +static struct usm_splay_tree_s usm_allocations = { NULL }; +static struct usm_splay_tree_s usm_free_space = { NULL }; + +#define ALIGN(VAR) (((VAR) + 7) & ~7) /* 8-byte granularity. */ + +/* Coalesce contiguous free space into one entry. This considers the entries + either side of the root node only, so it should be called each time a new + entry in inserted into the root. */ + +static void +usm_coalesce_free_space () +{ + usm_splay_tree_node prev, next, node = usm_free_space.root; + + for (prev = node->left; prev && prev->right; prev = prev->right) + ; + for (next = node->right; next && next->left; next = next->left) + ; + + /* Coalesce adjacent free chunks. */ + if (next + && node->key.base + node->key.size == next->key.base) + { + /* Free chunk follows. */ + node->key.size += next->key.size; + usm_splay_tree_remove (&usm_free_space, &next->key); + free (next); + } + if (prev + && prev->key.base + prev->key.size == node->key.base) + { + /* Free chunk precedes. */ + prev->key.size += node->key.size; + usm_splay_tree_remove (&usm_free_space, &node->key); + free (node); + } +} + +/* Add a new memory region into the free chain. This is how the USM heap is + initialized and extended. If the new region is contiguous with an existing + region then any free space will be coalesced. */ + +static void +usm_register_memory (char *base, size_t size) +{ + if (base == NULL) + return; + + while (__atomic_exchange_n (&usm_lock, 1, MEMMODEL_ACQUIRE) == 1) + ; + + usm_splay_tree_node node = malloc (sizeof (struct usm_splay_tree_node_s)); + node->key.base = base; + node->key.size = size; + node->left = NULL; + node->right = NULL; + usm_splay_tree_insert (&usm_free_space, node); + usm_coalesce_free_space (node); + + __atomic_store_n (&usm_lock, 0, MEMMODEL_RELEASE); +} + +/* This splay_tree_foreach callback selects the first free space large enough + to hold the allocation needed. Since the splay_tree walk may start in the + middle the "first" isn't necessarily the "leftmost" entry. */ + +struct usm_callback_data { + size_t size; + usm_splay_tree_node found; +}; + +static int +usm_alloc_callback (usm_splay_tree_key key, void *data) +{ + struct usm_callback_data *cbd = (struct usm_callback_data *)data; + + if (key->size >= cbd->size) + { + cbd->found = (usm_splay_tree_node)key; + return 1; + } + + return 0; +} + +/* USM "malloc". Selects and moves and address range from usm_free_space to + usm_allocations, while leaving any excess in usm_free_space. */ + +static void * +usm_alloc (size_t size) +{ + /* Memory is allocated in N-byte granularity. */ + size = ALIGN (size); + + /* Acquire the lock. */ + while (__atomic_exchange_n (&usm_lock, 1, MEMMODEL_ACQUIRE) == 1) + ; + + if (!usm_free_space.root) + { + /* No memory registered, or no free space. */ + __atomic_store_n (&usm_lock, 0, MEMMODEL_RELEASE); + return NULL; + } + + /* Find a suitable free block. */ + struct usm_callback_data cbd = {size, NULL}; + usm_splay_tree_foreach_lazy (&usm_free_space, usm_alloc_callback, &cbd); + usm_splay_tree_node freenode = cbd.found; + + void *result = NULL; + if (freenode) + { + /* Allocation successful. */ + result = freenode->key.base; + usm_splay_tree_node allocnode = malloc (sizeof (*allocnode)); + allocnode->key.base = result; + allocnode->key.size = size; + allocnode->left = NULL; + allocnode->right = NULL; + usm_splay_tree_insert (&usm_allocations, allocnode); + + /* Update the free chain. */ + size_t stillfree_size = freenode->key.size - size; + if (stillfree_size > 0) + { + freenode->key.base = freenode->key.base + size; + freenode->key.size = stillfree_size; + } + else + { + usm_splay_tree_remove (&usm_free_space, &freenode->key); + free (freenode); + } + } + + /* Release the lock. */ + __atomic_store_n (&usm_lock, 0, MEMMODEL_RELEASE); + + return result; +} + +/* USM "free". Moves an address range from usm_allocations to usm_free_space + and merges that record with any contiguous free memory. */ + +static void +usm_free (void *addr) +{ + /* Acquire the lock. */ + while (__atomic_exchange_n (&usm_lock, 1, MEMMODEL_ACQUIRE) == 1) + ; + + /* Convert the memory map to free. */ + struct usm_splay_tree_key_s key = {addr}; + usm_splay_tree_key found = usm_splay_tree_lookup (&usm_allocations, &key); + if (!found) + GOMP_PLUGIN_fatal ("invalid free"); + usm_splay_tree_remove (&usm_allocations, &key); + usm_splay_tree_insert (&usm_free_space, (usm_splay_tree_node)found); + usm_coalesce_free_space (); + + /* Release the lock. */ + __atomic_store_n (&usm_lock, 0, MEMMODEL_RELEASE); +} + +#undef ALIGN + +/* This allows splay-tree.c to call gomp_fatal in this context. The splay + tree code doesn't use the variadic arguments right now. */ +#define gomp_fatal(MSG, ...) GOMP_PLUGIN_fatal (MSG) + +/* Include the splay tree code inline, with the prefixes added. */ +#define splay_tree_prefix usm +#define splay_tree_c +#include "splay-tree.h"