public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc/devel/omp/gcc-12] nvptx, libgomp: Move the low-latency allocator code
@ 2023-02-16 18:02 Andrew Stubbs
0 siblings, 0 replies; only message in thread
From: Andrew Stubbs @ 2023-02-16 18:02 UTC (permalink / raw)
To: gcc-cvs
https://gcc.gnu.org/g:9583738a62a33a276b2aad980a27e77097f95924
commit 9583738a62a33a276b2aad980a27e77097f95924
Author: Andrew Stubbs <ams@codesourcery.com>
Date: Tue Dec 13 23:31:21 2022 +0000
nvptx, libgomp: Move the low-latency allocator code
There shouldn't be a functionality change; this is just so AMD can share
the code.
The new basic-allocator.c is designed to be included so it can be used as a
template multiple times and inlined.
libgomp/ChangeLog:
* config/nvptx/allocator.c (BASIC_ALLOC_PREFIX): New define, and
include basic-allocator.c.
(__nvptx_lowlat_heap_root): Remove.
(heapdesc): Remove.
(nvptx_memspace_alloc): Move implementation to basic-allocator.c.
(nvptx_memspace_calloc): Likewise.
(nvptx_memspace_free): Likewise.
(nvptx_memspace_realloc): Likewise.
* config/nvptx/team.c (__nvptx_lowlat_heap_root): Remove.
(gomp_nvptx_main): Call __nvptx_lowlat_init.
* basic-allocator.c: New file.
Diff:
---
libgomp/ChangeLog.omp | 14 ++
libgomp/basic-allocator.c | 380 +++++++++++++++++++++++++++++++++++++++
libgomp/config/nvptx/allocator.c | 268 +--------------------------
libgomp/config/nvptx/team.c | 18 +-
4 files changed, 407 insertions(+), 273 deletions(-)
diff --git a/libgomp/ChangeLog.omp b/libgomp/ChangeLog.omp
index af31412a0c6..cfcd3ca1d58 100644
--- a/libgomp/ChangeLog.omp
+++ b/libgomp/ChangeLog.omp
@@ -13,6 +13,20 @@
* testsuite/libgomp.fortran/target-nowait-array-section.f90: Fix
comment typo and improve its wording.
+2023-02-16 Andrew Stubbs <ams@codesourcery.com>
+
+ * config/nvptx/allocator.c (BASIC_ALLOC_PREFIX): New define, and
+ include basic-allocator.c.
+ (__nvptx_lowlat_heap_root): Remove.
+ (heapdesc): Remove.
+ (nvptx_memspace_alloc): Move implementation to basic-allocator.c.
+ (nvptx_memspace_calloc): Likewise.
+ (nvptx_memspace_free): Likewise.
+ (nvptx_memspace_realloc): Likewise.
+ * config/nvptx/team.c (__nvptx_lowlat_heap_root): Remove.
+ (gomp_nvptx_main): Call __nvptx_lowlat_init.
+ * basic-allocator.c: New file.
+
2023-02-15 Thomas Schwinge <thomas@codesourcery.com>
* testsuite/libgomp.c-c++-common/target-present-1.c: Fix.
diff --git a/libgomp/basic-allocator.c b/libgomp/basic-allocator.c
new file mode 100644
index 00000000000..94b99a89e0b
--- /dev/null
+++ b/libgomp/basic-allocator.c
@@ -0,0 +1,380 @@
+/* Copyright (C) 2023 Free Software Foundation, Inc.
+
+ This file is part of the GNU Offloading and Multi Processing Library
+ (libgomp).
+
+ Libgomp is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* This is a basic "malloc" implementation intended for use with small,
+ low-latency memories.
+
+ To use this template, define BASIC_ALLOC_PREFIX, and then #include the
+ source file. The other configuration macros are optional.
+
+ The root heap descriptor is stored in the first bytes of the heap, and each
+ free chunk contains a similar descriptor for the next free chunk in the
+ chain.
+
+ The descriptor is two values: offset and size, which describe the
+ location of a chunk of memory available for allocation. The offset is
+ relative to the base of the heap. The special offset value 0xffffffff
+ indicates that the heap (free chain) is locked. The offset and size are
+ 32-bit values so the base alignment can be 8-bytes.
+
+ Memory is allocated to the first free chunk that fits. The free chain
+ is always stored in order of the offset to assist coalescing adjacent
+ chunks. */
+
+#include "libgomp.h"
+
+#ifndef BASIC_ALLOC_PREFIX
+#error "BASIC_ALLOC_PREFIX not defined."
+#endif
+
+#ifndef BASIC_ALLOC_YIELD
+#deine BASIC_ALLOC_YIELD
+#endif
+
+#define ALIGN(VAR) (((VAR) + 7) & ~7) /* 8-byte granularity. */
+
+#define fn1(prefix, name) prefix ## _ ## name
+#define fn(prefix, name) fn1 (prefix, name)
+#define basic_alloc_init fn(BASIC_ALLOC_PREFIX,init)
+#define basic_alloc_alloc fn(BASIC_ALLOC_PREFIX,alloc)
+#define basic_alloc_calloc fn(BASIC_ALLOC_PREFIX,calloc)
+#define basic_alloc_free fn(BASIC_ALLOC_PREFIX,free)
+#define basic_alloc_realloc fn(BASIC_ALLOC_PREFIX,realloc)
+
+typedef struct {
+ uint32_t offset;
+ uint32_t size;
+} heapdesc;
+
+void
+basic_alloc_init (char *heap, size_t limit)
+{
+ if (heap == NULL)
+ return;
+
+ /* Initialize the head of the free chain. */
+ heapdesc *root = (heapdesc*)heap;
+ root->offset = ALIGN(1);
+ root->size = limit - root->offset;
+
+ /* And terminate the chain. */
+ heapdesc *next = (heapdesc*)(heap + root->offset);
+ next->offset = 0;
+ next->size = 0;
+}
+
+static void *
+basic_alloc_alloc (char *heap, size_t size)
+{
+ if (heap == NULL)
+ return NULL;
+
+ /* Memory is allocated in N-byte granularity. */
+ size = ALIGN (size);
+
+ /* Acquire a lock on the low-latency heap. */
+ heapdesc root, *root_ptr = (heapdesc*)heap;
+ do
+ {
+ root.offset = __atomic_exchange_n (&root_ptr->offset, 0xffffffff,
+ MEMMODEL_ACQUIRE);
+ if (root.offset != 0xffffffff)
+ {
+ root.size = root_ptr->size;
+ break;
+ }
+ /* Spin. */
+ BASIC_ALLOC_YIELD;
+ }
+ while (1);
+
+ /* Walk the free chain. */
+ heapdesc chunk = root;
+ heapdesc *prev_chunkptr = NULL;
+ heapdesc *chunkptr = (heapdesc*)(heap + chunk.offset);
+ heapdesc onward_chain = *chunkptr;
+ while (chunk.size != 0 && (uint32_t)size > chunk.size)
+ {
+ chunk = onward_chain;
+ prev_chunkptr = chunkptr;
+ chunkptr = (heapdesc*)(heap + chunk.offset);
+ onward_chain = *chunkptr;
+ }
+
+ void *result = NULL;
+ if (chunk.size != 0)
+ {
+ /* Allocation successful. */
+ result = chunkptr;
+
+ /* Update the free chain. */
+ heapdesc stillfree = chunk;
+ stillfree.offset += size;
+ stillfree.size -= size;
+ heapdesc *stillfreeptr = (heapdesc*)(heap + stillfree.offset);
+
+ if (stillfree.size == 0)
+ /* The whole chunk was used. */
+ stillfree = onward_chain;
+ else
+ /* The chunk was split, so restore the onward chain. */
+ *stillfreeptr = onward_chain;
+
+ /* The previous free slot or root now points to stillfree. */
+ if (prev_chunkptr)
+ *prev_chunkptr = stillfree;
+ else
+ root = stillfree;
+ }
+
+ /* Update the free chain root and release the lock. */
+ root_ptr->size = root.size;
+ __atomic_store_n (&root_ptr->offset, root.offset, MEMMODEL_RELEASE);
+
+ return result;
+}
+
+static void *
+basic_alloc_calloc (char *heap, size_t size)
+{
+ /* Memory is allocated in N-byte granularity. */
+ size = ALIGN (size);
+
+ uint64_t *result = basic_alloc_alloc (heap, size);
+ if (result)
+ /* Inline memset in which we know size is a multiple of 8. */
+ for (unsigned i = 0; i < (unsigned)size/8; i++)
+ result[i] = 0;
+
+ return result;
+}
+
+static void
+basic_alloc_free (char *heap, void *addr, size_t size)
+{
+ /* Memory is allocated in N-byte granularity. */
+ size = ALIGN (size);
+
+ /* Acquire a lock on the low-latency heap. */
+ heapdesc root, *root_ptr = (heapdesc*)heap;
+ do
+ {
+ root.offset = __atomic_exchange_n (&root_ptr->offset, 0xffffffff,
+ MEMMODEL_ACQUIRE);
+ if (root.offset != 0xffffffff)
+ {
+ root.size = root_ptr->size;
+ break;
+ }
+ /* Spin. */
+ }
+ while (1);
+
+ /* Walk the free chain to find where to insert a new entry. */
+ heapdesc chunk = root, prev_chunk;
+ heapdesc *prev_chunkptr = NULL, *prevprev_chunkptr = NULL;
+ heapdesc *chunkptr = (heapdesc*)(heap + chunk.offset);
+ heapdesc onward_chain = *chunkptr;
+ while (chunk.size != 0 && addr > (void*)chunkptr)
+ {
+ prev_chunk = chunk;
+ chunk = onward_chain;
+ prevprev_chunkptr = prev_chunkptr;
+ prev_chunkptr = chunkptr;
+ chunkptr = (heapdesc*)(heap + chunk.offset);
+ onward_chain = *chunkptr;
+ }
+
+ /* Create the new chunk descriptor. */
+ heapdesc newfreechunk;
+ newfreechunk.offset = (uint32_t)((uintptr_t)addr - (uintptr_t)heap);
+ newfreechunk.size = (uint32_t)size;
+
+ /* Coalesce adjacent free chunks. */
+ if (newfreechunk.offset + size == chunk.offset)
+ {
+ /* Free chunk follows. */
+ newfreechunk.size += chunk.size;
+ chunk = onward_chain;
+ }
+ if (prev_chunkptr)
+ {
+ if (prev_chunk.offset + prev_chunk.size
+ == newfreechunk.offset)
+ {
+ /* Free chunk precedes. */
+ newfreechunk.offset = prev_chunk.offset;
+ newfreechunk.size += prev_chunk.size;
+ addr = heap + prev_chunk.offset;
+ prev_chunkptr = prevprev_chunkptr;
+ }
+ }
+
+ /* Update the free chain in the new and previous chunks. */
+ *(heapdesc*)addr = chunk;
+ if (prev_chunkptr)
+ *prev_chunkptr = newfreechunk;
+ else
+ root = newfreechunk;
+
+ /* Update the free chain root and release the lock. */
+ root_ptr->size = root.size;
+ __atomic_store_n (&root_ptr->offset, root.offset, MEMMODEL_RELEASE);
+
+}
+
+static void *
+basic_alloc_realloc (char *heap, void *addr, size_t oldsize,
+ size_t size)
+{
+ /* Memory is allocated in N-byte granularity. */
+ oldsize = ALIGN (oldsize);
+ size = ALIGN (size);
+
+ if (oldsize == size)
+ return addr;
+
+ /* Acquire a lock on the low-latency heap. */
+ heapdesc root, *root_ptr = (heapdesc*)heap;
+ do
+ {
+ root.offset = __atomic_exchange_n (&root_ptr->offset, 0xffffffff,
+ MEMMODEL_ACQUIRE);
+ if (root.offset != 0xffffffff)
+ {
+ root.size = root_ptr->size;
+ break;
+ }
+ /* Spin. */
+ }
+ while (1);
+
+ /* Walk the free chain. */
+ heapdesc chunk = root;
+ heapdesc *prev_chunkptr = NULL;
+ heapdesc *chunkptr = (heapdesc*)(heap + chunk.offset);
+ heapdesc onward_chain = *chunkptr;
+ while (chunk.size != 0 && (void*)chunkptr < addr)
+ {
+ chunk = onward_chain;
+ prev_chunkptr = chunkptr;
+ chunkptr = (heapdesc*)(heap + chunk.offset);
+ onward_chain = *chunkptr;
+ }
+
+ void *result = NULL;
+ if (size < oldsize)
+ {
+ /* The new allocation is smaller than the old; we can always
+ shrink an allocation in place. */
+ result = addr;
+
+ heapdesc *nowfreeptr = (heapdesc*)(addr + size);
+
+ /* Update the free chain. */
+ heapdesc nowfree;
+ nowfree.offset = (char*)nowfreeptr - heap;
+ nowfree.size = oldsize - size;
+
+ if (nowfree.offset + size == chunk.offset)
+ {
+ /* Coalesce following free chunk. */
+ nowfree.size += chunk.size;
+ *nowfreeptr = onward_chain;
+ }
+ else
+ *nowfreeptr = chunk;
+
+ /* The previous free slot or root now points to nowfree. */
+ if (prev_chunkptr)
+ *prev_chunkptr = nowfree;
+ else
+ root = nowfree;
+ }
+ else if (chunk.size != 0
+ && (char *)addr + oldsize == (char *)chunkptr
+ && chunk.size >= size-oldsize)
+ {
+ /* The new allocation is larger than the old, and we found a
+ large enough free block right after the existing block,
+ so we extend into that space. */
+ result = addr;
+
+ uint32_t delta = size-oldsize;
+
+ /* Update the free chain. */
+ heapdesc stillfree = chunk;
+ stillfree.offset += delta;
+ stillfree.size -= delta;
+ heapdesc *stillfreeptr = (heapdesc*)(heap + stillfree.offset);
+
+ if (stillfree.size == 0)
+ /* The whole chunk was used. */
+ stillfree = onward_chain;
+ else
+ /* The chunk was split, so restore the onward chain. */
+ *stillfreeptr = onward_chain;
+
+ /* The previous free slot or root now points to stillfree. */
+ if (prev_chunkptr)
+ *prev_chunkptr = stillfree;
+ else
+ root = stillfree;
+ }
+ /* Else realloc in-place has failed and result remains NULL. */
+
+ /* Update the free chain root and release the lock. */
+ root_ptr->size = root.size;
+ __atomic_store_n (&root_ptr->offset, root.offset, MEMMODEL_RELEASE);
+
+ if (result == NULL)
+ {
+ /* The allocation could not be extended in place, so we simply
+ allocate fresh memory and move the data. If we can't allocate
+ from low-latency memory then we leave the original alloaction
+ intact and return NULL.
+ We could do a fall-back to main memory, but we don't know what
+ the fall-back trait said to do. */
+ result = basic_alloc_alloc (heap, size);
+ if (result != NULL)
+ {
+ /* Inline memcpy in which we know oldsize is a multiple of 8. */
+ uint64_t *from = addr, *to = result;
+ for (unsigned i = 0; i < (unsigned)oldsize/8; i++)
+ to[i] = from[i];
+
+ basic_alloc_free (heap, addr, oldsize);
+ }
+ }
+
+ return result;
+}
+
+#undef ALIGN
+#undef fn1
+#undef fn
+#undef basic_alloc_init
+#undef basic_alloc_alloc
+#undef basic_alloc_free
+#undef basic_alloc_realloc
diff --git a/libgomp/config/nvptx/allocator.c b/libgomp/config/nvptx/allocator.c
index c1a73511623..7c2a7463bf7 100644
--- a/libgomp/config/nvptx/allocator.c
+++ b/libgomp/config/nvptx/allocator.c
@@ -44,20 +44,13 @@
#include "libgomp.h"
#include <stdlib.h>
+#define BASIC_ALLOC_PREFIX __nvptx_lowlat
+#include "../../basic-allocator.c"
+
/* There should be some .shared space reserved for us. There's no way to
express this magic extern sizeless array in C so use asm. */
asm (".extern .shared .u8 __nvptx_lowlat_pool[];\n");
-extern uint32_t __nvptx_lowlat_heap_root __attribute__((shared,nocommon));
-
-typedef union {
- uint32_t raw;
- struct {
- uint16_t offset;
- uint16_t size;
- } desc;
-} heapdesc;
-
static void *
nvptx_memspace_alloc (omp_memspace_handle_t memspace, size_t size)
{
@@ -66,64 +59,7 @@ nvptx_memspace_alloc (omp_memspace_handle_t memspace, size_t size)
char *shared_pool;
asm ("cvta.shared.u64\t%0, __nvptx_lowlat_pool;" : "=r"(shared_pool));
- /* Memory is allocated in 8-byte granularity. */
- size = (size + 7) & ~7;
-
- /* Acquire a lock on the low-latency heap. */
- heapdesc root;
- do
- {
- root.raw = __atomic_exchange_n (&__nvptx_lowlat_heap_root,
- 0xffffffff, MEMMODEL_ACQUIRE);
- if (root.raw != 0xffffffff)
- break;
- /* Spin. */
- }
- while (1);
-
- /* Walk the free chain. */
- heapdesc chunk = {root.raw};
- uint32_t *prev_chunkptr = NULL;
- uint32_t *chunkptr = (uint32_t*)(shared_pool + chunk.desc.offset);
- heapdesc onward_chain = {chunkptr[0]};
- while (chunk.desc.size != 0 && (uint32_t)size > chunk.desc.size)
- {
- chunk.raw = onward_chain.raw;
- prev_chunkptr = chunkptr;
- chunkptr = (uint32_t*)(shared_pool + chunk.desc.offset);
- onward_chain.raw = chunkptr[0];
- }
-
- void *result = NULL;
- if (chunk.desc.size != 0)
- {
- /* Allocation successful. */
- result = chunkptr;
-
- /* Update the free chain. */
- heapdesc stillfree = {chunk.raw};
- stillfree.desc.offset += size;
- stillfree.desc.size -= size;
- uint32_t *stillfreeptr = (uint32_t*)(shared_pool
- + stillfree.desc.offset);
-
- if (stillfree.desc.size == 0)
- /* The whole chunk was used. */
- stillfree.raw = onward_chain.raw;
- else
- /* The chunk was split, so restore the onward chain. */
- stillfreeptr[0] = onward_chain.raw;
-
- /* The previous free slot or root now points to stillfree. */
- if (prev_chunkptr)
- prev_chunkptr[0] = stillfree.raw;
- else
- root.raw = stillfree.raw;
- }
-
- /* Update the free chain root and release the lock. */
- __atomic_store_n (&__nvptx_lowlat_heap_root, root.raw, MEMMODEL_RELEASE);
- return result;
+ return __nvptx_lowlat_alloc (shared_pool, size);
}
else if (memspace == ompx_host_mem_space)
return NULL;
@@ -136,16 +72,10 @@ nvptx_memspace_calloc (omp_memspace_handle_t memspace, size_t size)
{
if (memspace == omp_low_lat_mem_space)
{
- /* Memory is allocated in 8-byte granularity. */
- size = (size + 7) & ~7;
-
- uint64_t *result = nvptx_memspace_alloc (memspace, size);
- if (result)
- /* Inline memset in which we know size is a multiple of 8. */
- for (unsigned i = 0; i < (unsigned)size/8; i++)
- result[i] = 0;
+ char *shared_pool;
+ asm ("cvta.shared.u64\t%0, __nvptx_lowlat_pool;" : "=r"(shared_pool));
- return result;
+ return __nvptx_lowlat_calloc (shared_pool, size);
}
else if (memspace == ompx_host_mem_space)
return NULL;
@@ -161,71 +91,7 @@ nvptx_memspace_free (omp_memspace_handle_t memspace, void *addr, size_t size)
char *shared_pool;
asm ("cvta.shared.u64\t%0, __nvptx_lowlat_pool;" : "=r"(shared_pool));
- /* Memory is allocated in 8-byte granularity. */
- size = (size + 7) & ~7;
-
- /* Acquire a lock on the low-latency heap. */
- heapdesc root;
- do
- {
- root.raw = __atomic_exchange_n (&__nvptx_lowlat_heap_root,
- 0xffffffff, MEMMODEL_ACQUIRE);
- if (root.raw != 0xffffffff)
- break;
- /* Spin. */
- }
- while (1);
-
- /* Walk the free chain to find where to insert a new entry. */
- heapdesc chunk = {root.raw}, prev_chunk;
- uint32_t *prev_chunkptr = NULL, *prevprev_chunkptr = NULL;
- uint32_t *chunkptr = (uint32_t*)(shared_pool + chunk.desc.offset);
- heapdesc onward_chain = {chunkptr[0]};
- while (chunk.desc.size != 0 && addr > (void*)chunkptr)
- {
- prev_chunk.raw = chunk.raw;
- chunk.raw = onward_chain.raw;
- prevprev_chunkptr = prev_chunkptr;
- prev_chunkptr = chunkptr;
- chunkptr = (uint32_t*)(shared_pool + chunk.desc.offset);
- onward_chain.raw = chunkptr[0];
- }
-
- /* Create the new chunk descriptor. */
- heapdesc newfreechunk;
- newfreechunk.desc.offset = (uint16_t)((uintptr_t)addr
- - (uintptr_t)shared_pool);
- newfreechunk.desc.size = (uint16_t)size;
-
- /* Coalesce adjacent free chunks. */
- if (newfreechunk.desc.offset + size == chunk.desc.offset)
- {
- /* Free chunk follows. */
- newfreechunk.desc.size += chunk.desc.size;
- chunk.raw = onward_chain.raw;
- }
- if (prev_chunkptr)
- {
- if (prev_chunk.desc.offset + prev_chunk.desc.size
- == newfreechunk.desc.offset)
- {
- /* Free chunk precedes. */
- newfreechunk.desc.offset = prev_chunk.desc.offset;
- newfreechunk.desc.size += prev_chunk.desc.size;
- addr = shared_pool + prev_chunk.desc.offset;
- prev_chunkptr = prevprev_chunkptr;
- }
- }
-
- /* Update the free chain in the new and previous chunks. */
- ((uint32_t*)addr)[0] = chunk.raw;
- if (prev_chunkptr)
- prev_chunkptr[0] = newfreechunk.raw;
- else
- root.raw = newfreechunk.raw;
-
- /* Update the free chain root and release the lock. */
- __atomic_store_n (&__nvptx_lowlat_heap_root, root.raw, MEMMODEL_RELEASE);
+ __nvptx_lowlat_free (shared_pool, addr, size);
}
else
free (addr);
@@ -240,123 +106,7 @@ nvptx_memspace_realloc (omp_memspace_handle_t memspace, void *addr,
char *shared_pool;
asm ("cvta.shared.u64\t%0, __nvptx_lowlat_pool;" : "=r"(shared_pool));
- /* Memory is allocated in 8-byte granularity. */
- oldsize = (oldsize + 7) & ~7;
- size = (size + 7) & ~7;
-
- if (oldsize == size)
- return addr;
-
- /* Acquire a lock on the low-latency heap. */
- heapdesc root;
- do
- {
- root.raw = __atomic_exchange_n (&__nvptx_lowlat_heap_root,
- 0xffffffff, MEMMODEL_ACQUIRE);
- if (root.raw != 0xffffffff)
- break;
- /* Spin. */
- }
- while (1);
-
- /* Walk the free chain. */
- heapdesc chunk = {root.raw};
- uint32_t *prev_chunkptr = NULL;
- uint32_t *chunkptr = (uint32_t*)(shared_pool + chunk.desc.offset);
- heapdesc onward_chain = {chunkptr[0]};
- while (chunk.desc.size != 0 && (void*)chunkptr < addr)
- {
- chunk.raw = onward_chain.raw;
- prev_chunkptr = chunkptr;
- chunkptr = (uint32_t*)(shared_pool + chunk.desc.offset);
- onward_chain.raw = chunkptr[0];
- }
-
- void *result = NULL;
- if (size < oldsize)
- {
- /* The new allocation is smaller than the old; we can always
- shrink an allocation in place. */
- result = addr;
-
- uint32_t *nowfreeptr = (uint32_t*)(addr + size);
-
- /* Update the free chain. */
- heapdesc nowfree;
- nowfree.desc.offset = (char*)nowfreeptr - shared_pool;
- nowfree.desc.size = oldsize - size;
-
- if (nowfree.desc.offset + size == chunk.desc.offset)
- {
- /* Coalesce following free chunk. */
- nowfree.desc.size += chunk.desc.size;
- nowfreeptr[0] = onward_chain.raw;
- }
- else
- nowfreeptr[0] = chunk.raw;
-
- /* The previous free slot or root now points to nowfree. */
- if (prev_chunkptr)
- prev_chunkptr[0] = nowfree.raw;
- else
- root.raw = nowfree.raw;
- }
- else if (chunk.desc.size != 0
- && (char *)addr + oldsize == (char *)chunkptr
- && chunk.desc.size >= size-oldsize)
- {
- /* The new allocation is larger than the old, and we found a
- large enough free block right after the existing block,
- so we extend into that space. */
- result = addr;
-
- uint16_t delta = size-oldsize;
-
- /* Update the free chain. */
- heapdesc stillfree = {chunk.raw};
- stillfree.desc.offset += delta;
- stillfree.desc.size -= delta;
- uint32_t *stillfreeptr = (uint32_t*)(shared_pool
- + stillfree.desc.offset);
-
- if (stillfree.desc.size == 0)
- /* The whole chunk was used. */
- stillfree.raw = onward_chain.raw;
- else
- /* The chunk was split, so restore the onward chain. */
- stillfreeptr[0] = onward_chain.raw;
-
- /* The previous free slot or root now points to stillfree. */
- if (prev_chunkptr)
- prev_chunkptr[0] = stillfree.raw;
- else
- root.raw = stillfree.raw;
- }
- /* Else realloc in-place has failed and result remains NULL. */
-
- /* Update the free chain root and release the lock. */
- __atomic_store_n (&__nvptx_lowlat_heap_root, root.raw, MEMMODEL_RELEASE);
-
- if (result == NULL)
- {
- /* The allocation could not be extended in place, so we simply
- allocate fresh memory and move the data. If we can't allocate
- from low-latency memory then we leave the original alloaction
- intact and return NULL.
- We could do a fall-back to main memory, but we don't know what
- the fall-back trait said to do. */
- result = nvptx_memspace_alloc (memspace, size);
- if (result != NULL)
- {
- /* Inline memcpy in which we know oldsize is a multiple of 8. */
- uint64_t *from = addr, *to = result;
- for (unsigned i = 0; i < (unsigned)oldsize/8; i++)
- to[i] = from[i];
-
- nvptx_memspace_free (memspace, addr, oldsize);
- }
- }
- return result;
+ return __nvptx_lowlat_realloc (shared_pool, addr, oldsize, size);
}
else if (memspace == ompx_host_mem_space)
return NULL;
diff --git a/libgomp/config/nvptx/team.c b/libgomp/config/nvptx/team.c
index 685610e00be..b30b8df178d 100644
--- a/libgomp/config/nvptx/team.c
+++ b/libgomp/config/nvptx/team.c
@@ -33,7 +33,6 @@
struct gomp_thread *nvptx_thrs __attribute__((shared,nocommon));
int __gomp_team_num __attribute__((shared,nocommon));
-uint32_t __nvptx_lowlat_heap_root __attribute__((shared,nocommon));
static void gomp_thread_start (struct gomp_thread_pool *);
@@ -41,6 +40,9 @@ static void gomp_thread_start (struct gomp_thread_pool *);
express this magic extern sizeless array in C so use asm. */
asm (".extern .shared .u8 __nvptx_lowlat_pool[];\n");
+/* Defined in basic-allocator.c via config/nvptx/allocator.c. */
+void __nvptx_lowlat_init (void *heap, size_t size);
+
/* This externally visible function handles target region entry. It
sets up a per-team thread pool and transfers control by calling FN (FN_DATA)
in the master thread or gomp_thread_start in other threads.
@@ -76,19 +78,7 @@ gomp_nvptx_main (void (*fn) (void *), void *fn_data)
asm ("mov.u32\t%0, %%dynamic_smem_size;\n"
: "=r"(shared_pool_size));
#endif
-
- /* ... and initialize it with an empty free-chain. */
- union {
- uint32_t raw;
- struct {
- uint16_t offset;
- uint16_t size;
- } desc;
- } root;
- root.desc.offset = 0; /* The first byte is free. */
- root.desc.size = shared_pool_size; /* The whole space is free. */
- __nvptx_lowlat_heap_root = root.raw;
- shared_pool[0] = 0; /* Terminate free chain. */
+ __nvptx_lowlat_init (shared_pool, shared_pool_size);
/* Initialize the thread pool. */
struct gomp_thread_pool *pool = alloca (sizeof (*pool));
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2023-02-16 18:02 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-02-16 18:02 [gcc/devel/omp/gcc-12] nvptx, libgomp: Move the low-latency allocator code Andrew Stubbs
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).