From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 1461) id F1F053858D33; Thu, 16 Feb 2023 18:02:04 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org F1F053858D33 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org; s=default; t=1676570524; bh=zLZI7p+B3O0z3jPUpr6tDewD8MgFHCE34B/Y3apmSLY=; h=From:To:Subject:Date:From; b=DHYyv2+ogmK18pqTzOI88Kjf+P8ER2nYyrWV4OXaosYZ6NbGmNHcNSXUNxZn7fE5o 4ZhdznDfn7Qcz4qT7eC9CVoQKN4PQ6Jr4lTemtBZKQIgWR4Dc4Dq8+40Z7VzqUM/mS coa4569PcV4s1akcBAqr2vvgnHM8WluMKRmEsBvQ= Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: Andrew Stubbs To: gcc-cvs@gcc.gnu.org Subject: [gcc/devel/omp/gcc-12] nvptx, libgomp: Move the low-latency allocator code X-Act-Checkin: gcc X-Git-Author: Andrew Stubbs X-Git-Refname: refs/heads/devel/omp/gcc-12 X-Git-Oldrev: 2e3327d635d261df923f60f7d21e47ff895cd2df X-Git-Newrev: 9583738a62a33a276b2aad980a27e77097f95924 Message-Id: <20230216180204.F1F053858D33@sourceware.org> Date: Thu, 16 Feb 2023 18:02:04 +0000 (GMT) List-Id: https://gcc.gnu.org/g:9583738a62a33a276b2aad980a27e77097f95924 commit 9583738a62a33a276b2aad980a27e77097f95924 Author: Andrew Stubbs Date: Tue Dec 13 23:31:21 2022 +0000 nvptx, libgomp: Move the low-latency allocator code There shouldn't be a functionality change; this is just so AMD can share the code. The new basic-allocator.c is designed to be included so it can be used as a template multiple times and inlined. libgomp/ChangeLog: * config/nvptx/allocator.c (BASIC_ALLOC_PREFIX): New define, and include basic-allocator.c. (__nvptx_lowlat_heap_root): Remove. (heapdesc): Remove. (nvptx_memspace_alloc): Move implementation to basic-allocator.c. (nvptx_memspace_calloc): Likewise. (nvptx_memspace_free): Likewise. (nvptx_memspace_realloc): Likewise. * config/nvptx/team.c (__nvptx_lowlat_heap_root): Remove. (gomp_nvptx_main): Call __nvptx_lowlat_init. * basic-allocator.c: New file. Diff: --- libgomp/ChangeLog.omp | 14 ++ libgomp/basic-allocator.c | 380 +++++++++++++++++++++++++++++++++++++++ libgomp/config/nvptx/allocator.c | 268 +-------------------------- libgomp/config/nvptx/team.c | 18 +- 4 files changed, 407 insertions(+), 273 deletions(-) diff --git a/libgomp/ChangeLog.omp b/libgomp/ChangeLog.omp index af31412a0c6..cfcd3ca1d58 100644 --- a/libgomp/ChangeLog.omp +++ b/libgomp/ChangeLog.omp @@ -13,6 +13,20 @@ * testsuite/libgomp.fortran/target-nowait-array-section.f90: Fix comment typo and improve its wording. +2023-02-16 Andrew Stubbs + + * config/nvptx/allocator.c (BASIC_ALLOC_PREFIX): New define, and + include basic-allocator.c. + (__nvptx_lowlat_heap_root): Remove. + (heapdesc): Remove. + (nvptx_memspace_alloc): Move implementation to basic-allocator.c. + (nvptx_memspace_calloc): Likewise. + (nvptx_memspace_free): Likewise. + (nvptx_memspace_realloc): Likewise. + * config/nvptx/team.c (__nvptx_lowlat_heap_root): Remove. + (gomp_nvptx_main): Call __nvptx_lowlat_init. + * basic-allocator.c: New file. + 2023-02-15 Thomas Schwinge * testsuite/libgomp.c-c++-common/target-present-1.c: Fix. diff --git a/libgomp/basic-allocator.c b/libgomp/basic-allocator.c new file mode 100644 index 00000000000..94b99a89e0b --- /dev/null +++ b/libgomp/basic-allocator.c @@ -0,0 +1,380 @@ +/* Copyright (C) 2023 Free Software Foundation, Inc. + + This file is part of the GNU Offloading and Multi Processing Library + (libgomp). + + Libgomp is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY + WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +/* This is a basic "malloc" implementation intended for use with small, + low-latency memories. + + To use this template, define BASIC_ALLOC_PREFIX, and then #include the + source file. The other configuration macros are optional. + + The root heap descriptor is stored in the first bytes of the heap, and each + free chunk contains a similar descriptor for the next free chunk in the + chain. + + The descriptor is two values: offset and size, which describe the + location of a chunk of memory available for allocation. The offset is + relative to the base of the heap. The special offset value 0xffffffff + indicates that the heap (free chain) is locked. The offset and size are + 32-bit values so the base alignment can be 8-bytes. + + Memory is allocated to the first free chunk that fits. The free chain + is always stored in order of the offset to assist coalescing adjacent + chunks. */ + +#include "libgomp.h" + +#ifndef BASIC_ALLOC_PREFIX +#error "BASIC_ALLOC_PREFIX not defined." +#endif + +#ifndef BASIC_ALLOC_YIELD +#deine BASIC_ALLOC_YIELD +#endif + +#define ALIGN(VAR) (((VAR) + 7) & ~7) /* 8-byte granularity. */ + +#define fn1(prefix, name) prefix ## _ ## name +#define fn(prefix, name) fn1 (prefix, name) +#define basic_alloc_init fn(BASIC_ALLOC_PREFIX,init) +#define basic_alloc_alloc fn(BASIC_ALLOC_PREFIX,alloc) +#define basic_alloc_calloc fn(BASIC_ALLOC_PREFIX,calloc) +#define basic_alloc_free fn(BASIC_ALLOC_PREFIX,free) +#define basic_alloc_realloc fn(BASIC_ALLOC_PREFIX,realloc) + +typedef struct { + uint32_t offset; + uint32_t size; +} heapdesc; + +void +basic_alloc_init (char *heap, size_t limit) +{ + if (heap == NULL) + return; + + /* Initialize the head of the free chain. */ + heapdesc *root = (heapdesc*)heap; + root->offset = ALIGN(1); + root->size = limit - root->offset; + + /* And terminate the chain. */ + heapdesc *next = (heapdesc*)(heap + root->offset); + next->offset = 0; + next->size = 0; +} + +static void * +basic_alloc_alloc (char *heap, size_t size) +{ + if (heap == NULL) + return NULL; + + /* Memory is allocated in N-byte granularity. */ + size = ALIGN (size); + + /* Acquire a lock on the low-latency heap. */ + heapdesc root, *root_ptr = (heapdesc*)heap; + do + { + root.offset = __atomic_exchange_n (&root_ptr->offset, 0xffffffff, + MEMMODEL_ACQUIRE); + if (root.offset != 0xffffffff) + { + root.size = root_ptr->size; + break; + } + /* Spin. */ + BASIC_ALLOC_YIELD; + } + while (1); + + /* Walk the free chain. */ + heapdesc chunk = root; + heapdesc *prev_chunkptr = NULL; + heapdesc *chunkptr = (heapdesc*)(heap + chunk.offset); + heapdesc onward_chain = *chunkptr; + while (chunk.size != 0 && (uint32_t)size > chunk.size) + { + chunk = onward_chain; + prev_chunkptr = chunkptr; + chunkptr = (heapdesc*)(heap + chunk.offset); + onward_chain = *chunkptr; + } + + void *result = NULL; + if (chunk.size != 0) + { + /* Allocation successful. */ + result = chunkptr; + + /* Update the free chain. */ + heapdesc stillfree = chunk; + stillfree.offset += size; + stillfree.size -= size; + heapdesc *stillfreeptr = (heapdesc*)(heap + stillfree.offset); + + if (stillfree.size == 0) + /* The whole chunk was used. */ + stillfree = onward_chain; + else + /* The chunk was split, so restore the onward chain. */ + *stillfreeptr = onward_chain; + + /* The previous free slot or root now points to stillfree. */ + if (prev_chunkptr) + *prev_chunkptr = stillfree; + else + root = stillfree; + } + + /* Update the free chain root and release the lock. */ + root_ptr->size = root.size; + __atomic_store_n (&root_ptr->offset, root.offset, MEMMODEL_RELEASE); + + return result; +} + +static void * +basic_alloc_calloc (char *heap, size_t size) +{ + /* Memory is allocated in N-byte granularity. */ + size = ALIGN (size); + + uint64_t *result = basic_alloc_alloc (heap, size); + if (result) + /* Inline memset in which we know size is a multiple of 8. */ + for (unsigned i = 0; i < (unsigned)size/8; i++) + result[i] = 0; + + return result; +} + +static void +basic_alloc_free (char *heap, void *addr, size_t size) +{ + /* Memory is allocated in N-byte granularity. */ + size = ALIGN (size); + + /* Acquire a lock on the low-latency heap. */ + heapdesc root, *root_ptr = (heapdesc*)heap; + do + { + root.offset = __atomic_exchange_n (&root_ptr->offset, 0xffffffff, + MEMMODEL_ACQUIRE); + if (root.offset != 0xffffffff) + { + root.size = root_ptr->size; + break; + } + /* Spin. */ + } + while (1); + + /* Walk the free chain to find where to insert a new entry. */ + heapdesc chunk = root, prev_chunk; + heapdesc *prev_chunkptr = NULL, *prevprev_chunkptr = NULL; + heapdesc *chunkptr = (heapdesc*)(heap + chunk.offset); + heapdesc onward_chain = *chunkptr; + while (chunk.size != 0 && addr > (void*)chunkptr) + { + prev_chunk = chunk; + chunk = onward_chain; + prevprev_chunkptr = prev_chunkptr; + prev_chunkptr = chunkptr; + chunkptr = (heapdesc*)(heap + chunk.offset); + onward_chain = *chunkptr; + } + + /* Create the new chunk descriptor. */ + heapdesc newfreechunk; + newfreechunk.offset = (uint32_t)((uintptr_t)addr - (uintptr_t)heap); + newfreechunk.size = (uint32_t)size; + + /* Coalesce adjacent free chunks. */ + if (newfreechunk.offset + size == chunk.offset) + { + /* Free chunk follows. */ + newfreechunk.size += chunk.size; + chunk = onward_chain; + } + if (prev_chunkptr) + { + if (prev_chunk.offset + prev_chunk.size + == newfreechunk.offset) + { + /* Free chunk precedes. */ + newfreechunk.offset = prev_chunk.offset; + newfreechunk.size += prev_chunk.size; + addr = heap + prev_chunk.offset; + prev_chunkptr = prevprev_chunkptr; + } + } + + /* Update the free chain in the new and previous chunks. */ + *(heapdesc*)addr = chunk; + if (prev_chunkptr) + *prev_chunkptr = newfreechunk; + else + root = newfreechunk; + + /* Update the free chain root and release the lock. */ + root_ptr->size = root.size; + __atomic_store_n (&root_ptr->offset, root.offset, MEMMODEL_RELEASE); + +} + +static void * +basic_alloc_realloc (char *heap, void *addr, size_t oldsize, + size_t size) +{ + /* Memory is allocated in N-byte granularity. */ + oldsize = ALIGN (oldsize); + size = ALIGN (size); + + if (oldsize == size) + return addr; + + /* Acquire a lock on the low-latency heap. */ + heapdesc root, *root_ptr = (heapdesc*)heap; + do + { + root.offset = __atomic_exchange_n (&root_ptr->offset, 0xffffffff, + MEMMODEL_ACQUIRE); + if (root.offset != 0xffffffff) + { + root.size = root_ptr->size; + break; + } + /* Spin. */ + } + while (1); + + /* Walk the free chain. */ + heapdesc chunk = root; + heapdesc *prev_chunkptr = NULL; + heapdesc *chunkptr = (heapdesc*)(heap + chunk.offset); + heapdesc onward_chain = *chunkptr; + while (chunk.size != 0 && (void*)chunkptr < addr) + { + chunk = onward_chain; + prev_chunkptr = chunkptr; + chunkptr = (heapdesc*)(heap + chunk.offset); + onward_chain = *chunkptr; + } + + void *result = NULL; + if (size < oldsize) + { + /* The new allocation is smaller than the old; we can always + shrink an allocation in place. */ + result = addr; + + heapdesc *nowfreeptr = (heapdesc*)(addr + size); + + /* Update the free chain. */ + heapdesc nowfree; + nowfree.offset = (char*)nowfreeptr - heap; + nowfree.size = oldsize - size; + + if (nowfree.offset + size == chunk.offset) + { + /* Coalesce following free chunk. */ + nowfree.size += chunk.size; + *nowfreeptr = onward_chain; + } + else + *nowfreeptr = chunk; + + /* The previous free slot or root now points to nowfree. */ + if (prev_chunkptr) + *prev_chunkptr = nowfree; + else + root = nowfree; + } + else if (chunk.size != 0 + && (char *)addr + oldsize == (char *)chunkptr + && chunk.size >= size-oldsize) + { + /* The new allocation is larger than the old, and we found a + large enough free block right after the existing block, + so we extend into that space. */ + result = addr; + + uint32_t delta = size-oldsize; + + /* Update the free chain. */ + heapdesc stillfree = chunk; + stillfree.offset += delta; + stillfree.size -= delta; + heapdesc *stillfreeptr = (heapdesc*)(heap + stillfree.offset); + + if (stillfree.size == 0) + /* The whole chunk was used. */ + stillfree = onward_chain; + else + /* The chunk was split, so restore the onward chain. */ + *stillfreeptr = onward_chain; + + /* The previous free slot or root now points to stillfree. */ + if (prev_chunkptr) + *prev_chunkptr = stillfree; + else + root = stillfree; + } + /* Else realloc in-place has failed and result remains NULL. */ + + /* Update the free chain root and release the lock. */ + root_ptr->size = root.size; + __atomic_store_n (&root_ptr->offset, root.offset, MEMMODEL_RELEASE); + + if (result == NULL) + { + /* The allocation could not be extended in place, so we simply + allocate fresh memory and move the data. If we can't allocate + from low-latency memory then we leave the original alloaction + intact and return NULL. + We could do a fall-back to main memory, but we don't know what + the fall-back trait said to do. */ + result = basic_alloc_alloc (heap, size); + if (result != NULL) + { + /* Inline memcpy in which we know oldsize is a multiple of 8. */ + uint64_t *from = addr, *to = result; + for (unsigned i = 0; i < (unsigned)oldsize/8; i++) + to[i] = from[i]; + + basic_alloc_free (heap, addr, oldsize); + } + } + + return result; +} + +#undef ALIGN +#undef fn1 +#undef fn +#undef basic_alloc_init +#undef basic_alloc_alloc +#undef basic_alloc_free +#undef basic_alloc_realloc diff --git a/libgomp/config/nvptx/allocator.c b/libgomp/config/nvptx/allocator.c index c1a73511623..7c2a7463bf7 100644 --- a/libgomp/config/nvptx/allocator.c +++ b/libgomp/config/nvptx/allocator.c @@ -44,20 +44,13 @@ #include "libgomp.h" #include +#define BASIC_ALLOC_PREFIX __nvptx_lowlat +#include "../../basic-allocator.c" + /* There should be some .shared space reserved for us. There's no way to express this magic extern sizeless array in C so use asm. */ asm (".extern .shared .u8 __nvptx_lowlat_pool[];\n"); -extern uint32_t __nvptx_lowlat_heap_root __attribute__((shared,nocommon)); - -typedef union { - uint32_t raw; - struct { - uint16_t offset; - uint16_t size; - } desc; -} heapdesc; - static void * nvptx_memspace_alloc (omp_memspace_handle_t memspace, size_t size) { @@ -66,64 +59,7 @@ nvptx_memspace_alloc (omp_memspace_handle_t memspace, size_t size) char *shared_pool; asm ("cvta.shared.u64\t%0, __nvptx_lowlat_pool;" : "=r"(shared_pool)); - /* Memory is allocated in 8-byte granularity. */ - size = (size + 7) & ~7; - - /* Acquire a lock on the low-latency heap. */ - heapdesc root; - do - { - root.raw = __atomic_exchange_n (&__nvptx_lowlat_heap_root, - 0xffffffff, MEMMODEL_ACQUIRE); - if (root.raw != 0xffffffff) - break; - /* Spin. */ - } - while (1); - - /* Walk the free chain. */ - heapdesc chunk = {root.raw}; - uint32_t *prev_chunkptr = NULL; - uint32_t *chunkptr = (uint32_t*)(shared_pool + chunk.desc.offset); - heapdesc onward_chain = {chunkptr[0]}; - while (chunk.desc.size != 0 && (uint32_t)size > chunk.desc.size) - { - chunk.raw = onward_chain.raw; - prev_chunkptr = chunkptr; - chunkptr = (uint32_t*)(shared_pool + chunk.desc.offset); - onward_chain.raw = chunkptr[0]; - } - - void *result = NULL; - if (chunk.desc.size != 0) - { - /* Allocation successful. */ - result = chunkptr; - - /* Update the free chain. */ - heapdesc stillfree = {chunk.raw}; - stillfree.desc.offset += size; - stillfree.desc.size -= size; - uint32_t *stillfreeptr = (uint32_t*)(shared_pool - + stillfree.desc.offset); - - if (stillfree.desc.size == 0) - /* The whole chunk was used. */ - stillfree.raw = onward_chain.raw; - else - /* The chunk was split, so restore the onward chain. */ - stillfreeptr[0] = onward_chain.raw; - - /* The previous free slot or root now points to stillfree. */ - if (prev_chunkptr) - prev_chunkptr[0] = stillfree.raw; - else - root.raw = stillfree.raw; - } - - /* Update the free chain root and release the lock. */ - __atomic_store_n (&__nvptx_lowlat_heap_root, root.raw, MEMMODEL_RELEASE); - return result; + return __nvptx_lowlat_alloc (shared_pool, size); } else if (memspace == ompx_host_mem_space) return NULL; @@ -136,16 +72,10 @@ nvptx_memspace_calloc (omp_memspace_handle_t memspace, size_t size) { if (memspace == omp_low_lat_mem_space) { - /* Memory is allocated in 8-byte granularity. */ - size = (size + 7) & ~7; - - uint64_t *result = nvptx_memspace_alloc (memspace, size); - if (result) - /* Inline memset in which we know size is a multiple of 8. */ - for (unsigned i = 0; i < (unsigned)size/8; i++) - result[i] = 0; + char *shared_pool; + asm ("cvta.shared.u64\t%0, __nvptx_lowlat_pool;" : "=r"(shared_pool)); - return result; + return __nvptx_lowlat_calloc (shared_pool, size); } else if (memspace == ompx_host_mem_space) return NULL; @@ -161,71 +91,7 @@ nvptx_memspace_free (omp_memspace_handle_t memspace, void *addr, size_t size) char *shared_pool; asm ("cvta.shared.u64\t%0, __nvptx_lowlat_pool;" : "=r"(shared_pool)); - /* Memory is allocated in 8-byte granularity. */ - size = (size + 7) & ~7; - - /* Acquire a lock on the low-latency heap. */ - heapdesc root; - do - { - root.raw = __atomic_exchange_n (&__nvptx_lowlat_heap_root, - 0xffffffff, MEMMODEL_ACQUIRE); - if (root.raw != 0xffffffff) - break; - /* Spin. */ - } - while (1); - - /* Walk the free chain to find where to insert a new entry. */ - heapdesc chunk = {root.raw}, prev_chunk; - uint32_t *prev_chunkptr = NULL, *prevprev_chunkptr = NULL; - uint32_t *chunkptr = (uint32_t*)(shared_pool + chunk.desc.offset); - heapdesc onward_chain = {chunkptr[0]}; - while (chunk.desc.size != 0 && addr > (void*)chunkptr) - { - prev_chunk.raw = chunk.raw; - chunk.raw = onward_chain.raw; - prevprev_chunkptr = prev_chunkptr; - prev_chunkptr = chunkptr; - chunkptr = (uint32_t*)(shared_pool + chunk.desc.offset); - onward_chain.raw = chunkptr[0]; - } - - /* Create the new chunk descriptor. */ - heapdesc newfreechunk; - newfreechunk.desc.offset = (uint16_t)((uintptr_t)addr - - (uintptr_t)shared_pool); - newfreechunk.desc.size = (uint16_t)size; - - /* Coalesce adjacent free chunks. */ - if (newfreechunk.desc.offset + size == chunk.desc.offset) - { - /* Free chunk follows. */ - newfreechunk.desc.size += chunk.desc.size; - chunk.raw = onward_chain.raw; - } - if (prev_chunkptr) - { - if (prev_chunk.desc.offset + prev_chunk.desc.size - == newfreechunk.desc.offset) - { - /* Free chunk precedes. */ - newfreechunk.desc.offset = prev_chunk.desc.offset; - newfreechunk.desc.size += prev_chunk.desc.size; - addr = shared_pool + prev_chunk.desc.offset; - prev_chunkptr = prevprev_chunkptr; - } - } - - /* Update the free chain in the new and previous chunks. */ - ((uint32_t*)addr)[0] = chunk.raw; - if (prev_chunkptr) - prev_chunkptr[0] = newfreechunk.raw; - else - root.raw = newfreechunk.raw; - - /* Update the free chain root and release the lock. */ - __atomic_store_n (&__nvptx_lowlat_heap_root, root.raw, MEMMODEL_RELEASE); + __nvptx_lowlat_free (shared_pool, addr, size); } else free (addr); @@ -240,123 +106,7 @@ nvptx_memspace_realloc (omp_memspace_handle_t memspace, void *addr, char *shared_pool; asm ("cvta.shared.u64\t%0, __nvptx_lowlat_pool;" : "=r"(shared_pool)); - /* Memory is allocated in 8-byte granularity. */ - oldsize = (oldsize + 7) & ~7; - size = (size + 7) & ~7; - - if (oldsize == size) - return addr; - - /* Acquire a lock on the low-latency heap. */ - heapdesc root; - do - { - root.raw = __atomic_exchange_n (&__nvptx_lowlat_heap_root, - 0xffffffff, MEMMODEL_ACQUIRE); - if (root.raw != 0xffffffff) - break; - /* Spin. */ - } - while (1); - - /* Walk the free chain. */ - heapdesc chunk = {root.raw}; - uint32_t *prev_chunkptr = NULL; - uint32_t *chunkptr = (uint32_t*)(shared_pool + chunk.desc.offset); - heapdesc onward_chain = {chunkptr[0]}; - while (chunk.desc.size != 0 && (void*)chunkptr < addr) - { - chunk.raw = onward_chain.raw; - prev_chunkptr = chunkptr; - chunkptr = (uint32_t*)(shared_pool + chunk.desc.offset); - onward_chain.raw = chunkptr[0]; - } - - void *result = NULL; - if (size < oldsize) - { - /* The new allocation is smaller than the old; we can always - shrink an allocation in place. */ - result = addr; - - uint32_t *nowfreeptr = (uint32_t*)(addr + size); - - /* Update the free chain. */ - heapdesc nowfree; - nowfree.desc.offset = (char*)nowfreeptr - shared_pool; - nowfree.desc.size = oldsize - size; - - if (nowfree.desc.offset + size == chunk.desc.offset) - { - /* Coalesce following free chunk. */ - nowfree.desc.size += chunk.desc.size; - nowfreeptr[0] = onward_chain.raw; - } - else - nowfreeptr[0] = chunk.raw; - - /* The previous free slot or root now points to nowfree. */ - if (prev_chunkptr) - prev_chunkptr[0] = nowfree.raw; - else - root.raw = nowfree.raw; - } - else if (chunk.desc.size != 0 - && (char *)addr + oldsize == (char *)chunkptr - && chunk.desc.size >= size-oldsize) - { - /* The new allocation is larger than the old, and we found a - large enough free block right after the existing block, - so we extend into that space. */ - result = addr; - - uint16_t delta = size-oldsize; - - /* Update the free chain. */ - heapdesc stillfree = {chunk.raw}; - stillfree.desc.offset += delta; - stillfree.desc.size -= delta; - uint32_t *stillfreeptr = (uint32_t*)(shared_pool - + stillfree.desc.offset); - - if (stillfree.desc.size == 0) - /* The whole chunk was used. */ - stillfree.raw = onward_chain.raw; - else - /* The chunk was split, so restore the onward chain. */ - stillfreeptr[0] = onward_chain.raw; - - /* The previous free slot or root now points to stillfree. */ - if (prev_chunkptr) - prev_chunkptr[0] = stillfree.raw; - else - root.raw = stillfree.raw; - } - /* Else realloc in-place has failed and result remains NULL. */ - - /* Update the free chain root and release the lock. */ - __atomic_store_n (&__nvptx_lowlat_heap_root, root.raw, MEMMODEL_RELEASE); - - if (result == NULL) - { - /* The allocation could not be extended in place, so we simply - allocate fresh memory and move the data. If we can't allocate - from low-latency memory then we leave the original alloaction - intact and return NULL. - We could do a fall-back to main memory, but we don't know what - the fall-back trait said to do. */ - result = nvptx_memspace_alloc (memspace, size); - if (result != NULL) - { - /* Inline memcpy in which we know oldsize is a multiple of 8. */ - uint64_t *from = addr, *to = result; - for (unsigned i = 0; i < (unsigned)oldsize/8; i++) - to[i] = from[i]; - - nvptx_memspace_free (memspace, addr, oldsize); - } - } - return result; + return __nvptx_lowlat_realloc (shared_pool, addr, oldsize, size); } else if (memspace == ompx_host_mem_space) return NULL; diff --git a/libgomp/config/nvptx/team.c b/libgomp/config/nvptx/team.c index 685610e00be..b30b8df178d 100644 --- a/libgomp/config/nvptx/team.c +++ b/libgomp/config/nvptx/team.c @@ -33,7 +33,6 @@ struct gomp_thread *nvptx_thrs __attribute__((shared,nocommon)); int __gomp_team_num __attribute__((shared,nocommon)); -uint32_t __nvptx_lowlat_heap_root __attribute__((shared,nocommon)); static void gomp_thread_start (struct gomp_thread_pool *); @@ -41,6 +40,9 @@ static void gomp_thread_start (struct gomp_thread_pool *); express this magic extern sizeless array in C so use asm. */ asm (".extern .shared .u8 __nvptx_lowlat_pool[];\n"); +/* Defined in basic-allocator.c via config/nvptx/allocator.c. */ +void __nvptx_lowlat_init (void *heap, size_t size); + /* This externally visible function handles target region entry. It sets up a per-team thread pool and transfers control by calling FN (FN_DATA) in the master thread or gomp_thread_start in other threads. @@ -76,19 +78,7 @@ gomp_nvptx_main (void (*fn) (void *), void *fn_data) asm ("mov.u32\t%0, %%dynamic_smem_size;\n" : "=r"(shared_pool_size)); #endif - - /* ... and initialize it with an empty free-chain. */ - union { - uint32_t raw; - struct { - uint16_t offset; - uint16_t size; - } desc; - } root; - root.desc.offset = 0; /* The first byte is free. */ - root.desc.size = shared_pool_size; /* The whole space is free. */ - __nvptx_lowlat_heap_root = root.raw; - shared_pool[0] = 0; /* Terminate free chain. */ + __nvptx_lowlat_init (shared_pool, shared_pool_size); /* Initialize the thread pool. */ struct gomp_thread_pool *pool = alloca (sizeof (*pool));