From: Andrew Stubbs <ams@codesourcery.com>
To: <gcc-patches@gcc.gnu.org>
Subject: [PATCH 01/17] libgomp, nvptx: low-latency memory allocator
Date: Thu, 7 Jul 2022 11:34:32 +0100 [thread overview]
Message-ID: <400092d8ce44340cece0e2e38f88edbad6400b03.1657188329.git.ams@codesourcery.com> (raw)
In-Reply-To: <cover.1657188329.git.ams@codesourcery.com>
[-- Attachment #1: Type: text/plain, Size: 2765 bytes --]
This patch adds support for allocating low-latency ".shared" memory on
NVPTX GPU device, via the omp_low_lat_mem_space and omp_alloc. The memory
can be allocated, reallocated, and freed using a basic but fast algorithm,
is thread safe and the size of the low-latency heap can be configured using
the GOMP_NVPTX_LOWLAT_POOL environment variable.
The use of the PTX dynamic_smem_size feature means that low-latency allocator
will not work with the PTX 3.1 multilib.
libgomp/ChangeLog:
* allocator.c (MEMSPACE_ALLOC): New macro.
(MEMSPACE_CALLOC): New macro.
(MEMSPACE_REALLOC): New macro.
(MEMSPACE_FREE): New macro.
(dynamic_smem_size): New constants.
(omp_alloc): Use MEMSPACE_ALLOC.
Implement fall-backs for predefined allocators.
(omp_free): Use MEMSPACE_FREE.
(omp_calloc): Use MEMSPACE_CALLOC.
Implement fall-backs for predefined allocators.
(omp_realloc): Use MEMSPACE_REALLOC and MEMSPACE_ALLOC..
Implement fall-backs for predefined allocators.
* config/nvptx/team.c (__nvptx_lowlat_heap_root): New variable.
(__nvptx_lowlat_pool): New asm varaible.
(gomp_nvptx_main): Initialize the low-latency heap.
* plugin/plugin-nvptx.c (lowlat_pool_size): New variable.
(GOMP_OFFLOAD_init_device): Read the GOMP_NVPTX_LOWLAT_POOL envvar.
(GOMP_OFFLOAD_run): Apply lowlat_pool_size.
* config/nvptx/allocator.c: New file.
* testsuite/libgomp.c/allocators-1.c: New test.
* testsuite/libgomp.c/allocators-2.c: New test.
* testsuite/libgomp.c/allocators-3.c: New test.
* testsuite/libgomp.c/allocators-4.c: New test.
* testsuite/libgomp.c/allocators-5.c: New test.
* testsuite/libgomp.c/allocators-6.c: New test.
co-authored-by: Kwok Cheung Yeung <kcy@codesourcery.com>
---
libgomp/allocator.c | 235 ++++++++-----
libgomp/config/nvptx/allocator.c | 370 +++++++++++++++++++++
libgomp/config/nvptx/team.c | 28 ++
libgomp/plugin/plugin-nvptx.c | 23 +-
libgomp/testsuite/libgomp.c/allocators-1.c | 56 ++++
libgomp/testsuite/libgomp.c/allocators-2.c | 64 ++++
libgomp/testsuite/libgomp.c/allocators-3.c | 42 +++
libgomp/testsuite/libgomp.c/allocators-4.c | 196 +++++++++++
libgomp/testsuite/libgomp.c/allocators-5.c | 63 ++++
libgomp/testsuite/libgomp.c/allocators-6.c | 117 +++++++
10 files changed, 1110 insertions(+), 84 deletions(-)
create mode 100644 libgomp/config/nvptx/allocator.c
create mode 100644 libgomp/testsuite/libgomp.c/allocators-1.c
create mode 100644 libgomp/testsuite/libgomp.c/allocators-2.c
create mode 100644 libgomp/testsuite/libgomp.c/allocators-3.c
create mode 100644 libgomp/testsuite/libgomp.c/allocators-4.c
create mode 100644 libgomp/testsuite/libgomp.c/allocators-5.c
create mode 100644 libgomp/testsuite/libgomp.c/allocators-6.c
[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 0001-libgomp-nvptx-low-latency-memory-allocator.patch --]
[-- Type: text/x-patch; name="0001-libgomp-nvptx-low-latency-memory-allocator.patch", Size: 42973 bytes --]
diff --git a/libgomp/allocator.c b/libgomp/allocator.c
index b04820b8cf9..9b33bcf529b 100644
--- a/libgomp/allocator.c
+++ b/libgomp/allocator.c
@@ -37,6 +37,34 @@
#define omp_max_predefined_alloc omp_thread_mem_alloc
+/* These macros may be overridden in config/<target>/allocator.c. */
+#ifndef MEMSPACE_ALLOC
+#define MEMSPACE_ALLOC(MEMSPACE, SIZE) malloc (SIZE)
+#endif
+#ifndef MEMSPACE_CALLOC
+#define MEMSPACE_CALLOC(MEMSPACE, SIZE) calloc (1, SIZE)
+#endif
+#ifndef MEMSPACE_REALLOC
+#define MEMSPACE_REALLOC(MEMSPACE, ADDR, OLDSIZE, SIZE) realloc (ADDR, SIZE)
+#endif
+#ifndef MEMSPACE_FREE
+#define MEMSPACE_FREE(MEMSPACE, ADDR, SIZE) free (ADDR)
+#endif
+
+/* Map the predefined allocators to the correct memory space.
+ The index to this table is the omp_allocator_handle_t enum value. */
+static const omp_memspace_handle_t predefined_alloc_mapping[] = {
+ omp_default_mem_space, /* omp_null_allocator. */
+ omp_default_mem_space, /* omp_default_mem_alloc. */
+ omp_large_cap_mem_space, /* omp_large_cap_mem_alloc. */
+ omp_default_mem_space, /* omp_const_mem_alloc. */
+ omp_high_bw_mem_space, /* omp_high_bw_mem_alloc. */
+ omp_low_lat_mem_space, /* omp_low_lat_mem_alloc. */
+ omp_low_lat_mem_space, /* omp_cgroup_mem_alloc. */
+ omp_low_lat_mem_space, /* omp_pteam_mem_alloc. */
+ omp_low_lat_mem_space, /* omp_thread_mem_alloc. */
+};
+
enum gomp_memkind_kind
{
GOMP_MEMKIND_NONE = 0,
@@ -453,7 +481,7 @@ retry:
}
else
#endif
- ptr = malloc (new_size);
+ ptr = MEMSPACE_ALLOC (allocator_data->memspace, new_size);
if (ptr == NULL)
{
#ifdef HAVE_SYNC_BUILTINS
@@ -478,7 +506,13 @@ retry:
}
else
#endif
- ptr = malloc (new_size);
+ {
+ omp_memspace_handle_t memspace __attribute__((unused))
+ = (allocator_data
+ ? allocator_data->memspace
+ : predefined_alloc_mapping[allocator]);
+ ptr = MEMSPACE_ALLOC (memspace, new_size);
+ }
if (ptr == NULL)
goto fail;
}
@@ -496,35 +530,38 @@ retry:
return ret;
fail:
- if (allocator_data)
+ int fallback = (allocator_data
+ ? allocator_data->fallback
+ : allocator == omp_default_mem_alloc
+ ? omp_atv_null_fb
+ : omp_atv_default_mem_fb);
+ switch (fallback)
{
- switch (allocator_data->fallback)
- {
- case omp_atv_default_mem_fb:
- if ((new_alignment > sizeof (void *) && new_alignment > alignment)
+ case omp_atv_default_mem_fb:
+ if ((new_alignment > sizeof (void *) && new_alignment > alignment)
#ifdef LIBGOMP_USE_MEMKIND
- || memkind
+ || memkind
#endif
- || (allocator_data
- && allocator_data->pool_size < ~(uintptr_t) 0))
- {
- allocator = omp_default_mem_alloc;
- goto retry;
- }
- /* Otherwise, we've already performed default mem allocation
- and if that failed, it won't succeed again (unless it was
- intermittent. Return NULL then, as that is the fallback. */
- break;
- case omp_atv_null_fb:
- break;
- default:
- case omp_atv_abort_fb:
- gomp_fatal ("Out of memory allocating %lu bytes",
- (unsigned long) size);
- case omp_atv_allocator_fb:
- allocator = allocator_data->fb_data;
+ || (allocator_data
+ && allocator_data->pool_size < ~(uintptr_t) 0)
+ || !allocator_data)
+ {
+ allocator = omp_default_mem_alloc;
goto retry;
}
+ /* Otherwise, we've already performed default mem allocation
+ and if that failed, it won't succeed again (unless it was
+ intermittent. Return NULL then, as that is the fallback. */
+ break;
+ case omp_atv_null_fb:
+ break;
+ default:
+ case omp_atv_abort_fb:
+ gomp_fatal ("Out of memory allocating %lu bytes",
+ (unsigned long) size);
+ case omp_atv_allocator_fb:
+ allocator = allocator_data->fb_data;
+ goto retry;
}
return NULL;
}
@@ -557,6 +594,8 @@ void
omp_free (void *ptr, omp_allocator_handle_t allocator)
{
struct omp_mem_header *data;
+ omp_memspace_handle_t memspace __attribute__((unused))
+ = omp_default_mem_space;
if (ptr == NULL)
return;
@@ -586,10 +625,12 @@ omp_free (void *ptr, omp_allocator_handle_t allocator)
return;
}
#endif
+
+ memspace = allocator_data->memspace;
}
-#ifdef LIBGOMP_USE_MEMKIND
else
{
+#ifdef LIBGOMP_USE_MEMKIND
enum gomp_memkind_kind memkind = GOMP_MEMKIND_NONE;
if (data->allocator == omp_high_bw_mem_alloc)
memkind = GOMP_MEMKIND_HBW_PREFERRED;
@@ -605,9 +646,12 @@ omp_free (void *ptr, omp_allocator_handle_t allocator)
return;
}
}
- }
#endif
- free (data->ptr);
+
+ memspace = predefined_alloc_mapping[data->allocator];
+ }
+
+ MEMSPACE_FREE (memspace, data->ptr, data->size);
}
ialias (omp_free)
@@ -723,7 +767,7 @@ retry:
}
else
#endif
- ptr = calloc (1, new_size);
+ ptr = MEMSPACE_CALLOC (allocator_data->memspace, new_size);
if (ptr == NULL)
{
#ifdef HAVE_SYNC_BUILTINS
@@ -748,7 +792,13 @@ retry:
}
else
#endif
- ptr = calloc (1, new_size);
+ {
+ omp_memspace_handle_t memspace __attribute__((unused))
+ = (allocator_data
+ ? allocator_data->memspace
+ : predefined_alloc_mapping[allocator]);
+ ptr = MEMSPACE_CALLOC (memspace, new_size);
+ }
if (ptr == NULL)
goto fail;
}
@@ -766,35 +816,38 @@ retry:
return ret;
fail:
- if (allocator_data)
+ int fallback = (allocator_data
+ ? allocator_data->fallback
+ : allocator == omp_default_mem_alloc
+ ? omp_atv_null_fb
+ : omp_atv_default_mem_fb);
+ switch (fallback)
{
- switch (allocator_data->fallback)
- {
- case omp_atv_default_mem_fb:
- if ((new_alignment > sizeof (void *) && new_alignment > alignment)
+ case omp_atv_default_mem_fb:
+ if ((new_alignment > sizeof (void *) && new_alignment > alignment)
#ifdef LIBGOMP_USE_MEMKIND
- || memkind
+ || memkind
#endif
- || (allocator_data
- && allocator_data->pool_size < ~(uintptr_t) 0))
- {
- allocator = omp_default_mem_alloc;
- goto retry;
- }
- /* Otherwise, we've already performed default mem allocation
- and if that failed, it won't succeed again (unless it was
- intermittent. Return NULL then, as that is the fallback. */
- break;
- case omp_atv_null_fb:
- break;
- default:
- case omp_atv_abort_fb:
- gomp_fatal ("Out of memory allocating %lu bytes",
- (unsigned long) (size * nmemb));
- case omp_atv_allocator_fb:
- allocator = allocator_data->fb_data;
+ || (allocator_data
+ && allocator_data->pool_size < ~(uintptr_t) 0)
+ || !allocator_data)
+ {
+ allocator = omp_default_mem_alloc;
goto retry;
}
+ /* Otherwise, we've already performed default mem allocation
+ and if that failed, it won't succeed again (unless it was
+ intermittent. Return NULL then, as that is the fallback. */
+ break;
+ case omp_atv_null_fb:
+ break;
+ default:
+ case omp_atv_abort_fb:
+ gomp_fatal ("Out of memory allocating %lu bytes",
+ (unsigned long) (size * nmemb));
+ case omp_atv_allocator_fb:
+ allocator = allocator_data->fb_data;
+ goto retry;
}
return NULL;
}
@@ -967,9 +1020,10 @@ retry:
else
#endif
if (prev_size)
- new_ptr = realloc (data->ptr, new_size);
+ new_ptr = MEMSPACE_REALLOC (allocator_data->memspace, data->ptr,
+ data->size, new_size);
else
- new_ptr = malloc (new_size);
+ new_ptr = MEMSPACE_ALLOC (allocator_data->memspace, new_size);
if (new_ptr == NULL)
{
#ifdef HAVE_SYNC_BUILTINS
@@ -1010,7 +1064,13 @@ retry:
}
else
#endif
- new_ptr = realloc (data->ptr, new_size);
+ {
+ omp_memspace_handle_t memspace __attribute__((unused))
+ = (allocator_data
+ ? allocator_data->memspace
+ : predefined_alloc_mapping[allocator]);
+ new_ptr = MEMSPACE_REALLOC (memspace, data->ptr, data->size, new_size);
+ }
if (new_ptr == NULL)
goto fail;
ret = (char *) new_ptr + sizeof (struct omp_mem_header);
@@ -1030,7 +1090,13 @@ retry:
}
else
#endif
- new_ptr = malloc (new_size);
+ {
+ omp_memspace_handle_t memspace __attribute__((unused))
+ = (allocator_data
+ ? allocator_data->memspace
+ : predefined_alloc_mapping[allocator]);
+ new_ptr = MEMSPACE_ALLOC (memspace, new_size);
+ }
if (new_ptr == NULL)
goto fail;
}
@@ -1073,35 +1139,38 @@ retry:
return ret;
fail:
- if (allocator_data)
+ int fallback = (allocator_data
+ ? allocator_data->fallback
+ : allocator == omp_default_mem_alloc
+ ? omp_atv_null_fb
+ : omp_atv_default_mem_fb);
+ switch (fallback)
{
- switch (allocator_data->fallback)
- {
- case omp_atv_default_mem_fb:
- if (new_alignment > sizeof (void *)
+ case omp_atv_default_mem_fb:
+ if (new_alignment > sizeof (void *)
#ifdef LIBGOMP_USE_MEMKIND
- || memkind
+ || memkind
#endif
- || (allocator_data
- && allocator_data->pool_size < ~(uintptr_t) 0))
- {
- allocator = omp_default_mem_alloc;
- goto retry;
- }
- /* Otherwise, we've already performed default mem allocation
- and if that failed, it won't succeed again (unless it was
- intermittent. Return NULL then, as that is the fallback. */
- break;
- case omp_atv_null_fb:
- break;
- default:
- case omp_atv_abort_fb:
- gomp_fatal ("Out of memory allocating %lu bytes",
- (unsigned long) size);
- case omp_atv_allocator_fb:
- allocator = allocator_data->fb_data;
+ || (allocator_data
+ && allocator_data->pool_size < ~(uintptr_t) 0)
+ || !allocator_data)
+ {
+ allocator = omp_default_mem_alloc;
goto retry;
}
+ /* Otherwise, we've already performed default mem allocation
+ and if that failed, it won't succeed again (unless it was
+ intermittent. Return NULL then, as that is the fallback. */
+ break;
+ case omp_atv_null_fb:
+ break;
+ default:
+ case omp_atv_abort_fb:
+ gomp_fatal ("Out of memory allocating %lu bytes",
+ (unsigned long) size);
+ case omp_atv_allocator_fb:
+ allocator = allocator_data->fb_data;
+ goto retry;
}
return NULL;
}
diff --git a/libgomp/config/nvptx/allocator.c b/libgomp/config/nvptx/allocator.c
new file mode 100644
index 00000000000..6bc2ea48043
--- /dev/null
+++ b/libgomp/config/nvptx/allocator.c
@@ -0,0 +1,370 @@
+/* Copyright (C) 2021 Free Software Foundation, Inc.
+
+ This file is part of the GNU Offloading and Multi Processing Library
+ (libgomp).
+
+ Libgomp is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* The low-latency allocators use space reserved in .shared memory when the
+ kernel is launched. The heap is initialized in gomp_nvptx_main and all
+ allocations are forgotten when the kernel exits. Allocations to other
+ memory spaces all use the system malloc syscall.
+
+ The root heap descriptor is stored elsewhere in shared memory, and each
+ free chunk contains a similar descriptor for the next free chunk in the
+ chain.
+
+ The descriptor is two 16-bit values: offset and size, which describe the
+ location of a chunk of memory available for allocation. The offset is
+ relative to the base of the heap. The special value 0xffff, 0xffff
+ indicates that the heap is locked. The descriptor is encoded into a
+ single 32-bit integer so that it may be easily accessed atomically.
+
+ Memory is allocated to the first free chunk that fits. The free chain
+ is always stored in order of the offset to assist coalescing adjacent
+ chunks. */
+
+#include "libgomp.h"
+#include <stdlib.h>
+
+/* There should be some .shared space reserved for us. There's no way to
+ express this magic extern sizeless array in C so use asm. */
+asm (".extern .shared .u8 __nvptx_lowlat_pool[];\n");
+
+extern uint32_t __nvptx_lowlat_heap_root __attribute__((shared,nocommon));
+
+typedef union {
+ uint32_t raw;
+ struct {
+ uint16_t offset;
+ uint16_t size;
+ } desc;
+} heapdesc;
+
+static void *
+nvptx_memspace_alloc (omp_memspace_handle_t memspace, size_t size)
+{
+ if (memspace == omp_low_lat_mem_space)
+ {
+ char *shared_pool;
+ asm ("cvta.shared.u64\t%0, __nvptx_lowlat_pool;" : "=r"(shared_pool));
+
+ /* Memory is allocated in 8-byte granularity. */
+ size = (size + 7) & ~7;
+
+ /* Acquire a lock on the low-latency heap. */
+ heapdesc root;
+ do
+ {
+ root.raw = __atomic_exchange_n (&__nvptx_lowlat_heap_root,
+ 0xffffffff, MEMMODEL_ACQUIRE);
+ if (root.raw != 0xffffffff)
+ break;
+ /* Spin. */
+ }
+ while (1);
+
+ /* Walk the free chain. */
+ heapdesc chunk = {root.raw};
+ uint32_t *prev_chunkptr = NULL;
+ uint32_t *chunkptr = (uint32_t*)(shared_pool + chunk.desc.offset);
+ heapdesc onward_chain = {chunkptr[0]};
+ while (chunk.desc.size != 0 && (uint32_t)size > chunk.desc.size)
+ {
+ chunk.raw = onward_chain.raw;
+ prev_chunkptr = chunkptr;
+ chunkptr = (uint32_t*)(shared_pool + chunk.desc.offset);
+ onward_chain.raw = chunkptr[0];
+ }
+
+ void *result = NULL;
+ if (chunk.desc.size != 0)
+ {
+ /* Allocation successful. */
+ result = chunkptr;
+
+ /* Update the free chain. */
+ heapdesc stillfree = {chunk.raw};
+ stillfree.desc.offset += size;
+ stillfree.desc.size -= size;
+ uint32_t *stillfreeptr = (uint32_t*)(shared_pool
+ + stillfree.desc.offset);
+
+ if (stillfree.desc.size == 0)
+ /* The whole chunk was used. */
+ stillfree.raw = onward_chain.raw;
+ else
+ /* The chunk was split, so restore the onward chain. */
+ stillfreeptr[0] = onward_chain.raw;
+
+ /* The previous free slot or root now points to stillfree. */
+ if (prev_chunkptr)
+ prev_chunkptr[0] = stillfree.raw;
+ else
+ root.raw = stillfree.raw;
+ }
+
+ /* Update the free chain root and release the lock. */
+ __atomic_store_n (&__nvptx_lowlat_heap_root, root.raw, MEMMODEL_RELEASE);
+ return result;
+ }
+ else
+ return malloc (size);
+}
+
+static void *
+nvptx_memspace_calloc (omp_memspace_handle_t memspace, size_t size)
+{
+ if (memspace == omp_low_lat_mem_space)
+ {
+ /* Memory is allocated in 8-byte granularity. */
+ size = (size + 7) & ~7;
+
+ uint64_t *result = nvptx_memspace_alloc (memspace, size);
+ if (result)
+ /* Inline memset in which we know size is a multiple of 8. */
+ for (unsigned i = 0; i < (unsigned)size/8; i++)
+ result[i] = 0;
+
+ return result;
+ }
+ else
+ return calloc (1, size);
+}
+
+static void
+nvptx_memspace_free (omp_memspace_handle_t memspace, void *addr, size_t size)
+{
+ if (memspace == omp_low_lat_mem_space)
+ {
+ char *shared_pool;
+ asm ("cvta.shared.u64\t%0, __nvptx_lowlat_pool;" : "=r"(shared_pool));
+
+ /* Memory is allocated in 8-byte granularity. */
+ size = (size + 7) & ~7;
+
+ /* Acquire a lock on the low-latency heap. */
+ heapdesc root;
+ do
+ {
+ root.raw = __atomic_exchange_n (&__nvptx_lowlat_heap_root,
+ 0xffffffff, MEMMODEL_ACQUIRE);
+ if (root.raw != 0xffffffff)
+ break;
+ /* Spin. */
+ }
+ while (1);
+
+ /* Walk the free chain to find where to insert a new entry. */
+ heapdesc chunk = {root.raw}, prev_chunk;
+ uint32_t *prev_chunkptr = NULL, *prevprev_chunkptr = NULL;
+ uint32_t *chunkptr = (uint32_t*)(shared_pool + chunk.desc.offset);
+ heapdesc onward_chain = {chunkptr[0]};
+ while (chunk.desc.size != 0 && addr > (void*)chunkptr)
+ {
+ prev_chunk.raw = chunk.raw;
+ chunk.raw = onward_chain.raw;
+ prevprev_chunkptr = prev_chunkptr;
+ prev_chunkptr = chunkptr;
+ chunkptr = (uint32_t*)(shared_pool + chunk.desc.offset);
+ onward_chain.raw = chunkptr[0];
+ }
+
+ /* Create the new chunk descriptor. */
+ heapdesc newfreechunk;
+ newfreechunk.desc.offset = (uint16_t)((uintptr_t)addr
+ - (uintptr_t)shared_pool);
+ newfreechunk.desc.size = (uint16_t)size;
+
+ /* Coalesce adjacent free chunks. */
+ if (newfreechunk.desc.offset + size == chunk.desc.offset)
+ {
+ /* Free chunk follows. */
+ newfreechunk.desc.size += chunk.desc.size;
+ chunk.raw = onward_chain.raw;
+ }
+ if (prev_chunkptr)
+ {
+ if (prev_chunk.desc.offset + prev_chunk.desc.size
+ == newfreechunk.desc.offset)
+ {
+ /* Free chunk precedes. */
+ newfreechunk.desc.offset = prev_chunk.desc.offset;
+ newfreechunk.desc.size += prev_chunk.desc.size;
+ addr = shared_pool + prev_chunk.desc.offset;
+ prev_chunkptr = prevprev_chunkptr;
+ }
+ }
+
+ /* Update the free chain in the new and previous chunks. */
+ ((uint32_t*)addr)[0] = chunk.raw;
+ if (prev_chunkptr)
+ prev_chunkptr[0] = newfreechunk.raw;
+ else
+ root.raw = newfreechunk.raw;
+
+ /* Update the free chain root and release the lock. */
+ __atomic_store_n (&__nvptx_lowlat_heap_root, root.raw, MEMMODEL_RELEASE);
+ }
+ else
+ free (addr);
+}
+
+static void *
+nvptx_memspace_realloc (omp_memspace_handle_t memspace, void *addr,
+ size_t oldsize, size_t size)
+{
+ if (memspace == omp_low_lat_mem_space)
+ {
+ char *shared_pool;
+ asm ("cvta.shared.u64\t%0, __nvptx_lowlat_pool;" : "=r"(shared_pool));
+
+ /* Memory is allocated in 8-byte granularity. */
+ oldsize = (oldsize + 7) & ~7;
+ size = (size + 7) & ~7;
+
+ if (oldsize == size)
+ return addr;
+
+ /* Acquire a lock on the low-latency heap. */
+ heapdesc root;
+ do
+ {
+ root.raw = __atomic_exchange_n (&__nvptx_lowlat_heap_root,
+ 0xffffffff, MEMMODEL_ACQUIRE);
+ if (root.raw != 0xffffffff)
+ break;
+ /* Spin. */
+ }
+ while (1);
+
+ /* Walk the free chain. */
+ heapdesc chunk = {root.raw};
+ uint32_t *prev_chunkptr = NULL;
+ uint32_t *chunkptr = (uint32_t*)(shared_pool + chunk.desc.offset);
+ heapdesc onward_chain = {chunkptr[0]};
+ while (chunk.desc.size != 0 && (void*)chunkptr < addr)
+ {
+ chunk.raw = onward_chain.raw;
+ prev_chunkptr = chunkptr;
+ chunkptr = (uint32_t*)(shared_pool + chunk.desc.offset);
+ onward_chain.raw = chunkptr[0];
+ }
+
+ void *result = NULL;
+ if (size < oldsize)
+ {
+ /* The new allocation is smaller than the old; we can always
+ shrink an allocation in place. */
+ result = addr;
+
+ uint32_t *nowfreeptr = (uint32_t*)(addr + size);
+
+ /* Update the free chain. */
+ heapdesc nowfree;
+ nowfree.desc.offset = (char*)nowfreeptr - shared_pool;
+ nowfree.desc.size = oldsize - size;
+
+ if (nowfree.desc.offset + size == chunk.desc.offset)
+ {
+ /* Coalesce following free chunk. */
+ nowfree.desc.size += chunk.desc.size;
+ nowfreeptr[0] = onward_chain.raw;
+ }
+ else
+ nowfreeptr[0] = chunk.raw;
+
+ /* The previous free slot or root now points to nowfree. */
+ if (prev_chunkptr)
+ prev_chunkptr[0] = nowfree.raw;
+ else
+ root.raw = nowfree.raw;
+ }
+ else if (chunk.desc.size != 0
+ && (char *)addr + oldsize == (char *)chunkptr
+ && chunk.desc.size >= size-oldsize)
+ {
+ /* The new allocation is larger than the old, and we found a
+ large enough free block right after the existing block,
+ so we extend into that space. */
+ result = addr;
+
+ uint16_t delta = size-oldsize;
+
+ /* Update the free chain. */
+ heapdesc stillfree = {chunk.raw};
+ stillfree.desc.offset += delta;
+ stillfree.desc.size -= delta;
+ uint32_t *stillfreeptr = (uint32_t*)(shared_pool
+ + stillfree.desc.offset);
+
+ if (stillfree.desc.size == 0)
+ /* The whole chunk was used. */
+ stillfree.raw = onward_chain.raw;
+ else
+ /* The chunk was split, so restore the onward chain. */
+ stillfreeptr[0] = onward_chain.raw;
+
+ /* The previous free slot or root now points to stillfree. */
+ if (prev_chunkptr)
+ prev_chunkptr[0] = stillfree.raw;
+ else
+ root.raw = stillfree.raw;
+ }
+ /* Else realloc in-place has failed and result remains NULL. */
+
+ /* Update the free chain root and release the lock. */
+ __atomic_store_n (&__nvptx_lowlat_heap_root, root.raw, MEMMODEL_RELEASE);
+
+ if (result == NULL)
+ {
+ /* The allocation could not be extended in place, so we simply
+ allocate fresh memory and move the data. If we can't allocate
+ from low-latency memory then we leave the original alloaction
+ intact and return NULL.
+ We could do a fall-back to main memory, but we don't know what
+ the fall-back trait said to do. */
+ result = nvptx_memspace_alloc (memspace, size);
+ if (result != NULL)
+ {
+ /* Inline memcpy in which we know oldsize is a multiple of 8. */
+ uint64_t *from = addr, *to = result;
+ for (unsigned i = 0; i < (unsigned)oldsize/8; i++)
+ to[i] = from[i];
+
+ nvptx_memspace_free (memspace, addr, oldsize);
+ }
+ }
+ return result;
+ }
+ else
+ return realloc (addr, size);
+}
+
+#define MEMSPACE_ALLOC(MEMSPACE, SIZE) \
+ nvptx_memspace_alloc (MEMSPACE, SIZE)
+#define MEMSPACE_CALLOC(MEMSPACE, SIZE) \
+ nvptx_memspace_calloc (MEMSPACE, SIZE)
+#define MEMSPACE_REALLOC(MEMSPACE, ADDR, OLDSIZE, SIZE) \
+ nvptx_memspace_realloc (MEMSPACE, ADDR, OLDSIZE, SIZE)
+#define MEMSPACE_FREE(MEMSPACE, ADDR, SIZE) \
+ nvptx_memspace_free (MEMSPACE, ADDR, SIZE)
+
+#include "../../allocator.c"
diff --git a/libgomp/config/nvptx/team.c b/libgomp/config/nvptx/team.c
index 6923416fb4e..65a7af3417b 100644
--- a/libgomp/config/nvptx/team.c
+++ b/libgomp/config/nvptx/team.c
@@ -33,9 +33,13 @@
struct gomp_thread *nvptx_thrs __attribute__((shared,nocommon));
int __gomp_team_num __attribute__((shared,nocommon));
+uint32_t __nvptx_lowlat_heap_root __attribute__((shared,nocommon));
static void gomp_thread_start (struct gomp_thread_pool *);
+/* There should be some .shared space reserved for us. There's no way to
+ express this magic extern sizeless array in C so use asm. */
+asm (".extern .shared .u8 __nvptx_lowlat_pool[];\n");
/* This externally visible function handles target region entry. It
sets up a per-team thread pool and transfers control by calling FN (FN_DATA)
@@ -63,6 +67,30 @@ gomp_nvptx_main (void (*fn) (void *), void *fn_data)
nvptx_thrs = alloca (ntids * sizeof (*nvptx_thrs));
memset (nvptx_thrs, 0, ntids * sizeof (*nvptx_thrs));
+ /* Find the low-latency heap details .... */
+ uint32_t *shared_pool;
+ uint32_t shared_pool_size = 0;
+ asm ("cvta.shared.u64\t%0, __nvptx_lowlat_pool;" : "=r"(shared_pool));
+#if __PTX_ISA_VERSION_MAJOR__ > 4 \
+ || (__PTX_ISA_VERSION_MAJOR__ == 4 && __PTX_ISA_VERSION_MAJOR__ >= 1)
+ asm ("mov.u32\t%0, %%dynamic_smem_size;\n"
+ : "=r"(shared_pool_size));
+#endif
+
+ /* ... and initialize it with an empty free-chain. */
+ union {
+ uint32_t raw;
+ struct {
+ uint16_t offset;
+ uint16_t size;
+ } desc;
+ } root;
+ root.desc.offset = 0; /* The first byte is free. */
+ root.desc.size = shared_pool_size; /* The whole space is free. */
+ shared_pool[0] = 0; /* Terminate free chain. */
+ __atomic_store_n (&__nvptx_lowlat_heap_root, root.raw, MEMMODEL_RELEASE);
+
+ /* Initialize the thread pool. */
struct gomp_thread_pool *pool = alloca (sizeof (*pool));
pool->threads = alloca (ntids * sizeof (*pool->threads));
for (tid = 0; tid < ntids; tid++)
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index bc63e274cdf..40739ba592d 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -334,6 +334,11 @@ struct ptx_device
static struct ptx_device **ptx_devices;
+/* OpenMP kernels reserve a small amount of ".shared" space for use by
+ omp_alloc. The size is configured using GOMP_NVPTX_LOWLAT_POOL, but the
+ default is set here. */
+static unsigned lowlat_pool_size = 8*1024;
+
static inline struct nvptx_thread *
nvptx_thread (void)
{
@@ -1205,6 +1210,22 @@ GOMP_OFFLOAD_init_device (int n)
instantiated_devices++;
}
+ const char *var_name = "GOMP_NVPTX_LOWLAT_POOL";
+ const char *env_var = secure_getenv (var_name);
+ notify_var (var_name, env_var);
+
+ if (env_var != NULL)
+ {
+ char *endptr;
+ unsigned long val = strtoul (env_var, &endptr, 10);
+ if (endptr == NULL || *endptr != '\0'
+ || errno == ERANGE || errno == EINVAL
+ || val > UINT_MAX)
+ GOMP_PLUGIN_error ("Error parsing %s", var_name);
+ else
+ lowlat_pool_size = val;
+ }
+
pthread_mutex_unlock (&ptx_dev_lock);
return dev != NULL;
@@ -2030,7 +2051,7 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
" [(teams: %u), 1, 1] [(lanes: 32), (threads: %u), 1]\n",
__FUNCTION__, fn_name, teams, threads);
r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1,
- 32, threads, 1, 0, NULL, NULL, config);
+ 32, threads, 1, lowlat_pool_size, NULL, NULL, config);
if (r != CUDA_SUCCESS)
GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
diff --git a/libgomp/testsuite/libgomp.c/allocators-1.c b/libgomp/testsuite/libgomp.c/allocators-1.c
new file mode 100644
index 00000000000..04968e4c83d
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/allocators-1.c
@@ -0,0 +1,56 @@
+/* { dg-do run } */
+
+/* Test that omp_alloc returns usable memory. */
+
+#include <omp.h>
+
+#pragma omp requires dynamic_allocators
+
+void
+test (int n, omp_allocator_handle_t allocator)
+{
+ #pragma omp target map(to:n) map(to:allocator)
+ {
+ int *a;
+ a = (int *) omp_alloc(n*sizeof(int), allocator);
+
+ #pragma omp parallel
+ for (int i = 0; i < n; i++)
+ a[i] = i;
+
+ for (int i = 0; i < n; i++)
+ if (a[i] != i)
+ {
+ __builtin_printf ("data mismatch at %i\n", i);
+ __builtin_abort ();
+ }
+
+ omp_free(a, allocator);
+ }
+}
+
+int
+main ()
+{
+ // Smaller than low-latency memory limit
+ test (10, omp_default_mem_alloc);
+ test (10, omp_large_cap_mem_alloc);
+ test (10, omp_const_mem_alloc);
+ test (10, omp_high_bw_mem_alloc);
+ test (10, omp_low_lat_mem_alloc);
+ test (10, omp_cgroup_mem_alloc);
+ test (10, omp_pteam_mem_alloc);
+ test (10, omp_thread_mem_alloc);
+
+ // Larger than low-latency memory limit
+ test (100000, omp_default_mem_alloc);
+ test (100000, omp_large_cap_mem_alloc);
+ test (100000, omp_const_mem_alloc);
+ test (100000, omp_high_bw_mem_alloc);
+ test (100000, omp_low_lat_mem_alloc);
+ test (100000, omp_cgroup_mem_alloc);
+ test (100000, omp_pteam_mem_alloc);
+ test (100000, omp_thread_mem_alloc);
+
+ return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c/allocators-2.c b/libgomp/testsuite/libgomp.c/allocators-2.c
new file mode 100644
index 00000000000..a98f1b4c05e
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/allocators-2.c
@@ -0,0 +1,64 @@
+/* { dg-do run } */
+
+/* Test concurrent and repeated allocations. */
+
+#include <omp.h>
+
+#pragma omp requires dynamic_allocators
+
+void
+test (int n, omp_allocator_handle_t allocator)
+{
+ #pragma omp target map(to:n) map(to:allocator)
+ {
+ int **a;
+ a = (int **) omp_alloc(n*sizeof(int*), allocator);
+
+ #pragma omp parallel for
+ for (int i = 0; i < n; i++)
+ {
+ /*Use 10x to ensure we do activate low-latency fall-back. */
+ a[i] = omp_alloc(sizeof(int)*10, allocator);
+ a[i][0] = i;
+ }
+
+ for (int i = 0; i < n; i++)
+ if (a[i][0] != i)
+ {
+ __builtin_printf ("data mismatch at %i\n", i);
+ __builtin_abort ();
+ }
+
+ #pragma omp parallel for
+ for (int i = 0; i < n; i++)
+ omp_free(a[i], allocator);
+
+ omp_free (a, allocator);
+ }
+}
+
+int
+main ()
+{
+ // Smaller than low-latency memory limit
+ test (10, omp_default_mem_alloc);
+ test (10, omp_large_cap_mem_alloc);
+ test (10, omp_const_mem_alloc);
+ test (10, omp_high_bw_mem_alloc);
+ test (10, omp_low_lat_mem_alloc);
+ test (10, omp_cgroup_mem_alloc);
+ test (10, omp_pteam_mem_alloc);
+ test (10, omp_thread_mem_alloc);
+
+ // Larger than low-latency memory limit (on aggregate)
+ test (1000, omp_default_mem_alloc);
+ test (1000, omp_large_cap_mem_alloc);
+ test (1000, omp_const_mem_alloc);
+ test (1000, omp_high_bw_mem_alloc);
+ test (1000, omp_low_lat_mem_alloc);
+ test (1000, omp_cgroup_mem_alloc);
+ test (1000, omp_pteam_mem_alloc);
+ test (1000, omp_thread_mem_alloc);
+
+ return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c/allocators-3.c b/libgomp/testsuite/libgomp.c/allocators-3.c
new file mode 100644
index 00000000000..45514c2a088
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/allocators-3.c
@@ -0,0 +1,42 @@
+/* { dg-do run } */
+
+/* Stress-test omp_alloc/omp_malloc under concurrency. */
+
+#include <omp.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#pragma omp requires dynamic_allocators
+
+#define N 1000
+
+void
+test (omp_allocator_handle_t allocator)
+{
+ #pragma omp target map(to:allocator)
+ {
+ #pragma omp parallel for
+ for (int i = 0; i < N; i++)
+ for (int j = 0; j < N; j++)
+ {
+ int *p = omp_alloc(sizeof(int), allocator);
+ omp_free(p, allocator);
+ }
+ }
+}
+
+int
+main ()
+{
+ // Smaller than low-latency memory limit
+ test (omp_default_mem_alloc);
+ test (omp_large_cap_mem_alloc);
+ test (omp_const_mem_alloc);
+ test (omp_high_bw_mem_alloc);
+ test (omp_low_lat_mem_alloc);
+ test (omp_cgroup_mem_alloc);
+ test (omp_pteam_mem_alloc);
+ test (omp_thread_mem_alloc);
+
+ return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c/allocators-4.c b/libgomp/testsuite/libgomp.c/allocators-4.c
new file mode 100644
index 00000000000..9fa6aa1624f
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/allocators-4.c
@@ -0,0 +1,196 @@
+/* { dg-do run } */
+
+/* Test that low-latency free chains are sound. */
+
+#include <stddef.h>
+#include <omp.h>
+
+#pragma omp requires dynamic_allocators
+
+void
+check (int cond, const char *msg)
+{
+ if (!cond)
+ {
+ __builtin_printf ("%s\n", msg);
+ __builtin_abort ();
+ }
+}
+
+int
+main ()
+{
+ #pragma omp target
+ {
+ /* Ensure that the memory we get *is* low-latency with a null-fallback. */
+ omp_alloctrait_t traits[1]
+ = { { omp_atk_fallback, omp_atv_null_fb } };
+ omp_allocator_handle_t lowlat = omp_init_allocator (omp_low_lat_mem_space,
+ 1, traits);
+
+ int size = 4;
+
+ char *a = omp_alloc(size, lowlat);
+ char *b = omp_alloc(size, lowlat);
+ char *c = omp_alloc(size, lowlat);
+ char *d = omp_alloc(size, lowlat);
+
+ /* There are headers and padding to account for. */
+ int size2 = size + (b-a);
+ int size3 = size + (c-a);
+ int size4 = size + (d-a) + 100; /* Random larger amount. */
+
+ check (a != NULL && b != NULL && c != NULL && d != NULL,
+ "omp_alloc returned NULL\n");
+
+ omp_free(a, lowlat);
+ char *p = omp_alloc (size, lowlat);
+ check (p == a, "allocate did not reuse first chunk");
+
+ omp_free(b, lowlat);
+ p = omp_alloc (size, lowlat);
+ check (p == b, "allocate did not reuse second chunk");
+
+ omp_free(c, lowlat);
+ p = omp_alloc (size, lowlat);
+ check (p == c, "allocate did not reuse third chunk");
+
+ omp_free(a, lowlat);
+ omp_free(b, lowlat);
+ p = omp_alloc (size2, lowlat);
+ check (p == a, "allocate did not coalesce first two chunks");
+
+ omp_free(p, lowlat);
+ p = omp_alloc (size, lowlat);
+ check (p == a, "allocate did not split first chunk (1)");
+ p = omp_alloc (size, lowlat);
+ check (p == b, "allocate did not split first chunk (2)");
+
+ omp_free(b, lowlat);
+ omp_free(c, lowlat);
+ p = omp_alloc (size2, lowlat);
+ check (p == b, "allocate did not coalesce middle two chunks");
+
+ omp_free(p, lowlat);
+ p = omp_alloc (size, lowlat);
+ check (p == b, "allocate did not split second chunk (1)");
+ p = omp_alloc (size, lowlat);
+ check (p == c, "allocate did not split second chunk (2)");
+
+ omp_free(b, lowlat);
+ omp_free(a, lowlat);
+ p = omp_alloc (size2, lowlat);
+ check (p == a, "allocate did not coalesce first two chunks, reverse free");
+
+ omp_free(p, lowlat);
+ p = omp_alloc (size, lowlat);
+ check (p == a, "allocate did not split first chunk (1), reverse free");
+ p = omp_alloc (size, lowlat);
+ check (p == b, "allocate did not split first chunk (2), reverse free");
+
+ omp_free(c, lowlat);
+ omp_free(b, lowlat);
+ p = omp_alloc (size2, lowlat);
+ check (p == b, "allocate did not coalesce second two chunks, reverse free");
+
+ omp_free(p, lowlat);
+ p = omp_alloc (size, lowlat);
+ check (p == b, "allocate did not split second chunk (1), reverse free");
+ p = omp_alloc (size, lowlat);
+ check (p == c, "allocate did not split second chunk (2), reverse free");
+
+ omp_free(a, lowlat);
+ omp_free(b, lowlat);
+ omp_free(c, lowlat);
+ p = omp_alloc (size3, lowlat);
+ check (p == a, "allocate did not coalesce first three chunks");
+
+ omp_free(p, lowlat);
+ p = omp_alloc (size, lowlat);
+ check (p == a, "allocate did not split first chunk (1)");
+ p = omp_alloc (size, lowlat);
+ check (p == b, "allocate did not split first chunk (2)");
+ p = omp_alloc (size, lowlat);
+ check (p == c, "allocate did not split first chunk (3)");
+
+ omp_free(b, lowlat);
+ omp_free(c, lowlat);
+ omp_free(d, lowlat);
+ p = omp_alloc (size3, lowlat);
+ check (p == b, "allocate did not coalesce last three chunks");
+
+ omp_free(p, lowlat);
+ p = omp_alloc (size, lowlat);
+ check (p == b, "allocate did not split second chunk (1)");
+ p = omp_alloc (size, lowlat);
+ check (p == c, "allocate did not split second chunk (2)");
+ p = omp_alloc (size, lowlat);
+ check (p == d, "allocate did not split second chunk (3)");
+
+ omp_free(c, lowlat);
+ omp_free(b, lowlat);
+ omp_free(a, lowlat);
+ p = omp_alloc (size3, lowlat);
+ check (p == a, "allocate did not coalesce first three chunks, reverse free");
+
+ omp_free(p, lowlat);
+ p = omp_alloc (size, lowlat);
+ check (p == a, "allocate did not split first chunk (1), reverse free");
+ p = omp_alloc (size, lowlat);
+ check (p == b, "allocate did not split first chunk (2), reverse free");
+ p = omp_alloc (size, lowlat);
+ check (p == c, "allocate did not split first chunk (3), reverse free");
+
+ omp_free(d, lowlat);
+ omp_free(c, lowlat);
+ omp_free(b, lowlat);
+ p = omp_alloc (size3, lowlat);
+ check (p == b, "allocate did not coalesce second three chunks, reverse free");
+
+ omp_free(p, lowlat);
+ p = omp_alloc (size, lowlat);
+ check (p == b, "allocate did not split second chunk (1), reverse free");
+ p = omp_alloc (size, lowlat);
+ check (p == c, "allocate did not split second chunk (2), reverse free");
+ p = omp_alloc (size, lowlat);
+ check (p == d, "allocate did not split second chunk (3), reverse free");
+
+ omp_free(c, lowlat);
+ omp_free(a, lowlat);
+ omp_free(b, lowlat);
+ p = omp_alloc (size3, lowlat);
+ check (p == a, "allocate did not coalesce first three chunks, mixed free");
+
+ omp_free(p, lowlat);
+ p = omp_alloc (size, lowlat);
+ check (p == a, "allocate did not split first chunk (1), mixed free");
+ p = omp_alloc (size, lowlat);
+ check (p == b, "allocate did not split first chunk (2), mixed free");
+ p = omp_alloc (size, lowlat);
+ check (p == c, "allocate did not split first chunk (3), mixed free");
+
+ omp_free(d, lowlat);
+ omp_free(b, lowlat);
+ omp_free(c, lowlat);
+ p = omp_alloc (size3, lowlat);
+ check (p == b, "allocate did not coalesce second three chunks, mixed free");
+
+ omp_free(p, lowlat);
+ p = omp_alloc (size, lowlat);
+ check (p == b, "allocate did not split second chunk (1), mixed free");
+ p = omp_alloc (size, lowlat);
+ check (p == c, "allocate did not split second chunk (2), mixed free");
+ p = omp_alloc (size, lowlat);
+ check (p == d, "allocate did not split second chunk (3), mixed free");
+
+ omp_free(a, lowlat);
+ omp_free(b, lowlat);
+ omp_free(c, lowlat);
+ omp_free(d, lowlat);
+ p = omp_alloc(size4, lowlat);
+ check (p == a, "allocate did not coalesce all memory");
+ }
+
+return 0;
+}
+
diff --git a/libgomp/testsuite/libgomp.c/allocators-5.c b/libgomp/testsuite/libgomp.c/allocators-5.c
new file mode 100644
index 00000000000..9694010cf1f
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/allocators-5.c
@@ -0,0 +1,63 @@
+/* { dg-do run } */
+
+/* Test calloc with omp_alloc. */
+
+#include <omp.h>
+
+#pragma omp requires dynamic_allocators
+
+void
+test (int n, omp_allocator_handle_t allocator)
+{
+ #pragma omp target map(to:n) map(to:allocator)
+ {
+ int *a;
+ a = (int *) omp_calloc(n, sizeof(int), allocator);
+
+ for (int i = 0; i < n; i++)
+ if (a[i] != 0)
+ {
+ __builtin_printf ("memory not zeroed at %i\n", i);
+ __builtin_abort ();
+ }
+
+ #pragma omp parallel
+ for (int i = 0; i < n; i++)
+ a[i] = i;
+
+ for (int i = 0; i < n; i++)
+ if (a[i] != i)
+ {
+ __builtin_printf ("data mismatch at %i\n", i);
+ __builtin_abort ();
+ }
+
+ omp_free(a, allocator);
+ }
+}
+
+int
+main ()
+{
+ // Smaller than low-latency memory limit
+ test (10, omp_default_mem_alloc);
+ test (10, omp_large_cap_mem_alloc);
+ test (10, omp_const_mem_alloc);
+ test (10, omp_high_bw_mem_alloc);
+ test (10, omp_low_lat_mem_alloc);
+ test (10, omp_cgroup_mem_alloc);
+ test (10, omp_pteam_mem_alloc);
+ test (10, omp_thread_mem_alloc);
+
+ // Larger than low-latency memory limit
+ test (100000, omp_default_mem_alloc);
+ test (100000, omp_large_cap_mem_alloc);
+ test (100000, omp_const_mem_alloc);
+ test (100000, omp_high_bw_mem_alloc);
+ test (100000, omp_low_lat_mem_alloc);
+ test (100000, omp_cgroup_mem_alloc);
+ test (100000, omp_pteam_mem_alloc);
+ test (100000, omp_thread_mem_alloc);
+
+ return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c/allocators-6.c b/libgomp/testsuite/libgomp.c/allocators-6.c
new file mode 100644
index 00000000000..90bf73095ef
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/allocators-6.c
@@ -0,0 +1,117 @@
+/* { dg-do run } */
+
+/* Test that low-latency realloc and free chains are sound. */
+
+#include <stddef.h>
+#include <omp.h>
+
+#pragma omp requires dynamic_allocators
+
+void
+check (int cond, const char *msg)
+{
+ if (!cond)
+ {
+ __builtin_printf ("%s\n", msg);
+ __builtin_abort ();
+ }
+}
+
+int
+main ()
+{
+ #pragma omp target
+ {
+ /* Ensure that the memory we get *is* low-latency with a null-fallback. */
+ omp_alloctrait_t traits[1]
+ = { { omp_atk_fallback, omp_atv_null_fb } };
+ omp_allocator_handle_t lowlat = omp_init_allocator (omp_low_lat_mem_space,
+ 1, traits);
+
+ int size = 16;
+
+ char *a = (char *)omp_alloc(size, lowlat);
+ char *b = (char *)omp_alloc(size, lowlat);
+ char *c = (char *)omp_alloc(size, lowlat);
+ char *d = (char *)omp_alloc(size, lowlat);
+
+ /* There are headers and padding to account for. */
+ int size2 = size + (b-a);
+ int size3 = size + (c-a);
+ int size4 = size + (d-a) + 100; /* Random larger amount. */
+
+ check (a != NULL && b != NULL && c != NULL && d != NULL,
+ "omp_alloc returned NULL\n");
+
+ char *p = omp_realloc (b, size, lowlat, lowlat);
+ check (p == b, "realloc did not reuse same size chunk, no space after");
+
+ p = omp_realloc (b, size-8, lowlat, lowlat);
+ check (p == b, "realloc did not reuse smaller chunk, no space after");
+
+ p = omp_realloc (b, size, lowlat, lowlat);
+ check (p == b, "realloc did not reuse original size chunk, no space after");
+
+ /* Make space after b. */
+ omp_free(c, lowlat);
+
+ p = omp_realloc (b, size, lowlat, lowlat);
+ check (p == b, "realloc did not reuse same size chunk");
+
+ p = omp_realloc (b, size-8, lowlat, lowlat);
+ check (p == b, "realloc did not reuse smaller chunk");
+
+ p = omp_realloc (b, size, lowlat, lowlat);
+ check (p == b, "realloc did not reuse original size chunk");
+
+ p = omp_realloc (b, size+8, lowlat, lowlat);
+ check (p == b, "realloc did not extend in place by a little");
+
+ p = omp_realloc (b, size2, lowlat, lowlat);
+ check (p == b, "realloc did not extend into whole next chunk");
+
+ p = omp_realloc (b, size3, lowlat, lowlat);
+ check (p != b, "realloc did not move b elsewhere");
+ omp_free (p, lowlat);
+
+
+ p = omp_realloc (a, size, lowlat, lowlat);
+ check (p == a, "realloc did not reuse same size chunk, first position");
+
+ p = omp_realloc (a, size-8, lowlat, lowlat);
+ check (p == a, "realloc did not reuse smaller chunk, first position");
+
+ p = omp_realloc (a, size, lowlat, lowlat);
+ check (p == a, "realloc did not reuse original size chunk, first position");
+
+ p = omp_realloc (a, size+8, lowlat, lowlat);
+ check (p == a, "realloc did not extend in place by a little, first position");
+
+ p = omp_realloc (a, size3, lowlat, lowlat);
+ check (p == a, "realloc did not extend into whole next chunk, first position");
+
+ p = omp_realloc (a, size4, lowlat, lowlat);
+ check (p != a, "realloc did not move a elsewhere, first position");
+ omp_free (p, lowlat);
+
+
+ p = omp_realloc (d, size, lowlat, lowlat);
+ check (p == d, "realloc did not reuse same size chunk, last position");
+
+ p = omp_realloc (d, size-8, lowlat, lowlat);
+ check (p == d, "realloc did not reuse smaller chunk, last position");
+
+ p = omp_realloc (d, size, lowlat, lowlat);
+ check (p == d, "realloc did not reuse original size chunk, last position");
+
+ p = omp_realloc (d, size+8, lowlat, lowlat);
+ check (p == d, "realloc did not extend in place by d little, last position");
+
+ /* Larger than low latency memory. */
+ p = omp_realloc(d, 100000000, lowlat, lowlat);
+ check (p == NULL, "realloc did not fail on OOM");
+ }
+
+return 0;
+}
+
next prev parent reply other threads:[~2022-07-07 10:35 UTC|newest]
Thread overview: 30+ messages / expand[flat|nested] mbox.gz Atom feed top
2022-07-07 10:34 [PATCH 00/17] openmp, nvptx, amdgcn: 5.0 Memory Allocators Andrew Stubbs
2022-07-07 10:34 ` Andrew Stubbs [this message]
2022-12-08 11:40 ` [PATCH 01/17] libgomp, nvptx: low-latency memory allocator Jakub Jelinek
2022-07-07 10:34 ` [PATCH 02/17] libgomp: pinned memory Andrew Stubbs
2022-12-08 12:11 ` Jakub Jelinek
2022-12-08 12:51 ` Andrew Stubbs
2022-12-08 14:02 ` Tobias Burnus
2022-12-08 14:35 ` Andrew Stubbs
2022-12-08 15:02 ` Tobias Burnus
2022-07-07 10:34 ` [PATCH 03/17] libgomp, openmp: Add ompx_pinned_mem_alloc Andrew Stubbs
2022-07-07 10:34 ` [PATCH 04/17] openmp, nvptx: low-lat memory access traits Andrew Stubbs
2022-07-07 10:34 ` [PATCH 05/17] openmp, nvptx: ompx_unified_shared_mem_alloc Andrew Stubbs
2022-07-07 10:34 ` [PATCH 06/17] openmp: Add -foffload-memory Andrew Stubbs
2022-07-07 10:34 ` [PATCH 07/17] openmp: allow requires unified_shared_memory Andrew Stubbs
2022-07-07 10:34 ` [PATCH 08/17] openmp: -foffload-memory=pinned Andrew Stubbs
2022-07-07 11:54 ` Tobias Burnus
2022-07-07 22:18 ` Andrew Stubbs
2022-07-08 9:00 ` Tobias Burnus
2022-07-08 9:55 ` Andrew Stubbs
2022-07-08 9:57 ` Tobias Burnus
2023-02-20 14:59 ` Prototype 'GOMP_enable_pinned_mode' (was: [PATCH 08/17] openmp: -foffload-memory=pinned) Thomas Schwinge
2022-07-07 10:34 ` [PATCH 09/17] openmp: Use libgomp memory allocation functions with unified shared memory Andrew Stubbs
2022-07-07 10:34 ` [PATCH 10/17] Add parsing support for allocate directive (OpenMP 5.0) Andrew Stubbs
2022-07-07 10:34 ` [PATCH 11/17] Translate " Andrew Stubbs
2022-07-07 10:34 ` [PATCH 12/17] Handle cleanup of omp allocated variables " Andrew Stubbs
2022-07-07 10:34 ` [PATCH 13/17] Gimplify allocate directive " Andrew Stubbs
2022-07-07 10:34 ` [PATCH 14/17] Lower " Andrew Stubbs
2022-07-07 10:34 ` [PATCH 15/17] amdgcn: Support XNACK mode Andrew Stubbs
2022-07-07 10:34 ` [PATCH 16/17] amdgcn, openmp: Auto-detect USM mode and set HSA_XNACK Andrew Stubbs
2022-07-07 10:34 ` [PATCH 17/17] amdgcn: libgomp plugin USM implementation Andrew Stubbs
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=400092d8ce44340cece0e2e38f88edbad6400b03.1657188329.git.ams@codesourcery.com \
--to=ams@codesourcery.com \
--cc=gcc-patches@gcc.gnu.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).