diff --git a/gcc/config/gcn/gcn-builtins.def b/gcc/config/gcn/gcn-builtins.def index 636a8e7a1a9..471457d7c23 100644 --- a/gcc/config/gcn/gcn-builtins.def +++ b/gcc/config/gcn/gcn-builtins.def @@ -164,6 +164,8 @@ DEF_BUILTIN (FIRST_CALL_THIS_THREAD_P, -1, "first_call_this_thread_p", B_INSN, _A1 (GCN_BTI_BOOL), gcn_expand_builtin_1) DEF_BUILTIN (KERNARG_PTR, -1, "kernarg_ptr", B_INSN, _A1 (GCN_BTI_VOIDPTR), gcn_expand_builtin_1) +DEF_BUILTIN (DISPATCH_PTR, -1, "dispatch_ptr", B_INSN, _A1 (GCN_BTI_VOIDPTR), + gcn_expand_builtin_1) DEF_BUILTIN (GET_STACK_LIMIT, -1, "get_stack_limit", B_INSN, _A1 (GCN_BTI_VOIDPTR), gcn_expand_builtin_1) diff --git a/gcc/config/gcn/gcn.cc b/gcc/config/gcn/gcn.cc index 22d2b6ebf6d..d70238820dd 100644 --- a/gcc/config/gcn/gcn.cc +++ b/gcc/config/gcn/gcn.cc @@ -110,7 +110,8 @@ gcn_init_machine_status (void) f = ggc_cleared_alloc (); - if (TARGET_GCN3) + // FIXME: re-enable global addressing with safety for LDS-flat addresses + //if (TARGET_GCN3) f->use_flat_addressing = true; return f; @@ -4881,6 +4882,19 @@ gcn_expand_builtin_1 (tree exp, rtx target, rtx /*subtarget */ , } return ptr; } + case GCN_BUILTIN_DISPATCH_PTR: + { + rtx ptr; + if (cfun->machine->args.reg[DISPATCH_PTR_ARG] >= 0) + ptr = gen_rtx_REG (DImode, + cfun->machine->args.reg[DISPATCH_PTR_ARG]); + else + { + ptr = gen_reg_rtx (DImode); + emit_move_insn (ptr, const0_rtx); + } + return ptr; + } case GCN_BUILTIN_FIRST_CALL_THIS_THREAD_P: { /* Stash a marker in the unused upper 16 bits of s[0:1] to indicate diff --git a/libgomp/config/gcn/allocator.c b/libgomp/config/gcn/allocator.c new file mode 100644 index 00000000000..e9a95d683f9 --- /dev/null +++ b/libgomp/config/gcn/allocator.c @@ -0,0 +1,127 @@ +/* Copyright (C) 2023 Free Software Foundation, Inc. + + This file is part of the GNU Offloading and Multi Processing Library + (libgomp). + + Libgomp is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY + WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +/* The low-latency allocators use space reserved in LDS memory when the + kernel is launched. The heap is initialized in gomp_gcn_enter_kernel and + all allocations are forgotten when the kernel exits. Allocations to other + memory spaces all use the system malloc syscall. + + The pointers returned are 64-bit "Flat" addresses indistinguishable from + regular pointers, but only compatible with the "flat_load/store" + instructions. The compiler has been coded to assign default address + spaces accordingly. + + LDS memory is not visible to other teams, and therefore may only be used + when the memspace access trait is set accordingly. */ + +#include "libgomp.h" +#include + +#define BASIC_ALLOC_PREFIX __gcn_lowlat +#define BASIC_ALLOC_YIELD asm ("s_sleep 1" ::: "memory") +#include "../../basic-allocator.c" + +/* The low-latency heap is located in LDS memory, but we need the __flat + address space for compatibility reasons. */ +#define FLAT_HEAP_PTR \ + ((void *) (uintptr_t) (void __flat *) (void __lds *) GCN_LOWLAT_HEAP) + +static void * +gcn_memspace_alloc (omp_memspace_handle_t memspace, size_t size) +{ + if (memspace == omp_low_lat_mem_space) + { + char *shared_pool = FLAT_HEAP_PTR; + + return __gcn_lowlat_alloc (shared_pool, size); + } + else + return malloc (size); +} + +static void * +gcn_memspace_calloc (omp_memspace_handle_t memspace, size_t size) +{ + if (memspace == omp_low_lat_mem_space) + { + char *shared_pool = FLAT_HEAP_PTR; + + return __gcn_lowlat_calloc (shared_pool, size); + } + else + return calloc (1, size); +} + +static void +gcn_memspace_free (omp_memspace_handle_t memspace, void *addr, size_t size) +{ + if (memspace == omp_low_lat_mem_space) + { + char *shared_pool = FLAT_HEAP_PTR; + + __gcn_lowlat_free (shared_pool, addr, size); + } + else + free (addr); +} + +static void * +gcn_memspace_realloc (omp_memspace_handle_t memspace, void *addr, + size_t oldsize, size_t size) +{ + if (memspace == omp_low_lat_mem_space) + { + char *shared_pool = FLAT_HEAP_PTR; + + return __gcn_lowlat_realloc (shared_pool, addr, oldsize, size); + } + else + return realloc (addr, size); +} + +static inline int +gcn_memspace_validate (omp_memspace_handle_t memspace, unsigned access) +{ + /* Disallow use of low-latency memory when it must be accessible by + all threads. */ + return (memspace != omp_low_lat_mem_space + || access != omp_atv_all); +} + +#define MEMSPACE_ALLOC(MEMSPACE, SIZE) \ + gcn_memspace_alloc (MEMSPACE, SIZE) +#define MEMSPACE_CALLOC(MEMSPACE, SIZE) \ + gcn_memspace_calloc (MEMSPACE, SIZE) +#define MEMSPACE_REALLOC(MEMSPACE, ADDR, OLDSIZE, SIZE) \ + gcn_memspace_realloc (MEMSPACE, ADDR, OLDSIZE, SIZE) +#define MEMSPACE_FREE(MEMSPACE, ADDR, SIZE) \ + gcn_memspace_free (MEMSPACE, ADDR, SIZE) +#define MEMSPACE_VALIDATE(MEMSPACE, ACCESS) \ + gcn_memspace_validate (MEMSPACE, ACCESS) + +/* The default low-latency memspace implies omp_atv_all, which is incompatible + with the LDS memory space. */ +#define OMP_LOW_LAT_MEM_ALLOC_INVALID 1 + +#include "../../allocator.c" diff --git a/libgomp/config/gcn/libgomp-gcn.h b/libgomp/config/gcn/libgomp-gcn.h index f62b7dde0e7..05b6fb60cc9 100644 --- a/libgomp/config/gcn/libgomp-gcn.h +++ b/libgomp/config/gcn/libgomp-gcn.h @@ -33,6 +33,12 @@ #define DEFAULT_GCN_STACK_SIZE (32*1024) #define DEFAULT_TEAM_ARENA_SIZE (64*1024) +/* These define the LDS location of data needed by OpenMP. */ +#define TEAM_ARENA_START 16 /* LDS offset of free pointer. */ +#define TEAM_ARENA_FREE 24 /* LDS offset of free pointer. */ +#define TEAM_ARENA_END 32 /* LDS offset of end pointer. */ +#define GCN_LOWLAT_HEAP 40 /* LDS offset of the OpenMP low-latency heap. */ + struct heap { int64_t size; diff --git a/libgomp/config/gcn/team.c b/libgomp/config/gcn/team.c index fb20cbbcf9f..7ee6115b666 100644 --- a/libgomp/config/gcn/team.c +++ b/libgomp/config/gcn/team.c @@ -29,6 +29,12 @@ #include #include +#define LITTLEENDIAN_CPU +#include "hsa.h" + +/* Defined in basic-allocator.c via config/amdgcn/allocator.c. */ +void __gcn_lowlat_init (void *heap, size_t size); + static void gomp_thread_start (struct gomp_thread_pool *); extern void build_indirect_map (void); @@ -75,6 +81,12 @@ gomp_gcn_enter_kernel (void) *arena_free = team_arena; *arena_end = team_arena + kernargs->arena_size_per_team; + /* Initialize the low-latency heap. The header is the size. */ + void __lds *lowlat = (void __lds *)GCN_LOWLAT_HEAP; + hsa_kernel_dispatch_packet_t *queue_ptr = __builtin_gcn_dispatch_ptr (); + __gcn_lowlat_init ((void*)(uintptr_t)(void __flat*)lowlat, + queue_ptr->group_segment_size - GCN_LOWLAT_HEAP); + /* Allocate and initialize the team-local-storage data. */ struct gomp_thread *thrs = team_malloc_cleared (sizeof (*thrs) * numthreads); diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h index 15a767cf317..fa29f428976 100644 --- a/libgomp/libgomp.h +++ b/libgomp/libgomp.h @@ -114,9 +114,6 @@ extern void gomp_aligned_free (void *); #ifdef __AMDGCN__ #include "libgomp-gcn.h" /* The arena is initialized in config/gcn/team.c. */ -#define TEAM_ARENA_START 16 /* LDS offset of free pointer. */ -#define TEAM_ARENA_FREE 24 /* LDS offset of free pointer. */ -#define TEAM_ARENA_END 32 /* LDS offset of end pointer. */ static inline void * __attribute__((malloc)) team_malloc (size_t size) diff --git a/libgomp/libgomp.texi b/libgomp/libgomp.texi index 7fdd6fe9410..9d0aee72b33 100644 --- a/libgomp/libgomp.texi +++ b/libgomp/libgomp.texi @@ -5838,6 +5838,19 @@ The implementation remark: available devices (``host fallback''). @item The available stack size can be changed using the @code{GCN_STACK_SIZE} environment variable; the default is 32 kiB per thread. +@item Low-latency memory (@code{omp_low_lat_mem_space}) is supported when the + the @code{access} trait is set to @code{cgroup}. The default pool size + is automatically scaled to share the 64 kiB LDS memory between the number + of teams configured to run on each compute-unit, but may be adjusted at + runtime by setting environment variable + @code{GOMP_GCN_LOWLAT_POOL=@var{bytes}}. +@item @code{omp_low_lat_mem_alloc} cannot be used with true low-latency memory + because the definition implies the @code{omp_atv_all} trait; main + graphics memory is used instead. +@item @code{omp_cgroup_mem_alloc}, @code{omp_pteam_mem_alloc}, and + @code{omp_thread_mem_alloc}, all use low-latency memory as first + preference, and fall back to main graphics memory when the low-latency + pool is exhausted. @end itemize diff --git a/libgomp/plugin/plugin-gcn.c b/libgomp/plugin/plugin-gcn.c index 8aabbd99881..7f8178c78b7 100644 --- a/libgomp/plugin/plugin-gcn.c +++ b/libgomp/plugin/plugin-gcn.c @@ -550,6 +550,7 @@ static size_t gcn_kernel_heap_size = DEFAULT_GCN_HEAP_SIZE; static int team_arena_size = DEFAULT_TEAM_ARENA_SIZE; static int stack_size = DEFAULT_GCN_STACK_SIZE; +static int lowlat_size = -1; /* Flag to decide whether print to stderr information about what is going on. Set in init_debug depending on environment variables. */ @@ -1016,8 +1017,8 @@ print_kernel_dispatch (struct kernel_dispatch *dispatch, unsigned indent) fprintf (stderr, "%*sobject: %lu\n", indent, "", dispatch->object); fprintf (stderr, "%*sprivate_segment_size: %u\n", indent, "", dispatch->private_segment_size); - fprintf (stderr, "%*sgroup_segment_size: %u\n", indent, "", - dispatch->group_segment_size); + fprintf (stderr, "%*sgroup_segment_size: %u (low-latency pool)\n", indent, + "", dispatch->group_segment_size); fprintf (stderr, "\n"); } @@ -1088,6 +1089,10 @@ init_environment_variables (void) if (tmp) stack_size = tmp;; } + + const char *lowlat = secure_getenv ("GOMP_GCN_LOWLAT_POOL"); + if (lowlat) + lowlat_size = atoi (lowlat); } /* Return malloc'd string with name of SYMBOL. */ @@ -1930,7 +1935,25 @@ create_kernel_dispatch (struct kernel_info *kernel, int num_teams, shadow->signal = sync_signal.handle; shadow->private_segment_size = kernel->private_segment_size; - shadow->group_segment_size = kernel->group_segment_size; + + if (lowlat_size < 0) + { + /* Divide the LDS between the number of running teams. + Allocate not less than is defined in the kernel metadata. */ + int teams_per_cu = num_teams / get_cu_count (agent); + int LDS_per_team = (teams_per_cu ? 65536 / teams_per_cu : 65536); + shadow->group_segment_size + = (kernel->group_segment_size > LDS_per_team + ? kernel->group_segment_size + : LDS_per_team);; + } + else if (lowlat_size < GCN_LOWLAT_HEAP+8) + /* Ensure that there's space for the OpenMP libgomp data. */ + shadow->group_segment_size = GCN_LOWLAT_HEAP+8; + else + shadow->group_segment_size = (lowlat_size > 65536 + ? 65536 + : lowlat_size); /* We expect kernels to request a single pointer, explicitly, and the rest of struct kernargs, implicitly. If they request anything else @@ -2290,9 +2313,9 @@ run_kernel (struct kernel_info *kernel, void *vars, print_kernel_dispatch (shadow, 2); } - packet->private_segment_size = kernel->private_segment_size; - packet->group_segment_size = kernel->group_segment_size; - packet->kernel_object = kernel->object; + packet->private_segment_size = shadow->private_segment_size; + packet->group_segment_size = shadow->group_segment_size; + packet->kernel_object = shadow->object; packet->kernarg_address = shadow->kernarg_address; hsa_signal_t s; s.handle = shadow->signal; diff --git a/libgomp/testsuite/libgomp.c/omp_alloc-traits.c b/libgomp/testsuite/libgomp.c/omp_alloc-traits.c index 4ff0fca4986..e9acc8673a3 100644 --- a/libgomp/testsuite/libgomp.c/omp_alloc-traits.c +++ b/libgomp/testsuite/libgomp.c/omp_alloc-traits.c @@ -1,7 +1,7 @@ /* { dg-do run } */ /* { dg-require-effective-target offload_device } */ -/* { dg-xfail-if "not implemented" { ! offload_target_nvptx } } */ +/* { dg-xfail-if "not implemented" { ! { offload_target_nvptx || offload_target_amdgcn } } } */ /* Test that GPU low-latency allocation is limited to team access. */