Optimize GCN OpenMP malloc performance 2019-11-12 Andrew Stubbs libgomp/ * config/gcn/team.c (gomp_gcn_enter_kernel): Set up the team arena and use team_malloc variants. (gomp_gcn_exit_kernel): Use team_free. * libgomp.h (TEAM_ARENA_SIZE): Define. (TEAM_ARENA_START): Define. (TEAM_ARENA_FREE): Define. (TEAM_ARENA_END): Define. (team_malloc): New function. (team_malloc_cleared): New function. (team_free): New function. * team.c (gomp_new_team): Initialize and use team_malloc. (free_team): Use team_free. (gomp_free_thread): Use team_free. (gomp_pause_host): Use team_free. * work.c (gomp_init_work_share): Use team_malloc. (gomp_fini_work_share): Use team_free. diff --git a/libgomp/config/gcn/team.c b/libgomp/config/gcn/team.c index c566482bda2..20d419198e0 100644 --- a/libgomp/config/gcn/team.c +++ b/libgomp/config/gcn/team.c @@ -57,16 +57,28 @@ gomp_gcn_enter_kernel (void) /* Starting additional threads is not supported. */ gomp_global_icv.dyn_var = true; + /* Initialize the team arena for optimized memory allocation. + The arena has been allocated on the host side, and the address + passed in via the kernargs. Each team takes a small slice of it. */ + register void **kernargs asm("s8"); + void *team_arena = (kernargs[4] + TEAM_ARENA_SIZE*teamid); + void * __lds *arena_start = (void * __lds *)TEAM_ARENA_START; + void * __lds *arena_free = (void * __lds *)TEAM_ARENA_FREE; + void * __lds *arena_end = (void * __lds *)TEAM_ARENA_END; + *arena_start = team_arena; + *arena_free = team_arena; + *arena_end = team_arena + TEAM_ARENA_SIZE; + /* Allocate and initialize the team-local-storage data. */ - struct gomp_thread *thrs = gomp_malloc_cleared (sizeof (*thrs) + struct gomp_thread *thrs = team_malloc_cleared (sizeof (*thrs) * numthreads); set_gcn_thrs (thrs); /* Allocate and initailize a pool of threads in the team. The threads are already running, of course, we just need to manage the communication between them. */ - struct gomp_thread_pool *pool = gomp_malloc (sizeof (*pool)); - pool->threads = gomp_malloc (sizeof (void *) * numthreads); + struct gomp_thread_pool *pool = team_malloc (sizeof (*pool)); + pool->threads = team_malloc (sizeof (void *) * numthreads); for (int tid = 0; tid < numthreads; tid++) pool->threads[tid] = &thrs[tid]; pool->threads_size = numthreads; @@ -91,7 +103,7 @@ void gomp_gcn_exit_kernel (void) { gomp_free_thread (gcn_thrs ()); - free (gcn_thrs ()); + team_free (gcn_thrs ()); } /* This function contains the idle loop in which a thread waits diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h index 19e1241ee4c..bab733d2b2d 100644 --- a/libgomp/libgomp.h +++ b/libgomp/libgomp.h @@ -106,6 +106,69 @@ extern void gomp_aligned_free (void *); GCC's builtin alloca(). */ #define gomp_alloca(x) __builtin_alloca(x) +/* Optimized allocators for team-specific data that will die with the team. */ + +#ifdef __AMDGCN__ +/* The arena is initialized in config/gcn/team.c. */ +#define TEAM_ARENA_SIZE 64*1024 /* Must match the value in plugin-gcn.c. */ +#define TEAM_ARENA_START 16 /* LDS offset of free pointer. */ +#define TEAM_ARENA_FREE 24 /* LDS offset of free pointer. */ +#define TEAM_ARENA_END 32 /* LDS offset of end pointer. */ + +static inline void * __attribute__((malloc)) +team_malloc (size_t size) +{ + /* 4-byte align the size. */ + size = (size + 3) & ~3; + + /* Allocate directly from the arena. + The compiler does not support DS atomics, yet. */ + void *result; + asm ("ds_add_rtn_u64 %0, %1, %2\n\ts_waitcnt 0" + : "=v"(result) : "v"(TEAM_ARENA_FREE), "v"(size), "e"(1L) : "memory"); + + /* Handle OOM. */ + if (result + size > *(void * __lds *)TEAM_ARENA_END) + { + /* While this is experimental, let's make sure we know when OOM + happens. */ + const char msg[] = "GCN team arena exhausted\n"; + write (2, msg, sizeof(msg)-1); + + /* Fall back to using the heap (slowly). */ + result = gomp_malloc (size); + } + return result; +} + +static inline void * __attribute__((malloc)) +team_malloc_cleared (size_t size) +{ + char *result = team_malloc (size); + + /* Clear the allocated memory. */ + __builtin_memset (result, 0, size); + + return result; +} + +static inline void +team_free (void *ptr) +{ + /* The whole arena is freed when the kernel exits. + However, if we fell back to using heap then we should free it. + It would be better if this function could be a no-op, but at least + LDS loads are cheap. */ + if (ptr < *(void * __lds *)TEAM_ARENA_START + || ptr >= *(void * __lds *)TEAM_ARENA_END) + free (ptr); +} +#else +#define team_malloc(...) gomp_malloc (__VA_ARGS__) +#define team_malloc_cleared(...) gomp_malloc_cleared (__VA_ARGS__) +#define team_free(...) free (__VA_ARGS__) +#endif + /* error.c */ extern void gomp_vdebug (int, const char *, va_list); diff --git a/libgomp/team.c b/libgomp/team.c index b26caaaaec6..cdfb9ba6c98 100644 --- a/libgomp/team.c +++ b/libgomp/team.c @@ -171,7 +171,7 @@ gomp_new_team (unsigned nthreads) { size_t extra = sizeof (team->ordered_release[0]) + sizeof (team->implicit_task[0]); - team = gomp_malloc (sizeof (*team) + nthreads * extra); + team = team_malloc (sizeof (*team) + nthreads * extra); #ifndef HAVE_SYNC_BUILTINS gomp_mutex_init (&team->work_share_list_free_lock); @@ -221,7 +221,7 @@ free_team (struct gomp_team *team) gomp_barrier_destroy (&team->barrier); gomp_mutex_destroy (&team->task_lock); priority_queue_free (&team->task_queue); - free (team); + team_free (team); } static void @@ -285,8 +285,8 @@ gomp_free_thread (void *arg __attribute__((unused))) if (pool->last_team) free_team (pool->last_team); #ifndef __nvptx__ - free (pool->threads); - free (pool); + team_free (pool->threads); + team_free (pool); #endif thr->thread_pool = NULL; } @@ -1082,8 +1082,8 @@ gomp_pause_host (void) if (pool->last_team) free_team (pool->last_team); #ifndef __nvptx__ - free (pool->threads); - free (pool); + team_free (pool->threads); + team_free (pool); #endif thr->thread_pool = NULL; } diff --git a/libgomp/work.c b/libgomp/work.c index a589b8b5231..28bb0c11255 100644 --- a/libgomp/work.c +++ b/libgomp/work.c @@ -120,7 +120,7 @@ gomp_init_work_share (struct gomp_work_share *ws, size_t ordered, else ordered = nthreads * sizeof (*ws->ordered_team_ids); if (ordered > INLINE_ORDERED_TEAM_IDS_SIZE) - ws->ordered_team_ids = gomp_malloc (ordered); + ws->ordered_team_ids = team_malloc (ordered); else ws->ordered_team_ids = ws->inline_ordered_team_ids; memset (ws->ordered_team_ids, '\0', ordered); @@ -142,7 +142,7 @@ gomp_fini_work_share (struct gomp_work_share *ws) { gomp_mutex_destroy (&ws->lock); if (ws->ordered_team_ids != ws->inline_ordered_team_ids) - free (ws->ordered_team_ids); + team_free (ws->ordered_team_ids); gomp_ptrlock_destroy (&ws->next_ws); }