[Patch] libgomp: Use libnuma for OpenMP's partition=nearest allocation trait

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [Patch] libgomp: Use libnuma for OpenMP's partition=nearest allocation trait
@ 2023-07-11 10:35 Tobias Burnus
  2023-07-12 12:05 ` Tobias Burnus
  0 siblings, 1 reply; 4+ messages in thread
From: Tobias Burnus @ 2023-07-11 10:35 UTC (permalink / raw)
  To: gcc-patches, Jakub Jelinek

[-- Attachment #1: Type: text/plain, Size: 1913 bytes --]

While by default 'malloc' allocates memory on the same node as the calling
process/thread ('numactl --show' shows 'preferred node: current',
Linux kernel memory policy MPOL_DEFAULT), this can be changed.
For instance, when running the program as follows, 'malloc' now
prefers to allocate on the second node:
   numactl --preferred=1 ./myproc

Thus, it seems to be sensible to provide a means to ensure the 'nearest'
allocation.  The MPOL_LOCAL policy does so, as provided by
libnuma's numa_alloc_local. (Which is just wrapper around the syscalls
mmap and mbind.) As with (lib)memkind, there is a run-time dlopen check
for (lib)numa - and no numa*.h is required when bulding GCC.

The patch assumes that yesterday's patch
   'libgomp: Update OpenMP memory allocation doc, fix omp_high_bw_mem_space'
   https://gcc.gnu.org/pipermail/gcc-patches/2023-July/624030.html
has already been applied. (Which is mostly a .texi only patch, except
for one 'return' -> 'break' change.)

This patch has been bootstrapped and manually tested on x86-64.
It also passed "make check".

Comments, remarks, thoughts?

[I really dislike committing patches without any feedback from others,
but I still intent to do so, if no one comments. This applies to this patch
and the other one.]

Tobias

PS: I have attached a testcase, but as it needs -lnuma, I do not intent
to commit it.  An alternative which could be to do the same as we do in
the patch itself; namely, to use the dlopen handle to obtain the two
libnuma library calls. - I am unsure whether I should do so or
whether I should just leave out the testcase.

Thoughts?
-----------------
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht München, HRB 106955

[-- Attachment #2: libnuma.diff --]
[-- Type: text/x-patch, Size: 16470 bytes --]

libgomp: Use libnuma for OpenMP's partition=nearest allocation trait

libgomp/ChangeLog:

	* allocator.c: Add ifdef for LIBGOMP_USE_LIBNUMA.
	(enum gomp_numa_memkind_kind): Renamed from gomp_memkind_kind;
	add GOMP_MEMKIND_LIBNUMA.
	(struct gomp_libnuma_data, gomp_init_libnuma, gomp_get_libnuma): New.
	(omp_init_allocator): Handle partition=nearest with libnuma if avail.
	(omp_aligned_alloc, omp_free, omp_aligned_calloc, omp_realloc): Add
	numa_alloc_local (+ memset), numa_free, and numa_realloc calls as
	needed.
	* config/linux/allocator.c (LIBGOMP_USE_LIBNUMA): Define
	* libgomp.texi (Memory allocation): Renamed from 'Memory allocation
	with libmemkind'; updated for libnuma usage.

 libgomp/allocator.c              | 202 +++++++++++++++++++++++++++++++++------
 libgomp/config/linux/allocator.c |   1 +
 libgomp/libgomp.texi             |  22 ++++-
 3 files changed, 195 insertions(+), 30 deletions(-)

diff --git a/libgomp/allocator.c b/libgomp/allocator.c
index 25c0f150302..2632f16e132 100644
--- a/libgomp/allocator.c
+++ b/libgomp/allocator.c
@@ -31,13 +31,13 @@
 #include "libgomp.h"
 #include <stdlib.h>
 #include <string.h>
-#ifdef LIBGOMP_USE_MEMKIND
+#if defined(LIBGOMP_USE_MEMKIND) || defined(LIBGOMP_USE_LIBNUMA)
 #include <dlfcn.h>
 #endif
 
 #define omp_max_predefined_alloc omp_thread_mem_alloc
 
-enum gomp_memkind_kind
+enum gomp_numa_memkind_kind
 {
   GOMP_MEMKIND_NONE = 0,
 #define GOMP_MEMKIND_KINDS \
@@ -50,7 +50,8 @@ enum gomp_memkind_kind
 #define GOMP_MEMKIND_KIND(kind) GOMP_MEMKIND_##kind
   GOMP_MEMKIND_KINDS,
 #undef GOMP_MEMKIND_KIND
-  GOMP_MEMKIND_COUNT
+  GOMP_MEMKIND_COUNT,
+  GOMP_MEMKIND_LIBNUMA = GOMP_MEMKIND_COUNT
 };
 
 struct omp_allocator_data
@@ -65,7 +66,7 @@ struct omp_allocator_data
   unsigned int fallback : 8;
   unsigned int pinned : 1;
   unsigned int partition : 7;
-#ifdef LIBGOMP_USE_MEMKIND
+#if defined(LIBGOMP_USE_MEMKIND) || defined(LIBGOMP_USE_LIBNUMA)
   unsigned int memkind : 8;
 #endif
 #ifndef HAVE_SYNC_BUILTINS
@@ -81,6 +82,14 @@ struct omp_mem_header
   void *pad;
 };
 
+struct gomp_libnuma_data
+{
+  void *numa_handle;
+  void *(*numa_alloc_local) (size_t);
+  void *(*numa_realloc) (void *, size_t, size_t);
+  void (*numa_free) (void *, size_t);
+};
+
 struct gomp_memkind_data
 {
   void *memkind_handle;
@@ -92,6 +101,50 @@ struct gomp_memkind_data
   void **kinds[GOMP_MEMKIND_COUNT];
 };
 
+#ifdef LIBGOMP_USE_LIBNUMA
+static struct gomp_libnuma_data *libnuma_data;
+static pthread_once_t libnuma_data_once = PTHREAD_ONCE_INIT;
+
+static void
+gomp_init_libnuma (void)
+{
+  void *handle = dlopen ("libnuma.so.1", RTLD_LAZY);
+  struct gomp_libnuma_data *data;
+
+  data = calloc (1, sizeof (struct gomp_libnuma_data));
+  if (data == NULL)
+    {
+      if (handle)
+	dlclose (handle);
+      return;
+    }
+  if (!handle)
+    {
+      __atomic_store_n (&libnuma_data, data, MEMMODEL_RELEASE);
+      return;
+    }
+  data->numa_handle = handle;
+  data->numa_alloc_local
+    = (__typeof (data->numa_alloc_local)) dlsym (handle, "numa_alloc_local");
+  data->numa_realloc
+    = (__typeof (data->numa_realloc)) dlsym (handle, "numa_realloc");
+  data->numa_free
+    = (__typeof (data->numa_free)) dlsym (handle, "numa_free");
+  __atomic_store_n (&libnuma_data, data, MEMMODEL_RELEASE);
+}
+
+static struct gomp_libnuma_data *
+gomp_get_libnuma (void)
+{
+  struct gomp_libnuma_data *data
+    = __atomic_load_n (&libnuma_data, MEMMODEL_ACQUIRE);
+  if (data)
+    return data;
+  pthread_once (&libnuma_data_once, gomp_init_libnuma);
+  return __atomic_load_n (&libnuma_data, MEMMODEL_ACQUIRE);
+}
+#endif
+
 #ifdef LIBGOMP_USE_MEMKIND
 static struct gomp_memkind_data *memkind_data;
 static pthread_once_t memkind_data_once = PTHREAD_ONCE_INIT;
@@ -166,7 +219,7 @@ omp_init_allocator (omp_memspace_handle_t memspace, int ntraits,
   struct omp_allocator_data data
     = { memspace, 1, ~(uintptr_t) 0, 0, 0, omp_atv_contended, omp_atv_all,
 	omp_atv_default_mem_fb, omp_atv_false, omp_atv_environment,
-#ifdef LIBGOMP_USE_MEMKIND
+#if defined(LIBGOMP_USE_MEMKIND) || defined(LIBGOMP_USE_LIBNUMA)
 	GOMP_MEMKIND_NONE
 #endif
       };
@@ -285,8 +338,8 @@ omp_init_allocator (omp_memspace_handle_t memspace, int ntraits,
 
   switch (memspace)
     {
-    case omp_high_bw_mem_space:
 #ifdef LIBGOMP_USE_MEMKIND
+    case omp_high_bw_mem_space:
       struct gomp_memkind_data *memkind_data;
       memkind_data = gomp_get_memkind ();
       if (data.partition == omp_atv_interleaved
@@ -300,17 +353,15 @@ omp_init_allocator (omp_memspace_handle_t memspace, int ntraits,
 	  data.memkind = GOMP_MEMKIND_HBW_PREFERRED;
 	  break;
 	}
-#endif
       break;
     case omp_large_cap_mem_space:
-#ifdef LIBGOMP_USE_MEMKIND
       memkind_data = gomp_get_memkind ();
       if (memkind_data->kinds[GOMP_MEMKIND_DAX_KMEM_ALL])
 	data.memkind = GOMP_MEMKIND_DAX_KMEM_ALL;
       else if (memkind_data->kinds[GOMP_MEMKIND_DAX_KMEM])
 	data.memkind = GOMP_MEMKIND_DAX_KMEM;
-#endif
       break;
+#endif
     default:
 #ifdef LIBGOMP_USE_MEMKIND
       if (data.partition == omp_atv_interleaved)
@@ -323,6 +374,14 @@ omp_init_allocator (omp_memspace_handle_t memspace, int ntraits,
       break;
     }
 
+#ifdef LIBGOMP_USE_LIBNUMA
+  if (data.memkind == GOMP_MEMKIND_NONE && data.partition == omp_atv_nearest)
+    {
+      data.memkind = GOMP_MEMKIND_LIBNUMA;
+      libnuma_data = gomp_get_libnuma ();
+    }
+#endif
+
   /* No support for this so far.  */
   if (data.pinned)
     return omp_null_allocator;
@@ -357,8 +416,8 @@ omp_aligned_alloc (size_t alignment, size_t size,
   struct omp_allocator_data *allocator_data;
   size_t new_size, new_alignment;
   void *ptr, *ret;
-#ifdef LIBGOMP_USE_MEMKIND
-  enum gomp_memkind_kind memkind;
+#if defined(LIBGOMP_USE_MEMKIND) || defined(LIBGOMP_USE_LIBNUMA)
+  enum gomp_numa_memkind_kind memkind;
 #endif
 
   if (__builtin_expect (size == 0, 0))
@@ -379,7 +438,7 @@ retry:
       allocator_data = (struct omp_allocator_data *) allocator;
       if (new_alignment < allocator_data->alignment)
 	new_alignment = allocator_data->alignment;
-#ifdef LIBGOMP_USE_MEMKIND
+#if defined(LIBGOMP_USE_MEMKIND) || defined(LIBGOMP_USE_LIBNUMA)
       memkind = allocator_data->memkind;
 #endif
     }
@@ -388,8 +447,10 @@ retry:
       allocator_data = NULL;
       if (new_alignment < sizeof (void *))
 	new_alignment = sizeof (void *);
-#ifdef LIBGOMP_USE_MEMKIND
+#if defined(LIBGOMP_USE_MEMKIND) || defined(LIBGOMP_USE_LIBNUMA)
       memkind = GOMP_MEMKIND_NONE;
+#endif
+#ifdef LIBGOMP_USE_MEMKIND
       if (allocator == omp_high_bw_mem_alloc)
 	memkind = GOMP_MEMKIND_HBW_PREFERRED;
       else if (allocator == omp_large_cap_mem_alloc)
@@ -444,6 +505,13 @@ retry:
       allocator_data->used_pool_size = used_pool_size;
       gomp_mutex_unlock (&allocator_data->lock);
 #endif
+#ifdef LIBGOMP_USE_LIBNUMA
+      if (memkind == GOMP_MEMKIND_LIBNUMA)
+	ptr = libnuma_data->numa_alloc_local (new_size);
+# ifdef LIBGOMP_USE_MEMKIND
+      else
+# endif
+#endif
 #ifdef LIBGOMP_USE_MEMKIND
       if (memkind)
 	{
@@ -469,6 +537,13 @@ retry:
     }
   else
     {
+#ifdef LIBGOMP_USE_LIBNUMA
+      if (memkind == GOMP_MEMKIND_LIBNUMA)
+	ptr = libnuma_data->numa_alloc_local (new_size);
+# ifdef LIBGOMP_USE_MEMKIND
+      else
+# endif
+#endif
 #ifdef LIBGOMP_USE_MEMKIND
       if (memkind)
 	{
@@ -502,7 +577,7 @@ fail:
 	{
 	case omp_atv_default_mem_fb:
 	  if ((new_alignment > sizeof (void *) && new_alignment > alignment)
-#ifdef LIBGOMP_USE_MEMKIND
+#if defined(LIBGOMP_USE_MEMKIND) || defined(LIBGOMP_USE_LIBNUMA)
 	      || memkind
 #endif
 	      || (allocator_data
@@ -577,6 +652,16 @@ omp_free (void *ptr, omp_allocator_handle_t allocator)
 	  gomp_mutex_unlock (&allocator_data->lock);
 #endif
 	}
+#ifdef LIBGOMP_USE_LIBNUMA
+      if (allocator_data->memkind == GOMP_MEMKIND_LIBNUMA)
+	{
+	  libnuma_data->numa_free (data->ptr, data->size);
+	  return;
+	}
+# ifdef LIBGOMP_USE_MEMKIND
+      else
+# endif
+#endif
 #ifdef LIBGOMP_USE_MEMKIND
       if (allocator_data->memkind)
 	{
@@ -590,7 +675,7 @@ omp_free (void *ptr, omp_allocator_handle_t allocator)
 #ifdef LIBGOMP_USE_MEMKIND
   else
     {
-      enum gomp_memkind_kind memkind = GOMP_MEMKIND_NONE;
+      enum gomp_numa_memkind_kind memkind = GOMP_MEMKIND_NONE;
       if (data->allocator == omp_high_bw_mem_alloc)
 	memkind = GOMP_MEMKIND_HBW_PREFERRED;
       else if (data->allocator == omp_large_cap_mem_alloc)
@@ -625,8 +710,8 @@ omp_aligned_calloc (size_t alignment, size_t nmemb, size_t size,
   struct omp_allocator_data *allocator_data;
   size_t new_size, size_temp, new_alignment;
   void *ptr, *ret;
-#ifdef LIBGOMP_USE_MEMKIND
-  enum gomp_memkind_kind memkind;
+#if defined(LIBGOMP_USE_MEMKIND) || defined(LIBGOMP_USE_LIBNUMA)
+  enum gomp_numa_memkind_kind memkind;
 #endif
 
   if (__builtin_expect (size == 0 || nmemb == 0, 0))
@@ -647,7 +732,7 @@ retry:
       allocator_data = (struct omp_allocator_data *) allocator;
       if (new_alignment < allocator_data->alignment)
 	new_alignment = allocator_data->alignment;
-#ifdef LIBGOMP_USE_MEMKIND
+#if defined(LIBGOMP_USE_MEMKIND) || defined(LIBGOMP_USE_LIBNUMA)
       memkind = allocator_data->memkind;
 #endif
     }
@@ -656,8 +741,10 @@ retry:
       allocator_data = NULL;
       if (new_alignment < sizeof (void *))
 	new_alignment = sizeof (void *);
-#ifdef LIBGOMP_USE_MEMKIND
+#if defined(LIBGOMP_USE_MEMKIND) || defined(LIBGOMP_USE_LIBNUMA)
       memkind = GOMP_MEMKIND_NONE;
+#endif
+#ifdef LIBGOMP_USE_MEMKIND
       if (allocator == omp_high_bw_mem_alloc)
 	memkind = GOMP_MEMKIND_HBW_PREFERRED;
       else if (allocator == omp_large_cap_mem_alloc)
@@ -714,6 +801,16 @@ retry:
       allocator_data->used_pool_size = used_pool_size;
       gomp_mutex_unlock (&allocator_data->lock);
 #endif
+#ifdef LIBGOMP_USE_LIBNUMA
+      if (memkind == GOMP_MEMKIND_LIBNUMA)
+	{
+	  ptr = libnuma_data->numa_alloc_local (new_size);
+	  memset (ptr, '\0', new_size);
+	}
+# ifdef LIBGOMP_USE_MEMKIND
+      else
+# endif
+#endif
 #ifdef LIBGOMP_USE_MEMKIND
       if (memkind)
 	{
@@ -739,6 +836,16 @@ retry:
     }
   else
     {
+#ifdef LIBGOMP_USE_LIBNUMA
+      if (memkind == GOMP_MEMKIND_LIBNUMA)
+	{
+	  ptr = libnuma_data->numa_alloc_local (new_size);
+	  memset (ptr, '\0', new_size);
+	}
+# ifdef LIBGOMP_USE_MEMKIND
+      else
+# endif
+#endif
 #ifdef LIBGOMP_USE_MEMKIND
       if (memkind)
 	{
@@ -772,7 +879,7 @@ fail:
 	{
 	case omp_atv_default_mem_fb:
 	  if ((new_alignment > sizeof (void *) && new_alignment > alignment)
-#ifdef LIBGOMP_USE_MEMKIND
+#if defined(LIBGOMP_USE_MEMKIND) || defined(LIBGOMP_USE_LIBNUMA)
 	      || memkind
 #endif
 	      || (allocator_data
@@ -815,8 +922,8 @@ omp_realloc (void *ptr, size_t size, omp_allocator_handle_t allocator,
   size_t new_size, old_size, new_alignment, old_alignment;
   void *new_ptr, *ret;
   struct omp_mem_header *data;
-#ifdef LIBGOMP_USE_MEMKIND
-  enum gomp_memkind_kind memkind, free_memkind;
+#if defined(LIBGOMP_USE_MEMKIND) || defined(LIBGOMP_USE_LIBNUMA)
+  enum gomp_numa_memkind_kind memkind, free_memkind;
 #endif
 
   if (__builtin_expect (ptr == NULL, 0))
@@ -841,15 +948,17 @@ retry:
       allocator_data = (struct omp_allocator_data *) allocator;
       if (new_alignment < allocator_data->alignment)
 	new_alignment = allocator_data->alignment;
-#ifdef LIBGOMP_USE_MEMKIND
+#if defined(LIBGOMP_USE_MEMKIND) || defined(LIBGOMP_USE_LIBNUMA)
       memkind = allocator_data->memkind;
 #endif
     }
   else
     {
       allocator_data = NULL;
-#ifdef LIBGOMP_USE_MEMKIND
+#if defined(LIBGOMP_USE_MEMKIND) || defined(LIBGOMP_USE_LIBNUMA)
       memkind = GOMP_MEMKIND_NONE;
+#endif
+#ifdef LIBGOMP_USE_MEMKIND
       if (allocator == omp_high_bw_mem_alloc)
 	memkind = GOMP_MEMKIND_HBW_PREFERRED;
       else if (allocator == omp_large_cap_mem_alloc)
@@ -865,15 +974,17 @@ retry:
   if (free_allocator > omp_max_predefined_alloc)
     {
       free_allocator_data = (struct omp_allocator_data *) free_allocator;
-#ifdef LIBGOMP_USE_MEMKIND
+#if defined(LIBGOMP_USE_MEMKIND) || defined(LIBGOMP_USE_LIBNUMA)
       free_memkind = free_allocator_data->memkind;
 #endif
     }
   else
     {
       free_allocator_data = NULL;
-#ifdef LIBGOMP_USE_MEMKIND
+#if defined(LIBGOMP_USE_MEMKIND) || defined(LIBGOMP_USE_LIBNUMA)
       free_memkind = GOMP_MEMKIND_NONE;
+#endif
+#ifdef LIBGOMP_USE_MEMKIND
       if (free_allocator == omp_high_bw_mem_alloc)
 	free_memkind = GOMP_MEMKIND_HBW_PREFERRED;
       else if (free_allocator == omp_large_cap_mem_alloc)
@@ -953,6 +1064,19 @@ retry:
       allocator_data->used_pool_size = used_pool_size;
       gomp_mutex_unlock (&allocator_data->lock);
 #endif
+#ifdef LIBGOMP_USE_LIBNUMA
+      if (memkind == GOMP_MEMKIND_LIBNUMA)
+	{
+	  if (prev_size)
+	    new_ptr = libnuma_data->numa_realloc (data->ptr, data->size,
+						  new_size);
+	  else
+	    new_ptr = libnuma_data->numa_alloc_local (new_size);
+	}
+# ifdef LIBGOMP_USE_MEMKIND
+      else
+# endif
+#endif
 #ifdef LIBGOMP_USE_MEMKIND
       if (memkind)
 	{
@@ -1000,6 +1124,13 @@ retry:
 	   && (free_allocator_data == NULL
 	       || free_allocator_data->pool_size == ~(uintptr_t) 0))
     {
+#ifdef LIBGOMP_USE_LIBNUMA
+      if (memkind == GOMP_MEMKIND_LIBNUMA)
+	new_ptr = libnuma_data->numa_realloc (data->ptr, data->size, new_size);
+# ifdef LIBGOMP_USE_MEMKIND
+      else
+# endif
+#endif
 #ifdef LIBGOMP_USE_MEMKIND
       if (memkind)
 	{
@@ -1021,6 +1152,13 @@ retry:
     }
   else
     {
+#ifdef LIBGOMP_USE_LIBNUMA
+      if (memkind == GOMP_MEMKIND_LIBNUMA)
+	new_ptr = libnuma_data->numa_alloc_local (new_size);
+# ifdef LIBGOMP_USE_MEMKIND
+      else
+# endif
+#endif
 #ifdef LIBGOMP_USE_MEMKIND
       if (memkind)
 	{
@@ -1060,6 +1198,16 @@ retry:
       gomp_mutex_unlock (&free_allocator_data->lock);
 #endif
     }
+#ifdef LIBGOMP_USE_LIBNUMA
+      if (memkind == GOMP_MEMKIND_LIBNUMA)
+	{
+	  libnuma_data->numa_free (data->ptr, data->size);
+	  return ret;
+	}
+# ifdef LIBGOMP_USE_MEMKIND
+      else
+# endif
+#endif
 #ifdef LIBGOMP_USE_MEMKIND
   if (free_memkind)
     {
@@ -1079,7 +1227,7 @@ fail:
 	{
 	case omp_atv_default_mem_fb:
 	  if (new_alignment > sizeof (void *)
-#ifdef LIBGOMP_USE_MEMKIND
+#if defined(LIBGOMP_USE_MEMKIND) || defined(LIBGOMP_USE_LIBNUMA)
 	      || memkind
 #endif
 	      || (allocator_data
diff --git a/libgomp/config/linux/allocator.c b/libgomp/config/linux/allocator.c
index 15babcd1ada..64b1b4b9623 100644
--- a/libgomp/config/linux/allocator.c
+++ b/libgomp/config/linux/allocator.c
@@ -31,6 +31,7 @@
 #include "libgomp.h"
 #if defined(PLUGIN_SUPPORT) && defined(LIBGOMP_USE_PTHREADS)
 #define LIBGOMP_USE_MEMKIND
+#define LIBGOMP_USE_LIBNUMA
 #endif
 
 #include "../../allocator.c"
diff --git a/libgomp/libgomp.texi b/libgomp/libgomp.texi
index b1f58e74903..40328456a1d 100644
--- a/libgomp/libgomp.texi
+++ b/libgomp/libgomp.texi
@@ -4584,7 +4584,7 @@ offloading devices (it's not clear if they should be):
 @menu
 * Implementation-defined ICV Initialization::
 * OpenMP Context Selectors::
-* Memory allocation with libmemkind::
+* Memory allocation::
 @end menu
 
 @node Implementation-defined ICV Initialization
@@ -4631,8 +4631,8 @@ smaller number.  On non-host devices, the value of the
       @tab See @code{-march=} in ``Nvidia PTX Options''
 @end multitable
 
-@node Memory allocation with libmemkind
-@section Memory allocation with libmemkind
+@node Memory allocation
+@section Memory allocation
 
 For the memory spaces, the following applies:
 @itemize
@@ -4656,6 +4656,22 @@ creating memory allocators requesting
       @code{omp_large_cap_mem_space} the allocation will not be interleaved
 @end itemize
 
+On Linux systems, where the @uref{https://github.com/numactl/numactl, numa
+library} (@code{libnuma.so.1}) is available at runtime, it used when creating
+memory allocators requesting
+
+@itemize
+@item the partition trait @code{omp_atv_nearest}, except when the libmemkind
+library is available and the memory space is either
+@code{omp_large_cap_mem_space} or @code{omp_high_bw_mem_space}
+@end itemize
+
+Note that the numa library will round up the allocation size to a multiple of
+the system page size; therefore, consider using it only with large data or
+by sharing allocations by using the @code{pool_size} trait.  Additionally,
+the numa library does not guarantee that for reallocations the same node will
+be used.
+
 Additional notes:
 @itemize
 @item The @code{pinned} trait is unsupported.

[-- Attachment #3: allocator-7.c --]
[-- Type: text/x-csrc, Size: 3580 bytes --]

/* Link with "-lnuma"  */

/* Check that the allocation is done on the same node
   as the CPU executing the thread ('parition' = 'nearest').  */

#include <stdio.h>

#if !__has_include(<sched.h>) || !__has_include(<numaif.h>) || !__has_include(<dlfcn.h>)
int
main ()
{
  fprintf (stderr, "SKIPPED actual execution: (g)libc and/or libnuma header files not found\n");
  return 0;
}

#else

/* Assume that when numaif.h is available, the system also has a GLIBC/Kernel
   supporting 'sched_getcpu'.  */

#define __USE_GNU
#include <sched.h>   /* sched_getcpu - added in ec08f13dad for GLIBC_2.6. */
#include <dlfcn.h>   /* dlopen/dlcose  */
#include <numa.h>    /* numa_node_of_cpu  */
#include <numaif.h>  /* get_mempolicy  */
#include <assert.h>
#include <omp.h>

int main()
{
  const omp_alloctrait_t traits[]
    = { { omp_atk_fallback, omp_atv_abort_fb },
	{ omp_atk_partition, omp_atv_nearest } };
  int has_libnuma, has_memkind;

  void *handle = dlopen ("libnuma.so.1", RTLD_LAZY);
  has_libnuma = handle != NULL;
  dlclose (handle);

  handle = dlopen ("libmemkind.so.0", RTLD_LAZY);
  has_memkind = handle != NULL;
  dlclose (handle);

  if (!has_libnuma)
    {
      fprintf (stderr, "SKIPPED actual execution: dlopen of libnuma.so.1 failed (not installed?)\n");
      return 0;
    }
  fprintf (stderr, "Note: libmemkind.so.0 does %sexist\n", has_memkind ? "" : "NOT ");

  omp_allocator_handle_t a1, a2, a3, a4;

  a1 = omp_init_allocator (omp_default_mem_space, 2, traits);
  assert (a1 != omp_null_allocator);

  a2 = omp_init_allocator (omp_low_lat_mem_space, 2, traits);
  assert (a2 != omp_null_allocator);

  if (!has_memkind)
    {
      a3 = omp_init_allocator (omp_large_cap_mem_space, 2, traits);
      assert (a3 != omp_null_allocator);

      a4 = omp_init_allocator (omp_high_bw_mem_space, 2, traits);
      assert (a4 != omp_null_allocator);
    }

  #pragma omp parallel  /* Increase the chance that multiple nodes are used. */
  {
    unsigned node_cpu;
    int node_memory;
    int *i1, *i2, *i3, *i4;

    node_cpu = numa_node_of_cpu (sched_getcpu ());

    i1 = (int*) omp_alloc (sizeof(int)*1025*1, a1);
    assert (i1 != NULL);
    i1[1024] = 1;
    i2 = (int*) omp_alloc (sizeof(int)*1025*2, a2);
    assert (i2 != NULL);
    i2[1024] = 2;

    if (!has_memkind)
      {
        i3 = (int*) omp_alloc (sizeof(int)*1025*3, a3);
	assert (i3 != NULL);
        i3[1024] = 3;
        i4 = (int*) omp_alloc (sizeof(int)*1025*4, a4);
	assert (i3 != NULL);
        i4[1024] = 4;
      }

    node_memory = -1;
    if (get_mempolicy (&node_memory, NULL, 0, i1, MPOL_F_ADDR|MPOL_F_NODE) < 0)
      assert (0 && "get_mempolicy for i1");
    assert (node_memory == node_cpu);

    node_memory = -1;
    if (get_mempolicy (&node_memory, NULL, 0, i2, MPOL_F_ADDR|MPOL_F_NODE) < 0)
      assert (0 && "get_mempolicy for i2");
    assert (node_memory == node_cpu);

    if (!has_memkind)
      {
	node_memory = -1;
	if (get_mempolicy (&node_memory, NULL, 0, i3, MPOL_F_ADDR|MPOL_F_NODE) < 0)
	  assert (0 && "get_mempolicy for i3");
	assert (node_memory == node_cpu);

	node_memory = -1;
	if (get_mempolicy (&node_memory, NULL, 0, i4, MPOL_F_ADDR|MPOL_F_NODE) < 0)
	  assert (0 && "get_mempolicy for i4");
	assert (node_memory == node_cpu);
      }

     omp_free (i1, a1);
     omp_free (i2, a2);
    if (!has_memkind)
      {
	omp_free (i3, a3);
	omp_free (i4, a4);
      }
  }

  omp_destroy_allocator (a1);
  omp_destroy_allocator (a2);
  if (!has_memkind)
    {
      omp_destroy_allocator (a3);
      omp_destroy_allocator (a4);
    }
  return 0;
}

#endif

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [Patch] libgomp: Use libnuma for OpenMP's partition=nearest allocation trait
  2023-07-11 10:35 [Patch] libgomp: Use libnuma for OpenMP's partition=nearest allocation trait Tobias Burnus
@ 2023-07-12 12:05 ` Tobias Burnus
  2023-07-13 16:13   ` Prathamesh Kulkarni
  0 siblings, 1 reply; 4+ messages in thread
From: Tobias Burnus @ 2023-07-12 12:05 UTC (permalink / raw)
  To: gcc-patches, Jakub Jelinek

[-- Attachment #1: Type: text/plain, Size: 2537 bytes --]

Now committed as r14-2462-g450b05ce54d3f0.

Changes to the patch in previous email:
* I fixed some issues found on the way,
* The wording in the .texi has been improved/expanded, and
* I included two testcases to exercise the two libraries (or
   the default allocator when it is not available at runtime).

Given that the default allocation already works fine (nearest)
and the normal "malloc" is more economic in terms of memory
handling (not multiples of page size or requesting a fixed
pool size), I was wondering whether this patch is really needed.

But at the end: default can be changed (cf. below) and given
the user the choice makes sense. The manual states what GCC does
which should help to make a conscious choice.

* * *

I did experiment with the testcase attached to previous email
plus using dlopen to obtain the functions from libnuma if available.

It was also using:
/* { dg-do run { target { dlopen } } } */
/* { dg-additional-options "-ldl" } */

However, the Linux kernel too often placed the allocated memory
on the "wrong" node to be usable as a testcase. I did get be
0 to 15 misplaced allocations, depending on the run.

Hence, there is no such testcase. Using numactrl --preferred=1 I
could force the normal allocation to (mostly) use node 1 for
allocations such that the difference between partiton = default/environment
vs. partition = nearest was clearly visible. Hence it does work.

Otherwise, the same applies as I wrote the yesterday:

On 11.07.23 12:35, Tobias Burnus wrote:

> While by default 'malloc' allocates memory on the same node as the
> calling
> process/thread ('numactl --show' shows 'preferred node: current',
> Linux kernel memory policy MPOL_DEFAULT), this can be changed.
> For instance, when running the program as follows, 'malloc' now
> prefers to allocate on the second node:
>   numactl --preferred=1 ./myproc
>
> Thus, it seems to be sensible to provide a means to ensure the 'nearest'
> allocation.  The MPOL_LOCAL policy does so, as provided by
> libnuma's numa_alloc_local. (Which is just wrapper around the syscalls
> mmap and mbind.) As with (lib)memkind, there is a run-time dlopen check
> for (lib)numa - and no numa*.h is required when bulding GCC.
-----------------
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht München, HRB 106955

[-- Attachment #2: committed.diff --]
[-- Type: text/x-patch, Size: 37392 bytes --]

commit 450b05ce54d3f08c583c3b5341233ce0df99725b
Author: Tobias Burnus <tobias@codesourcery.com>
Date:   Wed Jul 12 13:50:21 2023 +0200

    libgomp: Use libnuma for OpenMP's partition=nearest allocation trait
    
    As with the memkind library, it is only used when found at runtime;
    it does not need to be present when building GCC.
    
    The included testcase does not check whether the memory has been placed
    on the nearest node as the Linux kernel memory handling too often ignores
    that hint, using a different node for the allocation.  However, when
    running with 'numactl --preferred=<node> ./executable', it is clearly
    visible that the feature works by comparing malloc/default vs. nearest
    placement (using get_mempolicy to obtain the node for a mem addr).
    
    libgomp/ChangeLog:
    
            * allocator.c: Add ifdef for LIBGOMP_USE_LIBNUMA.
            (enum gomp_numa_memkind_kind): Renamed from gomp_memkind_kind;
            add GOMP_MEMKIND_LIBNUMA.
            (struct gomp_libnuma_data, gomp_init_libnuma, gomp_get_libnuma): New.
            (omp_init_allocator): Handle partition=nearest with libnuma if avail.
            (omp_aligned_alloc, omp_free, omp_aligned_calloc, omp_realloc): Add
            numa_alloc_local (+ memset), numa_free, and numa_realloc calls as
            needed.
            * config/linux/allocator.c (LIBGOMP_USE_LIBNUMA): Define
            * libgomp.texi: Fix a typo; use 'fi' instead of its ligature char.
            (Memory allocation): Renamed from 'Memory allocation with libmemkind';
            updated for libnuma usage.
            * testsuite/libgomp.c-c++-common/alloc-11.c: New test.
            * testsuite/libgomp.c-c++-common/alloc-12.c: New test.
---
 libgomp/allocator.c                               | 202 ++++++++++++---
 libgomp/config/linux/allocator.c                  |   1 +
 libgomp/libgomp.texi                              |  42 +++-
 libgomp/testsuite/libgomp.c-c++-common/alloc-11.c | 285 ++++++++++++++++++++++
 libgomp/testsuite/libgomp.c-c++-common/alloc-12.c | 217 ++++++++++++++++
 5 files changed, 708 insertions(+), 39 deletions(-)

diff --git a/libgomp/allocator.c b/libgomp/allocator.c
index 25c0f150302..b3187ab2911 100644
--- a/libgomp/allocator.c
+++ b/libgomp/allocator.c
@@ -31,13 +31,13 @@
 #include "libgomp.h"
 #include <stdlib.h>
 #include <string.h>
-#ifdef LIBGOMP_USE_MEMKIND
+#if defined(LIBGOMP_USE_MEMKIND) || defined(LIBGOMP_USE_LIBNUMA)
 #include <dlfcn.h>
 #endif
 
 #define omp_max_predefined_alloc omp_thread_mem_alloc
 
-enum gomp_memkind_kind
+enum gomp_numa_memkind_kind
 {
   GOMP_MEMKIND_NONE = 0,
 #define GOMP_MEMKIND_KINDS \
@@ -50,7 +50,8 @@ enum gomp_memkind_kind
 #define GOMP_MEMKIND_KIND(kind) GOMP_MEMKIND_##kind
   GOMP_MEMKIND_KINDS,
 #undef GOMP_MEMKIND_KIND
-  GOMP_MEMKIND_COUNT
+  GOMP_MEMKIND_COUNT,
+  GOMP_MEMKIND_LIBNUMA = GOMP_MEMKIND_COUNT
 };
 
 struct omp_allocator_data
@@ -65,7 +66,7 @@ struct omp_allocator_data
   unsigned int fallback : 8;
   unsigned int pinned : 1;
   unsigned int partition : 7;
-#ifdef LIBGOMP_USE_MEMKIND
+#if defined(LIBGOMP_USE_MEMKIND) || defined(LIBGOMP_USE_LIBNUMA)
   unsigned int memkind : 8;
 #endif
 #ifndef HAVE_SYNC_BUILTINS
@@ -81,6 +82,14 @@ struct omp_mem_header
   void *pad;
 };
 
+struct gomp_libnuma_data
+{
+  void *numa_handle;
+  void *(*numa_alloc_local) (size_t);
+  void *(*numa_realloc) (void *, size_t, size_t);
+  void (*numa_free) (void *, size_t);
+};
+
 struct gomp_memkind_data
 {
   void *memkind_handle;
@@ -92,6 +101,50 @@ struct gomp_memkind_data
   void **kinds[GOMP_MEMKIND_COUNT];
 };
 
+#ifdef LIBGOMP_USE_LIBNUMA
+static struct gomp_libnuma_data *libnuma_data;
+static pthread_once_t libnuma_data_once = PTHREAD_ONCE_INIT;
+
+static void
+gomp_init_libnuma (void)
+{
+  void *handle = dlopen ("libnuma.so.1", RTLD_LAZY);
+  struct gomp_libnuma_data *data;
+
+  data = calloc (1, sizeof (struct gomp_libnuma_data));
+  if (data == NULL)
+    {
+      if (handle)
+	dlclose (handle);
+      return;
+    }
+  if (!handle)
+    {
+      __atomic_store_n (&libnuma_data, data, MEMMODEL_RELEASE);
+      return;
+    }
+  data->numa_handle = handle;
+  data->numa_alloc_local
+    = (__typeof (data->numa_alloc_local)) dlsym (handle, "numa_alloc_local");
+  data->numa_realloc
+    = (__typeof (data->numa_realloc)) dlsym (handle, "numa_realloc");
+  data->numa_free
+    = (__typeof (data->numa_free)) dlsym (handle, "numa_free");
+  __atomic_store_n (&libnuma_data, data, MEMMODEL_RELEASE);
+}
+
+static struct gomp_libnuma_data *
+gomp_get_libnuma (void)
+{
+  struct gomp_libnuma_data *data
+    = __atomic_load_n (&libnuma_data, MEMMODEL_ACQUIRE);
+  if (data)
+    return data;
+  pthread_once (&libnuma_data_once, gomp_init_libnuma);
+  return __atomic_load_n (&libnuma_data, MEMMODEL_ACQUIRE);
+}
+#endif
+
 #ifdef LIBGOMP_USE_MEMKIND
 static struct gomp_memkind_data *memkind_data;
 static pthread_once_t memkind_data_once = PTHREAD_ONCE_INIT;
@@ -166,7 +219,7 @@ omp_init_allocator (omp_memspace_handle_t memspace, int ntraits,
   struct omp_allocator_data data
     = { memspace, 1, ~(uintptr_t) 0, 0, 0, omp_atv_contended, omp_atv_all,
 	omp_atv_default_mem_fb, omp_atv_false, omp_atv_environment,
-#ifdef LIBGOMP_USE_MEMKIND
+#if defined(LIBGOMP_USE_MEMKIND) || defined(LIBGOMP_USE_LIBNUMA)
 	GOMP_MEMKIND_NONE
 #endif
       };
@@ -285,8 +338,8 @@ omp_init_allocator (omp_memspace_handle_t memspace, int ntraits,
 
   switch (memspace)
     {
-    case omp_high_bw_mem_space:
 #ifdef LIBGOMP_USE_MEMKIND
+    case omp_high_bw_mem_space:
       struct gomp_memkind_data *memkind_data;
       memkind_data = gomp_get_memkind ();
       if (data.partition == omp_atv_interleaved
@@ -300,17 +353,15 @@ omp_init_allocator (omp_memspace_handle_t memspace, int ntraits,
 	  data.memkind = GOMP_MEMKIND_HBW_PREFERRED;
 	  break;
 	}
-#endif
       break;
     case omp_large_cap_mem_space:
-#ifdef LIBGOMP_USE_MEMKIND
       memkind_data = gomp_get_memkind ();
       if (memkind_data->kinds[GOMP_MEMKIND_DAX_KMEM_ALL])
 	data.memkind = GOMP_MEMKIND_DAX_KMEM_ALL;
       else if (memkind_data->kinds[GOMP_MEMKIND_DAX_KMEM])
 	data.memkind = GOMP_MEMKIND_DAX_KMEM;
-#endif
       break;
+#endif
     default:
 #ifdef LIBGOMP_USE_MEMKIND
       if (data.partition == omp_atv_interleaved)
@@ -323,6 +374,14 @@ omp_init_allocator (omp_memspace_handle_t memspace, int ntraits,
       break;
     }
 
+#ifdef LIBGOMP_USE_LIBNUMA
+  if (data.memkind == GOMP_MEMKIND_NONE && data.partition == omp_atv_nearest)
+    {
+      data.memkind = GOMP_MEMKIND_LIBNUMA;
+      libnuma_data = gomp_get_libnuma ();
+    }
+#endif
+
   /* No support for this so far.  */
   if (data.pinned)
     return omp_null_allocator;
@@ -357,8 +416,8 @@ omp_aligned_alloc (size_t alignment, size_t size,
   struct omp_allocator_data *allocator_data;
   size_t new_size, new_alignment;
   void *ptr, *ret;
-#ifdef LIBGOMP_USE_MEMKIND
-  enum gomp_memkind_kind memkind;
+#if defined(LIBGOMP_USE_MEMKIND) || defined(LIBGOMP_USE_LIBNUMA)
+  enum gomp_numa_memkind_kind memkind;
 #endif
 
   if (__builtin_expect (size == 0, 0))
@@ -379,7 +438,7 @@ retry:
       allocator_data = (struct omp_allocator_data *) allocator;
       if (new_alignment < allocator_data->alignment)
 	new_alignment = allocator_data->alignment;
-#ifdef LIBGOMP_USE_MEMKIND
+#if defined(LIBGOMP_USE_MEMKIND) || defined(LIBGOMP_USE_LIBNUMA)
       memkind = allocator_data->memkind;
 #endif
     }
@@ -388,8 +447,10 @@ retry:
       allocator_data = NULL;
       if (new_alignment < sizeof (void *))
 	new_alignment = sizeof (void *);
-#ifdef LIBGOMP_USE_MEMKIND
+#if defined(LIBGOMP_USE_MEMKIND) || defined(LIBGOMP_USE_LIBNUMA)
       memkind = GOMP_MEMKIND_NONE;
+#endif
+#ifdef LIBGOMP_USE_MEMKIND
       if (allocator == omp_high_bw_mem_alloc)
 	memkind = GOMP_MEMKIND_HBW_PREFERRED;
       else if (allocator == omp_large_cap_mem_alloc)
@@ -444,6 +505,13 @@ retry:
       allocator_data->used_pool_size = used_pool_size;
       gomp_mutex_unlock (&allocator_data->lock);
 #endif
+#ifdef LIBGOMP_USE_LIBNUMA
+      if (memkind == GOMP_MEMKIND_LIBNUMA)
+	ptr = libnuma_data->numa_alloc_local (new_size);
+# ifdef LIBGOMP_USE_MEMKIND
+      else
+# endif
+#endif
 #ifdef LIBGOMP_USE_MEMKIND
       if (memkind)
 	{
@@ -469,6 +537,13 @@ retry:
     }
   else
     {
+#ifdef LIBGOMP_USE_LIBNUMA
+      if (memkind == GOMP_MEMKIND_LIBNUMA)
+	ptr = libnuma_data->numa_alloc_local (new_size);
+# ifdef LIBGOMP_USE_MEMKIND
+      else
+# endif
+#endif
 #ifdef LIBGOMP_USE_MEMKIND
       if (memkind)
 	{
@@ -502,7 +577,7 @@ fail:
 	{
 	case omp_atv_default_mem_fb:
 	  if ((new_alignment > sizeof (void *) && new_alignment > alignment)
-#ifdef LIBGOMP_USE_MEMKIND
+#if defined(LIBGOMP_USE_MEMKIND) || defined(LIBGOMP_USE_LIBNUMA)
 	      || memkind
 #endif
 	      || (allocator_data
@@ -577,6 +652,16 @@ omp_free (void *ptr, omp_allocator_handle_t allocator)
 	  gomp_mutex_unlock (&allocator_data->lock);
 #endif
 	}
+#ifdef LIBGOMP_USE_LIBNUMA
+      if (allocator_data->memkind == GOMP_MEMKIND_LIBNUMA)
+	{
+	  libnuma_data->numa_free (data->ptr, data->size);
+	  return;
+	}
+# ifdef LIBGOMP_USE_MEMKIND
+      else
+# endif
+#endif
 #ifdef LIBGOMP_USE_MEMKIND
       if (allocator_data->memkind)
 	{
@@ -590,7 +675,7 @@ omp_free (void *ptr, omp_allocator_handle_t allocator)
 #ifdef LIBGOMP_USE_MEMKIND
   else
     {
-      enum gomp_memkind_kind memkind = GOMP_MEMKIND_NONE;
+      enum gomp_numa_memkind_kind memkind = GOMP_MEMKIND_NONE;
       if (data->allocator == omp_high_bw_mem_alloc)
 	memkind = GOMP_MEMKIND_HBW_PREFERRED;
       else if (data->allocator == omp_large_cap_mem_alloc)
@@ -625,8 +710,8 @@ omp_aligned_calloc (size_t alignment, size_t nmemb, size_t size,
   struct omp_allocator_data *allocator_data;
   size_t new_size, size_temp, new_alignment;
   void *ptr, *ret;
-#ifdef LIBGOMP_USE_MEMKIND
-  enum gomp_memkind_kind memkind;
+#if defined(LIBGOMP_USE_MEMKIND) || defined(LIBGOMP_USE_LIBNUMA)
+  enum gomp_numa_memkind_kind memkind;
 #endif
 
   if (__builtin_expect (size == 0 || nmemb == 0, 0))
@@ -647,7 +732,7 @@ retry:
       allocator_data = (struct omp_allocator_data *) allocator;
       if (new_alignment < allocator_data->alignment)
 	new_alignment = allocator_data->alignment;
-#ifdef LIBGOMP_USE_MEMKIND
+#if defined(LIBGOMP_USE_MEMKIND) || defined(LIBGOMP_USE_LIBNUMA)
       memkind = allocator_data->memkind;
 #endif
     }
@@ -656,8 +741,10 @@ retry:
       allocator_data = NULL;
       if (new_alignment < sizeof (void *))
 	new_alignment = sizeof (void *);
-#ifdef LIBGOMP_USE_MEMKIND
+#if defined(LIBGOMP_USE_MEMKIND) || defined(LIBGOMP_USE_LIBNUMA)
       memkind = GOMP_MEMKIND_NONE;
+#endif
+#ifdef LIBGOMP_USE_MEMKIND
       if (allocator == omp_high_bw_mem_alloc)
 	memkind = GOMP_MEMKIND_HBW_PREFERRED;
       else if (allocator == omp_large_cap_mem_alloc)
@@ -714,6 +801,15 @@ retry:
       allocator_data->used_pool_size = used_pool_size;
       gomp_mutex_unlock (&allocator_data->lock);
 #endif
+#ifdef LIBGOMP_USE_LIBNUMA
+      if (memkind == GOMP_MEMKIND_LIBNUMA)
+	/* numa_alloc_local uses mmap with MAP_ANONYMOUS, returning
+	   memory that is initialized to zero.  */
+	ptr = libnuma_data->numa_alloc_local (new_size);
+# ifdef LIBGOMP_USE_MEMKIND
+      else
+# endif
+#endif
 #ifdef LIBGOMP_USE_MEMKIND
       if (memkind)
 	{
@@ -739,6 +835,15 @@ retry:
     }
   else
     {
+#ifdef LIBGOMP_USE_LIBNUMA
+      if (memkind == GOMP_MEMKIND_LIBNUMA)
+	/* numa_alloc_local uses mmap with MAP_ANONYMOUS, returning
+	   memory that is initialized to zero.  */
+	ptr = libnuma_data->numa_alloc_local (new_size);
+# ifdef LIBGOMP_USE_MEMKIND
+      else
+# endif
+#endif
 #ifdef LIBGOMP_USE_MEMKIND
       if (memkind)
 	{
@@ -772,7 +877,7 @@ fail:
 	{
 	case omp_atv_default_mem_fb:
 	  if ((new_alignment > sizeof (void *) && new_alignment > alignment)
-#ifdef LIBGOMP_USE_MEMKIND
+#if defined(LIBGOMP_USE_MEMKIND) || defined(LIBGOMP_USE_LIBNUMA)
 	      || memkind
 #endif
 	      || (allocator_data
@@ -815,8 +920,8 @@ omp_realloc (void *ptr, size_t size, omp_allocator_handle_t allocator,
   size_t new_size, old_size, new_alignment, old_alignment;
   void *new_ptr, *ret;
   struct omp_mem_header *data;
-#ifdef LIBGOMP_USE_MEMKIND
-  enum gomp_memkind_kind memkind, free_memkind;
+#if defined(LIBGOMP_USE_MEMKIND) || defined(LIBGOMP_USE_LIBNUMA)
+  enum gomp_numa_memkind_kind memkind, free_memkind;
 #endif
 
   if (__builtin_expect (ptr == NULL, 0))
@@ -841,15 +946,17 @@ retry:
       allocator_data = (struct omp_allocator_data *) allocator;
       if (new_alignment < allocator_data->alignment)
 	new_alignment = allocator_data->alignment;
-#ifdef LIBGOMP_USE_MEMKIND
+#if defined(LIBGOMP_USE_MEMKIND) || defined(LIBGOMP_USE_LIBNUMA)
       memkind = allocator_data->memkind;
 #endif
     }
   else
     {
       allocator_data = NULL;
-#ifdef LIBGOMP_USE_MEMKIND
+#if defined(LIBGOMP_USE_MEMKIND) || defined(LIBGOMP_USE_LIBNUMA)
       memkind = GOMP_MEMKIND_NONE;
+#endif
+#ifdef LIBGOMP_USE_MEMKIND
       if (allocator == omp_high_bw_mem_alloc)
 	memkind = GOMP_MEMKIND_HBW_PREFERRED;
       else if (allocator == omp_large_cap_mem_alloc)
@@ -865,15 +972,17 @@ retry:
   if (free_allocator > omp_max_predefined_alloc)
     {
       free_allocator_data = (struct omp_allocator_data *) free_allocator;
-#ifdef LIBGOMP_USE_MEMKIND
+#if defined(LIBGOMP_USE_MEMKIND) || defined(LIBGOMP_USE_LIBNUMA)
       free_memkind = free_allocator_data->memkind;
 #endif
     }
   else
     {
       free_allocator_data = NULL;
-#ifdef LIBGOMP_USE_MEMKIND
+#if defined(LIBGOMP_USE_MEMKIND) || defined(LIBGOMP_USE_LIBNUMA)
       free_memkind = GOMP_MEMKIND_NONE;
+#endif
+#ifdef LIBGOMP_USE_MEMKIND
       if (free_allocator == omp_high_bw_mem_alloc)
 	free_memkind = GOMP_MEMKIND_HBW_PREFERRED;
       else if (free_allocator == omp_large_cap_mem_alloc)
@@ -953,6 +1062,19 @@ retry:
       allocator_data->used_pool_size = used_pool_size;
       gomp_mutex_unlock (&allocator_data->lock);
 #endif
+#ifdef LIBGOMP_USE_LIBNUMA
+      if (memkind == GOMP_MEMKIND_LIBNUMA)
+	{
+	  if (prev_size)
+	    new_ptr = libnuma_data->numa_realloc (data->ptr, data->size,
+						  new_size);
+	  else
+	    new_ptr = libnuma_data->numa_alloc_local (new_size);
+	}
+# ifdef LIBGOMP_USE_MEMKIND
+      else
+# endif
+#endif
 #ifdef LIBGOMP_USE_MEMKIND
       if (memkind)
 	{
@@ -994,12 +1116,19 @@ retry:
     }
   else if (new_alignment == sizeof (void *)
 	   && old_alignment == sizeof (struct omp_mem_header)
-#ifdef LIBGOMP_USE_MEMKIND
+#if defined(LIBGOMP_USE_MEMKIND) || defined(LIBGOMP_USE_LIBNUMA)
 	   && memkind == free_memkind
 #endif
 	   && (free_allocator_data == NULL
 	       || free_allocator_data->pool_size == ~(uintptr_t) 0))
     {
+#ifdef LIBGOMP_USE_LIBNUMA
+      if (memkind == GOMP_MEMKIND_LIBNUMA)
+	new_ptr = libnuma_data->numa_realloc (data->ptr, data->size, new_size);
+# ifdef LIBGOMP_USE_MEMKIND
+      else
+# endif
+#endif
 #ifdef LIBGOMP_USE_MEMKIND
       if (memkind)
 	{
@@ -1021,6 +1150,13 @@ retry:
     }
   else
     {
+#ifdef LIBGOMP_USE_LIBNUMA
+      if (memkind == GOMP_MEMKIND_LIBNUMA)
+	new_ptr = libnuma_data->numa_alloc_local (new_size);
+# ifdef LIBGOMP_USE_MEMKIND
+      else
+# endif
+#endif
 #ifdef LIBGOMP_USE_MEMKIND
       if (memkind)
 	{
@@ -1060,6 +1196,16 @@ retry:
       gomp_mutex_unlock (&free_allocator_data->lock);
 #endif
     }
+#ifdef LIBGOMP_USE_LIBNUMA
+  if (free_memkind == GOMP_MEMKIND_LIBNUMA)
+    {
+      libnuma_data->numa_free (data->ptr, data->size);
+      return ret;
+    }
+# ifdef LIBGOMP_USE_MEMKIND
+  else
+# endif
+#endif
 #ifdef LIBGOMP_USE_MEMKIND
   if (free_memkind)
     {
@@ -1079,7 +1225,7 @@ fail:
 	{
 	case omp_atv_default_mem_fb:
 	  if (new_alignment > sizeof (void *)
-#ifdef LIBGOMP_USE_MEMKIND
+#if defined(LIBGOMP_USE_MEMKIND) || defined(LIBGOMP_USE_LIBNUMA)
 	      || memkind
 #endif
 	      || (allocator_data
diff --git a/libgomp/config/linux/allocator.c b/libgomp/config/linux/allocator.c
index 15babcd1ada..64b1b4b9623 100644
--- a/libgomp/config/linux/allocator.c
+++ b/libgomp/config/linux/allocator.c
@@ -31,6 +31,7 @@
 #include "libgomp.h"
 #if defined(PLUGIN_SUPPORT) && defined(LIBGOMP_USE_PTHREADS)
 #define LIBGOMP_USE_MEMKIND
+#define LIBGOMP_USE_LIBNUMA
 #endif
 
 #include "../../allocator.c"
diff --git a/libgomp/libgomp.texi b/libgomp/libgomp.texi
index d1a5e67329a..9d910e6883c 100644
--- a/libgomp/libgomp.texi
+++ b/libgomp/libgomp.texi
@@ -2061,7 +2061,7 @@ Special values are output using @code{%} followed by an optional size
 specification and then either the single-character field type or its long
 name enclosed in curly braces; using @code{%%} will display a literal percent.
 The size specification consists of an optional @code{0.} or @code{.} followed
-by a positive integer, specifing the minimal width of the output.  With
+by a positive integer, specifying the minimal width of the output.  With
 @code{0.} and numerical values, the output is padded with zeros on the left;
 with @code{.}, the output is padded by spaces on the left; otherwise, the
 output is padded by spaces on the right.  If unset, the value is
@@ -2079,8 +2079,8 @@ Supported field types are:
       @tab value returned by
            @code{omp_get_ancestor_thread_num(omp_get_level()-1)}
 @item H @tab host @tab name of the host that executes the thread
-@item P @tab process_id @tab process identiﬁer
-@item i @tab native_thread_id @tab native thread identiﬁer
+@item P @tab process_id @tab process identifier
+@item i @tab native_thread_id @tab native thread identifier
 @item A @tab thread_affinity
       @tab comma separated list of integer values or ranges, representing the
            processors on which a process might execute, subject to affinity
@@ -4584,7 +4584,7 @@ offloading devices (it's not clear if they should be):
 @menu
 * Implementation-defined ICV Initialization::
 * OpenMP Context Selectors::
-* Memory allocation with libmemkind::
+* Memory allocation::
 @end menu
 
 @node Implementation-defined ICV Initialization
@@ -4631,8 +4631,8 @@ smaller number.  On non-host devices, the value of the
       @tab See @code{-march=} in ``Nvidia PTX Options''
 @end multitable
 
-@node Memory allocation with libmemkind
-@section Memory allocation with libmemkind
+@node Memory allocation
+@section Memory allocation
 
 For the memory spaces, the following applies:
 @itemize
@@ -4652,20 +4652,40 @@ creating memory allocators requesting
 @itemize
 @item the memory space @code{omp_high_bw_mem_space}
 @item the memory space @code{omp_large_cap_mem_space}
-@item the partition trait @code{omp_atv_interleaved}; note that for
+@item the @code{partition} trait @code{interleaved}; note that for
       @code{omp_large_cap_mem_space} the allocation will not be interleaved
 @end itemize
 
+On Linux systems, where the @uref{https://github.com/numactl/numactl, numa
+library} (@code{libnuma.so.1}) is available at runtime, it used when creating
+memory allocators requesting
+
+@itemize
+@item the @code{partition} trait @code{nearest}, except when both the
+libmemkind library is available and the memory space is either
+@code{omp_large_cap_mem_space} or @code{omp_high_bw_mem_space}
+@end itemize
+
+Note that the numa library will round up the allocation size to a multiple of
+the system page size; therefore, consider using it only with large data or
+by sharing allocations via the @code{pool_size} trait.  Furthermore, the Linux
+kernel does not guarantee that an allocation will always be on the nearest NUMA
+node nor that after reallocation the same node will be used.  Note additionally
+that, on Linux, the default setting of the memory placement policy is to use the
+current node; therefore, unless the memory placement policy has been overridden,
+the @code{partition} trait @code{environment} (the default) will be effectively
+a @code{nearest} allocation.
+
 Additional notes:
 @itemize
 @item The @code{pinned} trait is unsupported.
 @item For the @code{partition} trait, the partition part size will be the same
       as the requested size (i.e. @code{interleaved} or @code{blocked} has no
       effect), except for @code{interleaved} when the memkind library is
-      available.  Furthermore, for @code{nearest} the memory might not be
-      on the same NUMA node as thread that allocated the memory; on Linux,
-      this is in particular the case when the memory placement policy is
-      set to preferred.
+      available.  Furthermore, for @code{nearest} and unless the numa library
+      is available, the memory might not be on the same NUMA node as thread
+      that allocated the memory; on Linux, this is in particular the case when
+      the memory placement policy is set to preferred.
 @item The @code{access} trait has no effect such that memory is always
       accessible by all threads.
 @item The @code{sync_hint} trait has no effect.
diff --git a/libgomp/testsuite/libgomp.c-c++-common/alloc-11.c b/libgomp/testsuite/libgomp.c-c++-common/alloc-11.c
new file mode 100644
index 00000000000..5fbadf4406a
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c-c++-common/alloc-11.c
@@ -0,0 +1,285 @@
+/* This testcase is mostly the same as alloc-9.c.
+   However, on systems where the numa and/or memkind libraries are
+   installed, libgomp uses those.  This test ensures that the minimal
+   features work. Note: No attempt has been made to verify the parition
+   hints interleaved and nearest as the kernal purposely ignore them once
+   in a while and it would also require a 'dlopen' dance.
+
+   memkind is used for omp_high_bw_mem_space, omp_large_cap_mem_space
+   and partition = interleaved, albeit it won't be interleaved for
+   omp_large_cap_mem_space.
+
+   numa is used for partition = nearest, unless memkind is used.  */
+
+#include <omp.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+const omp_alloctrait_t traits2[]
+= { { omp_atk_alignment, 16 },
+    { omp_atk_sync_hint, omp_atv_default },
+    { omp_atk_access, omp_atv_default },
+    { omp_atk_pool_size, 1024 },
+    { omp_atk_fallback, omp_atv_default_mem_fb },
+    { omp_atk_partition, omp_atv_nearest } };
+omp_alloctrait_t traits3[]
+= { { omp_atk_sync_hint, omp_atv_uncontended },
+    { omp_atk_alignment, 32 },
+    { omp_atk_access, omp_atv_all },
+    { omp_atk_pool_size, 512 },
+    { omp_atk_fallback, omp_atv_allocator_fb },
+    { omp_atk_fb_data, 0 },
+    { omp_atk_partition, omp_atv_interleaved } };
+const omp_alloctrait_t traits4[]
+= { { omp_atk_alignment, 128 },
+    { omp_atk_pool_size, 1024 },
+    { omp_atk_fallback, omp_atv_null_fb } };
+
+int
+main ()
+{
+  int *volatile p = (int *) omp_alloc (3 * sizeof (int), omp_default_mem_alloc);
+  int *volatile q;
+  int *volatile r;
+  omp_alloctrait_t traits[4]
+    = { { omp_atk_alignment, 64 },
+	{ omp_atk_fallback, omp_atv_null_fb },
+	{ omp_atk_pool_size, 4096 },
+        { omp_atk_partition, omp_atv_nearest } };
+  omp_alloctrait_t traits5[2]
+    = { { omp_atk_fallback, omp_atv_null_fb },
+	{ omp_atk_pool_size, 4096 } };
+  omp_allocator_handle_t a, a2;
+
+  if ((((uintptr_t) p) % __alignof (int)) != 0)
+    abort ();
+  p[0] = 1;
+  p[1] = 2;
+  p[2] = 3;
+  p = (int *) omp_realloc (p, 4 * sizeof (int), omp_high_bw_mem_alloc, omp_high_bw_mem_alloc);
+  if ((((uintptr_t) p) % __alignof (int)) != 0 || p[0] != 1 || p[1] != 2 || p[2] != 3)
+    abort ();
+  p[0] = 4;
+  p[1] = 5;
+  p[2] = 6;
+  p[3] = 7;
+  p = (int *) omp_realloc (p, 2 * sizeof (int), omp_high_bw_mem_alloc, omp_high_bw_mem_alloc);
+  if ((((uintptr_t) p) % __alignof (int)) != 0 || p[0] != 4 || p[1] != 5)
+    abort ();
+  p[0] = 8;
+  p[1] = 9;
+  if (omp_realloc (p, 0, omp_null_allocator, omp_high_bw_mem_alloc) != NULL)
+    abort ();
+  p = (int *) omp_realloc (NULL, 2 * sizeof (int), omp_large_cap_mem_alloc, omp_null_allocator);
+  if ((((uintptr_t) p) % __alignof (int)) != 0)
+    abort ();
+  p[0] = 1;
+  p[1] = 2;
+  p = (int *) omp_realloc (p, 5 * sizeof (int), omp_large_cap_mem_alloc, omp_large_cap_mem_alloc);
+  if ((((uintptr_t) p) % __alignof (int)) != 0 || p[0] != 1 || p[1] != 2)
+    abort ();
+  p[0] = 3;
+  p[1] = 4;
+  p[2] = 5;
+  p[3] = 6;
+  p[4] = 7;
+  omp_free (p, omp_null_allocator);
+  omp_set_default_allocator (omp_large_cap_mem_alloc);
+  if (omp_realloc (NULL, 0, omp_null_allocator, omp_null_allocator) != NULL)
+    abort ();
+  p = (int *) omp_alloc (sizeof (int), omp_null_allocator);
+  if ((((uintptr_t) p) % __alignof (int)) != 0)
+    abort ();
+  p[0] = 3;
+  p = (int *) omp_realloc (p, 3 * sizeof (int), omp_null_allocator, omp_null_allocator);
+  if ((((uintptr_t) p) % __alignof (int)) != 0 || p[0] != 3)
+    abort ();
+  p[0] = 4;
+  p[1] = 5;
+  p[2] = 6;
+  if (omp_realloc (p, 0, omp_null_allocator, omp_get_default_allocator ()) != NULL)
+    abort ();
+  a = omp_init_allocator (omp_default_mem_space, 4, traits);
+  if (a == omp_null_allocator)
+    abort ();
+  p = (int *) omp_alloc (sizeof (int), a);
+  if ((((uintptr_t) p) % 64) != 0)
+    abort ();
+  p[0] = 7;
+  p = (int *) omp_realloc (p, 3072, a, a);
+  if ((((uintptr_t) p) % 64) != 0 || p[0] != 7)
+    abort ();
+  p[0] = 1;
+  p[3071 / sizeof (int)] = 2;
+  q = (int *) omp_alloc (sizeof (int), a);
+  if ((((uintptr_t) q) % 64) != 0)
+    abort ();
+  q[0] = 8;
+  if (omp_realloc (q, 3072, a, a) != NULL)
+    abort ();
+  omp_free (p, a);
+  omp_free (q, a);
+  p = (int *) omp_alloc (sizeof (int), a);
+  p[0] = 42;
+  p = (int *) omp_realloc (p, 3072, a, a);
+  if (p[0] != 42)
+    abort ();
+  p[0] = 3;
+  p[3071 / sizeof (int)] = 4;
+  omp_realloc (p, 0, omp_null_allocator, omp_null_allocator);
+  omp_set_default_allocator (a);
+  if (omp_get_default_allocator () != a)
+    abort ();
+  p = (int *) omp_alloc (31, omp_null_allocator);
+  if (p == NULL)
+    abort ();
+  p = (int *) omp_realloc (p, 3072, omp_null_allocator, omp_null_allocator);
+  if (p == NULL)
+    abort ();
+  q = (int *) omp_alloc (sizeof (int), omp_null_allocator);
+  if (q == NULL)
+    abort ();
+  if (omp_realloc (q, 3072, omp_null_allocator, omp_null_allocator) != NULL)
+    abort ();
+  omp_free (p, a);
+  omp_free (q, a);
+  omp_destroy_allocator (a);
+
+  a = omp_init_allocator (omp_large_cap_mem_space, 2, traits5);
+  if (a == omp_null_allocator)
+    abort ();
+  omp_set_default_allocator (a);
+  if (omp_get_default_allocator () != a)
+    abort ();
+  p = (int *) omp_alloc (3071, omp_null_allocator);
+  if (p == NULL)
+    abort ();
+  p = (int *) omp_realloc (p, 3072, omp_null_allocator, omp_null_allocator);
+  if (p == NULL)
+    abort ();
+  q = (int *) omp_alloc (sizeof (int), omp_null_allocator);
+  if (q == NULL)
+    abort ();
+  if (omp_realloc (q, 3072, omp_null_allocator, omp_null_allocator) != NULL)
+    abort ();
+  omp_free (p, a);
+  omp_free (q, a);
+  omp_destroy_allocator (a);
+
+  a = omp_init_allocator (omp_default_mem_space,
+			  sizeof (traits2) / sizeof (traits2[0]),
+			  traits2);
+  if (a == omp_null_allocator)
+    abort ();
+  if (traits3[5].key != omp_atk_fb_data)
+    abort ();
+  traits3[5].value = (uintptr_t) a;
+  a2 = omp_init_allocator (omp_default_mem_space,
+			   sizeof (traits3) / sizeof (traits3[0]),
+			   traits3);
+  if (a2 == omp_null_allocator)
+    abort ();
+  p = (int *) omp_alloc (sizeof (int), a2);
+  if ((((uintptr_t) p) % 32) != 0)
+    abort ();
+  p[0] = 84;
+  p = (int *) omp_realloc (p, 380, a2, a2);
+  if ((((uintptr_t) p) % 32) != 0 || p[0] != 84)
+    abort ();
+  p[0] = 5;
+  p[379 / sizeof (int)] = 6;
+  q = (int *) omp_alloc (sizeof (int), a2);
+  if ((((uintptr_t) q) % 32) != 0)
+    abort ();
+  q[0] = 42;
+  q = (int *) omp_realloc (q, 768, a2, a2);
+  if ((((uintptr_t) q) % 16) != 0 || q[0] != 42)
+    abort ();
+  q[0] = 7;
+  q[767 / sizeof (int)] = 8;
+  r = (int *) omp_realloc (NULL, 512, a2, omp_null_allocator);
+  if ((((uintptr_t) r) % __alignof (int)) != 0)
+    abort ();
+  r[0] = 9;
+  r[511 / sizeof (int)] = 10;
+  omp_free (p, omp_null_allocator);
+  omp_free (q, a2);
+  omp_free (r, omp_null_allocator);
+  p = (int *) omp_alloc (sizeof (int), a2);
+  if ((((uintptr_t) p) % 32) != 0)
+    abort ();
+  p[0] = 85;
+  p = (int *) omp_realloc (p, 320, a, a2);
+  if ((((uintptr_t) p) % 16) != 0 || p[0] != 85)
+    abort ();
+  p[0] = 5;
+  p[319 / sizeof (int)] = 6;
+  q = (int *) omp_alloc (sizeof (int), a);
+  if ((((uintptr_t) q) % 16) != 0)
+    abort ();
+  q[0] = 43;
+  q = (int *) omp_realloc (q, 320, a2, a);
+  if ((((uintptr_t) q) % 32) != 0 || q[0] != 43)
+    abort ();
+  q[0] = 44;
+  q[319 / sizeof (int)] = 8;
+  q = (int *) omp_realloc (q, 568, a2, a2);
+  if ((((uintptr_t) q) % 16) != 0 || q[0] != 44)
+    abort ();
+  q[0] = 7;
+  q[567 / sizeof (int)] = 8;
+  omp_free (p, omp_null_allocator);
+  omp_free (q, a2);
+  omp_destroy_allocator (a2);
+  omp_destroy_allocator (a);
+
+  a = omp_init_allocator (omp_large_cap_mem_space,
+			  sizeof (traits4) / sizeof (traits4[0]),
+			  traits4);
+  if (a == omp_null_allocator)
+    abort ();
+  if (traits3[5].key != omp_atk_fb_data)
+    abort ();
+  traits3[5].value = (uintptr_t) a;
+  a2 = omp_init_allocator (omp_default_mem_space,
+			   sizeof (traits3) / sizeof (traits3[0]),
+			   traits3);
+  if (a2 == omp_null_allocator)
+    abort ();
+  omp_set_default_allocator (a2);
+#ifdef __cplusplus
+  p = static_cast <int *> (omp_realloc (NULL, 420));
+#else
+  p = (int *) omp_realloc (NULL, 420, omp_null_allocator, omp_null_allocator);
+#endif
+  if ((((uintptr_t) p) % 32) != 0)
+    abort ();
+  p[0] = 5;
+  p[419 / sizeof (int)] = 6;
+  q = (int *) omp_realloc (NULL, sizeof (int), omp_null_allocator, omp_null_allocator);
+  if ((((uintptr_t) q) % 32) != 0)
+    abort ();
+  q[0] = 99;
+  q = (int *) omp_realloc (q, 700, omp_null_allocator, omp_null_allocator);
+  if ((((uintptr_t) q) % 128) != 0 || q[0] != 99)
+    abort ();
+  q[0] = 7;
+  q[699 / sizeof (int)] = 8;
+  if (omp_realloc (NULL, 768, omp_null_allocator, omp_null_allocator) != NULL)
+    abort ();
+#ifdef __cplusplus
+  omp_free (p);
+  if (omp_realloc (q, 0) != NULL)
+    abort ();
+  omp_free (NULL);
+#else
+  omp_free (p, omp_null_allocator);
+  if (omp_realloc (q, 0, omp_null_allocator, omp_null_allocator) != NULL)
+    abort ();
+  omp_free (NULL, omp_null_allocator);
+#endif
+  omp_free (NULL, omp_null_allocator);
+  omp_destroy_allocator (a2);
+  omp_destroy_allocator (a);
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c-c++-common/alloc-12.c b/libgomp/testsuite/libgomp.c-c++-common/alloc-12.c
new file mode 100644
index 00000000000..e07de3be6a7
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c-c++-common/alloc-12.c
@@ -0,0 +1,217 @@
+/* This testcase is mostly the same as alloc-8.c.
+   However, on systems where the numa and/or memkind libraries are
+   installed, libgomp uses those.  This test ensures that the minimal
+   features work. Note: No attempt has been made to verify the parition
+   hints interleaved and nearest as the kernal purposely ignore them once
+   in a while and it would also require a 'dlopen' dance.
+
+   memkind is used for omp_high_bw_mem_space, omp_large_cap_mem_space
+   and partition = interleaved, albeit it won't be interleaved for
+   omp_large_cap_mem_space.
+
+   numa is used for partition = nearest, unless memkind is used.  */
+
+#include <omp.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+const omp_alloctrait_t traits2[]
+= { { omp_atk_alignment, 16 },
+    { omp_atk_sync_hint, omp_atv_default },
+    { omp_atk_access, omp_atv_default },
+    { omp_atk_pool_size, 1024 },
+    { omp_atk_fallback, omp_atv_default_mem_fb },
+    { omp_atk_partition, omp_atv_nearest } };
+omp_alloctrait_t traits3[]
+= { { omp_atk_sync_hint, omp_atv_uncontended },
+    { omp_atk_alignment, 32 },
+    { omp_atk_access, omp_atv_all },
+    { omp_atk_pool_size, 512 },
+    { omp_atk_fallback, omp_atv_allocator_fb },
+    { omp_atk_fb_data, 0 },
+    { omp_atk_partition, omp_atv_interleaved } };
+const omp_alloctrait_t traits4[]
+= { { omp_atk_alignment, 128 },
+    { omp_atk_pool_size, 1024 },
+    { omp_atk_fallback, omp_atv_null_fb } };
+
+static void
+check_all_zero (void *ptr, size_t len)
+{
+  char *p = (char *) ptr;
+  for (size_t i = 0; i < len; i++)
+    if (p[i] != '\0')
+      abort ();
+}
+
+int
+main ()
+{
+  int *volatile p = (int *) omp_aligned_calloc (sizeof (int), 3, sizeof (int), omp_high_bw_mem_alloc);
+  check_all_zero (p, 3*sizeof (int));
+  int *volatile q;
+  int *volatile r;
+  int i;
+  omp_alloctrait_t traits[3]
+    = { { omp_atk_alignment, 64 },
+	{ omp_atk_fallback, omp_atv_null_fb },
+	{ omp_atk_pool_size, 4096 } };
+  omp_allocator_handle_t a, a2;
+
+  if ((((uintptr_t) p) % __alignof (int)) != 0 || p[0] || p[1] || p[2])
+    abort ();
+  p[0] = 1;
+  p[1] = 2;
+  p[2] = 3;
+  omp_free (p, omp_high_bw_mem_alloc);
+  p = (int *) omp_aligned_calloc (2 * sizeof (int), 1, 2 * sizeof (int), omp_large_cap_mem_alloc);
+  check_all_zero (p, 2*sizeof (int));
+  if ((((uintptr_t) p) % (2 * sizeof (int))) != 0 || p[0] || p[1])
+    abort ();
+  p[0] = 1;
+  p[1] = 2;
+  omp_free (p, omp_null_allocator);
+  omp_set_default_allocator (omp_large_cap_mem_alloc);
+  p = (int *) omp_aligned_calloc (1, 1, sizeof (int), omp_null_allocator);
+  check_all_zero (p, sizeof (int));
+  if ((((uintptr_t) p) % __alignof (int)) != 0 || p[0])
+    abort ();
+  p[0] = 3;
+  omp_free (p, omp_get_default_allocator ());
+
+  a = omp_init_allocator (omp_large_cap_mem_space, 3, traits);
+  if (a == omp_null_allocator)
+    abort ();
+  p = (int *) omp_aligned_calloc (32, 3, 1024, a);
+  check_all_zero (p, 3*1024);
+  if ((((uintptr_t) p) % 64) != 0)
+    abort ();
+  for (i = 0; i < 3072 / sizeof (int); i++)
+    if (p[i])
+      abort ();
+  p[0] = 1;
+  p[3071 / sizeof (int)] = 2;
+  if (omp_aligned_calloc (8, 192, 16, a) != NULL)
+    abort ();
+  omp_free (p, a);
+  p = (int *) omp_aligned_calloc (128, 6, 512, a);
+  check_all_zero (p, 6*512);
+  if ((((uintptr_t) p) % 128) != 0)
+    abort ();
+  for (i = 0; i < 3072 / sizeof (int); i++)
+    if (p[i])
+      abort ();
+  p[0] = 3;
+  p[3071 / sizeof (int)] = 4;
+  omp_free (p, omp_null_allocator);
+  omp_set_default_allocator (a);
+  if (omp_get_default_allocator () != a)
+    abort ();
+  p = (int *) omp_aligned_calloc (64, 12, 256, omp_null_allocator);
+  check_all_zero (p, 12*256);
+  for (i = 0; i < 3072 / sizeof (int); i++)
+    if (p[i])
+      abort ();
+  if (omp_aligned_calloc (8, 128, 24, omp_null_allocator) != NULL)
+    abort ();
+  omp_free (p, a);
+  omp_destroy_allocator (a);
+
+  a = omp_init_allocator (omp_default_mem_space,
+			  sizeof (traits2) / sizeof (traits2[0]),
+			  traits2);
+  if (a == omp_null_allocator)
+    abort ();
+  if (traits3[5].key != omp_atk_fb_data)
+    abort ();
+  traits3[5].value = (uintptr_t) a;
+  a2 = omp_init_allocator (omp_default_mem_space,
+			   sizeof (traits3) / sizeof (traits3[0]),
+			   traits3);
+  if (a2 == omp_null_allocator)
+    abort ();
+  p = (int *) omp_aligned_calloc (4, 5, 84, a2);
+  check_all_zero (p, 5*84);
+  for (i = 0; i < 420 / sizeof (int); i++)
+    if (p[i])
+      abort ();
+  if ((((uintptr_t) p) % 32) != 0)
+    abort ();
+  p[0] = 5;
+  p[419 / sizeof (int)] = 6;
+  q = (int *) omp_aligned_calloc (8, 24, 32, a2);
+  check_all_zero (q, 24*32);
+  if ((((uintptr_t) q) % 16) != 0)
+    abort ();
+  for (i = 0; i < 768 / sizeof (int); i++)
+    if (q[i])
+      abort ();
+  q[0] = 7;
+  q[767 / sizeof (int)] = 8;
+  r = (int *) omp_aligned_calloc (8, 64, 8, a2);
+  check_all_zero (r, 64*8);
+  if ((((uintptr_t) r) % 8) != 0)
+    abort ();
+  for (i = 0; i < 512 / sizeof (int); i++)
+    if (r[i])
+      abort ();
+  r[0] = 9;
+  r[511 / sizeof (int)] = 10;
+  omp_free (p, omp_null_allocator);
+  omp_free (q, a2);
+  omp_free (r, omp_null_allocator);
+  omp_destroy_allocator (a2);
+  omp_destroy_allocator (a);
+
+  a = omp_init_allocator (omp_high_bw_mem_space,
+			  sizeof (traits4) / sizeof (traits4[0]),
+			  traits4);
+  if (a == omp_null_allocator)
+    abort ();
+  if (traits3[5].key != omp_atk_fb_data)
+    abort ();
+  traits3[5].value = (uintptr_t) a;
+  a2 = omp_init_allocator (omp_high_bw_mem_space,
+			   sizeof (traits3) / sizeof (traits3[0]),
+			   traits3);
+  if (a2 == omp_null_allocator)
+    abort ();
+  omp_set_default_allocator (a2);
+#ifdef __cplusplus
+  p = static_cast <int *> (omp_aligned_calloc (4, 21, 20));
+#else
+  p = (int *) omp_aligned_calloc (4, 21, 20, omp_null_allocator);
+#endif
+  check_all_zero (p, 21*20);
+  if ((((uintptr_t) p) % 32) != 0)
+    abort ();
+  for (i = 0; i < 420 / sizeof (int); i++)
+    if (p[i])
+      abort ();
+  p[0] = 5;
+  p[419 / sizeof (int)] = 6;
+  q = (int *) omp_aligned_calloc (64, 12, 64, omp_null_allocator);
+  check_all_zero (q, 12*64);
+  if ((((uintptr_t) q) % 128) != 0)
+    abort ();
+  for (i = 0; i < 768 / sizeof (int); i++)
+    if (q[i])
+      abort ();
+  q[0] = 7;
+  q[767 / sizeof (int)] = 8;
+  if (omp_aligned_calloc (8, 24, 32, omp_null_allocator) != NULL)
+    abort ();
+#ifdef __cplusplus
+  omp_free (p);
+  omp_free (q);
+  omp_free (NULL);
+#else
+  omp_free (p, omp_null_allocator);
+  omp_free (q, omp_null_allocator);
+  omp_free (NULL, omp_null_allocator);
+#endif
+  omp_free (NULL, omp_null_allocator);
+  omp_destroy_allocator (a2);
+  omp_destroy_allocator (a);
+  return 0;
+}

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [Patch] libgomp: Use libnuma for OpenMP's partition=nearest allocation trait
  2023-07-12 12:05 ` Tobias Burnus
@ 2023-07-13 16:13   ` Prathamesh Kulkarni
  2023-07-14  7:27     ` Tobias Burnus
  0 siblings, 1 reply; 4+ messages in thread
From: Prathamesh Kulkarni @ 2023-07-13 16:13 UTC (permalink / raw)
  To: Tobias Burnus; +Cc: gcc-patches, Jakub Jelinek

On Wed, 12 Jul 2023 at 17:35, Tobias Burnus <tobias@codesourcery.com> wrote:
>
> Now committed as r14-2462-g450b05ce54d3f0.
Hi Tobias,
The newly added tests in above commit -- alloc-11.c and alloc-12.c
seem to fail during execution
on armv8l-unknown-linux-gnueabihf:

Running libgomp:libgomp.c++/c++.exp ...
FAIL: libgomp.c++/../libgomp.c-c++-common/alloc-11.c execution test
FAIL: libgomp.c++/../libgomp.c-c++-common/alloc-12.c execution test

Running libgomp:libgomp.c/c.exp ...
FAIL: libgomp.c/../libgomp.c-c++-common/alloc-11.c execution test
FAIL: libgomp.c/../libgomp.c-c++-common/alloc-12.c execution test

Could you please investigate ?

Thanks,
Prathamesh
>
> Changes to the patch in previous email:
> * I fixed some issues found on the way,
> * The wording in the .texi has been improved/expanded, and
> * I included two testcases to exercise the two libraries (or
>    the default allocator when it is not available at runtime).
>
> Given that the default allocation already works fine (nearest)
> and the normal "malloc" is more economic in terms of memory
> handling (not multiples of page size or requesting a fixed
> pool size), I was wondering whether this patch is really needed.
>
> But at the end: default can be changed (cf. below) and given
> the user the choice makes sense. The manual states what GCC does
> which should help to make a conscious choice.
>
> * * *
>
> I did experiment with the testcase attached to previous email
> plus using dlopen to obtain the functions from libnuma if available.
>
> It was also using:
> /* { dg-do run { target { dlopen } } } */
> /* { dg-additional-options "-ldl" } */
>
> However, the Linux kernel too often placed the allocated memory
> on the "wrong" node to be usable as a testcase. I did get be
> 0 to 15 misplaced allocations, depending on the run.
>
> Hence, there is no such testcase. Using numactrl --preferred=1 I
> could force the normal allocation to (mostly) use node 1 for
> allocations such that the difference between partiton = default/environment
> vs. partition = nearest was clearly visible. Hence it does work.
>
> Otherwise, the same applies as I wrote the yesterday:
>
> On 11.07.23 12:35, Tobias Burnus wrote:
>
> > While by default 'malloc' allocates memory on the same node as the
> > calling
> > process/thread ('numactl --show' shows 'preferred node: current',
> > Linux kernel memory policy MPOL_DEFAULT), this can be changed.
> > For instance, when running the program as follows, 'malloc' now
> > prefers to allocate on the second node:
> >   numactl --preferred=1 ./myproc
> >
> > Thus, it seems to be sensible to provide a means to ensure the 'nearest'
> > allocation.  The MPOL_LOCAL policy does so, as provided by
> > libnuma's numa_alloc_local. (Which is just wrapper around the syscalls
> > mmap and mbind.) As with (lib)memkind, there is a run-time dlopen check
> > for (lib)numa - and no numa*.h is required when bulding GCC.
> -----------------
> Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht München, HRB 106955

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [Patch] libgomp: Use libnuma for OpenMP's partition=nearest allocation trait
  2023-07-13 16:13   ` Prathamesh Kulkarni
@ 2023-07-14  7:27     ` Tobias Burnus
  0 siblings, 0 replies; 4+ messages in thread
From: Tobias Burnus @ 2023-07-14  7:27 UTC (permalink / raw)
  To: Prathamesh Kulkarni; +Cc: gcc-patches, Jakub Jelinek

[-- Attachment #1: Type: text/plain, Size: 860 bytes --]

Hi Prathamesh,

On 13.07.23 18:13, Prathamesh Kulkarni wrote:

> The newly added tests in above commit -- alloc-11.c and alloc-12.c
> seem to fail during execution on armv8l-unknown-linux-gnueabihf:

thanks for the report and sorry for the breakage. While being aware that
libnuma is potentially not available, the code actually did not properly
test for it. (That 200+ packages require libnuma on this system, did not
help with testing it without, though.)

I have committed the attached obvious patch as r14-2514-g407d68daed00e0,
which hopefully fixes all issues.

Tobias
-----------------
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht München, HRB 106955

[-- Attachment #2: committed.diff --]
[-- Type: text/x-patch, Size: 1195 bytes --]

commit 407d68daed00e040a7d9545b2a18aa27bf93a106
Author: Tobias Burnus <tobias@codesourcery.com>
Date:   Fri Jul 14 09:14:37 2023 +0200

    libgomp: Fix allocator handling for Linux when libnuma is not available
    
    Follow up to r14-2462-g450b05ce54d3f0.  The case that libnuma was not
    available at runtime was not properly handled; now it falls back to
    the normal malloc.
    
    libgomp/
    
            * allocator.c (omp_init_allocator): Check whether symbol from
            dlopened libnuma is available before using libnuma for
            allocations.
---
 libgomp/allocator.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/libgomp/allocator.c b/libgomp/allocator.c
index b3187ab2911..90f2dcb60d6 100644
--- a/libgomp/allocator.c
+++ b/libgomp/allocator.c
@@ -377,8 +377,9 @@ omp_init_allocator (omp_memspace_handle_t memspace, int ntraits,
 #ifdef LIBGOMP_USE_LIBNUMA
   if (data.memkind == GOMP_MEMKIND_NONE && data.partition == omp_atv_nearest)
     {
-      data.memkind = GOMP_MEMKIND_LIBNUMA;
       libnuma_data = gomp_get_libnuma ();
+      if (libnuma_data->numa_alloc_local != NULL)
+	data.memkind = GOMP_MEMKIND_LIBNUMA;
     }
 #endif
 

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2023-07-14  7:27 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-07-11 10:35 [Patch] libgomp: Use libnuma for OpenMP's partition=nearest allocation trait Tobias Burnus
2023-07-12 12:05 ` Tobias Burnus
2023-07-13 16:13   ` Prathamesh Kulkarni
2023-07-14  7:27     ` Tobias Burnus

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).