[PATCH] libgomp, openmp: pinned memory

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCH] libgomp, openmp: pinned memory
@ 2022-01-04 15:32 Andrew Stubbs
  2022-01-04 15:55 ` Jakub Jelinek
  2023-03-24 15:49 ` [og12] libgomp: Document OpenMP 'pinned' memory (was: [PATCH] libgomp, openmp: pinned memory Thomas Schwinge
  0 siblings, 2 replies; 28+ messages in thread
From: Andrew Stubbs @ 2022-01-04 15:32 UTC (permalink / raw)
  To: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 984 bytes --]

This patch implements the OpenMP pinned memory trait for Linux hosts. On 
other hosts and on devices the trait becomes a no-op (instead of being 
rejected).

The memory is locked via the mlock syscall, which is both the "correct" 
way to do it on Linux, and a problem because the default ulimit for 
pinned memory is very small (and most users don't have permission to 
increase it (much?)). Therefore the code emits a non-fatal warning 
message if locking fails.

Another approach might be to use cudaHostAlloc to allocate the memory in 
the first place, which bypasses the ulimit somehow, but this would not 
help non-NVidia users.

The tests work on Linux and will xfail on other hosts; neither libgomp 
nor the test knows how to allocate or query pinned memory elsewhere.

The patch applies on top of the text of my previously submitted patches, 
but does not actually depend on the functionality of those patches.

OK for stage 1?

I'll commit a backport to OG11 shortly.

Andrew

[-- Attachment #2: 220104-pinned-trait.patch --]
[-- Type: text/plain, Size: 6893 bytes --]

libgomp: pinned memory

Implement the OpenMP pinned memory trait on Linux hosts using the mlock
syscall.

libgomp/ChangeLog:

	* allocator.c (MEMSPACE_PIN): New macro.
	(xmlock): New function.
	(omp_init_allocator): Don't disallow the pinned trait.
	(omp_aligned_alloc): Add pinning via MEMSPACE_PIN.
	(omp_aligned_calloc): Likewise.
	(omp_realloc): Likewise.
	* testsuite/libgomp.c/alloc-pinned-1.c: New test.
	* testsuite/libgomp.c/alloc-pinned-2.c: New test.

diff --git a/libgomp/allocator.c b/libgomp/allocator.c
index b1f5fe0a5e2..671b91e7ff8 100644
--- a/libgomp/allocator.c
+++ b/libgomp/allocator.c
@@ -51,6 +51,25 @@
 #define MEMSPACE_FREE(MEMSPACE, ADDR, SIZE) \
   ((void)MEMSPACE, (void)SIZE, free (ADDR))
 #endif
+#ifndef MEMSPACE_PIN
+/* Only define this on supported host platforms.  */
+#ifdef __linux__
+#define MEMSPACE_PIN(MEMSPACE, ADDR, SIZE) \
+  ((void)MEMSPACE, xmlock (ADDR, SIZE))
+
+#include <sys/mman.h>
+#include <stdio.h>
+void
+xmlock (void *addr, size_t size)
+{
+  if (mlock (addr, size))
+      perror ("libgomp: failed to pin memory (ulimit too low?)");
+}
+#else
+#define MEMSPACE_PIN(MEMSPACE, ADDR, SIZE) \
+  ((void)MEMSPACE, (void)ADDR, (void)SIZE)
+#endif
+#endif
 
 /* Map the predefined allocators to the correct memory space.
    The index to this table is the omp_allocator_handle_t enum value.  */
@@ -212,7 +231,7 @@ omp_init_allocator (omp_memspace_handle_t memspace, int ntraits,
     data.alignment = sizeof (void *);
 
   /* No support for these so far (for hbw will use memkind).  */
-  if (data.pinned || data.memspace == omp_high_bw_mem_space)
+  if (data.memspace == omp_high_bw_mem_space)
     return omp_null_allocator;
 
   ret = gomp_malloc (sizeof (struct omp_allocator_data));
@@ -326,6 +345,9 @@ retry:
 #endif
 	  goto fail;
 	}
+
+      if (allocator_data->pinned)
+	MEMSPACE_PIN (allocator_data->memspace, ptr, new_size);
     }
   else
     {
@@ -335,6 +357,9 @@ retry:
       ptr = MEMSPACE_ALLOC (memspace, new_size);
       if (ptr == NULL)
 	goto fail;
+
+      if (allocator_data && allocator_data->pinned)
+	MEMSPACE_PIN (allocator_data->memspace, ptr, new_size);
     }
 
   if (new_alignment > sizeof (void *))
@@ -539,6 +564,9 @@ retry:
 #endif
 	  goto fail;
 	}
+
+      if (allocator_data->pinned)
+	MEMSPACE_PIN (allocator_data->memspace, ptr, new_size);
     }
   else
     {
@@ -548,6 +576,9 @@ retry:
       ptr = MEMSPACE_CALLOC (memspace, new_size);
       if (ptr == NULL)
 	goto fail;
+
+      if (allocator_data && allocator_data->pinned)
+	MEMSPACE_PIN (allocator_data->memspace, ptr, new_size);
     }
 
   if (new_alignment > sizeof (void *))
@@ -727,7 +758,11 @@ retry:
 #endif
 	  goto fail;
 	}
-      else if (prev_size)
+
+      if (allocator_data->pinned)
+	MEMSPACE_PIN (allocator_data->memspace, new_ptr, new_size);
+
+      if (prev_size)
 	{
 	  ret = (char *) new_ptr + sizeof (struct omp_mem_header);
 	  ((struct omp_mem_header *) ret)[-1].ptr = new_ptr;
@@ -747,6 +782,10 @@ retry:
       new_ptr = MEMSPACE_REALLOC (memspace, data->ptr, data->size, new_size);
       if (new_ptr == NULL)
 	goto fail;
+
+      if (allocator_data && allocator_data->pinned)
+	MEMSPACE_PIN (allocator_data->memspace, ptr, new_size);
+
       ret = (char *) new_ptr + sizeof (struct omp_mem_header);
       ((struct omp_mem_header *) ret)[-1].ptr = new_ptr;
       ((struct omp_mem_header *) ret)[-1].size = new_size;
diff --git a/libgomp/testsuite/libgomp.c/alloc-pinned-1.c b/libgomp/testsuite/libgomp.c/alloc-pinned-1.c
new file mode 100644
index 00000000000..0a6360cda29
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/alloc-pinned-1.c
@@ -0,0 +1,81 @@
+/* { dg-do run } */
+
+/* { dg-xfail-run-if "Pinning not implemented on this host" { ! *-*-linux-gnu } } */
+
+/* Test that pinned memory works.  */
+
+#ifdef __linux__
+#include <sys/types.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <sys/mman.h>
+
+int
+get_pinned_mem ()
+{
+  int pid = getpid ();
+  char buf[100];
+  sprintf (buf, "/proc/%d/status", pid);
+
+  FILE *proc = fopen (buf, "r");
+  if (!proc)
+    abort ();
+  while (fgets (buf, 100, proc))
+    {
+      int val;
+      if (sscanf (buf, "VmLck: %d", &val))
+	{
+	  fclose (proc);
+	  return val;
+	}
+    }
+  abort ();
+}
+#else
+int
+get_pinned_mem ()
+{
+  return 0;
+}
+#endif
+
+#include <omp.h>
+
+/* Allocate more than a page each time, but stay within the ulimit.  */
+#define SIZE 10*1024
+
+int
+main ()
+{
+  const omp_alloctrait_t traits[] = {
+      { omp_atk_pinned, 1 }
+  };
+  omp_allocator_handle_t allocator = omp_init_allocator (omp_default_mem_space, 1, traits);
+
+  // Sanity check
+  if (get_pinned_mem () != 0)
+    abort ();
+
+  void *p = omp_alloc (SIZE, allocator);
+  if (!p)
+    abort ();
+
+  int amount = get_pinned_mem ();
+  if (amount == 0)
+    abort ();
+
+  p = omp_realloc (p, SIZE*2, allocator, allocator);
+
+  int amount2 = get_pinned_mem ();
+  if (amount2 <= amount)
+    abort ();
+
+  p = omp_calloc (1, SIZE, allocator);
+
+  if (get_pinned_mem () <= amount2)
+    abort ();
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c/alloc-pinned-2.c b/libgomp/testsuite/libgomp.c/alloc-pinned-2.c
new file mode 100644
index 00000000000..8fdb4ff5cfd
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/alloc-pinned-2.c
@@ -0,0 +1,87 @@
+/* { dg-do run } */
+
+/* { dg-xfail-run-if "Pinning not implemented on this host" { ! *-*-linux-gnu } } */
+
+/* Test that pinned memory works (pool_size code path).  */
+
+#ifdef __linux__
+#include <sys/types.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <sys/mman.h>
+
+int
+get_pinned_mem ()
+{
+  int pid = getpid ();
+  char buf[100];
+  sprintf (buf, "/proc/%d/status", pid);
+
+  FILE *proc = fopen (buf, "r");
+  if (!proc)
+    abort ();
+  while (fgets (buf, 100, proc))
+    {
+      int val;
+      if (sscanf (buf, "VmLck: %d", &val))
+	{
+	  fclose (proc);
+	  return val;
+	}
+    }
+  abort ();
+}
+#else
+int
+get_pinned_mem ()
+{
+  return 0;
+}
+#endif
+
+#include <omp.h>
+
+/* Allocate more than a page each time, but stay within the ulimit.  */
+#define SIZE 10*1024
+
+int
+main ()
+{
+  const omp_alloctrait_t traits[] = {
+      { omp_atk_pinned, 1 },
+      { omp_atk_pool_size, SIZE*8 }
+  };
+  omp_allocator_handle_t allocator = omp_init_allocator (omp_default_mem_space,
+							 2, traits);
+
+  // Sanity check
+  if (get_pinned_mem () != 0)
+    abort ();
+
+  void *p = omp_alloc (SIZE, allocator);
+  if (!p)
+    abort ();
+
+  int amount = get_pinned_mem ();
+  if (amount == 0)
+    abort ();
+
+  p = omp_realloc (p, SIZE*2, allocator, allocator);
+  if (!p)
+    abort ();
+
+  int amount2 = get_pinned_mem ();
+  if (amount2 <= amount)
+    abort ();
+
+  p = omp_calloc (1, SIZE, allocator);
+  if (!p)
+    abort ();
+
+  if (get_pinned_mem () <= amount2)
+    abort ();
+
+  return 0;
+}

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] libgomp, openmp: pinned memory
  2022-01-04 15:32 [PATCH] libgomp, openmp: pinned memory Andrew Stubbs
@ 2022-01-04 15:55 ` Jakub Jelinek
  2022-01-04 16:58   ` Andrew Stubbs
  2023-03-24 15:49 ` [og12] libgomp: Document OpenMP 'pinned' memory (was: [PATCH] libgomp, openmp: pinned memory Thomas Schwinge
  1 sibling, 1 reply; 28+ messages in thread
From: Jakub Jelinek @ 2022-01-04 15:55 UTC (permalink / raw)
  To: Andrew Stubbs; +Cc: gcc-patches

On Tue, Jan 04, 2022 at 03:32:17PM +0000, Andrew Stubbs wrote:
> This patch implements the OpenMP pinned memory trait for Linux hosts. On
> other hosts and on devices the trait becomes a no-op (instead of being
> rejected).
> 
> The memory is locked via the mlock syscall, which is both the "correct" way
> to do it on Linux, and a problem because the default ulimit for pinned
> memory is very small (and most users don't have permission to increase it
> (much?)). Therefore the code emits a non-fatal warning message if locking
> fails.
> 
> Another approach might be to use cudaHostAlloc to allocate the memory in the
> first place, which bypasses the ulimit somehow, but this would not help
> non-NVidia users.
> 
> The tests work on Linux and will xfail on other hosts; neither libgomp nor
> the test knows how to allocate or query pinned memory elsewhere.
> 
> The patch applies on top of the text of my previously submitted patches, but
> does not actually depend on the functionality of those patches.
> 
> OK for stage 1?
> 
> I'll commit a backport to OG11 shortly.
> 
> Andrew

> libgomp: pinned memory
> 
> Implement the OpenMP pinned memory trait on Linux hosts using the mlock
> syscall.
> 
> libgomp/ChangeLog:
> 
> 	* allocator.c (MEMSPACE_PIN): New macro.
> 	(xmlock): New function.
> 	(omp_init_allocator): Don't disallow the pinned trait.
> 	(omp_aligned_alloc): Add pinning via MEMSPACE_PIN.
> 	(omp_aligned_calloc): Likewise.
> 	(omp_realloc): Likewise.
> 	* testsuite/libgomp.c/alloc-pinned-1.c: New test.
> 	* testsuite/libgomp.c/alloc-pinned-2.c: New test.
> 
> diff --git a/libgomp/allocator.c b/libgomp/allocator.c
> index b1f5fe0a5e2..671b91e7ff8 100644
> --- a/libgomp/allocator.c
> +++ b/libgomp/allocator.c
> @@ -51,6 +51,25 @@
>  #define MEMSPACE_FREE(MEMSPACE, ADDR, SIZE) \
>    ((void)MEMSPACE, (void)SIZE, free (ADDR))
>  #endif
> +#ifndef MEMSPACE_PIN
> +/* Only define this on supported host platforms.  */
> +#ifdef __linux__
> +#define MEMSPACE_PIN(MEMSPACE, ADDR, SIZE) \
> +  ((void)MEMSPACE, xmlock (ADDR, SIZE))
> +
> +#include <sys/mman.h>
> +#include <stdio.h>
> +void
> +xmlock (void *addr, size_t size)
> +{
> +  if (mlock (addr, size))
> +      perror ("libgomp: failed to pin memory (ulimit too low?)");
> +}
> +#else
> +#define MEMSPACE_PIN(MEMSPACE, ADDR, SIZE) \
> +  ((void)MEMSPACE, (void)ADDR, (void)SIZE)
> +#endif
> +#endif

The usual libgomp way of doing this wouldn't be to use #ifdef __linux__, but
instead add libgomp/config/linux/allocator.c that includes some headers,
defines some macros and then includes the generic allocator.c.

I think perror is the wrong thing to do, omp_alloc etc. has a well defined
interface what to do in such cases - the allocation should just fail (not be
allocated) and depending on user's choice that can be fatal, or return NULL,
or chain to some other allocator with other properties etc.

Other issues in the patch are that it doesn't munlock on deallocation and
that because of that deallocation we need to figure out what to do on page
boundaries.  As documented, mlock can be passed address and/or address +
size that aren't at page boundaries and pinning happens even just for
partially touched pages.  But munlock unpins also even the partially
overlapping pages and we don't know at that point whether some other pinned
allocations don't appear in those pages.
Some bad options are only pin pages wholy contained within the allocation
and don't pin partial pages around it, force at least page alignment and
size so that everything can be pinned, somehow ensure that we never allocate
more than one pinned allocation in such partial pages (but can allocate
there non-pinned allocations), or e.g. use some internal data structure to
track how many pinned allocations are on the partial pages (say a hash map
from page start address to a counter how many pinned allocations are there,
if it goes to 0 munlock even that page, otherwise munlock just the wholy
contained pages), or perhaps use page size aligned allocation and size and
just remember in some data structure that the partial pages could be used
for other pinned (small) allocations.

	Jakub

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] libgomp, openmp: pinned memory
  2022-01-04 15:55 ` Jakub Jelinek
@ 2022-01-04 16:58   ` Andrew Stubbs
  2022-01-04 18:28     ` Jakub Jelinek
  0 siblings, 1 reply; 28+ messages in thread
From: Andrew Stubbs @ 2022-01-04 16:58 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: gcc-patches

On 04/01/2022 15:55, Jakub Jelinek wrote:
> The usual libgomp way of doing this wouldn't be to use #ifdef __linux__, but
> instead add libgomp/config/linux/allocator.c that includes some headers,
> defines some macros and then includes the generic allocator.c.

OK, good point, I can do that.

> I think perror is the wrong thing to do, omp_alloc etc. has a well defined
> interface what to do in such cases - the allocation should just fail (not be
> allocated) and depending on user's choice that can be fatal, or return NULL,
> or chain to some other allocator with other properties etc.

I did it this way because pinning feels more like an optimization, and 
falling back to "just works" seemed like what users would want to 
happen. The perror was added because it turns out the default ulimit is 
tiny and I wanted to hint at the solution.

I guess you're right that the consistent behaviour would be to silently 
switch to the fallback allocator, but it still feels like users will be 
left in the dark about why it failed.

> Other issues in the patch are that it doesn't munlock on deallocation and
> that because of that deallocation we need to figure out what to do on page
> boundaries.  As documented, mlock can be passed address and/or address +
> size that aren't at page boundaries and pinning happens even just for
> partially touched pages.  But munlock unpins also even the partially
> overlapping pages and we don't know at that point whether some other pinned
> allocations don't appear in those pages.

Right, it doesn't munlock because of these issues. I don't know of any 
way to solve this that wouldn't involve building tables of locked ranges 
(and knowing what the page size is).

I considered using mmap with the lock flag instead, but the failure mode 
looked unhelpful. I guess we could mmap with the regular flags, then 
mlock after. That should bypass the regular heap and ensure each 
allocation has it's own page. I'm not sure what the unintended 
side-effects of that might be.

> Some bad options are only pin pages wholy contained within the allocation
> and don't pin partial pages around it, force at least page alignment and
> size so that everything can be pinned, somehow ensure that we never allocate
> more than one pinned allocation in such partial pages (but can allocate
> there non-pinned allocations), or e.g. use some internal data structure to
> track how many pinned allocations are on the partial pages (say a hash map
> from page start address to a counter how many pinned allocations are there,
> if it goes to 0 munlock even that page, otherwise munlock just the wholy
> contained pages), or perhaps use page size aligned allocation and size and
> just remember in some data structure that the partial pages could be used
> for other pinned (small) allocations.

Bad options indeed. If any part of the memory block is not pinned I 
expect no performance gains whatsoever. And all this other business adds 
complexity and runtime overhead.

For version 1.0 it feels reasonable to omit the unlock step and hope 
that a) pinned data will be long-lived, or b) short-lived pinned data 
will be replaced with more data that -- most likely -- occupies the same 
pages.

Similarly, it seems likely that serious HPC applications will run on 
devices with lots of RAM, and if not any page swapping will destroy the 
performance gains of using OpenMP.

For now I'll just fix the architectural issues.

Andrew

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] libgomp, openmp: pinned memory
  2022-01-04 16:58   ` Andrew Stubbs
@ 2022-01-04 18:28     ` Jakub Jelinek
  2022-01-04 18:47       ` Jakub Jelinek
  0 siblings, 1 reply; 28+ messages in thread
From: Jakub Jelinek @ 2022-01-04 18:28 UTC (permalink / raw)
  To: Andrew Stubbs; +Cc: gcc-patches

On Tue, Jan 04, 2022 at 04:58:19PM +0000, Andrew Stubbs wrote:
> > I think perror is the wrong thing to do, omp_alloc etc. has a well defined
> > interface what to do in such cases - the allocation should just fail (not be
> > allocated) and depending on user's choice that can be fatal, or return NULL,
> > or chain to some other allocator with other properties etc.
> 
> I did it this way because pinning feels more like an optimization, and
> falling back to "just works" seemed like what users would want to happen.
> The perror was added because it turns out the default ulimit is tiny and I
> wanted to hint at the solution.

Something like perror might be acceptable for GOMP_DEBUG mode, but not
normal operation.  So perhaps use gomp_debug there instead?

If it is just an optimization for the user, they should be using the
chaining to corresponding allocator without the pinning to make it clear
what they want and also standard conforming.

> > Other issues in the patch are that it doesn't munlock on deallocation and
> > that because of that deallocation we need to figure out what to do on page
> > boundaries.  As documented, mlock can be passed address and/or address +
> > size that aren't at page boundaries and pinning happens even just for
> > partially touched pages.  But munlock unpins also even the partially
> > overlapping pages and we don't know at that point whether some other pinned
> > allocations don't appear in those pages.
> 
> Right, it doesn't munlock because of these issues. I don't know of any way
> to solve this that wouldn't involve building tables of locked ranges (and
> knowing what the page size is).
> 
> I considered using mmap with the lock flag instead, but the failure mode
> looked unhelpful. I guess we could mmap with the regular flags, then mlock
> after. That should bypass the regular heap and ensure each allocation has
> it's own page. I'm not sure what the unintended side-effects of that might
> be.

But the munlock is even more important because of the low ulimit -l, because
if munlock isn't done on deallocation, the by default I think 64KB limit
will be reached even much earlier.  If most users have just 64KB limit on
pinned memory per process, then that most likely asks for grabbing such memory
in whole pages and doing memory management on that resource.
Because vasting that precious memory on the partial pages which will most
likely get non-pinned allocations when we just have 16 such pages is a big
waste.

	Jakub


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] libgomp, openmp: pinned memory
  2022-01-04 18:28     ` Jakub Jelinek
@ 2022-01-04 18:47       ` Jakub Jelinek
  2022-01-05 17:07         ` Andrew Stubbs
  0 siblings, 1 reply; 28+ messages in thread
From: Jakub Jelinek @ 2022-01-04 18:47 UTC (permalink / raw)
  To: Andrew Stubbs, gcc-patches

On Tue, Jan 04, 2022 at 07:28:29PM +0100, Jakub Jelinek via Gcc-patches wrote:
> > > Other issues in the patch are that it doesn't munlock on deallocation and
> > > that because of that deallocation we need to figure out what to do on page
> > > boundaries.  As documented, mlock can be passed address and/or address +
> > > size that aren't at page boundaries and pinning happens even just for
> > > partially touched pages.  But munlock unpins also even the partially
> > > overlapping pages and we don't know at that point whether some other pinned
> > > allocations don't appear in those pages.
> > 
> > Right, it doesn't munlock because of these issues. I don't know of any way
> > to solve this that wouldn't involve building tables of locked ranges (and
> > knowing what the page size is).
> > 
> > I considered using mmap with the lock flag instead, but the failure mode
> > looked unhelpful. I guess we could mmap with the regular flags, then mlock
> > after. That should bypass the regular heap and ensure each allocation has
> > it's own page. I'm not sure what the unintended side-effects of that might
> > be.
> 
> But the munlock is even more important because of the low ulimit -l, because
> if munlock isn't done on deallocation, the by default I think 64KB limit
> will be reached even much earlier.  If most users have just 64KB limit on
> pinned memory per process, then that most likely asks for grabbing such memory
> in whole pages and doing memory management on that resource.
> Because vasting that precious memory on the partial pages which will most
> likely get non-pinned allocations when we just have 16 such pages is a big
> waste.

E.g. if we start using (dynamically, using dlopen/dlsym etc.) the memkind
library for some of the allocators, for the pinned memory we could use
e.g. the memkind_create_fixed API - on the first pinned allocation, check
what is the ulimit -l and if it is fairly small, mmap PROT_NONE the whole
pinned size (but don't pin it whole at start, just whatever we need as we
go).

	Jakub


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] libgomp, openmp: pinned memory
  2022-01-04 18:47       ` Jakub Jelinek
@ 2022-01-05 17:07         ` Andrew Stubbs
  2022-01-13 13:53           ` Andrew Stubbs
  0 siblings, 1 reply; 28+ messages in thread
From: Andrew Stubbs @ 2022-01-05 17:07 UTC (permalink / raw)
  To: Jakub Jelinek, gcc-patches

On 04/01/2022 18:47, Jakub Jelinek wrote:
> On Tue, Jan 04, 2022 at 07:28:29PM +0100, Jakub Jelinek via Gcc-patches wrote:
>>>> Other issues in the patch are that it doesn't munlock on deallocation and
>>>> that because of that deallocation we need to figure out what to do on page
>>>> boundaries.  As documented, mlock can be passed address and/or address +
>>>> size that aren't at page boundaries and pinning happens even just for
>>>> partially touched pages.  But munlock unpins also even the partially
>>>> overlapping pages and we don't know at that point whether some other pinned
>>>> allocations don't appear in those pages.
>>>
>>> Right, it doesn't munlock because of these issues. I don't know of any way
>>> to solve this that wouldn't involve building tables of locked ranges (and
>>> knowing what the page size is).
>>>
>>> I considered using mmap with the lock flag instead, but the failure mode
>>> looked unhelpful. I guess we could mmap with the regular flags, then mlock
>>> after. That should bypass the regular heap and ensure each allocation has
>>> it's own page. I'm not sure what the unintended side-effects of that might
>>> be.
>>
>> But the munlock is even more important because of the low ulimit -l, because
>> if munlock isn't done on deallocation, the by default I think 64KB limit
>> will be reached even much earlier.  If most users have just 64KB limit on
>> pinned memory per process, then that most likely asks for grabbing such memory
>> in whole pages and doing memory management on that resource.
>> Because vasting that precious memory on the partial pages which will most
>> likely get non-pinned allocations when we just have 16 such pages is a big
>> waste.
> 
> E.g. if we start using (dynamically, using dlopen/dlsym etc.) the memkind
> library for some of the allocators, for the pinned memory we could use
> e.g. the memkind_create_fixed API - on the first pinned allocation, check
> what is the ulimit -l and if it is fairly small, mmap PROT_NONE the whole
> pinned size (but don't pin it whole at start, just whatever we need as we
> go).

I don't believe 64KB will be anything like enough for any real HPC 
application. Is it really worth optimizing for this case?

Anyway, I'm working on an implementation using mmap instead of malloc 
for pinned allocations. I figure that will simplify the unpin algorithm 
(because it'll be munmap) and optimize for large allocations such as I 
imagine HPC applications will use. It won't fix the ulimit issue.

Andrew

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] libgomp, openmp: pinned memory
  2022-01-05 17:07         ` Andrew Stubbs
@ 2022-01-13 13:53           ` Andrew Stubbs
  2022-06-07 11:05             ` Andrew Stubbs
                               ` (2 more replies)
  0 siblings, 3 replies; 28+ messages in thread
From: Andrew Stubbs @ 2022-01-13 13:53 UTC (permalink / raw)
  To: Jakub Jelinek, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 1515 bytes --]

On 05/01/2022 17:07, Andrew Stubbs wrote:
> I don't believe 64KB will be anything like enough for any real HPC 
> application. Is it really worth optimizing for this case?
> 
> Anyway, I'm working on an implementation using mmap instead of malloc 
> for pinned allocations. I figure that will simplify the unpin algorithm 
> (because it'll be munmap) and optimize for large allocations such as I 
> imagine HPC applications will use. It won't fix the ulimit issue.

Here's my new patch.

This version is intended to apply on top of the latest version of my 
low-latency allocator patch, although the dependency is mostly textual.

Pinned memory is allocated via mmap + mlock, and allocation fails 
(returns NULL) if the lock fails and there's no fallback configured.

This means that large allocations will now be page aligned and therefore 
pin the smallest number of pages for the size requested, and that that 
memory will be unpinned automatically when freed via munmap, or moved 
via mremap.

Obviously this is not ideal for allocations much smaller than one page. 
If that turns out to be a problem in the real world then we can add a 
special case fairly straight-forwardly, and incur the extra page 
tracking expense in those cases only, or maybe implement our own 
pinned-memory heap (something like already proposed for low-latency 
memory, perhaps).

Also new is a realloc implementation that works better when reallocation 
fails. This is confirmed by the new testcases.

OK for stage 1?

Thanks

Andrew

[-- Attachment #2: 220113-pinned-trait.patch --]
[-- Type: text/plain, Size: 21345 bytes --]

libgomp: pinned memory

Implement the OpenMP pinned memory trait on Linux hosts using the mlock
syscall.  Pinned allocations are performed using mmap, not malloc, to ensure
that they can be unpinned safely when freed.

libgomp/ChangeLog:

	* allocator.c (MEMSPACE_ALLOC): Add PIN.
	(MEMSPACE_CALLOC): Add PIN.
	(MEMSPACE_REALLOC): Add PIN.
	(MEMSPACE_FREE): Add PIN.
	(xmlock): New function.
	(omp_init_allocator): Don't disallow the pinned trait.
	(omp_aligned_alloc): Add pinning to all MEMSPACE_* calls.
	(omp_aligned_calloc): Likewise.
	(omp_realloc): Likewise.
	(omp_free): Likewise.
	* config/linux/allocator.c: New file.
	* config/nvptx/allocator.c (MEMSPACE_ALLOC): Add PIN.
	(MEMSPACE_CALLOC): Add PIN.
	(MEMSPACE_REALLOC): Add PIN.
	(MEMSPACE_FREE): Add PIN.
	* testsuite/libgomp.c/alloc-pinned-1.c: New test.
	* testsuite/libgomp.c/alloc-pinned-2.c: New test.
	* testsuite/libgomp.c/alloc-pinned-3.c: New test.
	* testsuite/libgomp.c/alloc-pinned-4.c: New test.

diff --git a/libgomp/allocator.c b/libgomp/allocator.c
index 1cc7486fc4c..5ab161b6314 100644
--- a/libgomp/allocator.c
+++ b/libgomp/allocator.c
@@ -36,16 +36,20 @@
 
 /* These macros may be overridden in config/<target>/allocator.c.  */
 #ifndef MEMSPACE_ALLOC
-#define MEMSPACE_ALLOC(MEMSPACE, SIZE) malloc (SIZE)
+#define MEMSPACE_ALLOC(MEMSPACE, SIZE, PIN) \
+  (PIN ? NULL : malloc (SIZE))
 #endif
 #ifndef MEMSPACE_CALLOC
-#define MEMSPACE_CALLOC(MEMSPACE, SIZE) calloc (1, SIZE)
+#define MEMSPACE_CALLOC(MEMSPACE, SIZE, PIN) \
+  (PIN ? NULL : calloc (1, SIZE))
 #endif
 #ifndef MEMSPACE_REALLOC
-#define MEMSPACE_REALLOC(MEMSPACE, ADDR, OLDSIZE, SIZE) realloc (ADDR, SIZE)
+#define MEMSPACE_REALLOC(MEMSPACE, ADDR, OLDSIZE, SIZE, OLDPIN, PIN) \
+  ((PIN) || (OLDPIN) ? NULL : realloc (ADDR, SIZE))
 #endif
 #ifndef MEMSPACE_FREE
-#define MEMSPACE_FREE(MEMSPACE, ADDR, SIZE) free (ADDR)
+#define MEMSPACE_FREE(MEMSPACE, ADDR, SIZE, PIN) \
+  (PIN ? NULL : free (ADDR))
 #endif
 
 /* Map the predefined allocators to the correct memory space.
@@ -208,7 +212,7 @@ omp_init_allocator (omp_memspace_handle_t memspace, int ntraits,
     data.alignment = sizeof (void *);
 
   /* No support for these so far (for hbw will use memkind).  */
-  if (data.pinned || data.memspace == omp_high_bw_mem_space)
+  if (data.memspace == omp_high_bw_mem_space)
     return omp_null_allocator;
 
   ret = gomp_malloc (sizeof (struct omp_allocator_data));
@@ -309,7 +313,8 @@ retry:
       allocator_data->used_pool_size = used_pool_size;
       gomp_mutex_unlock (&allocator_data->lock);
 #endif
-      ptr = MEMSPACE_ALLOC (allocator_data->memspace, new_size);
+      ptr = MEMSPACE_ALLOC (allocator_data->memspace, new_size,
+			    allocator_data->pinned);
       if (ptr == NULL)
 	{
 #ifdef HAVE_SYNC_BUILTINS
@@ -329,7 +334,8 @@ retry:
 	= (allocator_data
 	   ? allocator_data->memspace
 	   : predefined_alloc_mapping[allocator]);
-      ptr = MEMSPACE_ALLOC (memspace, new_size);
+      ptr = MEMSPACE_ALLOC (memspace, new_size,
+			    allocator_data && allocator_data->pinned);
       if (ptr == NULL)
 	goto fail;
     }
@@ -356,9 +362,9 @@ fail:
     {
     case omp_atv_default_mem_fb:
       if ((new_alignment > sizeof (void *) && new_alignment > alignment)
-	  || (allocator_data
-	      && allocator_data->pool_size < ~(uintptr_t) 0)
-	  || !allocator_data)
+	  || !allocator_data
+	  || allocator_data->pool_size < ~(uintptr_t) 0
+	  || allocator_data->pinned)
 	{
 	  allocator = omp_default_mem_alloc;
 	  goto retry;
@@ -410,6 +416,7 @@ omp_free (void *ptr, omp_allocator_handle_t allocator)
   struct omp_mem_header *data;
   omp_memspace_handle_t memspace __attribute__((unused))
     = omp_default_mem_space;
+  int pinned __attribute__((unused)) = false;
 
   if (ptr == NULL)
     return;
@@ -432,11 +439,12 @@ omp_free (void *ptr, omp_allocator_handle_t allocator)
 	}
 
       memspace = allocator_data->memspace;
+      pinned = allocator_data->pinned;
     }
   else
     memspace = predefined_alloc_mapping[data->allocator];
 
-  MEMSPACE_FREE (memspace, data->ptr, data->size);
+  MEMSPACE_FREE (memspace, data->ptr, data->size, pinned);
 }
 
 ialias (omp_free)
@@ -524,7 +532,8 @@ retry:
       allocator_data->used_pool_size = used_pool_size;
       gomp_mutex_unlock (&allocator_data->lock);
 #endif
-      ptr = MEMSPACE_CALLOC (allocator_data->memspace, new_size);
+      ptr = MEMSPACE_CALLOC (allocator_data->memspace, new_size,
+			     allocator_data->pinned);
       if (ptr == NULL)
 	{
 #ifdef HAVE_SYNC_BUILTINS
@@ -544,7 +553,8 @@ retry:
 	= (allocator_data
 	   ? allocator_data->memspace
 	   : predefined_alloc_mapping[allocator]);
-      ptr = MEMSPACE_CALLOC (memspace, new_size);
+      ptr = MEMSPACE_CALLOC (memspace, new_size,
+			     allocator_data && allocator_data->pinned);
       if (ptr == NULL)
 	goto fail;
     }
@@ -571,9 +581,9 @@ fail:
     {
     case omp_atv_default_mem_fb:
       if ((new_alignment > sizeof (void *) && new_alignment > alignment)
-	  || (allocator_data
-	      && allocator_data->pool_size < ~(uintptr_t) 0)
-	  || !allocator_data)
+	  || !allocator_data
+	  || allocator_data->pool_size < ~(uintptr_t) 0
+	  || allocator_data->pinned)
 	{
 	  allocator = omp_default_mem_alloc;
 	  goto retry;
@@ -710,9 +720,13 @@ retry:
 #endif
       if (prev_size)
 	new_ptr = MEMSPACE_REALLOC (allocator_data->memspace, data->ptr,
-				    data->size, new_size);
+				    data->size, new_size,
+				    (free_allocator_data
+				     && free_allocator_data->pinned),
+				    allocator_data->pinned);
       else
-	new_ptr = MEMSPACE_ALLOC (allocator_data->memspace, new_size);
+	new_ptr = MEMSPACE_ALLOC (allocator_data->memspace, new_size,
+				  allocator_data->pinned);
       if (new_ptr == NULL)
 	{
 #ifdef HAVE_SYNC_BUILTINS
@@ -744,9 +758,13 @@ retry:
 	= (allocator_data
 	   ? allocator_data->memspace
 	   : predefined_alloc_mapping[allocator]);
-      new_ptr = MEMSPACE_REALLOC (memspace, data->ptr, data->size, new_size);
+      new_ptr = MEMSPACE_REALLOC (memspace, data->ptr, data->size, new_size,
+				  (free_allocator_data
+				   && free_allocator_data->pinned),
+				  allocator_data && allocator_data->pinned);
       if (new_ptr == NULL)
 	goto fail;
+
       ret = (char *) new_ptr + sizeof (struct omp_mem_header);
       ((struct omp_mem_header *) ret)[-1].ptr = new_ptr;
       ((struct omp_mem_header *) ret)[-1].size = new_size;
@@ -759,7 +777,8 @@ retry:
 	= (allocator_data
 	   ? allocator_data->memspace
 	   : predefined_alloc_mapping[allocator]);
-      new_ptr = MEMSPACE_ALLOC (memspace, new_size);
+      new_ptr = MEMSPACE_ALLOC (memspace, new_size,
+				allocator_data && allocator_data->pinned);
       if (new_ptr == NULL)
 	goto fail;
     }
@@ -802,9 +821,9 @@ fail:
     {
     case omp_atv_default_mem_fb:
       if (new_alignment > sizeof (void *)
-	  || (allocator_data
-	      && allocator_data->pool_size < ~(uintptr_t) 0)
-	  || !allocator_data)
+	  || !allocator_data
+	  || allocator_data->pool_size < ~(uintptr_t) 0
+	  || allocator_data->pinned)
 	{
 	  allocator = omp_default_mem_alloc;
 	  goto retry;
diff --git a/libgomp/config/linux/allocator.c b/libgomp/config/linux/allocator.c
new file mode 100644
index 00000000000..5f3ae491f07
--- /dev/null
+++ b/libgomp/config/linux/allocator.c
@@ -0,0 +1,124 @@
+/* Copyright (C) 2022 Free Software Foundation, Inc.
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Implement malloc routines that can handle pinned memory on Linux.
+   
+   It's possible to use mlock on any heap memory, but using munlock is
+   problematic if there are multiple pinned allocations on the same page.
+   Tracking all that manually would be possible, but adds overhead. This may
+   be worth it if there are a lot of small allocations getting pinned, but
+   this seems less likely in a HPC application.
+
+   Instead we optimize for large pinned allocations, and use mmap to ensure
+   that two pinned allocations don't share the same page.  This also means
+   that large allocations don't pin extra pages by being poorly aligned.  */
+
+#define _GNU_SOURCE
+#include <sys/mman.h>
+#include <string.h>
+#include "libgomp.h"
+
+static void *
+linux_memspace_alloc (omp_memspace_handle_t memspace, size_t size, int pin)
+{
+  (void)memspace;
+
+  if (pin)
+    {
+      void *addr = mmap (NULL, size, PROT_READ | PROT_WRITE,
+			 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+      if (addr == MAP_FAILED)
+	return NULL;
+
+      if (mlock (addr, size))
+	{
+	  gomp_debug (0, "libgomp: failed to pin memory (ulimit too low?)\n");
+	  munmap (addr, size);
+	  return NULL;
+	}
+
+      return addr;
+    }
+  else
+    return malloc (size);
+}
+
+static void *
+linux_memspace_calloc (omp_memspace_handle_t memspace, size_t size, int pin)
+{
+  if (pin)
+    return linux_memspace_alloc (memspace, size, pin);
+  else
+    return calloc (1, size);
+}
+
+static void
+linux_memspace_free (omp_memspace_handle_t memspace, void *addr, size_t size,
+		     int pin)
+{
+  (void)memspace;
+
+  if (pin)
+    munmap (addr, size);
+  else
+    free (addr);
+}
+
+static void *
+linux_memspace_realloc (omp_memspace_handle_t memspace, void *addr,
+			size_t oldsize, size_t size, int oldpin, int pin)
+{
+  if (oldpin && pin)
+    {
+      void *newaddr = mremap (addr, oldsize, size, MREMAP_MAYMOVE);
+      if (newaddr == MAP_FAILED)
+	return NULL;
+
+      return newaddr;
+    }
+  else if (oldpin || pin)
+    {
+      void *newaddr = linux_memspace_alloc (memspace, size, pin);
+      if (newaddr)
+	{
+	  memcpy (newaddr, addr, oldsize < size ? oldsize : size);
+	  linux_memspace_free (memspace, addr, oldsize, oldpin);
+	}
+
+      return newaddr;
+    }
+  else
+    return realloc (addr, size);
+}
+
+#define MEMSPACE_ALLOC(MEMSPACE, SIZE, PIN) \
+  linux_memspace_alloc (MEMSPACE, SIZE, PIN)
+#define MEMSPACE_CALLOC(MEMSPACE, SIZE, PIN) \
+  linux_memspace_calloc (MEMSPACE, SIZE, PIN)
+#define MEMSPACE_REALLOC(MEMSPACE, ADDR, OLDSIZE, SIZE, OLDPIN, PIN) \
+  linux_memspace_realloc (MEMSPACE, ADDR, OLDSIZE, SIZE, OLDPIN, PIN)
+#define MEMSPACE_FREE(MEMSPACE, ADDR, SIZE, PIN) \
+  linux_memspace_free (MEMSPACE, ADDR, SIZE, PIN)
+
+#include "../../allocator.c"
diff --git a/libgomp/config/nvptx/allocator.c b/libgomp/config/nvptx/allocator.c
index 6bc2ea48043..f740b97f6ac 100644
--- a/libgomp/config/nvptx/allocator.c
+++ b/libgomp/config/nvptx/allocator.c
@@ -358,13 +358,13 @@ nvptx_memspace_realloc (omp_memspace_handle_t memspace, void *addr,
     return realloc (addr, size);
 }
 
-#define MEMSPACE_ALLOC(MEMSPACE, SIZE) \
+#define MEMSPACE_ALLOC(MEMSPACE, SIZE, PIN) \
   nvptx_memspace_alloc (MEMSPACE, SIZE)
-#define MEMSPACE_CALLOC(MEMSPACE, SIZE) \
+#define MEMSPACE_CALLOC(MEMSPACE, SIZE, PIN) \
   nvptx_memspace_calloc (MEMSPACE, SIZE)
-#define MEMSPACE_REALLOC(MEMSPACE, ADDR, OLDSIZE, SIZE) \
+#define MEMSPACE_REALLOC(MEMSPACE, ADDR, OLDSIZE, SIZE, OLDPIN, PIN) \
   nvptx_memspace_realloc (MEMSPACE, ADDR, OLDSIZE, SIZE)
-#define MEMSPACE_FREE(MEMSPACE, ADDR, SIZE) \
+#define MEMSPACE_FREE(MEMSPACE, ADDR, SIZE, PIN) \
   nvptx_memspace_free (MEMSPACE, ADDR, SIZE)
 
 #include "../../allocator.c"
diff --git a/libgomp/testsuite/libgomp.c/alloc-pinned-1.c b/libgomp/testsuite/libgomp.c/alloc-pinned-1.c
new file mode 100644
index 00000000000..0a6360cda29
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/alloc-pinned-1.c
@@ -0,0 +1,81 @@
+/* { dg-do run } */
+
+/* { dg-xfail-run-if "Pinning not implemented on this host" { ! *-*-linux-gnu } } */
+
+/* Test that pinned memory works.  */
+
+#ifdef __linux__
+#include <sys/types.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <sys/mman.h>
+
+int
+get_pinned_mem ()
+{
+  int pid = getpid ();
+  char buf[100];
+  sprintf (buf, "/proc/%d/status", pid);
+
+  FILE *proc = fopen (buf, "r");
+  if (!proc)
+    abort ();
+  while (fgets (buf, 100, proc))
+    {
+      int val;
+      if (sscanf (buf, "VmLck: %d", &val))
+	{
+	  fclose (proc);
+	  return val;
+	}
+    }
+  abort ();
+}
+#else
+int
+get_pinned_mem ()
+{
+  return 0;
+}
+#endif
+
+#include <omp.h>
+
+/* Allocate more than a page each time, but stay within the ulimit.  */
+#define SIZE 10*1024
+
+int
+main ()
+{
+  const omp_alloctrait_t traits[] = {
+      { omp_atk_pinned, 1 }
+  };
+  omp_allocator_handle_t allocator = omp_init_allocator (omp_default_mem_space, 1, traits);
+
+  // Sanity check
+  if (get_pinned_mem () != 0)
+    abort ();
+
+  void *p = omp_alloc (SIZE, allocator);
+  if (!p)
+    abort ();
+
+  int amount = get_pinned_mem ();
+  if (amount == 0)
+    abort ();
+
+  p = omp_realloc (p, SIZE*2, allocator, allocator);
+
+  int amount2 = get_pinned_mem ();
+  if (amount2 <= amount)
+    abort ();
+
+  p = omp_calloc (1, SIZE, allocator);
+
+  if (get_pinned_mem () <= amount2)
+    abort ();
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c/alloc-pinned-2.c b/libgomp/testsuite/libgomp.c/alloc-pinned-2.c
new file mode 100644
index 00000000000..8fdb4ff5cfd
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/alloc-pinned-2.c
@@ -0,0 +1,87 @@
+/* { dg-do run } */
+
+/* { dg-xfail-run-if "Pinning not implemented on this host" { ! *-*-linux-gnu } } */
+
+/* Test that pinned memory works (pool_size code path).  */
+
+#ifdef __linux__
+#include <sys/types.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <sys/mman.h>
+
+int
+get_pinned_mem ()
+{
+  int pid = getpid ();
+  char buf[100];
+  sprintf (buf, "/proc/%d/status", pid);
+
+  FILE *proc = fopen (buf, "r");
+  if (!proc)
+    abort ();
+  while (fgets (buf, 100, proc))
+    {
+      int val;
+      if (sscanf (buf, "VmLck: %d", &val))
+	{
+	  fclose (proc);
+	  return val;
+	}
+    }
+  abort ();
+}
+#else
+int
+get_pinned_mem ()
+{
+  return 0;
+}
+#endif
+
+#include <omp.h>
+
+/* Allocate more than a page each time, but stay within the ulimit.  */
+#define SIZE 10*1024
+
+int
+main ()
+{
+  const omp_alloctrait_t traits[] = {
+      { omp_atk_pinned, 1 },
+      { omp_atk_pool_size, SIZE*8 }
+  };
+  omp_allocator_handle_t allocator = omp_init_allocator (omp_default_mem_space,
+							 2, traits);
+
+  // Sanity check
+  if (get_pinned_mem () != 0)
+    abort ();
+
+  void *p = omp_alloc (SIZE, allocator);
+  if (!p)
+    abort ();
+
+  int amount = get_pinned_mem ();
+  if (amount == 0)
+    abort ();
+
+  p = omp_realloc (p, SIZE*2, allocator, allocator);
+  if (!p)
+    abort ();
+
+  int amount2 = get_pinned_mem ();
+  if (amount2 <= amount)
+    abort ();
+
+  p = omp_calloc (1, SIZE, allocator);
+  if (!p)
+    abort ();
+
+  if (get_pinned_mem () <= amount2)
+    abort ();
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c/alloc-pinned-3.c b/libgomp/testsuite/libgomp.c/alloc-pinned-3.c
new file mode 100644
index 00000000000..943dfea5c9b
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/alloc-pinned-3.c
@@ -0,0 +1,125 @@
+/* { dg-do run } */
+
+/* Test that pinned memory fails correctly.  */
+
+#ifdef __linux__
+#include <sys/types.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <sys/mman.h>
+#include <sys/resource.h>
+
+int
+get_pinned_mem ()
+{
+  int pid = getpid ();
+  char buf[100];
+  sprintf (buf, "/proc/%d/status", pid);
+
+  FILE *proc = fopen (buf, "r");
+  if (!proc)
+    abort ();
+  while (fgets (buf, 100, proc))
+    {
+      int val;
+      if (sscanf (buf, "VmLck: %d", &val))
+	{
+	  fclose (proc);
+	  return val;
+	}
+    }
+  abort ();
+}
+
+void
+set_pin_limit (int size)
+{
+  struct rlimit limit;
+  if (getrlimit (RLIMIT_MEMLOCK, &limit))
+    abort ();
+  limit.rlim_cur = (limit.rlim_max < size ? limit.rlim_max : size);
+  if (setrlimit (RLIMIT_MEMLOCK, &limit))
+    abort ();
+}
+#else
+int
+get_pinned_mem ()
+{
+  return 0;
+}
+
+void
+set_pin_limit ()
+{
+}
+#endif
+
+#include <omp.h>
+
+/* This should be large enough to cover multiple pages.  */
+#define SIZE 10000*1024
+
+int
+main ()
+{
+  /* Pinned memory, no fallback.  */
+  const omp_alloctrait_t traits1[] = {
+      { omp_atk_pinned, 1 },
+      { omp_atk_fallback, omp_atv_null_fb }
+  };
+  omp_allocator_handle_t allocator1 = omp_init_allocator (omp_default_mem_space, 2, traits1);
+
+  /* Pinned memory, plain memory fallback.  */
+  const omp_alloctrait_t traits2[] = {
+      { omp_atk_pinned, 1 },
+      { omp_atk_fallback, omp_atv_default_mem_fb }
+  };
+  omp_allocator_handle_t allocator2 = omp_init_allocator (omp_default_mem_space, 2, traits2);
+
+  /* Ensure that the limit is smaller than the allocation.  */
+  set_pin_limit (SIZE/2);
+
+  // Sanity check
+  if (get_pinned_mem () != 0)
+    abort ();
+
+  // Should fail
+  void *p = omp_alloc (SIZE, allocator1);
+  if (p)
+    abort ();
+
+  // Should fail
+  p = omp_calloc (1, SIZE, allocator1);
+  if (p)
+    abort ();
+
+  // Should fall back
+  p = omp_alloc (SIZE, allocator2);
+  if (!p)
+    abort ();
+
+  // Should fall back
+  p = omp_calloc (1, SIZE, allocator2);
+  if (!p)
+    abort ();
+
+  // Should fail to realloc
+  void *notpinned = omp_alloc (SIZE, omp_default_mem_alloc);
+  p = omp_realloc (notpinned, SIZE, allocator1, omp_default_mem_alloc);
+  if (!notpinned || p)
+    abort ();
+
+  // Should fall back to no realloc needed
+  p = omp_realloc (notpinned, SIZE, allocator2, omp_default_mem_alloc);
+  if (p != notpinned)
+    abort ();
+
+  // No memory should have been pinned
+  int amount = get_pinned_mem ();
+  if (amount != 0)
+    abort ();
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c/alloc-pinned-4.c b/libgomp/testsuite/libgomp.c/alloc-pinned-4.c
new file mode 100644
index 00000000000..d9cb8dfe1fd
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/alloc-pinned-4.c
@@ -0,0 +1,127 @@
+/* { dg-do run } */
+
+/* Test that pinned memory fails correctly, pool_size code path.  */
+
+#ifdef __linux__
+#include <sys/types.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <sys/mman.h>
+#include <sys/resource.h>
+
+int
+get_pinned_mem ()
+{
+  int pid = getpid ();
+  char buf[100];
+  sprintf (buf, "/proc/%d/status", pid);
+
+  FILE *proc = fopen (buf, "r");
+  if (!proc)
+    abort ();
+  while (fgets (buf, 100, proc))
+    {
+      int val;
+      if (sscanf (buf, "VmLck: %d", &val))
+	{
+	  fclose (proc);
+	  return val;
+	}
+    }
+  abort ();
+}
+
+void
+set_pin_limit (int size)
+{
+  struct rlimit limit;
+  if (getrlimit (RLIMIT_MEMLOCK, &limit))
+    abort ();
+  limit.rlim_cur = (limit.rlim_max < size ? limit.rlim_max : size);
+  if (setrlimit (RLIMIT_MEMLOCK, &limit))
+    abort ();
+}
+#else
+int
+get_pinned_mem ()
+{
+  return 0;
+}
+
+void
+set_pin_limit ()
+{
+}
+#endif
+
+#include <omp.h>
+
+/* This should be large enough to cover multiple pages.  */
+#define SIZE 10000*1024
+
+int
+main ()
+{
+  /* Pinned memory, no fallback.  */
+  const omp_alloctrait_t traits1[] = {
+      { omp_atk_pinned, 1 },
+      { omp_atk_fallback, omp_atv_null_fb },
+      { omp_atk_pool_size, SIZE*8 }
+  };
+  omp_allocator_handle_t allocator1 = omp_init_allocator (omp_default_mem_space, 3, traits1);
+
+  /* Pinned memory, plain memory fallback.  */
+  const omp_alloctrait_t traits2[] = {
+      { omp_atk_pinned, 1 },
+      { omp_atk_fallback, omp_atv_default_mem_fb },
+      { omp_atk_pool_size, SIZE*8 }
+  };
+  omp_allocator_handle_t allocator2 = omp_init_allocator (omp_default_mem_space, 3, traits2);
+
+  /* Ensure that the limit is smaller than the allocation.  */
+  set_pin_limit (SIZE/2);
+
+  // Sanity check
+  if (get_pinned_mem () != 0)
+    abort ();
+
+  // Should fail
+  void *p = omp_alloc (SIZE, allocator1);
+  if (p)
+    abort ();
+
+  // Should fail
+  p = omp_calloc (1, SIZE, allocator1);
+  if (p)
+    abort ();
+
+  // Should fall back
+  p = omp_alloc (SIZE, allocator2);
+  if (!p)
+    abort ();
+
+  // Should fall back
+  p = omp_calloc (1, SIZE, allocator2);
+  if (!p)
+    abort ();
+
+  // Should fail to realloc
+  void *notpinned = omp_alloc (SIZE, omp_default_mem_alloc);
+  p = omp_realloc (notpinned, SIZE, allocator1, omp_default_mem_alloc);
+  if (!notpinned || p)
+    abort ();
+
+  // Should fall back to no realloc needed
+  p = omp_realloc (notpinned, SIZE, allocator2, omp_default_mem_alloc);
+  if (p != notpinned)
+    abort ();
+
+  // No memory should have been pinned
+  int amount = get_pinned_mem ();
+  if (amount != 0)
+    abort ();
+
+  return 0;
+}

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] libgomp, openmp: pinned memory
  2022-01-13 13:53           ` Andrew Stubbs
@ 2022-06-07 11:05             ` Andrew Stubbs
  2022-06-07 12:10               ` Jakub Jelinek
  2023-02-10 15:11             ` [PATCH] libgomp, openmp: pinned memory Thomas Schwinge
  2023-02-16 21:39             ` [og12] Clarify/verify OpenMP 'omp_calloc' zero-initialization for pinned memory (was: [PATCH] libgomp, openmp: pinned memory) Thomas Schwinge
  2 siblings, 1 reply; 28+ messages in thread
From: Andrew Stubbs @ 2022-06-07 11:05 UTC (permalink / raw)
  To: Jakub Jelinek, gcc-patches

Following some feedback from users of the OG11 branch I think I need to 
withdraw this patch, for now.

The memory pinned via the mlock call does not give the expected 
performance boost. I had not expected that it would do much in my test 
setup, given that the machine has a lot of RAM and my benchmarks are 
small, but others have tried more and on varying machines and architectures.

It seems that it isn't enough for the memory to be pinned, it has to be 
pinned using the Cuda API to get the performance boost. I had not done 
this because it was difficult to resolve the code abstraction 
difficulties and anyway the implementation was supposed to be device 
independent, but it seems we need a specific pinning mechanism for each 
device.

I will resubmit this patch with some kind of Cuda/plugin hook soonish, 
keeping the existing implementation for other device types. I don't know 
how that'll handle heterogenous systems, but those ought to be rare.

I don't think libmemkind will resolve this performance issue, although 
certainly it can be used for host implementations of low-latency 
memories, etc.

Andrew

On 13/01/2022 13:53, Andrew Stubbs wrote:
> On 05/01/2022 17:07, Andrew Stubbs wrote:
>> I don't believe 64KB will be anything like enough for any real HPC 
>> application. Is it really worth optimizing for this case?
>>
>> Anyway, I'm working on an implementation using mmap instead of malloc 
>> for pinned allocations. I figure that will simplify the unpin 
>> algorithm (because it'll be munmap) and optimize for large allocations 
>> such as I imagine HPC applications will use. It won't fix the ulimit 
>> issue.
> 
> Here's my new patch.
> 
> This version is intended to apply on top of the latest version of my 
> low-latency allocator patch, although the dependency is mostly textual.
> 
> Pinned memory is allocated via mmap + mlock, and allocation fails 
> (returns NULL) if the lock fails and there's no fallback configured.
> 
> This means that large allocations will now be page aligned and therefore 
> pin the smallest number of pages for the size requested, and that that 
> memory will be unpinned automatically when freed via munmap, or moved 
> via mremap.
> 
> Obviously this is not ideal for allocations much smaller than one page. 
> If that turns out to be a problem in the real world then we can add a 
> special case fairly straight-forwardly, and incur the extra page 
> tracking expense in those cases only, or maybe implement our own 
> pinned-memory heap (something like already proposed for low-latency 
> memory, perhaps).
> 
> Also new is a realloc implementation that works better when reallocation 
> fails. This is confirmed by the new testcases.
> 
> OK for stage 1?
> 
> Thanks
> 
> Andrew

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] libgomp, openmp: pinned memory
  2022-06-07 11:05             ` Andrew Stubbs
@ 2022-06-07 12:10               ` Jakub Jelinek
  2022-06-07 12:28                 ` Andrew Stubbs
  0 siblings, 1 reply; 28+ messages in thread
From: Jakub Jelinek @ 2022-06-07 12:10 UTC (permalink / raw)
  To: Andrew Stubbs; +Cc: gcc-patches

On Tue, Jun 07, 2022 at 12:05:40PM +0100, Andrew Stubbs wrote:
> Following some feedback from users of the OG11 branch I think I need to
> withdraw this patch, for now.
> 
> The memory pinned via the mlock call does not give the expected performance
> boost. I had not expected that it would do much in my test setup, given that
> the machine has a lot of RAM and my benchmarks are small, but others have
> tried more and on varying machines and architectures.

I don't understand why there should be any expected performance boost (at
least not unless the machine starts swapping out pages),
{ omp_atk_pinned, true } is solely about the requirement that the memory
can't be swapped out.

> It seems that it isn't enough for the memory to be pinned, it has to be
> pinned using the Cuda API to get the performance boost. I had not done this

For performance boost of what kind of code?
I don't understand how Cuda API could be useful (or can be used at all) if
offloading to NVPTX isn't involved.  The fact that somebody asks for host
memory allocation with omp_atk_pinned set to true doesn't mean it will be
in any way related to NVPTX offloading (unless it is in NVPTX target region
obviously, but then mlock isn't available, so sure, if there is something
CUDA can provide for that case, nice).

> I don't think libmemkind will resolve this performance issue, although
> certainly it can be used for host implementations of low-latency memories,
> etc.

The reason for libmemkind is primarily its support of HBW memory (but
admittedly I need to find out what kind of such memory it does support),
or the various interleaving etc. the library has.
Plus, when we have such support, as it has its own costomizable allocator,
it could be used to allocate larger chunks of memory that can be mlocked
and then just allocate from that pinned memory if user asks for small
allocations from that memory.

	Jakub

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] libgomp, openmp: pinned memory
  2022-06-07 12:10               ` Jakub Jelinek
@ 2022-06-07 12:28                 ` Andrew Stubbs
  2022-06-07 12:40                   ` Jakub Jelinek
  2022-06-09  9:38                   ` Thomas Schwinge
  0 siblings, 2 replies; 28+ messages in thread
From: Andrew Stubbs @ 2022-06-07 12:28 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: gcc-patches

On 07/06/2022 13:10, Jakub Jelinek wrote:
> On Tue, Jun 07, 2022 at 12:05:40PM +0100, Andrew Stubbs wrote:
>> Following some feedback from users of the OG11 branch I think I need to
>> withdraw this patch, for now.
>>
>> The memory pinned via the mlock call does not give the expected performance
>> boost. I had not expected that it would do much in my test setup, given that
>> the machine has a lot of RAM and my benchmarks are small, but others have
>> tried more and on varying machines and architectures.
> 
> I don't understand why there should be any expected performance boost (at
> least not unless the machine starts swapping out pages),
> { omp_atk_pinned, true } is solely about the requirement that the memory
> can't be swapped out.

It seems like it takes a faster path through the NVidia drivers. This is 
a black box, for me, but that seems like a plausible explanation. The 
results are different on x86_64 and powerpc hosts (such as the Summit 
supercomputer).

>> It seems that it isn't enough for the memory to be pinned, it has to be
>> pinned using the Cuda API to get the performance boost. I had not done this
> 
> For performance boost of what kind of code?
> I don't understand how Cuda API could be useful (or can be used at all) if
> offloading to NVPTX isn't involved.  The fact that somebody asks for host
> memory allocation with omp_atk_pinned set to true doesn't mean it will be
> in any way related to NVPTX offloading (unless it is in NVPTX target region
> obviously, but then mlock isn't available, so sure, if there is something
> CUDA can provide for that case, nice).

This is specifically for NVPTX offload, of course, but then that's what 
our customer is paying for.

The expectation, from users, is that memory pinning will give the 
benefits specific to the active device. We can certainly make that 
happen when there is only one (flavour of) offload device present. I had 
hoped it could be one way for all, but it looks like not.

> 
>> I don't think libmemkind will resolve this performance issue, although
>> certainly it can be used for host implementations of low-latency memories,
>> etc.
> 
> The reason for libmemkind is primarily its support of HBW memory (but
> admittedly I need to find out what kind of such memory it does support),
> or the various interleaving etc. the library has.
> Plus, when we have such support, as it has its own costomizable allocator,
> it could be used to allocate larger chunks of memory that can be mlocked
> and then just allocate from that pinned memory if user asks for small
> allocations from that memory.

It should be straight-forward to switch the no-offload implementation to 
libmemkind when the time comes (the changes would be contained within 
config/linux/allocator.c), but I have no plans to do so myself (and no 
hardware to test it with). I'd prefer that it didn't impede the offload 
solution in the meantime.

Andrew

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] libgomp, openmp: pinned memory
  2022-06-07 12:28                 ` Andrew Stubbs
@ 2022-06-07 12:40                   ` Jakub Jelinek
  2022-06-09  9:38                   ` Thomas Schwinge
  1 sibling, 0 replies; 28+ messages in thread
From: Jakub Jelinek @ 2022-06-07 12:40 UTC (permalink / raw)
  To: Andrew Stubbs; +Cc: gcc-patches

On Tue, Jun 07, 2022 at 01:28:33PM +0100, Andrew Stubbs wrote:
> > For performance boost of what kind of code?
> > I don't understand how Cuda API could be useful (or can be used at all) if
> > offloading to NVPTX isn't involved.  The fact that somebody asks for host
> > memory allocation with omp_atk_pinned set to true doesn't mean it will be
> > in any way related to NVPTX offloading (unless it is in NVPTX target region
> > obviously, but then mlock isn't available, so sure, if there is something
> > CUDA can provide for that case, nice).
> 
> This is specifically for NVPTX offload, of course, but then that's what our
> customer is paying for.
> 
> The expectation, from users, is that memory pinning will give the benefits
> specific to the active device. We can certainly make that happen when there
> is only one (flavour of) offload device present. I had hoped it could be one
> way for all, but it looks like not.

I think that is just an expectation that isn't backed by anything in the
standard.
When users need something like that (but would be good to describe what
it is, memory that will be primarily used for interfacing the offloading
device 0 (or some specific device given by some number), or memory that
can be used without remapping on some offloading device, something else?
And when we know what exactly that is (e.g. what Cuda APIs or GCN APIs etc.
can provide), discuss on omp-lang whether there shouldn't be some standard
way to ask for such an allocator.  Or there is always the possibility of
extensions.  Not sure if one can just define ompx_atv_whatever, use some
large value for it (but the spec doesn't have a vendor range which would be
safe to use) and support it that way.

Plus a different thing is allocators in the offloading regions.
I think we should translate some omp_alloc etc. calls in such regions
when they use constant expression standard allocators to doing the
allocation through other means, or allocators.c can be overridden or
amended for the needs or possibilities of the offloading targets.

	Jakub

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] libgomp, openmp: pinned memory
  2022-06-07 12:28                 ` Andrew Stubbs
  2022-06-07 12:40                   ` Jakub Jelinek
@ 2022-06-09  9:38                   ` Thomas Schwinge
  2022-06-09 10:09                     ` Tobias Burnus
                                       ` (2 more replies)
  1 sibling, 3 replies; 28+ messages in thread
From: Thomas Schwinge @ 2022-06-09  9:38 UTC (permalink / raw)
  To: Andrew Stubbs, Jakub Jelinek; +Cc: gcc-patches

Hi!

I'm not all too familiar with the "newish" CUDA Driver API, but maybe the
following is useful still:

On 2022-06-07T13:28:33+0100, Andrew Stubbs <ams@codesourcery.com> wrote:
> On 07/06/2022 13:10, Jakub Jelinek wrote:
>> On Tue, Jun 07, 2022 at 12:05:40PM +0100, Andrew Stubbs wrote:
>>> Following some feedback from users of the OG11 branch I think I need to
>>> withdraw this patch, for now.
>>>
>>> The memory pinned via the mlock call does not give the expected performance
>>> boost. I had not expected that it would do much in my test setup, given that
>>> the machine has a lot of RAM and my benchmarks are small, but others have
>>> tried more and on varying machines and architectures.
>>
>> I don't understand why there should be any expected performance boost (at
>> least not unless the machine starts swapping out pages),
>> { omp_atk_pinned, true } is solely about the requirement that the memory
>> can't be swapped out.
>
> It seems like it takes a faster path through the NVidia drivers. This is
> a black box, for me, but that seems like a plausible explanation. The
> results are different on x86_64 and powerpc hosts (such as the Summit
> supercomputer).

For example, it's documented that 'cuMemHostAlloc',
<https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g572ca4011bfcb25034888a14d4e035b9>,
"Allocates page-locked host memory".  The crucial thing, though, what
makes this different from 'malloc' plus 'mlock' is, that "The driver
tracks the virtual memory ranges allocated with this function and
automatically accelerates calls to functions such as cuMemcpyHtoD().
Since the memory can be accessed directly by the device, it can be read
or written with much higher bandwidth than pageable memory obtained with
functions such as malloc()".

Similar, for example, for 'cuMemAllocHost',
<https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gdd8311286d2c2691605362c689bc64e0>.

This, to me, would explain why "the mlock call does not give the expected
performance boost", in comparison with 'cuMemAllocHost'/'cuMemHostAlloc';
with 'mlock' you're missing the "tracks the virtual memory ranges"
aspect.

Also, by means of the Nvidia Driver allocating the memory, I suppose
using this interface likely circumvents any "annoying" 'ulimit'
limitations?  I get this impression, because documentation continues
stating that "Allocating excessive amounts of memory with
cuMemAllocHost() may degrade system performance, since it reduces the
amount of memory available to the system for paging.  As a result, this
function is best used sparingly to allocate staging areas for data
exchange between host and device".

>>> It seems that it isn't enough for the memory to be pinned, it has to be
>>> pinned using the Cuda API to get the performance boost.
>>
>> For performance boost of what kind of code?
>> I don't understand how Cuda API could be useful (or can be used at all) if
>> offloading to NVPTX isn't involved.  The fact that somebody asks for host
>> memory allocation with omp_atk_pinned set to true doesn't mean it will be
>> in any way related to NVPTX offloading (unless it is in NVPTX target region
>> obviously, but then mlock isn't available, so sure, if there is something
>> CUDA can provide for that case, nice).
>
> This is specifically for NVPTX offload, of course, but then that's what
> our customer is paying for.
>
> The expectation, from users, is that memory pinning will give the
> benefits specific to the active device. We can certainly make that
> happen when there is only one (flavour of) offload device present. I had
> hoped it could be one way for all, but it looks like not.

Aren't there CUDA Driver interfaces for that?  That is:

>>> I had not done this
>>> this because it was difficult to resolve the code abstraction
>>> difficulties and anyway the implementation was supposed to be device
>>> independent, but it seems we need a specific pinning mechanism for each
>>> device.

If not directly *allocating and registering* such memory via
'cuMemAllocHost'/'cuMemHostAlloc', you should still be able to only
*register* your standard 'malloc'ed etc. memory via 'cuMemHostRegister',
<https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gf0a9fe11544326dabd743b7aa6b54223>:
"Page-locks the memory range specified [...] and maps it for the
device(s) [...].  This memory range also is added to the same tracking
mechanism as cuMemHostAlloc to automatically accelerate [...]"?  (No
manual 'mlock'ing involved in that case, too; presumably again using this
interface likely circumvents any "annoying" 'ulimit' limitations?)

Such a *register* abstraction can then be implemented by all the libgomp
offloading plugins: they just call the respective
CUDA/HSA/etc. functions to register such (existing, 'malloc'ed, etc.)
memory.

..., but maybe I'm missing some crucial "detail" here?

Grüße
 Thomas
-----------------
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht München, HRB 106955

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] libgomp, openmp: pinned memory
  2022-06-09  9:38                   ` Thomas Schwinge
@ 2022-06-09 10:09                     ` Tobias Burnus
  2022-06-09 10:22                       ` Stubbs, Andrew
  2022-06-09 10:31                     ` Stubbs, Andrew
  2023-02-16 15:32                     ` Attempt to register OpenMP pinned memory using a device instead of 'mlock' (was: [PATCH] libgomp, openmp: pinned memory) Thomas Schwinge
  2 siblings, 1 reply; 28+ messages in thread
From: Tobias Burnus @ 2022-06-09 10:09 UTC (permalink / raw)
  To: Thomas Schwinge, Andrew Stubbs, Jakub Jelinek; +Cc: gcc-patches

On 09.06.22 11:38, Thomas Schwinge wrote:
> On 2022-06-07T13:28:33+0100, Andrew Stubbs <ams@codesourcery.com> wrote:
>> On 07/06/2022 13:10, Jakub Jelinek wrote:
>>> On Tue, Jun 07, 2022 at 12:05:40PM +0100, Andrew Stubbs wrote:
>>>> The memory pinned via the mlock call does not give the expected performance
>>>> boost. I had not expected that it would do much in my test setup, given that
>>>> the machine has a lot of RAM and my benchmarks are small, but others have
>>>> tried more and on varying machines and architectures.
>>> I don't understand why there should be any expected performance boost (at
>>> least not unless the machine starts swapping out pages),
>>> { omp_atk_pinned, true } is solely about the requirement that the memory
>>> can't be swapped out.
>> It seems like it takes a faster path through the NVidia drivers. [...]

I think this conflates two parts:

* User-defined allocators in general – there CUDA does not make much
sense and without unified-shared memory, it will always be inaccessible
on the device (w/o explicit/implicit mapping).

* Memory which is supposed to be accessible both on the host and on the
device. That's most obvious by  explicitly allocating to be accessible
on both – it is less clear cut when just creating an allocator with
unified-shared memory as it is not clear when it is only using on the
host (e.g. with host-based thread parallelization) – and when it is also
relevant for the device.

Currently, the user has no means to express the intent that it should be
accessible on both the host and one/several devices, except for 'omp
requires unified_shared_memory'.

The next OpenMP version will likely permit a means to create an
allocator which permits this →
https://github.com/OpenMP/spec/issues/1843 (not publicly available;
slides (last comment) are slightly outdated).

  * * *

The question is only what to do with 'requires unified_shared_memory' –
and a non-multi-device allocator.

Probably: unified_shared_memory or no nvptx device: just use mlock.
Otherwise (i.e. both nvptx device and (unified_shared_memory or a
multi-device-allocator)), use the CUDA one.

For the latter, I think Thomas' remarks are helpful.

Tobias

-----------------
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht München, HRB 106955

^ permalink raw reply	[flat|nested] 28+ messages in thread

* RE: [PATCH] libgomp, openmp: pinned memory
  2022-06-09 10:09                     ` Tobias Burnus
@ 2022-06-09 10:22                       ` Stubbs, Andrew
  0 siblings, 0 replies; 28+ messages in thread
From: Stubbs, Andrew @ 2022-06-09 10:22 UTC (permalink / raw)
  To: Burnus, Tobias, Schwinge, Thomas, Jakub Jelinek; +Cc: gcc-patches

> The question is only what to do with 'requires unified_shared_memory' –
> and a non-multi-device allocator.

The compiler emits an error at compile time if you attempt to use both -foffload-memory=pinned and USM, because they’re not compatible. You're fine to use both explicit allocators in the same program, but the "pinnedness" of USM allocations is a matter for Cuda to care about (cuMallocManaged) and has nothing to do with this discussion.

The OpenMP pinned memory feature is intended to accelerate normal mappings, as far as I can tell.

Andrew

^ permalink raw reply	[flat|nested] 28+ messages in thread

* RE: [PATCH] libgomp, openmp: pinned memory
  2022-06-09  9:38                   ` Thomas Schwinge
  2022-06-09 10:09                     ` Tobias Burnus
@ 2022-06-09 10:31                     ` Stubbs, Andrew
  2023-02-16 15:32                     ` Attempt to register OpenMP pinned memory using a device instead of 'mlock' (was: [PATCH] libgomp, openmp: pinned memory) Thomas Schwinge
  2 siblings, 0 replies; 28+ messages in thread
From: Stubbs, Andrew @ 2022-06-09 10:31 UTC (permalink / raw)
  To: Schwinge, Thomas, Jakub Jelinek; +Cc: gcc-patches

> For example, it's documented that 'cuMemHostAlloc',
> <https://docs.nvidia.com/cuda/cuda-driver-
> api/group__CUDA__MEM.html#group__CUDA__MEM_1g572ca4011bfcb25034888a14d4e035b
> 9>,
> "Allocates page-locked host memory".  The crucial thing, though, what
> makes this different from 'malloc' plus 'mlock' is, that "The driver
> tracks the virtual memory ranges allocated with this function and
> automatically accelerates calls to functions such as cuMemcpyHtoD().
> Since the memory can be accessed directly by the device, it can be read
> or written with much higher bandwidth than pageable memory obtained with
> functions such as malloc()".

OK, interesting. I had not seen this, but I think it confirms that the performance difference is within Cuda and regular locked memory is not so great.

> Also, by means of the Nvidia Driver allocating the memory, I suppose
> using this interface likely circumvents any "annoying" 'ulimit'
> limitations?

Yes, this is the case.

> If not directly *allocating and registering* such memory via
> 'cuMemAllocHost'/'cuMemHostAlloc', you should still be able to only
> *register* your standard 'malloc'ed etc. memory via 'cuMemHostRegister',
> <https://docs.nvidia.com/cuda/cuda-driver-
> api/group__CUDA__MEM.html#group__CUDA__MEM_1gf0a9fe11544326dabd743b7aa6b5422
> 3>:
> "Page-locks the memory range specified [...] and maps it for the
> device(s) [...].  This memory range also is added to the same tracking
> mechanism as cuMemHostAlloc to automatically accelerate [...]"?  (No
> manual 'mlock'ing involved in that case, too; presumably again using this
> interface likely circumvents any "annoying" 'ulimit' limitations?)
> 
> Such a *register* abstraction can then be implemented by all the libgomp
> offloading plugins: they just call the respective
> CUDA/HSA/etc. functions to register such (existing, 'malloc'ed, etc.)
> memory.
> 
> ..., but maybe I'm missing some crucial "detail" here?

I'm investigating this stuff for the AMD USM implementation as well right now. It might be a good way to handle static and stack data too. Or not.

Andrew

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] libgomp, openmp: pinned memory
  2022-01-13 13:53           ` Andrew Stubbs
  2022-06-07 11:05             ` Andrew Stubbs
@ 2023-02-10 15:11             ` Thomas Schwinge
  2023-02-10 15:55               ` Andrew Stubbs
  2023-02-16 21:39             ` [og12] Clarify/verify OpenMP 'omp_calloc' zero-initialization for pinned memory (was: [PATCH] libgomp, openmp: pinned memory) Thomas Schwinge
  2 siblings, 1 reply; 28+ messages in thread
From: Thomas Schwinge @ 2023-02-10 15:11 UTC (permalink / raw)
  To: Andrew Stubbs, Jakub Jelinek, Tobias Burnus; +Cc: gcc-patches

Hi!

Re OpenMP 'pinned' memory allocator trait semantics vs. 'omp_realloc':

On 2022-01-13T13:53:03+0000, Andrew Stubbs <ams@codesourcery.com> wrote:
> On 05/01/2022 17:07, Andrew Stubbs wrote:
>> [...], I'm working on an implementation using mmap instead of malloc
>> for pinned allocations.  [...]

> This means that large allocations will now be page aligned and therefore
> pin the smallest number of pages for the size requested, and that that
> memory will be unpinned automatically when freed via munmap, or moved
> via mremap.

> --- /dev/null
> +++ b/libgomp/config/linux/allocator.c

> +static void *
> +linux_memspace_realloc (omp_memspace_handle_t memspace, void *addr,
> +                     size_t oldsize, size_t size, int oldpin, int pin)
> +{
> +  if (oldpin && pin)
> +    {
> +      void *newaddr = mremap (addr, oldsize, size, MREMAP_MAYMOVE);
> +      if (newaddr == MAP_FAILED)
> +     return NULL;
> +
> +      return newaddr;
> +    }
> +  else if (oldpin || pin)
> +    {
> +      void *newaddr = linux_memspace_alloc (memspace, size, pin);
> +      if (newaddr)
> +     {
> +       memcpy (newaddr, addr, oldsize < size ? oldsize : size);
> +       linux_memspace_free (memspace, addr, oldsize, oldpin);
> +     }
> +
> +      return newaddr;
> +    }
> +  else
> +    return realloc (addr, size);
> +}

I did wonder if 'mremap' with 'MREMAP_MAYMOVE' is really acceptable here,
given OpenMP 5.2, 6.2 "Memory Allocators": "Allocators with the 'pinned'
trait defined to be 'true' ensure that their allocations remain in the
same storage resource at the same location for their entire lifetime."
I'd have read into this that 'realloc' may shrink or enlarge the region
(unless even that considered faulty), but the region must not be moved
("same location"), thus no 'MREMAP_MAYMOVE'; see 'man 2 mremap'
(2019-03-06):

    'MREMAP_MAYMOVE'
        By  default, if there is not sufficient space to expand a mapping at its current location, then 'mremap()' fails.  If this flag is specified, then the kernel is permitted to relocate the mapping to a new virtual address, if necessary.  If the mapping is relocated, then absolute pointers into the old mapping location become invalid (offsets relative to the starting address of the mapping should be employed).

..., but then I saw that OpenMP 5.2, 18.13.9 'omp_realloc' is specified
such that it isn't expected to 'realloc' in-place, but rather it
"deallocates previously allocated memory and requests a memory
allocation", which I understand that it does end a "lifetime" and then
establish a new "lifetime", which means that 'MREMAP_MAYMOVE' in fact is
fine (as implemented)?


Further I read in 'man 2 mremap' (2019-03-06):

    If  the  memory segment specified by *old_address* and *old_size* is locked (using 'mlock(2)' or similar), then this lock is maintained when the segment is resized and/or relocated.  As a consequence, the amount of memory locked by the process may change.

(The current proposed code evidently does make use of that; OK.)

But then in 'NOTES' I read:

    If 'mremap()' is used to move or expand an area locked with 'mlock(2)' or equivalent, the 'mremap()' call will make a best effort to populate the new area but will not fail with 'ENOMEM' if the area cannot be populated.

What exactly is that supposed to tell us: "will make a best effort [...]
but will not fail"?  Isn't that in conflict with the earlier statement?
So can we rely on 'mremap' together with 'mlock' or not?


(This topic remains valid even if we follow through the idea of using
CUDA to register page-locked memory, because that's not available in all
configurations, and we then still want to do the 'mmap'/'mlock' thing, I
suppose.)


Grüße
 Thomas
-----------------
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht München, HRB 106955

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] libgomp, openmp: pinned memory
  2023-02-10 15:11             ` [PATCH] libgomp, openmp: pinned memory Thomas Schwinge
@ 2023-02-10 15:55               ` Andrew Stubbs
  0 siblings, 0 replies; 28+ messages in thread
From: Andrew Stubbs @ 2023-02-10 15:55 UTC (permalink / raw)
  To: Thomas Schwinge, Jakub Jelinek, Tobias Burnus; +Cc: gcc-patches

On 10/02/2023 15:11, Thomas Schwinge wrote:
> Hi!
> 
> Re OpenMP 'pinned' memory allocator trait semantics vs. 'omp_realloc':
> 
> On 2022-01-13T13:53:03+0000, Andrew Stubbs <ams@codesourcery.com> wrote:
>> On 05/01/2022 17:07, Andrew Stubbs wrote:
>>> [...], I'm working on an implementation using mmap instead of malloc
>>> for pinned allocations.  [...]
> 
>> This means that large allocations will now be page aligned and therefore
>> pin the smallest number of pages for the size requested, and that that
>> memory will be unpinned automatically when freed via munmap, or moved
>> via mremap.
> 
>> --- /dev/null
>> +++ b/libgomp/config/linux/allocator.c
> 
>> +static void *
>> +linux_memspace_realloc (omp_memspace_handle_t memspace, void *addr,
>> +                     size_t oldsize, size_t size, int oldpin, int pin)
>> +{
>> +  if (oldpin && pin)
>> +    {
>> +      void *newaddr = mremap (addr, oldsize, size, MREMAP_MAYMOVE);
>> +      if (newaddr == MAP_FAILED)
>> +     return NULL;
>> +
>> +      return newaddr;
>> +    }
>> +  else if (oldpin || pin)
>> +    {
>> +      void *newaddr = linux_memspace_alloc (memspace, size, pin);
>> +      if (newaddr)
>> +     {
>> +       memcpy (newaddr, addr, oldsize < size ? oldsize : size);
>> +       linux_memspace_free (memspace, addr, oldsize, oldpin);
>> +     }
>> +
>> +      return newaddr;
>> +    }
>> +  else
>> +    return realloc (addr, size);
>> +}
> 
> I did wonder if 'mremap' with 'MREMAP_MAYMOVE' is really acceptable here,
> given OpenMP 5.2, 6.2 "Memory Allocators": "Allocators with the 'pinned'
> trait defined to be 'true' ensure that their allocations remain in the
> same storage resource at the same location for their entire lifetime."
> I'd have read into this that 'realloc' may shrink or enlarge the region
> (unless even that considered faulty), but the region must not be moved
> ("same location"), thus no 'MREMAP_MAYMOVE'; see 'man 2 mremap'

I don't think the OpenMP specification really means that any program 
using omp_realloc should abort randomly depending on the vagaries of 
chaos? What are we supposed to do? Hugely over-allocate in case realloc 
is ever called?

Andrew

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Attempt to register OpenMP pinned memory using a device instead of 'mlock' (was: [PATCH] libgomp, openmp: pinned memory)
  2022-06-09  9:38                   ` Thomas Schwinge
  2022-06-09 10:09                     ` Tobias Burnus
  2022-06-09 10:31                     ` Stubbs, Andrew
@ 2023-02-16 15:32                     ` Thomas Schwinge
  2023-02-16 16:17                       ` Stubbs, Andrew
  2 siblings, 1 reply; 28+ messages in thread
From: Thomas Schwinge @ 2023-02-16 15:32 UTC (permalink / raw)
  To: Andrew Stubbs, Jakub Jelinek, Tobias Burnus, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 5589 bytes --]

Hi!

On 2022-06-09T11:38:22+0200, I wrote:
> On 2022-06-07T13:28:33+0100, Andrew Stubbs <ams@codesourcery.com> wrote:
>> On 07/06/2022 13:10, Jakub Jelinek wrote:
>>> On Tue, Jun 07, 2022 at 12:05:40PM +0100, Andrew Stubbs wrote:
>>>> Following some feedback from users of the OG11 branch I think I need to
>>>> withdraw this patch, for now.
>>>>
>>>> The memory pinned via the mlock call does not give the expected performance
>>>> boost. I had not expected that it would do much in my test setup, given that
>>>> the machine has a lot of RAM and my benchmarks are small, but others have
>>>> tried more and on varying machines and architectures.
>>>
>>> I don't understand why there should be any expected performance boost (at
>>> least not unless the machine starts swapping out pages),
>>> { omp_atk_pinned, true } is solely about the requirement that the memory
>>> can't be swapped out.
>>
>> It seems like it takes a faster path through the NVidia drivers. This is
>> a black box, for me, but that seems like a plausible explanation. The
>> results are different on x86_64 and powerpc hosts (such as the Summit
>> supercomputer).
>
> For example, it's documented that 'cuMemHostAlloc',
> <https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g572ca4011bfcb25034888a14d4e035b9>,
> "Allocates page-locked host memory".  The crucial thing, though, what
> makes this different from 'malloc' plus 'mlock' is, that "The driver
> tracks the virtual memory ranges allocated with this function and
> automatically accelerates calls to functions such as cuMemcpyHtoD().
> Since the memory can be accessed directly by the device, it can be read
> or written with much higher bandwidth than pageable memory obtained with
> functions such as malloc()".
>
> Similar, for example, for 'cuMemAllocHost',
> <https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gdd8311286d2c2691605362c689bc64e0>.
>
> This, to me, would explain why "the mlock call does not give the expected
> performance boost", in comparison with 'cuMemAllocHost'/'cuMemHostAlloc';
> with 'mlock' you're missing the "tracks the virtual memory ranges"
> aspect.
>
> Also, by means of the Nvidia Driver allocating the memory, I suppose
> using this interface likely circumvents any "annoying" 'ulimit'
> limitations?  I get this impression, because documentation continues
> stating that "Allocating excessive amounts of memory with
> cuMemAllocHost() may degrade system performance, since it reduces the
> amount of memory available to the system for paging.  As a result, this
> function is best used sparingly to allocate staging areas for data
> exchange between host and device".
>
>>>> It seems that it isn't enough for the memory to be pinned, it has to be
>>>> pinned using the Cuda API to get the performance boost.
>>>
>>> For performance boost of what kind of code?
>>> I don't understand how Cuda API could be useful (or can be used at all) if
>>> offloading to NVPTX isn't involved.  The fact that somebody asks for host
>>> memory allocation with omp_atk_pinned set to true doesn't mean it will be
>>> in any way related to NVPTX offloading (unless it is in NVPTX target region
>>> obviously, but then mlock isn't available, so sure, if there is something
>>> CUDA can provide for that case, nice).
>>
>> This is specifically for NVPTX offload, of course, but then that's what
>> our customer is paying for.
>>
>> The expectation, from users, is that memory pinning will give the
>> benefits specific to the active device. We can certainly make that
>> happen when there is only one (flavour of) offload device present. I had
>> hoped it could be one way for all, but it looks like not.
>
> Aren't there CUDA Driver interfaces for that?  That is:
>
>>>> I had not done this
>>>> this because it was difficult to resolve the code abstraction
>>>> difficulties and anyway the implementation was supposed to be device
>>>> independent, but it seems we need a specific pinning mechanism for each
>>>> device.
>
> If not directly *allocating and registering* such memory via
> 'cuMemAllocHost'/'cuMemHostAlloc', you should still be able to only
> *register* your standard 'malloc'ed etc. memory via 'cuMemHostRegister',
> <https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gf0a9fe11544326dabd743b7aa6b54223>:
> "Page-locks the memory range specified [...] and maps it for the
> device(s) [...].  This memory range also is added to the same tracking
> mechanism as cuMemHostAlloc to automatically accelerate [...]"?  (No
> manual 'mlock'ing involved in that case, too; presumably again using this
> interface likely circumvents any "annoying" 'ulimit' limitations?)
>
> Such a *register* abstraction can then be implemented by all the libgomp
> offloading plugins: they just call the respective
> CUDA/HSA/etc. functions to register such (existing, 'malloc'ed, etc.)
> memory.
>
> ..., but maybe I'm missing some crucial "detail" here?

Indeed this does appear to work; see attached
"[WIP] Attempt to register OpenMP pinned memory using a device instead of 'mlock'".
Any comments (aside from the TODOs that I'm still working on)?


Grüße
 Thomas


-----------------
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht München, HRB 106955

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 0001-WIP-Attempt-to-register-OpenMP-pinned-memory-using-a.patch --]
[-- Type: text/x-diff, Size: 29631 bytes --]

From 97707db8602430e57b9f1c9c34da6a54ad9e2da9 Mon Sep 17 00:00:00 2001
From: Thomas Schwinge <thomas@codesourcery.com>
Date: Thu, 16 Feb 2023 15:57:37 +0100
Subject: [PATCH] [WIP] Attempt to register OpenMP pinned memory using a device
 instead of 'mlock'

Implemented for nvptx offloading via 'cuMemHostRegister'.

This re-works og12 commit ab7520b3b4cd9fdabfd63652badde478955bd3b5
"libgomp: pinned memory".
---
 include/cuda/cuda.h                          |   3 +
 libgomp/config/linux/allocator.c             |  74 +++++++++-
 libgomp/libgomp-plugin.h                     |   2 +
 libgomp/libgomp.h                            |   4 +
 libgomp/plugin/cuda-lib.def                  |   3 +
 libgomp/plugin/plugin-nvptx.c                |  48 +++++++
 libgomp/target.c                             | 137 +++++++++++++++++++
 libgomp/testsuite/libgomp.c/alloc-pinned-1.c |  25 ++++
 libgomp/testsuite/libgomp.c/alloc-pinned-2.c |  25 ++++
 libgomp/testsuite/libgomp.c/alloc-pinned-3.c |  43 +++++-
 libgomp/testsuite/libgomp.c/alloc-pinned-4.c |  43 +++++-
 libgomp/testsuite/libgomp.c/alloc-pinned-5.c |  25 ++++
 libgomp/testsuite/libgomp.c/alloc-pinned-6.c |  34 ++++-
 13 files changed, 447 insertions(+), 19 deletions(-)

diff --git a/include/cuda/cuda.h b/include/cuda/cuda.h
index 062d394b95f..b0c7636d318 100644
--- a/include/cuda/cuda.h
+++ b/include/cuda/cuda.h
@@ -183,6 +183,9 @@ CUresult cuMemAlloc (CUdeviceptr *, size_t);
 CUresult cuMemAllocHost (void **, size_t);
 CUresult cuMemAllocManaged(CUdeviceptr *, size_t, unsigned int);
 CUresult cuMemHostAlloc (void **, size_t, unsigned int);
+#define cuMemHostRegister cuMemHostRegister_v2
+CUresult cuMemHostRegister(void *, size_t, unsigned int);
+CUresult cuMemHostUnregister(void *);
 CUresult cuMemcpy (CUdeviceptr, CUdeviceptr, size_t);
 #define cuMemcpyDtoDAsync cuMemcpyDtoDAsync_v2
 CUresult cuMemcpyDtoDAsync (CUdeviceptr, CUdeviceptr, size_t, CUstream);
diff --git a/libgomp/config/linux/allocator.c b/libgomp/config/linux/allocator.c
index f278e5cdf14..81e64b268e9 100644
--- a/libgomp/config/linux/allocator.c
+++ b/libgomp/config/linux/allocator.c
@@ -24,6 +24,10 @@
 
 /* Implement malloc routines that can handle pinned memory on Linux.
 
+   Given that pinned memory is typically used to help host <-> device memory
+   transfers, we attempt to register such using a device (really: libgomp
+   plugin), but fall back to mlock if no suitable device is available.
+
    It's possible to use mlock on any heap memory, but using munlock is
    problematic if there are multiple pinned allocations on the same page.
    Tracking all that manually would be possible, but adds overhead. This may
@@ -37,6 +41,7 @@
 #define _GNU_SOURCE
 #include <sys/mman.h>
 #include <string.h>
+#include <assert.h>
 #include "libgomp.h"
 
 static bool always_pinned_mode = false;
@@ -53,9 +58,15 @@ GOMP_enable_pinned_mode ()
     always_pinned_mode = true;
 }
 
+static int using_device_for_register_page_locked
+  = /* uninitialized */ -1;
+
 static void *
 linux_memspace_alloc (omp_memspace_handle_t memspace, size_t size, int pin)
 {
+  gomp_debug (0, "%s: memspace=%llu, size=%llu, pin=%d\n",
+	      __FUNCTION__, (unsigned long long) memspace, (unsigned long long) size, pin);
+
   /* Explicit pinning may not be required.  */
   pin = pin && !always_pinned_mode;
 
@@ -71,11 +82,32 @@ linux_memspace_alloc (omp_memspace_handle_t memspace, size_t size, int pin)
       if (addr == MAP_FAILED)
 	return NULL;
 
-      if (mlock (addr, size))
+      int using_device
+	= __atomic_load_n (&using_device_for_register_page_locked,
+			   MEMMODEL_RELAXED);
+      gomp_debug (0, "  using_device=%d\n",
+		  using_device);
+      if (using_device != 0)
+	{
+	  using_device = gomp_register_page_locked (addr, size);
+	  int using_device_old
+	    = __atomic_exchange_n (&using_device_for_register_page_locked,
+				   using_device, MEMMODEL_RELAXED);
+	  gomp_debug (0, "  using_device=%d, using_device_old=%d\n",
+		      using_device, using_device_old);
+	  assert (using_device_old == -1
+		  /* We shouldn't have concurrently changed our mind.  */
+		  || using_device_old == using_device);
+	}
+      if (using_device == 0)
 	{
-	  gomp_debug (0, "libgomp: failed to pin memory (ulimit too low?)\n");
-	  munmap (addr, size);
-	  return NULL;
+	  gomp_debug (0, "  mlock\n");
+	  if (mlock (addr, size))
+	    {
+	      gomp_debug (0, "libgomp: failed to pin memory (ulimit too low?)\n");
+	      munmap (addr, size);
+	      return NULL;
+	    }
 	}
 
       return addr;
@@ -87,6 +119,9 @@ linux_memspace_alloc (omp_memspace_handle_t memspace, size_t size, int pin)
 static void *
 linux_memspace_calloc (omp_memspace_handle_t memspace, size_t size, int pin)
 {
+  gomp_debug (0, "%s: memspace=%llu, size=%llu, pin=%d\n",
+	      __FUNCTION__, (unsigned long long) memspace, (unsigned long long) size, pin);
+
   /* Explicit pinning may not be required.  */
   pin = pin && !always_pinned_mode;
 
@@ -107,13 +142,28 @@ static void
 linux_memspace_free (omp_memspace_handle_t memspace, void *addr, size_t size,
 		     int pin)
 {
+  gomp_debug (0, "%s: memspace=%llu, addr=%p, size=%llu, pin=%d\n",
+	      __FUNCTION__, (unsigned long long) memspace, addr, (unsigned long long) size, pin);
+
   /* Explicit pinning may not be required.  */
   pin = pin && !always_pinned_mode;
 
   if (memspace == ompx_unified_shared_mem_space)
     gomp_usm_free (addr, GOMP_DEVICE_ICV);
   else if (pin)
-    munmap (addr, size);
+    {
+      int using_device
+	= __atomic_load_n (&using_device_for_register_page_locked,
+			   MEMMODEL_RELAXED);
+      gomp_debug (0, "  using_device=%d\n",
+		  using_device);
+      if (using_device == 1)
+	gomp_unregister_page_locked (addr, size);
+      else
+	/* 'munlock'ing is implicit with following 'munmap'.  */
+	;
+      munmap (addr, size);
+    }
   else
     free (addr);
 }
@@ -122,6 +172,9 @@ static void *
 linux_memspace_realloc (omp_memspace_handle_t memspace, void *addr,
 			size_t oldsize, size_t size, int oldpin, int pin)
 {
+  gomp_debug (0, "%s: memspace=%llu, addr=%p, oldsize=%llu, size=%llu, oldpin=%d, pin=%d\n",
+	      __FUNCTION__, (unsigned long long) memspace, addr, (unsigned long long) oldsize, (unsigned long long) size, oldpin, pin);
+
   /* Explicit pinning may not be required.  */
   pin = pin && !always_pinned_mode;
 
@@ -129,6 +182,17 @@ linux_memspace_realloc (omp_memspace_handle_t memspace, void *addr,
     goto manual_realloc;
   else if (oldpin && pin)
     {
+      /* We can only expect to be able to just 'mremap' if not using a device
+	 for registering page-locked memory.  */
+      int using_device
+	= __atomic_load_n (&using_device_for_register_page_locked,
+		       MEMMODEL_RELAXED);
+      gomp_debug (0, "  using_device=%d\n",
+		  using_device);
+      if (using_device != 0)
+	goto manual_realloc;
+
+      gomp_debug (0, "  mremap\n");
       void *newaddr = mremap (addr, oldsize, size, MREMAP_MAYMOVE);
       if (newaddr == MAP_FAILED)
 	return NULL;
diff --git a/libgomp/libgomp-plugin.h b/libgomp/libgomp-plugin.h
index bb79ef8d9d7..345fc62d4f5 100644
--- a/libgomp/libgomp-plugin.h
+++ b/libgomp/libgomp-plugin.h
@@ -144,6 +144,8 @@ extern bool GOMP_OFFLOAD_free (int, void *);
 extern void *GOMP_OFFLOAD_usm_alloc (int, size_t);
 extern bool GOMP_OFFLOAD_usm_free (int, void *);
 extern bool GOMP_OFFLOAD_is_usm_ptr (void *);
+extern bool GOMP_OFFLOAD_register_page_locked (void *, size_t);
+extern bool GOMP_OFFLOAD_unregister_page_locked (void *, size_t);
 extern bool GOMP_OFFLOAD_dev2host (int, void *, const void *, size_t);
 extern bool GOMP_OFFLOAD_host2dev (int, void *, const void *, size_t);
 extern bool GOMP_OFFLOAD_dev2dev (int, void *, const void *, size_t);
diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h
index f6fab788519..f8cf04746ac 100644
--- a/libgomp/libgomp.h
+++ b/libgomp/libgomp.h
@@ -1136,6 +1136,8 @@ extern void gomp_target_rev (uint64_t, uint64_t, uint64_t, uint64_t, uint64_t,
 			     void *);
 extern void * gomp_usm_alloc (size_t size, int device_num);
 extern void gomp_usm_free (void *device_ptr, int device_num);
+extern bool gomp_register_page_locked (void *, size_t);
+extern void gomp_unregister_page_locked (void *, size_t);
 
 /* Splay tree definitions.  */
 typedef struct splay_tree_node_s *splay_tree_node;
@@ -1395,6 +1397,8 @@ struct gomp_device_descr
   __typeof (GOMP_OFFLOAD_usm_alloc) *usm_alloc_func;
   __typeof (GOMP_OFFLOAD_usm_free) *usm_free_func;
   __typeof (GOMP_OFFLOAD_is_usm_ptr) *is_usm_ptr_func;
+  __typeof (GOMP_OFFLOAD_register_page_locked) *register_page_locked_func;
+  __typeof (GOMP_OFFLOAD_unregister_page_locked) *unregister_page_locked_func;
   __typeof (GOMP_OFFLOAD_dev2host) *dev2host_func;
   __typeof (GOMP_OFFLOAD_host2dev) *host2dev_func;
   __typeof (GOMP_OFFLOAD_dev2dev) *dev2dev_func;
diff --git a/libgomp/plugin/cuda-lib.def b/libgomp/plugin/cuda-lib.def
index 9b786c9f2f6..8dbaadf848e 100644
--- a/libgomp/plugin/cuda-lib.def
+++ b/libgomp/plugin/cuda-lib.def
@@ -31,6 +31,9 @@ CUDA_ONE_CALL (cuMemAlloc)
 CUDA_ONE_CALL (cuMemAllocHost)
 CUDA_ONE_CALL (cuMemAllocManaged)
 CUDA_ONE_CALL (cuMemHostAlloc)
+CUDA_ONE_CALL_MAYBE_NULL (cuMemHostRegister_v2)
+CUDA_ONE_CALL (cuMemHostRegister)
+CUDA_ONE_CALL (cuMemHostUnregister)
 CUDA_ONE_CALL (cuMemcpy)
 CUDA_ONE_CALL (cuMemcpyDtoDAsync)
 CUDA_ONE_CALL (cuMemcpyDtoH)
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index 2ebf17728fa..cbdf466dd05 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -77,11 +77,14 @@ extern CUresult cuGetErrorString (CUresult, const char **);
 CUresult cuLinkAddData (CUlinkState, CUjitInputType, void *, size_t,
 			const char *, unsigned, CUjit_option *, void **);
 CUresult cuLinkCreate (unsigned, CUjit_option *, void **, CUlinkState *);
+#undef cuMemHostRegister
+CUresult cuMemHostRegister (void *, size_t, unsigned int);
 #else
 typedef size_t (*CUoccupancyB2DSize)(int);
 CUresult cuLinkAddData_v2 (CUlinkState, CUjitInputType, void *, size_t,
 			   const char *, unsigned, CUjit_option *, void **);
 CUresult cuLinkCreate_v2 (unsigned, CUjit_option *, void **, CUlinkState *);
+CUresult cuMemHostRegister_v2 (void *, size_t, unsigned int);
 CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
 					  CUoccupancyB2DSize, size_t, int);
 #endif
@@ -361,6 +364,9 @@ nvptx_thread (void)
 static bool
 nvptx_init (void)
 {
+  GOMP_PLUGIN_debug (0, "%s\n",
+		     __FUNCTION__);
+
   int ndevs;
 
   if (instantiated_devices != 0)
@@ -614,6 +620,9 @@ nvptx_close_device (struct ptx_device *ptx_dev)
 static int
 nvptx_get_num_devices (void)
 {
+  GOMP_PLUGIN_debug (0, "%s\n",
+		     __FUNCTION__);
+
   int n;
 
   /* This function will be called before the plugin has been initialized in
@@ -1704,6 +1713,45 @@ GOMP_OFFLOAD_is_usm_ptr (void *ptr)
   return managed;
 }
 
+bool
+GOMP_OFFLOAD_register_page_locked (void *ptr, size_t size)
+{
+  GOMP_PLUGIN_debug (0, "nvptx %s: ptr=%p, size=%llu\n",
+		     __FUNCTION__, ptr, (unsigned long long) size);
+
+  // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gf0a9fe11544326dabd743b7aa6b54223
+  // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1ge8d5c17670f16ac4fc8fcb4181cb490c
+
+  /* 'cuMemHostRegister' "page-locks the memory range specified".  */
+
+  unsigned int flags = /*TODO*/ 0;
+#if 0
+  //TODO
+#define CU_MEMHOSTREGISTER_PORTABLE 0x01
+  flags |= CU_MEMHOSTREGISTER_PORTABLE;
+#endif
+  //TODO Do we need some more elaborate error management instead of this 'return false' for '!CUDA_SUCCESS'?
+  if (CUDA_CALL_EXISTS (cuMemHostRegister_v2))
+    CUDA_CALL (cuMemHostRegister_v2, ptr, size, flags);
+  else
+    CUDA_CALL (cuMemHostRegister, ptr, size, flags);
+  return true;
+}
+
+bool
+GOMP_OFFLOAD_unregister_page_locked (void *ptr, size_t size)
+{
+  GOMP_PLUGIN_debug (0, "nvptx %s: ptr=%p, size=%llu\n",
+		     __FUNCTION__, ptr, (unsigned long long) size);
+
+  // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g63f450c8125359be87b7623b1c0b2a14
+  // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g81fd4101862bbefdb42a62d60e515eea
+
+  //TODO Do we need some more elaborate error management instead of this 'return false' for '!CUDA_SUCCESS'?
+  CUDA_CALL (cuMemHostUnregister, ptr);
+  return true;
+}
+
 void
 GOMP_OFFLOAD_openacc_exec (void (*fn) (void *), size_t mapnum,
 			   void **hostaddrs, void **devaddrs,
diff --git a/libgomp/target.c b/libgomp/target.c
index 1b911c9bdb9..e7285188d1e 100644
--- a/libgomp/target.c
+++ b/libgomp/target.c
@@ -4584,6 +4584,141 @@ gomp_usm_free (void *device_ptr, int device_num)
   gomp_mutex_unlock (&devicep->lock);
 }
 
+
+/* Device (really: libgomp plugin) for registering paged-locked memory.  We
+   assume there is either none or exactly one such device for the lifetime of
+   the process.  */
+
+static struct gomp_device_descr *device_for_register_page_locked
+  = /* uninitialized */ (void *) -1;
+
+static struct gomp_device_descr *
+get_device_for_register_page_locked (void)
+{
+  gomp_debug (0, "%s\n",
+	      __FUNCTION__);
+
+  struct gomp_device_descr *device;
+#ifdef HAVE_SYNC_BUILTINS
+  device
+    = __atomic_load_n (&device_for_register_page_locked, MEMMODEL_RELAXED);
+  if (device == (void *) -1)
+    {
+      gomp_debug (0, "  init\n");
+
+      gomp_init_targets_once ();
+
+      device = NULL;
+      for (int i = 0; i < num_devices; ++i)
+	{
+	  gomp_debug (0, "  i=%d, target_id=%d\n",
+		      i, devices[i].target_id);
+
+	  /* We consider only the first device of potentially several of the
+	     same type as this functionality is not specific to an individual
+	     offloading device, but instead relates to the host-side
+	     implementation of the respective offloading implementation.  */
+	  if (devices[i].target_id != 0)
+	    continue;
+
+	  if (!devices[i].register_page_locked_func)
+	    continue;
+
+	  gomp_debug (0, "  found device: %p (%s)\n",
+		      &devices[i], devices[i].name);
+	  if (device)
+	    gomp_fatal ("Unclear how %s and %s libgomp plugins may"
+			" simultaneously provide functionality"
+			" to register page-locked memory",
+			device->name, devices[i].name);
+	  else
+	    device = &devices[i];
+	}
+
+      struct gomp_device_descr *device_old
+	= __atomic_exchange_n (&device_for_register_page_locked, device,
+			       MEMMODEL_RELAXED);
+      gomp_debug (0, "  old device_for_register_page_locked: %p\n",
+		  device_old);
+      assert (device_old == (void *) -1
+	      /* We shouldn't have concurrently found a different or no
+		 device.  */
+	      || device_old == device);
+    }
+#else /* !HAVE_SYNC_BUILTINS */
+  gomp_debug (0, "  not implemented for '!HAVE_SYNC_BUILTINS'\n");
+  (void) &device_for_register_page_locked;
+  device = NULL;
+#endif /* HAVE_SYNC_BUILTINS */
+
+  gomp_debug (0, "  -> device=%p (%s)\n",
+	      device, device ? device->name : "[none]");
+  return device;
+}
+
+/* Register page-locked memory region.
+   Returns whether we have a device capable of that.  */
+
+attribute_hidden bool
+gomp_register_page_locked (void *ptr, size_t size)
+{
+  gomp_debug (0, "%s: ptr=%p, size=%llu\n",
+	      __FUNCTION__, ptr, (unsigned long long) size);
+
+  struct gomp_device_descr *device = get_device_for_register_page_locked ();
+  gomp_debug (0, "  device=%p (%s)\n",
+	      device, device ? device->name : "[none]");
+  if (device)
+    {
+      gomp_mutex_lock (&device->lock);
+      if (device->state == GOMP_DEVICE_UNINITIALIZED)
+	gomp_init_device (device);
+      else if (device->state == GOMP_DEVICE_FINALIZED)
+	{
+	  gomp_mutex_unlock (&device->lock);
+	  gomp_fatal ("Device %s for registering page-locked memory"
+		      " is finalized", device->name);
+	}
+      gomp_mutex_unlock (&device->lock);
+
+      if (!device->register_page_locked_func (ptr, size))
+	gomp_fatal ("Failed to register page-locked memory"
+		    " via %s libgomp plugin",
+		    device->name);
+    }
+  return device != NULL;
+}
+
+/* Unregister page-locked memory region.
+   This must only be called if 'gomp_register_page_locked' returned 'true'.  */
+
+attribute_hidden void
+gomp_unregister_page_locked (void *ptr, size_t size)
+{
+  gomp_debug (0, "%s: ptr=%p\n",
+	      __FUNCTION__, ptr);
+
+  struct gomp_device_descr *device = get_device_for_register_page_locked ();
+  gomp_debug (0, "  device=%p (%s)\n",
+	      device, device ? device->name : "[none]");
+  assert (device);
+
+  gomp_mutex_lock (&device->lock);
+  assert (device->state != GOMP_DEVICE_UNINITIALIZED);
+  if (device->state == GOMP_DEVICE_FINALIZED)
+    {
+      gomp_mutex_unlock (&device->lock);
+      return;
+    }
+  gomp_mutex_unlock (&device->lock);
+
+  if (!device->unregister_page_locked_func (ptr, size))
+    gomp_fatal ("Failed to unregister page-locked memory"
+		" via %s libgomp plugin",
+		device->name);
+}
+
+
 int
 omp_target_is_present (const void *ptr, int device_num)
 {
@@ -5268,6 +5403,8 @@ gomp_load_plugin_for_device (struct gomp_device_descr *device,
   DLSYM_OPT (usm_alloc, usm_alloc);
   DLSYM_OPT (usm_free, usm_free);
   DLSYM_OPT (is_usm_ptr, is_usm_ptr);
+  DLSYM_OPT (register_page_locked, register_page_locked);
+  DLSYM_OPT (unregister_page_locked, unregister_page_locked);
   DLSYM (dev2host);
   DLSYM (host2dev);
   DLSYM (evaluate_device);
diff --git a/libgomp/testsuite/libgomp.c/alloc-pinned-1.c b/libgomp/testsuite/libgomp.c/alloc-pinned-1.c
index fb7ac8b0080..bd71e22b003 100644
--- a/libgomp/testsuite/libgomp.c/alloc-pinned-1.c
+++ b/libgomp/testsuite/libgomp.c/alloc-pinned-1.c
@@ -2,6 +2,8 @@
 
 /* { dg-xfail-run-if "Pinning not implemented on this host" { ! *-*-linux-gnu } } */
 
+/* { dg-additional-options -DOFFLOAD_DEVICE_NVPTX { target offload_device_nvptx } } */
+
 /* Test that pinned memory works.  */
 
 #include <stdio.h>
@@ -67,9 +69,14 @@ verify0 (char *p, size_t s)
 int
 main ()
 {
+#ifdef OFFLOAD_DEVICE_NVPTX
+  /* Go big or go home.  */
+  const int SIZE = 40 * 1024 * 1024;
+#else
   /* Allocate at least a page each time, but stay within the ulimit.  */
   const int SIZE = PAGE_SIZE;
   CHECK_SIZE (SIZE*3);
+#endif
 
   const omp_alloctrait_t traits[] = {
       { omp_atk_pinned, 1 }
@@ -85,19 +92,37 @@ main ()
     abort ();
 
   int amount = get_pinned_mem ();
+#ifdef OFFLOAD_DEVICE_NVPTX
+  /* This doesn't show up as process 'VmLck'ed memory.  */
+  if (amount != 0)
+    abort ();
+#else
   if (amount == 0)
     abort ();
+#endif
 
   p = omp_realloc (p, SIZE*2, allocator, allocator);
 
   int amount2 = get_pinned_mem ();
+#ifdef OFFLOAD_DEVICE_NVPTX
+  /* This doesn't show up as process 'VmLck'ed memory.  */
+  if (amount2 != 0)
+    abort ();
+#else
   if (amount2 <= amount)
     abort ();
+#endif
 
   p = omp_calloc (1, SIZE, allocator);
 
+#ifdef OFFLOAD_DEVICE_NVPTX
+  /* This doesn't show up as process 'VmLck'ed memory.  */
+  if (get_pinned_mem () != 0)
+    abort ();
+#else
   if (get_pinned_mem () <= amount2)
     abort ();
+#endif
 
   verify0 (p, SIZE);
 
diff --git a/libgomp/testsuite/libgomp.c/alloc-pinned-2.c b/libgomp/testsuite/libgomp.c/alloc-pinned-2.c
index 651b89fb42f..c71248b046d 100644
--- a/libgomp/testsuite/libgomp.c/alloc-pinned-2.c
+++ b/libgomp/testsuite/libgomp.c/alloc-pinned-2.c
@@ -2,6 +2,8 @@
 
 /* { dg-xfail-run-if "Pinning not implemented on this host" { ! *-*-linux-gnu } } */
 
+/* { dg-additional-options -DOFFLOAD_DEVICE_NVPTX { target offload_device_nvptx } } */
+
 /* Test that pinned memory works (pool_size code path).  */
 
 #include <stdio.h>
@@ -67,9 +69,14 @@ verify0 (char *p, size_t s)
 int
 main ()
 {
+#ifdef OFFLOAD_DEVICE_NVPTX
+  /* Go big or go home.  */
+  const int SIZE = 40 * 1024 * 1024;
+#else
   /* Allocate at least a page each time, but stay within the ulimit.  */
   const int SIZE = PAGE_SIZE;
   CHECK_SIZE (SIZE*3);
+#endif
 
   const omp_alloctrait_t traits[] = {
       { omp_atk_pinned, 1 },
@@ -87,23 +94,41 @@ main ()
     abort ();
 
   int amount = get_pinned_mem ();
+#ifdef OFFLOAD_DEVICE_NVPTX
+  /* This doesn't show up as process 'VmLck'ed memory.  */
+  if (amount != 0)
+    abort ();
+#else
   if (amount == 0)
     abort ();
+#endif
 
   p = omp_realloc (p, SIZE*2, allocator, allocator);
   if (!p)
     abort ();
 
   int amount2 = get_pinned_mem ();
+#ifdef OFFLOAD_DEVICE_NVPTX
+  /* This doesn't show up as process 'VmLck'ed memory.  */
+  if (amount2 != 0)
+    abort ();
+#else
   if (amount2 <= amount)
     abort ();
+#endif
 
   p = omp_calloc (1, SIZE, allocator);
   if (!p)
     abort ();
 
+#ifdef OFFLOAD_DEVICE_NVPTX
+  /* This doesn't show up as process 'VmLck'ed memory.  */
+  if (get_pinned_mem () != 0)
+    abort ();
+#else
   if (get_pinned_mem () <= amount2)
     abort ();
+#endif
 
   verify0 (p, SIZE);
 
diff --git a/libgomp/testsuite/libgomp.c/alloc-pinned-3.c b/libgomp/testsuite/libgomp.c/alloc-pinned-3.c
index f41797881ef..26b0c352d85 100644
--- a/libgomp/testsuite/libgomp.c/alloc-pinned-3.c
+++ b/libgomp/testsuite/libgomp.c/alloc-pinned-3.c
@@ -1,5 +1,7 @@
 /* { dg-do run } */
 
+/* { dg-additional-options -DOFFLOAD_DEVICE_NVPTX { target offload_device_nvptx } } */
+
 /* Test that pinned memory fails correctly.  */
 
 #include <stdio.h>
@@ -74,8 +76,14 @@ verify0 (char *p, size_t s)
 int
 main ()
 {
+#ifdef OFFLOAD_DEVICE_NVPTX
+  /* Go big or go home.  */
+  const int SIZE = 40 * 1024 * 1024;
+#else
   /* This needs to be large enough to cover multiple pages.  */
   const int SIZE = PAGE_SIZE*4;
+#endif
+  const int PIN_LIMIT = PAGE_SIZE*2;
 
   /* Pinned memory, no fallback.  */
   const omp_alloctrait_t traits1[] = {
@@ -92,21 +100,33 @@ main ()
   omp_allocator_handle_t allocator2 = omp_init_allocator (omp_default_mem_space, 2, traits2);
 
   /* Ensure that the limit is smaller than the allocation.  */
-  set_pin_limit (SIZE/2);
+  set_pin_limit (PIN_LIMIT);
 
   // Sanity check
   if (get_pinned_mem () != 0)
     abort ();
 
-  // Should fail
   void *p = omp_alloc (SIZE, allocator1);
+#ifdef OFFLOAD_DEVICE_NVPTX
+  // Doesn't care about 'set_pin_limit'.
+  if (!p)
+    abort ();
+#else
+  // Should fail
   if (p)
     abort ();
+#endif
 
-  // Should fail
   p = omp_calloc (1, SIZE, allocator1);
+#ifdef OFFLOAD_DEVICE_NVPTX
+  // Doesn't care about 'set_pin_limit'.
+  if (!p)
+    abort ();
+#else
+  // Should fail
   if (p)
     abort ();
+#endif
 
   // Should fall back
   p = omp_alloc (SIZE, allocator2);
@@ -119,16 +139,29 @@ main ()
     abort ();
   verify0 (p, SIZE);
 
-  // Should fail to realloc
   void *notpinned = omp_alloc (SIZE, omp_default_mem_alloc);
   p = omp_realloc (notpinned, SIZE, allocator1, omp_default_mem_alloc);
+#ifdef OFFLOAD_DEVICE_NVPTX
+  // Doesn't care about 'set_pin_limit'; does reallocate.
+  if (!notpinned || !p || p == notpinned)
+    abort ();
+#else
+  // Should fail to realloc
   if (!notpinned || p)
     abort ();
+#endif
 
-  // Should fall back to no realloc needed
+#ifdef OFFLOAD_DEVICE_NVPTX
+  void *p_ = omp_realloc (p, SIZE, allocator2, allocator1);
+  // Does reallocate.
+  if (p_ == p)
+    abort ();
+#else
   p = omp_realloc (notpinned, SIZE, allocator2, omp_default_mem_alloc);
+  // Should fall back to no realloc needed
   if (p != notpinned)
     abort ();
+#endif
 
   // No memory should have been pinned
   int amount = get_pinned_mem ();
diff --git a/libgomp/testsuite/libgomp.c/alloc-pinned-4.c b/libgomp/testsuite/libgomp.c/alloc-pinned-4.c
index a878da8c558..0bd6a552d94 100644
--- a/libgomp/testsuite/libgomp.c/alloc-pinned-4.c
+++ b/libgomp/testsuite/libgomp.c/alloc-pinned-4.c
@@ -1,5 +1,7 @@
 /* { dg-do run } */
 
+/* { dg-additional-options -DOFFLOAD_DEVICE_NVPTX { target offload_device_nvptx } } */
+
 /* Test that pinned memory fails correctly, pool_size code path.  */
 
 #include <stdio.h>
@@ -74,8 +76,14 @@ verify0 (char *p, size_t s)
 int
 main ()
 {
+#ifdef OFFLOAD_DEVICE_NVPTX
+  /* Go big or go home.  */
+  const int SIZE = 40 * 1024 * 1024;
+#else
   /* This needs to be large enough to cover multiple pages.  */
   const int SIZE = PAGE_SIZE*4;
+#endif
+  const int PIN_LIMIT = PAGE_SIZE*2;
 
   /* Pinned memory, no fallback.  */
   const omp_alloctrait_t traits1[] = {
@@ -94,21 +102,33 @@ main ()
   omp_allocator_handle_t allocator2 = omp_init_allocator (omp_default_mem_space, 3, traits2);
 
   /* Ensure that the limit is smaller than the allocation.  */
-  set_pin_limit (SIZE/2);
+  set_pin_limit (PIN_LIMIT);
 
   // Sanity check
   if (get_pinned_mem () != 0)
     abort ();
 
-  // Should fail
   void *p = omp_alloc (SIZE, allocator1);
+#ifdef OFFLOAD_DEVICE_NVPTX
+  // Doesn't care about 'set_pin_limit'.
+  if (!p)
+    abort ();
+#else
+  // Should fail
   if (p)
     abort ();
+#endif
 
-  // Should fail
   p = omp_calloc (1, SIZE, allocator1);
+#ifdef OFFLOAD_DEVICE_NVPTX
+  // Doesn't care about 'set_pin_limit'.
+  if (!p)
+    abort ();
+#else
+  // Should fail
   if (p)
     abort ();
+#endif
 
   // Should fall back
   p = omp_alloc (SIZE, allocator2);
@@ -121,16 +141,29 @@ main ()
     abort ();
   verify0 (p, SIZE);
 
-  // Should fail to realloc
   void *notpinned = omp_alloc (SIZE, omp_default_mem_alloc);
   p = omp_realloc (notpinned, SIZE, allocator1, omp_default_mem_alloc);
+#ifdef OFFLOAD_DEVICE_NVPTX
+  // Doesn't care about 'set_pin_limit'; does reallocate.
+  if (!notpinned || !p || p == notpinned)
+    abort ();
+#else
+  // Should fail to realloc
   if (!notpinned || p)
     abort ();
+#endif
 
-  // Should fall back to no realloc needed
+#ifdef OFFLOAD_DEVICE_NVPTX
+  void *p_ = omp_realloc (p, SIZE, allocator2, allocator1);
+  // Does reallocate.
+  if (p_ == p)
+    abort ();
+#else
   p = omp_realloc (notpinned, SIZE, allocator2, omp_default_mem_alloc);
+  // Should fall back to no realloc needed
   if (p != notpinned)
     abort ();
+#endif
 
   // No memory should have been pinned
   int amount = get_pinned_mem ();
diff --git a/libgomp/testsuite/libgomp.c/alloc-pinned-5.c b/libgomp/testsuite/libgomp.c/alloc-pinned-5.c
index 65983b3d03d..623c96a78e3 100644
--- a/libgomp/testsuite/libgomp.c/alloc-pinned-5.c
+++ b/libgomp/testsuite/libgomp.c/alloc-pinned-5.c
@@ -2,6 +2,8 @@
 
 /* { dg-xfail-run-if "Pinning not implemented on this host" { ! *-*-linux-gnu } } */
 
+/* { dg-additional-options -DOFFLOAD_DEVICE_NVPTX { target offload_device_nvptx } } */
+
 /* Test that ompx_pinned_mem_alloc works.  */
 
 #include <stdio.h>
@@ -67,9 +69,14 @@ verify0 (char *p, size_t s)
 int
 main ()
 {
+#ifdef OFFLOAD_DEVICE_NVPTX
+  /* Go big or go home.  */
+  const int SIZE = 40 * 1024 * 1024;
+#else
   /* Allocate at least a page each time, but stay within the ulimit.  */
   const int SIZE = PAGE_SIZE;
   CHECK_SIZE (SIZE*3);
+#endif
 
   // Sanity check
   if (get_pinned_mem () != 0)
@@ -80,19 +87,37 @@ main ()
     abort ();
 
   int amount = get_pinned_mem ();
+#ifdef OFFLOAD_DEVICE_NVPTX
+  /* This doesn't show up as process 'VmLck'ed memory.  */
+  if (amount != 0)
+    abort ();
+#else
   if (amount == 0)
     abort ();
+#endif
 
   p = omp_realloc (p, SIZE*2, ompx_pinned_mem_alloc, ompx_pinned_mem_alloc);
 
   int amount2 = get_pinned_mem ();
+#ifdef OFFLOAD_DEVICE_NVPTX
+  /* This doesn't show up as process 'VmLck'ed memory.  */
+  if (amount2 != 0)
+    abort ();
+#else
   if (amount2 <= amount)
     abort ();
+#endif
 
   p = omp_calloc (1, SIZE, ompx_pinned_mem_alloc);
 
+#ifdef OFFLOAD_DEVICE_NVPTX
+  /* This doesn't show up as process 'VmLck'ed memory.  */
+  if (get_pinned_mem () != 0)
+    abort ();
+#else
   if (get_pinned_mem () <= amount2)
     abort ();
+#endif
 
   verify0 (p, SIZE);
 
diff --git a/libgomp/testsuite/libgomp.c/alloc-pinned-6.c b/libgomp/testsuite/libgomp.c/alloc-pinned-6.c
index bbe20c04875..c0f8b260e37 100644
--- a/libgomp/testsuite/libgomp.c/alloc-pinned-6.c
+++ b/libgomp/testsuite/libgomp.c/alloc-pinned-6.c
@@ -1,5 +1,7 @@
 /* { dg-do run } */
 
+/* { dg-additional-options -DOFFLOAD_DEVICE_NVPTX { target offload_device_nvptx } } */
+
 /* Test that ompx_pinned_mem_alloc fails correctly.  */
 
 #include <stdio.h>
@@ -66,31 +68,55 @@ set_pin_limit ()
 int
 main ()
 {
+#ifdef OFFLOAD_DEVICE_NVPTX
+  /* Go big or go home.  */
+  const int SIZE = 40 * 1024 * 1024;
+#else
   /* Allocate at least a page each time, but stay within the ulimit.  */
   const int SIZE = PAGE_SIZE*4;
+#endif
+  const int PIN_LIMIT = PAGE_SIZE*2;
 
   /* Ensure that the limit is smaller than the allocation.  */
-  set_pin_limit (SIZE/2);
+  set_pin_limit (PIN_LIMIT);
 
   // Sanity check
   if (get_pinned_mem () != 0)
     abort ();
 
-  // Should fail
   void *p = omp_alloc (SIZE, ompx_pinned_mem_alloc);
+#ifdef OFFLOAD_DEVICE_NVPTX
+  // Doesn't care about 'set_pin_limit'.
+  if (!p)
+    abort ();
+#else
+  // Should fail
   if (p)
     abort ();
+#endif
 
-  // Should fail
   p = omp_calloc (1, SIZE, ompx_pinned_mem_alloc);
+#ifdef OFFLOAD_DEVICE_NVPTX
+  // Doesn't care about 'set_pin_limit'.
+  if (!p)
+    abort ();
+#else
+  // Should fail
   if (p)
     abort ();
+#endif
 
-  // Should fail to realloc
   void *notpinned = omp_alloc (SIZE, omp_default_mem_alloc);
   p = omp_realloc (notpinned, SIZE, ompx_pinned_mem_alloc, omp_default_mem_alloc);
+#ifdef OFFLOAD_DEVICE_NVPTX
+  // Doesn't care about 'set_pin_limit'; does reallocate.
+  if (!notpinned || !p || p == notpinned)
+    abort ();
+#else
+  // Should fail to realloc
   if (!notpinned || p)
     abort ();
+#endif
 
   // No memory should have been pinned
   int amount = get_pinned_mem ();
-- 
2.25.1


^ permalink raw reply	[flat|nested] 28+ messages in thread

* RE: Attempt to register OpenMP pinned memory using a device instead of 'mlock' (was: [PATCH] libgomp, openmp: pinned memory)
  2023-02-16 15:32                     ` Attempt to register OpenMP pinned memory using a device instead of 'mlock' (was: [PATCH] libgomp, openmp: pinned memory) Thomas Schwinge
@ 2023-02-16 16:17                       ` Stubbs, Andrew
  2023-02-16 22:06                         ` [og12] " Thomas Schwinge
  0 siblings, 1 reply; 28+ messages in thread
From: Stubbs, Andrew @ 2023-02-16 16:17 UTC (permalink / raw)
  To: Thomas Schwinge, Andrew Stubbs, Jakub Jelinek, Tobias Burnus,
	gcc-patches

> -----Original Message-----
> From: Thomas Schwinge <thomas@codesourcery.com>
> Sent: 16 February 2023 15:33
> To: Andrew Stubbs <ams@codesourcery.com>; Jakub Jelinek <jakub@redhat.com>;
> Tobias Burnus <tobias@codesourcery.com>; gcc-patches@gcc.gnu.org
> Subject: Attempt to register OpenMP pinned memory using a device instead of
> 'mlock' (was: [PATCH] libgomp, openmp: pinned memory)
> 
> Hi!
> 
> On 2022-06-09T11:38:22+0200, I wrote:
> > On 2022-06-07T13:28:33+0100, Andrew Stubbs <ams@codesourcery.com> wrote:
> >> On 07/06/2022 13:10, Jakub Jelinek wrote:
> >>> On Tue, Jun 07, 2022 at 12:05:40PM +0100, Andrew Stubbs wrote:
> >>>> Following some feedback from users of the OG11 branch I think I need to
> >>>> withdraw this patch, for now.
> >>>>
> >>>> The memory pinned via the mlock call does not give the expected
> performance
> >>>> boost. I had not expected that it would do much in my test setup, given
> that
> >>>> the machine has a lot of RAM and my benchmarks are small, but others
> have
> >>>> tried more and on varying machines and architectures.
> >>>
> >>> I don't understand why there should be any expected performance boost
> (at
> >>> least not unless the machine starts swapping out pages),
> >>> { omp_atk_pinned, true } is solely about the requirement that the memory
> >>> can't be swapped out.
> >>
> >> It seems like it takes a faster path through the NVidia drivers. This is
> >> a black box, for me, but that seems like a plausible explanation. The
> >> results are different on x86_64 and powerpc hosts (such as the Summit
> >> supercomputer).
> >
> > For example, it's documented that 'cuMemHostAlloc',
> >
> <https://eur01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fdocs.nvid
> ia.com%2Fcuda%2Fcuda-driver-
> api%2Fgroup__CUDA__MEM.html%23group__CUDA__MEM_1g572ca4011bfcb25034888a14d4e
> 035b9&data=05%7C01%7Candrew.stubbs%40siemens.com%7C239a86c9ff1142313daa08db1
> 0331cfc%7C38ae3bcd95794fd4addab42e1495d55a%7C1%7C0%7C638121583939887694%7CUn
> known%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJX
> VCI6Mn0%3D%7C3000%7C%7C%7C&sdata=7S8K2opKAV%2F5Ub2tyZtcgplptZ65dNc3b%2F2IYoh
> me%2Fw%3D&reserved=0>,
> > "Allocates page-locked host memory".  The crucial thing, though, what
> > makes this different from 'malloc' plus 'mlock' is, that "The driver
> > tracks the virtual memory ranges allocated with this function and
> > automatically accelerates calls to functions such as cuMemcpyHtoD().
> > Since the memory can be accessed directly by the device, it can be read
> > or written with much higher bandwidth than pageable memory obtained with
> > functions such as malloc()".
> >
> > Similar, for example, for 'cuMemAllocHost',
> >
> <https://eur01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fdocs.nvid
> ia.com%2Fcuda%2Fcuda-driver-
> api%2Fgroup__CUDA__MEM.html%23group__CUDA__MEM_1gdd8311286d2c2691605362c689b
> c64e0&data=05%7C01%7Candrew.stubbs%40siemens.com%7C239a86c9ff1142313daa08db1
> 0331cfc%7C38ae3bcd95794fd4addab42e1495d55a%7C1%7C0%7C638121583939887694%7CUn
> known%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJX
> VCI6Mn0%3D%7C3000%7C%7C%7C&sdata=TAhX%2BFjPavhKZKICMDiO%2BuZuytxnkaDvfDArT0R
> KDV0%3D&reserved=0>.
> >
> > This, to me, would explain why "the mlock call does not give the expected
> > performance boost", in comparison with 'cuMemAllocHost'/'cuMemHostAlloc';
> > with 'mlock' you're missing the "tracks the virtual memory ranges"
> > aspect.
> >
> > Also, by means of the Nvidia Driver allocating the memory, I suppose
> > using this interface likely circumvents any "annoying" 'ulimit'
> > limitations?  I get this impression, because documentation continues
> > stating that "Allocating excessive amounts of memory with
> > cuMemAllocHost() may degrade system performance, since it reduces the
> > amount of memory available to the system for paging.  As a result, this
> > function is best used sparingly to allocate staging areas for data
> > exchange between host and device".
> >
> >>>> It seems that it isn't enough for the memory to be pinned, it has to be
> >>>> pinned using the Cuda API to get the performance boost.
> >>>
> >>> For performance boost of what kind of code?
> >>> I don't understand how Cuda API could be useful (or can be used at all)
> if
> >>> offloading to NVPTX isn't involved.  The fact that somebody asks for
> host
> >>> memory allocation with omp_atk_pinned set to true doesn't mean it will
> be
> >>> in any way related to NVPTX offloading (unless it is in NVPTX target
> region
> >>> obviously, but then mlock isn't available, so sure, if there is
> something
> >>> CUDA can provide for that case, nice).
> >>
> >> This is specifically for NVPTX offload, of course, but then that's what
> >> our customer is paying for.
> >>
> >> The expectation, from users, is that memory pinning will give the
> >> benefits specific to the active device. We can certainly make that
> >> happen when there is only one (flavour of) offload device present. I had
> >> hoped it could be one way for all, but it looks like not.
> >
> > Aren't there CUDA Driver interfaces for that?  That is:
> >
> >>>> I had not done this
> >>>> this because it was difficult to resolve the code abstraction
> >>>> difficulties and anyway the implementation was supposed to be device
> >>>> independent, but it seems we need a specific pinning mechanism for each
> >>>> device.
> >
> > If not directly *allocating and registering* such memory via
> > 'cuMemAllocHost'/'cuMemHostAlloc', you should still be able to only
> > *register* your standard 'malloc'ed etc. memory via 'cuMemHostRegister',
> >
> <https://eur01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fdocs.nvid
> ia.com%2Fcuda%2Fcuda-driver-
> api%2Fgroup__CUDA__MEM.html%23group__CUDA__MEM_1gf0a9fe11544326dabd743b7aa6b
> 54223&data=05%7C01%7Candrew.stubbs%40siemens.com%7C239a86c9ff1142313daa08db1
> 0331cfc%7C38ae3bcd95794fd4addab42e1495d55a%7C1%7C0%7C638121583939887694%7CUn
> known%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJX
> VCI6Mn0%3D%7C3000%7C%7C%7C&sdata=Wkwx9TipC8JJNn1QqULahoTfqn9w%2FOLyoCQ1MTt90
> 8M%3D&reserved=0>:
> > "Page-locks the memory range specified [...] and maps it for the
> > device(s) [...].  This memory range also is added to the same tracking
> > mechanism as cuMemHostAlloc to automatically accelerate [...]"?  (No
> > manual 'mlock'ing involved in that case, too; presumably again using this
> > interface likely circumvents any "annoying" 'ulimit' limitations?)
> >
> > Such a *register* abstraction can then be implemented by all the libgomp
> > offloading plugins: they just call the respective
> > CUDA/HSA/etc. functions to register such (existing, 'malloc'ed, etc.)
> > memory.
> >
> > ..., but maybe I'm missing some crucial "detail" here?
> 
> Indeed this does appear to work; see attached
> "[WIP] Attempt to register OpenMP pinned memory using a device instead of
> 'mlock'".
> Any comments (aside from the TODOs that I'm still working on)?

The mmap implementation was not optimized for a lot of small allocations, and I can't see that issue changing here, so I don't know if this can be used for mlockall replacement.

I had assumed that using the Cuda allocator would fix that limitation.

Andrew

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [og12] Clarify/verify OpenMP 'omp_calloc' zero-initialization for pinned memory (was: [PATCH] libgomp, openmp: pinned memory)
  2022-01-13 13:53           ` Andrew Stubbs
  2022-06-07 11:05             ` Andrew Stubbs
  2023-02-10 15:11             ` [PATCH] libgomp, openmp: pinned memory Thomas Schwinge
@ 2023-02-16 21:39             ` Thomas Schwinge
  2 siblings, 0 replies; 28+ messages in thread
From: Thomas Schwinge @ 2023-02-16 21:39 UTC (permalink / raw)
  To: Andrew Stubbs, gcc-patches; +Cc: Jakub Jelinek

[-- Attachment #1: Type: text/plain, Size: 936 bytes --]

Hi!

On 2022-01-13T13:53:03+0000, Andrew Stubbs <ams@codesourcery.com> wrote:
> Pinned memory is allocated via mmap

> --- /dev/null
> +++ b/libgomp/config/linux/allocator.c

> +static void *
> +linux_memspace_calloc (omp_memspace_handle_t memspace, size_t size, int pin)
> +{
> +  if (pin)
> +    return linux_memspace_alloc (memspace, size, pin);
> +[...]

This confused me for a moment, why we don't have to manually
zero-initialize here.  I've pushed to devel/omp/gcc-12 branch
commit 57b8f0600262566cd4f1ab12bf1bdafb29dbdc34
"Clarify/verify OpenMP 'omp_calloc' zero-initialization for pinned memory",
see attached.


Grüße
 Thomas


-----------------
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht München, HRB 106955

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 0001-Clarify-verify-OpenMP-omp_calloc-zero-initialization.patch --]
[-- Type: text/x-diff, Size: 5924 bytes --]

From 57b8f0600262566cd4f1ab12bf1bdafb29dbdc34 Mon Sep 17 00:00:00 2001
From: Thomas Schwinge <thomas@codesourcery.com>
Date: Wed, 15 Feb 2023 10:23:03 +0100
Subject: [PATCH] Clarify/verify OpenMP 'omp_calloc' zero-initialization for
 pinned memory

Clarification for og12 commit ab7520b3b4cd9fdabfd63652badde478955bd3b5
"libgomp: pinned memory".  No functional change.

	libgomp/
	* config/linux/allocator.c (linux_memspace_alloc)
	(linux_memspace_calloc): Clarify zero-initialization for pinned
	memory.
	* testsuite/libgomp.c/alloc-pinned-1.c: Verify zero-initialization
	for pinned memory.
	* testsuite/libgomp.c/alloc-pinned-2.c: Likewise.
	* testsuite/libgomp.c/alloc-pinned-3.c: Likewise.
	* testsuite/libgomp.c/alloc-pinned-4.c: Likewise.
	* testsuite/libgomp.c/alloc-pinned-5.c: Likewise.
---
 libgomp/ChangeLog.omp                        | 10 ++++++++++
 libgomp/config/linux/allocator.c             |  2 ++
 libgomp/testsuite/libgomp.c/alloc-pinned-1.c | 10 ++++++++++
 libgomp/testsuite/libgomp.c/alloc-pinned-2.c | 10 ++++++++++
 libgomp/testsuite/libgomp.c/alloc-pinned-3.c |  9 +++++++++
 libgomp/testsuite/libgomp.c/alloc-pinned-4.c |  9 +++++++++
 libgomp/testsuite/libgomp.c/alloc-pinned-5.c | 10 ++++++++++
 7 files changed, 60 insertions(+)

diff --git a/libgomp/ChangeLog.omp b/libgomp/ChangeLog.omp
index 1c4b1833c0b..530f5c6acf6 100644
--- a/libgomp/ChangeLog.omp
+++ b/libgomp/ChangeLog.omp
@@ -1,5 +1,15 @@
 2023-02-16  Thomas Schwinge  <thomas@codesourcery.com>
 
+	* config/linux/allocator.c (linux_memspace_alloc)
+	(linux_memspace_calloc): Clarify zero-initialization for pinned
+	memory.
+	* testsuite/libgomp.c/alloc-pinned-1.c: Verify zero-initialization
+	for pinned memory.
+	* testsuite/libgomp.c/alloc-pinned-2.c: Likewise.
+	* testsuite/libgomp.c/alloc-pinned-3.c: Likewise.
+	* testsuite/libgomp.c/alloc-pinned-4.c: Likewise.
+	* testsuite/libgomp.c/alloc-pinned-5.c: Likewise.
+
 	* config/linux/allocator.c (linux_memspace_calloc): Elide
 	(innocuous) duplicate 'if' condition.
 	* config/nvptx/allocator.c (nvptx_memspace_free): Explicitly
diff --git a/libgomp/config/linux/allocator.c b/libgomp/config/linux/allocator.c
index 8a9171c36df..f278e5cdf14 100644
--- a/libgomp/config/linux/allocator.c
+++ b/libgomp/config/linux/allocator.c
@@ -65,6 +65,7 @@ linux_memspace_alloc (omp_memspace_handle_t memspace, size_t size, int pin)
     }
   else if (pin)
     {
+      /* 'mmap' zero-initializes, which 'linux_memspace_calloc' relies on.  */
       void *addr = mmap (NULL, size, PROT_READ | PROT_WRITE,
 			 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
       if (addr == MAP_FAILED)
@@ -96,6 +97,7 @@ linux_memspace_calloc (omp_memspace_handle_t memspace, size_t size, int pin)
       return ret;
     }
   else if (pin)
+    /* If PINned, 'linux_memspace_alloc' 'mmap's, which zero-initializes.  */
     return linux_memspace_alloc (memspace, size, pin);
   else
     return calloc (1, size);
diff --git a/libgomp/testsuite/libgomp.c/alloc-pinned-1.c b/libgomp/testsuite/libgomp.c/alloc-pinned-1.c
index 79792b16d83..fb7ac8b0080 100644
--- a/libgomp/testsuite/libgomp.c/alloc-pinned-1.c
+++ b/libgomp/testsuite/libgomp.c/alloc-pinned-1.c
@@ -54,6 +54,14 @@ get_pinned_mem ()
 }
 #endif
 
+static void
+verify0 (char *p, size_t s)
+{
+  for (size_t i = 0; i < s; ++i)
+    if (p[i] != 0)
+      abort ();
+}
+
 #include <omp.h>
 
 int
@@ -91,5 +99,7 @@ main ()
   if (get_pinned_mem () <= amount2)
     abort ();
 
+  verify0 (p, SIZE);
+
   return 0;
 }
diff --git a/libgomp/testsuite/libgomp.c/alloc-pinned-2.c b/libgomp/testsuite/libgomp.c/alloc-pinned-2.c
index 228c656b715..651b89fb42f 100644
--- a/libgomp/testsuite/libgomp.c/alloc-pinned-2.c
+++ b/libgomp/testsuite/libgomp.c/alloc-pinned-2.c
@@ -54,6 +54,14 @@ get_pinned_mem ()
 }
 #endif
 
+static void
+verify0 (char *p, size_t s)
+{
+  for (size_t i = 0; i < s; ++i)
+    if (p[i] != 0)
+      abort ();
+}
+
 #include <omp.h>
 
 int
@@ -97,5 +105,7 @@ main ()
   if (get_pinned_mem () <= amount2)
     abort ();
 
+  verify0 (p, SIZE);
+
   return 0;
 }
diff --git a/libgomp/testsuite/libgomp.c/alloc-pinned-3.c b/libgomp/testsuite/libgomp.c/alloc-pinned-3.c
index 90539ffe3e0..f41797881ef 100644
--- a/libgomp/testsuite/libgomp.c/alloc-pinned-3.c
+++ b/libgomp/testsuite/libgomp.c/alloc-pinned-3.c
@@ -61,6 +61,14 @@ set_pin_limit ()
 }
 #endif
 
+static void
+verify0 (char *p, size_t s)
+{
+  for (size_t i = 0; i < s; ++i)
+    if (p[i] != 0)
+      abort ();
+}
+
 #include <omp.h>
 
 int
@@ -109,6 +117,7 @@ main ()
   p = omp_calloc (1, SIZE, allocator2);
   if (!p)
     abort ();
+  verify0 (p, SIZE);
 
   // Should fail to realloc
   void *notpinned = omp_alloc (SIZE, omp_default_mem_alloc);
diff --git a/libgomp/testsuite/libgomp.c/alloc-pinned-4.c b/libgomp/testsuite/libgomp.c/alloc-pinned-4.c
index 534e49eefc4..a878da8c558 100644
--- a/libgomp/testsuite/libgomp.c/alloc-pinned-4.c
+++ b/libgomp/testsuite/libgomp.c/alloc-pinned-4.c
@@ -61,6 +61,14 @@ set_pin_limit ()
 }
 #endif
 
+static void
+verify0 (char *p, size_t s)
+{
+  for (size_t i = 0; i < s; ++i)
+    if (p[i] != 0)
+      abort ();
+}
+
 #include <omp.h>
 
 int
@@ -111,6 +119,7 @@ main ()
   p = omp_calloc (1, SIZE, allocator2);
   if (!p)
     abort ();
+  verify0 (p, SIZE);
 
   // Should fail to realloc
   void *notpinned = omp_alloc (SIZE, omp_default_mem_alloc);
diff --git a/libgomp/testsuite/libgomp.c/alloc-pinned-5.c b/libgomp/testsuite/libgomp.c/alloc-pinned-5.c
index 315c7161a39..65983b3d03d 100644
--- a/libgomp/testsuite/libgomp.c/alloc-pinned-5.c
+++ b/libgomp/testsuite/libgomp.c/alloc-pinned-5.c
@@ -54,6 +54,14 @@ get_pinned_mem ()
 }
 #endif
 
+static void
+verify0 (char *p, size_t s)
+{
+  for (size_t i = 0; i < s; ++i)
+    if (p[i] != 0)
+      abort ();
+}
+
 #include <omp.h>
 
 int
@@ -86,5 +94,7 @@ main ()
   if (get_pinned_mem () <= amount2)
     abort ();
 
+  verify0 (p, SIZE);
+
   return 0;
 }
-- 
2.25.1


^ permalink raw reply	[flat|nested] 28+ messages in thread

* [og12] Attempt to register OpenMP pinned memory using a device instead of 'mlock' (was: [PATCH] libgomp, openmp: pinned memory)
  2023-02-16 16:17                       ` Stubbs, Andrew
@ 2023-02-16 22:06                         ` Thomas Schwinge
  2023-02-17  8:12                           ` Thomas Schwinge
  0 siblings, 1 reply; 28+ messages in thread
From: Thomas Schwinge @ 2023-02-16 22:06 UTC (permalink / raw)
  To: Andrew Stubbs, gcc-patches; +Cc: Jakub Jelinek, Tobias Burnus

[-- Attachment #1: Type: text/plain, Size: 2653 bytes --]

Hi!

On 2023-02-16T16:17:32+0000, "Stubbs, Andrew via Gcc-patches" <gcc-patches@gcc.gnu.org> wrote:
>> On 2022-06-09T11:38:22+0200, I wrote:
>> > [...]
>> > *register* your standard 'malloc'ed etc. memory via 'cuMemHostRegister',
>> > <https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gf0a9fe11544326dabd743b7aa6b54223>:
>> > "Page-locks the memory range specified [...] and maps it for the
>> > device(s) [...].  This memory range also is added to the same tracking
>> > mechanism as cuMemHostAlloc to automatically accelerate [...]"?  (No
>> > manual 'mlock'ing involved in that case, too; presumably again using this
>> > interface likely circumvents any "annoying" 'ulimit' limitations?)
>> >
>> > Such a *register* abstraction can then be implemented by all the libgomp
>> > offloading plugins: they just call the respective
>> > CUDA/HSA/etc. functions to register such (existing, 'malloc'ed, etc.)
>> > memory.
>> >
>> > ..., but maybe I'm missing some crucial "detail" here?
>>
>> Indeed this does appear to work; see attached
>> "[WIP] Attempt to register OpenMP pinned memory using a device instead of
>> 'mlock'".
>> Any comments (aside from the TODOs that I'm still working on)?

With those TODOs resolved, I've now pushed to devel/omp/gcc-12
commit a5a4800e92773da7126c00a9c79b172494d58ab5
"Attempt to register OpenMP pinned memory using a device instead of 'mlock'",
see attached.


> The mmap implementation was not optimized for a lot of small allocations, and I can't see that issue changing here

That's correct, 'mmap' remains.  Under the hood, 'cuMemHostRegister' must
surely also be doing some 'mlock'-like thing, so I figured it's best to
feed page-boundary memory regions to it, which 'mmap' gets us.

> so I don't know if this can be used for mlockall replacement.
>
> I had assumed that using the Cuda allocator would fix that limitation.

From what I've read (but no first-hand experiments), there's non-trivial
overhead with 'cuMemHostRegister' (just like with 'mlock'), so routing
all small allocations individually through it probably isn't a good idea
either.  Therefore, I suppose, we'll indeed want to use some local
allocator if we wish this "optimized for a lot of small allocations".

And, getting rid of 'mlockall' is yet another topic.


Grüße
 Thomas


-----------------
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht München, HRB 106955

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 0001-Attempt-to-register-OpenMP-pinned-memory-using-a-dev.patch --]
[-- Type: text/x-diff, Size: 31728 bytes --]

From a5a4800e92773da7126c00a9c79b172494d58ab5 Mon Sep 17 00:00:00 2001
From: Thomas Schwinge <thomas@codesourcery.com>
Date: Thu, 16 Feb 2023 15:57:37 +0100
Subject: [PATCH] Attempt to register OpenMP pinned memory using a device
 instead of 'mlock'

Implemented for nvptx offloading via 'cuMemHostRegister'.  This means: (a) not
running into 'mlock' limitations, and (b) the device is aware of this and may
optimize host <-> device memory transfers.

This re-works og12 commit ab7520b3b4cd9fdabfd63652badde478955bd3b5
"libgomp: pinned memory".

	include/
	* cuda/cuda.h (cuMemHostRegister, cuMemHostUnregister): New.
	libgomp/
	* config/linux/allocator.c (linux_memspace_alloc)
	(linux_memspace_free, linux_memspace_realloc): Attempt to register
	OpenMP pinned memory using a device instead of 'mlock'.
	* libgomp-plugin.h (GOMP_OFFLOAD_register_page_locked)
	(GOMP_OFFLOAD_unregister_page_locked): New.
	* libgomp.h (gomp_register_page_locked)
	(gomp_unregister_page_locked): New
	(struct gomp_device_descr): Add 'register_page_locked_func',
	'unregister_page_locked_func'.
	* plugin/cuda-lib.def (cuMemHostRegister_v2, cuMemHostRegister)
	(cuMemHostUnregister): New.
	* plugin/plugin-nvptx.c (GOMP_OFFLOAD_register_page_locked)
	(GOMP_OFFLOAD_unregister_page_locked): New.
	* target.c (gomp_register_page_locked)
	(gomp_unregister_page_locked): New.
	(gomp_load_plugin_for_device): Handle 'register_page_locked',
	'unregister_page_locked'.
	* testsuite/libgomp.c/alloc-pinned-1.c: Adjust.
	* testsuite/libgomp.c/alloc-pinned-2.c: Likewise.
	* testsuite/libgomp.c/alloc-pinned-3.c: Likewise.
	* testsuite/libgomp.c/alloc-pinned-4.c: Likewise.
	* testsuite/libgomp.c/alloc-pinned-5.c: Likewise.
	* testsuite/libgomp.c/alloc-pinned-6.c: Likewise.
---
 include/ChangeLog.omp                        |   4 +
 include/cuda/cuda.h                          |   3 +
 libgomp/ChangeLog.omp                        |  24 ++++
 libgomp/config/linux/allocator.c             |  74 +++++++++-
 libgomp/libgomp-plugin.h                     |   2 +
 libgomp/libgomp.h                            |   4 +
 libgomp/plugin/cuda-lib.def                  |   3 +
 libgomp/plugin/plugin-nvptx.c                |  33 +++++
 libgomp/target.c                             | 137 +++++++++++++++++++
 libgomp/testsuite/libgomp.c/alloc-pinned-1.c |  25 ++++
 libgomp/testsuite/libgomp.c/alloc-pinned-2.c |  25 ++++
 libgomp/testsuite/libgomp.c/alloc-pinned-3.c |  43 +++++-
 libgomp/testsuite/libgomp.c/alloc-pinned-4.c |  43 +++++-
 libgomp/testsuite/libgomp.c/alloc-pinned-5.c |  25 ++++
 libgomp/testsuite/libgomp.c/alloc-pinned-6.c |  34 ++++-
 15 files changed, 460 insertions(+), 19 deletions(-)

diff --git a/include/ChangeLog.omp b/include/ChangeLog.omp
index 190ae67a321..c8ef7a83c58 100644
--- a/include/ChangeLog.omp
+++ b/include/ChangeLog.omp
@@ -1,3 +1,7 @@
+2023-02-16  Thomas Schwinge  <thomas@codesourcery.com>
+
+	* cuda/cuda.h (cuMemHostRegister, cuMemHostUnregister): New.
+
 2023-02-09  Kwok Cheung Yeung  <kcy@codesourcery.com>
 
 	* gomp-constants.h (GOMP_MAP_FLAG_SPECIAL_5): New.
diff --git a/include/cuda/cuda.h b/include/cuda/cuda.h
index 062d394b95f..b0c7636d318 100644
--- a/include/cuda/cuda.h
+++ b/include/cuda/cuda.h
@@ -183,6 +183,9 @@ CUresult cuMemAlloc (CUdeviceptr *, size_t);
 CUresult cuMemAllocHost (void **, size_t);
 CUresult cuMemAllocManaged(CUdeviceptr *, size_t, unsigned int);
 CUresult cuMemHostAlloc (void **, size_t, unsigned int);
+#define cuMemHostRegister cuMemHostRegister_v2
+CUresult cuMemHostRegister(void *, size_t, unsigned int);
+CUresult cuMemHostUnregister(void *);
 CUresult cuMemcpy (CUdeviceptr, CUdeviceptr, size_t);
 #define cuMemcpyDtoDAsync cuMemcpyDtoDAsync_v2
 CUresult cuMemcpyDtoDAsync (CUdeviceptr, CUdeviceptr, size_t, CUstream);
diff --git a/libgomp/ChangeLog.omp b/libgomp/ChangeLog.omp
index 819a5333907..7e464566a21 100644
--- a/libgomp/ChangeLog.omp
+++ b/libgomp/ChangeLog.omp
@@ -1,5 +1,29 @@
 2023-02-16  Thomas Schwinge  <thomas@codesourcery.com>
 
+	* config/linux/allocator.c (linux_memspace_alloc)
+	(linux_memspace_free, linux_memspace_realloc): Attempt to register
+	OpenMP pinned memory using a device instead of 'mlock'.
+	* libgomp-plugin.h (GOMP_OFFLOAD_register_page_locked)
+	(GOMP_OFFLOAD_unregister_page_locked): New.
+	* libgomp.h (gomp_register_page_locked)
+	(gomp_unregister_page_locked): New
+	(struct gomp_device_descr): Add 'register_page_locked_func',
+	'unregister_page_locked_func'.
+	* plugin/cuda-lib.def (cuMemHostRegister_v2, cuMemHostRegister)
+	(cuMemHostUnregister): New.
+	* plugin/plugin-nvptx.c (GOMP_OFFLOAD_register_page_locked)
+	(GOMP_OFFLOAD_unregister_page_locked): New.
+	* target.c (gomp_register_page_locked)
+	(gomp_unregister_page_locked): New.
+	(gomp_load_plugin_for_device): Handle 'register_page_locked',
+	'unregister_page_locked'.
+	* testsuite/libgomp.c/alloc-pinned-1.c: Adjust.
+	* testsuite/libgomp.c/alloc-pinned-2.c: Likewise.
+	* testsuite/libgomp.c/alloc-pinned-3.c: Likewise.
+	* testsuite/libgomp.c/alloc-pinned-4.c: Likewise.
+	* testsuite/libgomp.c/alloc-pinned-5.c: Likewise.
+	* testsuite/libgomp.c/alloc-pinned-6.c: Likewise.
+
 	* allocator.c (omp_realloc): Route 'free' through 'MEMSPACE_FREE'.
 
 	* config/linux/allocator.c (linux_memspace_alloc)
diff --git a/libgomp/config/linux/allocator.c b/libgomp/config/linux/allocator.c
index f278e5cdf14..81e64b268e9 100644
--- a/libgomp/config/linux/allocator.c
+++ b/libgomp/config/linux/allocator.c
@@ -24,6 +24,10 @@
 
 /* Implement malloc routines that can handle pinned memory on Linux.
 
+   Given that pinned memory is typically used to help host <-> device memory
+   transfers, we attempt to register such using a device (really: libgomp
+   plugin), but fall back to mlock if no suitable device is available.
+
    It's possible to use mlock on any heap memory, but using munlock is
    problematic if there are multiple pinned allocations on the same page.
    Tracking all that manually would be possible, but adds overhead. This may
@@ -37,6 +41,7 @@
 #define _GNU_SOURCE
 #include <sys/mman.h>
 #include <string.h>
+#include <assert.h>
 #include "libgomp.h"
 
 static bool always_pinned_mode = false;
@@ -53,9 +58,15 @@ GOMP_enable_pinned_mode ()
     always_pinned_mode = true;
 }
 
+static int using_device_for_register_page_locked
+  = /* uninitialized */ -1;
+
 static void *
 linux_memspace_alloc (omp_memspace_handle_t memspace, size_t size, int pin)
 {
+  gomp_debug (0, "%s: memspace=%llu, size=%llu, pin=%d\n",
+	      __FUNCTION__, (unsigned long long) memspace, (unsigned long long) size, pin);
+
   /* Explicit pinning may not be required.  */
   pin = pin && !always_pinned_mode;
 
@@ -71,11 +82,32 @@ linux_memspace_alloc (omp_memspace_handle_t memspace, size_t size, int pin)
       if (addr == MAP_FAILED)
 	return NULL;
 
-      if (mlock (addr, size))
+      int using_device
+	= __atomic_load_n (&using_device_for_register_page_locked,
+			   MEMMODEL_RELAXED);
+      gomp_debug (0, "  using_device=%d\n",
+		  using_device);
+      if (using_device != 0)
+	{
+	  using_device = gomp_register_page_locked (addr, size);
+	  int using_device_old
+	    = __atomic_exchange_n (&using_device_for_register_page_locked,
+				   using_device, MEMMODEL_RELAXED);
+	  gomp_debug (0, "  using_device=%d, using_device_old=%d\n",
+		      using_device, using_device_old);
+	  assert (using_device_old == -1
+		  /* We shouldn't have concurrently changed our mind.  */
+		  || using_device_old == using_device);
+	}
+      if (using_device == 0)
 	{
-	  gomp_debug (0, "libgomp: failed to pin memory (ulimit too low?)\n");
-	  munmap (addr, size);
-	  return NULL;
+	  gomp_debug (0, "  mlock\n");
+	  if (mlock (addr, size))
+	    {
+	      gomp_debug (0, "libgomp: failed to pin memory (ulimit too low?)\n");
+	      munmap (addr, size);
+	      return NULL;
+	    }
 	}
 
       return addr;
@@ -87,6 +119,9 @@ linux_memspace_alloc (omp_memspace_handle_t memspace, size_t size, int pin)
 static void *
 linux_memspace_calloc (omp_memspace_handle_t memspace, size_t size, int pin)
 {
+  gomp_debug (0, "%s: memspace=%llu, size=%llu, pin=%d\n",
+	      __FUNCTION__, (unsigned long long) memspace, (unsigned long long) size, pin);
+
   /* Explicit pinning may not be required.  */
   pin = pin && !always_pinned_mode;
 
@@ -107,13 +142,28 @@ static void
 linux_memspace_free (omp_memspace_handle_t memspace, void *addr, size_t size,
 		     int pin)
 {
+  gomp_debug (0, "%s: memspace=%llu, addr=%p, size=%llu, pin=%d\n",
+	      __FUNCTION__, (unsigned long long) memspace, addr, (unsigned long long) size, pin);
+
   /* Explicit pinning may not be required.  */
   pin = pin && !always_pinned_mode;
 
   if (memspace == ompx_unified_shared_mem_space)
     gomp_usm_free (addr, GOMP_DEVICE_ICV);
   else if (pin)
-    munmap (addr, size);
+    {
+      int using_device
+	= __atomic_load_n (&using_device_for_register_page_locked,
+			   MEMMODEL_RELAXED);
+      gomp_debug (0, "  using_device=%d\n",
+		  using_device);
+      if (using_device == 1)
+	gomp_unregister_page_locked (addr, size);
+      else
+	/* 'munlock'ing is implicit with following 'munmap'.  */
+	;
+      munmap (addr, size);
+    }
   else
     free (addr);
 }
@@ -122,6 +172,9 @@ static void *
 linux_memspace_realloc (omp_memspace_handle_t memspace, void *addr,
 			size_t oldsize, size_t size, int oldpin, int pin)
 {
+  gomp_debug (0, "%s: memspace=%llu, addr=%p, oldsize=%llu, size=%llu, oldpin=%d, pin=%d\n",
+	      __FUNCTION__, (unsigned long long) memspace, addr, (unsigned long long) oldsize, (unsigned long long) size, oldpin, pin);
+
   /* Explicit pinning may not be required.  */
   pin = pin && !always_pinned_mode;
 
@@ -129,6 +182,17 @@ linux_memspace_realloc (omp_memspace_handle_t memspace, void *addr,
     goto manual_realloc;
   else if (oldpin && pin)
     {
+      /* We can only expect to be able to just 'mremap' if not using a device
+	 for registering page-locked memory.  */
+      int using_device
+	= __atomic_load_n (&using_device_for_register_page_locked,
+		       MEMMODEL_RELAXED);
+      gomp_debug (0, "  using_device=%d\n",
+		  using_device);
+      if (using_device != 0)
+	goto manual_realloc;
+
+      gomp_debug (0, "  mremap\n");
       void *newaddr = mremap (addr, oldsize, size, MREMAP_MAYMOVE);
       if (newaddr == MAP_FAILED)
 	return NULL;
diff --git a/libgomp/libgomp-plugin.h b/libgomp/libgomp-plugin.h
index bb79ef8d9d7..345fc62d4f5 100644
--- a/libgomp/libgomp-plugin.h
+++ b/libgomp/libgomp-plugin.h
@@ -144,6 +144,8 @@ extern bool GOMP_OFFLOAD_free (int, void *);
 extern void *GOMP_OFFLOAD_usm_alloc (int, size_t);
 extern bool GOMP_OFFLOAD_usm_free (int, void *);
 extern bool GOMP_OFFLOAD_is_usm_ptr (void *);
+extern bool GOMP_OFFLOAD_register_page_locked (void *, size_t);
+extern bool GOMP_OFFLOAD_unregister_page_locked (void *, size_t);
 extern bool GOMP_OFFLOAD_dev2host (int, void *, const void *, size_t);
 extern bool GOMP_OFFLOAD_host2dev (int, void *, const void *, size_t);
 extern bool GOMP_OFFLOAD_dev2dev (int, void *, const void *, size_t);
diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h
index c001b468252..a5fa3f9daab 100644
--- a/libgomp/libgomp.h
+++ b/libgomp/libgomp.h
@@ -1133,6 +1133,8 @@ extern void gomp_target_rev (uint64_t, uint64_t, uint64_t, uint64_t, uint64_t,
 			     void *);
 extern void * gomp_usm_alloc (size_t size, int device_num);
 extern void gomp_usm_free (void *device_ptr, int device_num);
+extern bool gomp_register_page_locked (void *, size_t);
+extern void gomp_unregister_page_locked (void *, size_t);
 
 /* Splay tree definitions.  */
 typedef struct splay_tree_node_s *splay_tree_node;
@@ -1392,6 +1394,8 @@ struct gomp_device_descr
   __typeof (GOMP_OFFLOAD_usm_alloc) *usm_alloc_func;
   __typeof (GOMP_OFFLOAD_usm_free) *usm_free_func;
   __typeof (GOMP_OFFLOAD_is_usm_ptr) *is_usm_ptr_func;
+  __typeof (GOMP_OFFLOAD_register_page_locked) *register_page_locked_func;
+  __typeof (GOMP_OFFLOAD_unregister_page_locked) *unregister_page_locked_func;
   __typeof (GOMP_OFFLOAD_dev2host) *dev2host_func;
   __typeof (GOMP_OFFLOAD_host2dev) *host2dev_func;
   __typeof (GOMP_OFFLOAD_dev2dev) *dev2dev_func;
diff --git a/libgomp/plugin/cuda-lib.def b/libgomp/plugin/cuda-lib.def
index 9b786c9f2f6..8dbaadf848e 100644
--- a/libgomp/plugin/cuda-lib.def
+++ b/libgomp/plugin/cuda-lib.def
@@ -31,6 +31,9 @@ CUDA_ONE_CALL (cuMemAlloc)
 CUDA_ONE_CALL (cuMemAllocHost)
 CUDA_ONE_CALL (cuMemAllocManaged)
 CUDA_ONE_CALL (cuMemHostAlloc)
+CUDA_ONE_CALL_MAYBE_NULL (cuMemHostRegister_v2)
+CUDA_ONE_CALL (cuMemHostRegister)
+CUDA_ONE_CALL (cuMemHostUnregister)
 CUDA_ONE_CALL (cuMemcpy)
 CUDA_ONE_CALL (cuMemcpyDtoDAsync)
 CUDA_ONE_CALL (cuMemcpyDtoH)
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index 8e7b63bd637..698317f37ac 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -77,11 +77,14 @@ extern CUresult cuGetErrorString (CUresult, const char **);
 CUresult cuLinkAddData (CUlinkState, CUjitInputType, void *, size_t,
 			const char *, unsigned, CUjit_option *, void **);
 CUresult cuLinkCreate (unsigned, CUjit_option *, void **, CUlinkState *);
+#undef cuMemHostRegister
+CUresult cuMemHostRegister (void *, size_t, unsigned int);
 #else
 typedef size_t (*CUoccupancyB2DSize)(int);
 CUresult cuLinkAddData_v2 (CUlinkState, CUjitInputType, void *, size_t,
 			   const char *, unsigned, CUjit_option *, void **);
 CUresult cuLinkCreate_v2 (unsigned, CUjit_option *, void **, CUlinkState *);
+CUresult cuMemHostRegister_v2 (void *, size_t, unsigned int);
 CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
 					  CUoccupancyB2DSize, size_t, int);
 #endif
@@ -1704,6 +1707,36 @@ GOMP_OFFLOAD_is_usm_ptr (void *ptr)
   return managed;
 }
 
+
+bool
+GOMP_OFFLOAD_register_page_locked (void *ptr, size_t size)
+{
+  GOMP_PLUGIN_debug (0, "nvptx %s: ptr=%p, size=%llu\n",
+		     __FUNCTION__, ptr, (unsigned long long) size);
+
+  unsigned int flags = 0;
+  /* Given 'CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING', we don't need
+     'flags |= CU_MEMHOSTREGISTER_PORTABLE;' here.  */
+  if (CUDA_CALL_EXISTS (cuMemHostRegister_v2))
+    CUDA_CALL (cuMemHostRegister_v2, ptr, size, flags);
+  else
+    CUDA_CALL (cuMemHostRegister, ptr, size, flags);
+
+  return true;
+}
+
+bool
+GOMP_OFFLOAD_unregister_page_locked (void *ptr, size_t size)
+{
+  GOMP_PLUGIN_debug (0, "nvptx %s: ptr=%p, size=%llu\n",
+		     __FUNCTION__, ptr, (unsigned long long) size);
+
+  CUDA_CALL (cuMemHostUnregister, ptr);
+
+  return true;
+}
+
+
 void
 GOMP_OFFLOAD_openacc_exec (void (*fn) (void *), size_t mapnum,
 			   void **hostaddrs, void **devaddrs,
diff --git a/libgomp/target.c b/libgomp/target.c
index 1b911c9bdb9..e7285188d1e 100644
--- a/libgomp/target.c
+++ b/libgomp/target.c
@@ -4584,6 +4584,141 @@ gomp_usm_free (void *device_ptr, int device_num)
   gomp_mutex_unlock (&devicep->lock);
 }
 
+
+/* Device (really: libgomp plugin) for registering paged-locked memory.  We
+   assume there is either none or exactly one such device for the lifetime of
+   the process.  */
+
+static struct gomp_device_descr *device_for_register_page_locked
+  = /* uninitialized */ (void *) -1;
+
+static struct gomp_device_descr *
+get_device_for_register_page_locked (void)
+{
+  gomp_debug (0, "%s\n",
+	      __FUNCTION__);
+
+  struct gomp_device_descr *device;
+#ifdef HAVE_SYNC_BUILTINS
+  device
+    = __atomic_load_n (&device_for_register_page_locked, MEMMODEL_RELAXED);
+  if (device == (void *) -1)
+    {
+      gomp_debug (0, "  init\n");
+
+      gomp_init_targets_once ();
+
+      device = NULL;
+      for (int i = 0; i < num_devices; ++i)
+	{
+	  gomp_debug (0, "  i=%d, target_id=%d\n",
+		      i, devices[i].target_id);
+
+	  /* We consider only the first device of potentially several of the
+	     same type as this functionality is not specific to an individual
+	     offloading device, but instead relates to the host-side
+	     implementation of the respective offloading implementation.  */
+	  if (devices[i].target_id != 0)
+	    continue;
+
+	  if (!devices[i].register_page_locked_func)
+	    continue;
+
+	  gomp_debug (0, "  found device: %p (%s)\n",
+		      &devices[i], devices[i].name);
+	  if (device)
+	    gomp_fatal ("Unclear how %s and %s libgomp plugins may"
+			" simultaneously provide functionality"
+			" to register page-locked memory",
+			device->name, devices[i].name);
+	  else
+	    device = &devices[i];
+	}
+
+      struct gomp_device_descr *device_old
+	= __atomic_exchange_n (&device_for_register_page_locked, device,
+			       MEMMODEL_RELAXED);
+      gomp_debug (0, "  old device_for_register_page_locked: %p\n",
+		  device_old);
+      assert (device_old == (void *) -1
+	      /* We shouldn't have concurrently found a different or no
+		 device.  */
+	      || device_old == device);
+    }
+#else /* !HAVE_SYNC_BUILTINS */
+  gomp_debug (0, "  not implemented for '!HAVE_SYNC_BUILTINS'\n");
+  (void) &device_for_register_page_locked;
+  device = NULL;
+#endif /* HAVE_SYNC_BUILTINS */
+
+  gomp_debug (0, "  -> device=%p (%s)\n",
+	      device, device ? device->name : "[none]");
+  return device;
+}
+
+/* Register page-locked memory region.
+   Returns whether we have a device capable of that.  */
+
+attribute_hidden bool
+gomp_register_page_locked (void *ptr, size_t size)
+{
+  gomp_debug (0, "%s: ptr=%p, size=%llu\n",
+	      __FUNCTION__, ptr, (unsigned long long) size);
+
+  struct gomp_device_descr *device = get_device_for_register_page_locked ();
+  gomp_debug (0, "  device=%p (%s)\n",
+	      device, device ? device->name : "[none]");
+  if (device)
+    {
+      gomp_mutex_lock (&device->lock);
+      if (device->state == GOMP_DEVICE_UNINITIALIZED)
+	gomp_init_device (device);
+      else if (device->state == GOMP_DEVICE_FINALIZED)
+	{
+	  gomp_mutex_unlock (&device->lock);
+	  gomp_fatal ("Device %s for registering page-locked memory"
+		      " is finalized", device->name);
+	}
+      gomp_mutex_unlock (&device->lock);
+
+      if (!device->register_page_locked_func (ptr, size))
+	gomp_fatal ("Failed to register page-locked memory"
+		    " via %s libgomp plugin",
+		    device->name);
+    }
+  return device != NULL;
+}
+
+/* Unregister page-locked memory region.
+   This must only be called if 'gomp_register_page_locked' returned 'true'.  */
+
+attribute_hidden void
+gomp_unregister_page_locked (void *ptr, size_t size)
+{
+  gomp_debug (0, "%s: ptr=%p\n",
+	      __FUNCTION__, ptr);
+
+  struct gomp_device_descr *device = get_device_for_register_page_locked ();
+  gomp_debug (0, "  device=%p (%s)\n",
+	      device, device ? device->name : "[none]");
+  assert (device);
+
+  gomp_mutex_lock (&device->lock);
+  assert (device->state != GOMP_DEVICE_UNINITIALIZED);
+  if (device->state == GOMP_DEVICE_FINALIZED)
+    {
+      gomp_mutex_unlock (&device->lock);
+      return;
+    }
+  gomp_mutex_unlock (&device->lock);
+
+  if (!device->unregister_page_locked_func (ptr, size))
+    gomp_fatal ("Failed to unregister page-locked memory"
+		" via %s libgomp plugin",
+		device->name);
+}
+
+
 int
 omp_target_is_present (const void *ptr, int device_num)
 {
@@ -5268,6 +5403,8 @@ gomp_load_plugin_for_device (struct gomp_device_descr *device,
   DLSYM_OPT (usm_alloc, usm_alloc);
   DLSYM_OPT (usm_free, usm_free);
   DLSYM_OPT (is_usm_ptr, is_usm_ptr);
+  DLSYM_OPT (register_page_locked, register_page_locked);
+  DLSYM_OPT (unregister_page_locked, unregister_page_locked);
   DLSYM (dev2host);
   DLSYM (host2dev);
   DLSYM (evaluate_device);
diff --git a/libgomp/testsuite/libgomp.c/alloc-pinned-1.c b/libgomp/testsuite/libgomp.c/alloc-pinned-1.c
index fb7ac8b0080..bd71e22b003 100644
--- a/libgomp/testsuite/libgomp.c/alloc-pinned-1.c
+++ b/libgomp/testsuite/libgomp.c/alloc-pinned-1.c
@@ -2,6 +2,8 @@
 
 /* { dg-xfail-run-if "Pinning not implemented on this host" { ! *-*-linux-gnu } } */
 
+/* { dg-additional-options -DOFFLOAD_DEVICE_NVPTX { target offload_device_nvptx } } */
+
 /* Test that pinned memory works.  */
 
 #include <stdio.h>
@@ -67,9 +69,14 @@ verify0 (char *p, size_t s)
 int
 main ()
 {
+#ifdef OFFLOAD_DEVICE_NVPTX
+  /* Go big or go home.  */
+  const int SIZE = 40 * 1024 * 1024;
+#else
   /* Allocate at least a page each time, but stay within the ulimit.  */
   const int SIZE = PAGE_SIZE;
   CHECK_SIZE (SIZE*3);
+#endif
 
   const omp_alloctrait_t traits[] = {
       { omp_atk_pinned, 1 }
@@ -85,19 +92,37 @@ main ()
     abort ();
 
   int amount = get_pinned_mem ();
+#ifdef OFFLOAD_DEVICE_NVPTX
+  /* This doesn't show up as process 'VmLck'ed memory.  */
+  if (amount != 0)
+    abort ();
+#else
   if (amount == 0)
     abort ();
+#endif
 
   p = omp_realloc (p, SIZE*2, allocator, allocator);
 
   int amount2 = get_pinned_mem ();
+#ifdef OFFLOAD_DEVICE_NVPTX
+  /* This doesn't show up as process 'VmLck'ed memory.  */
+  if (amount2 != 0)
+    abort ();
+#else
   if (amount2 <= amount)
     abort ();
+#endif
 
   p = omp_calloc (1, SIZE, allocator);
 
+#ifdef OFFLOAD_DEVICE_NVPTX
+  /* This doesn't show up as process 'VmLck'ed memory.  */
+  if (get_pinned_mem () != 0)
+    abort ();
+#else
   if (get_pinned_mem () <= amount2)
     abort ();
+#endif
 
   verify0 (p, SIZE);
 
diff --git a/libgomp/testsuite/libgomp.c/alloc-pinned-2.c b/libgomp/testsuite/libgomp.c/alloc-pinned-2.c
index 651b89fb42f..c71248b046d 100644
--- a/libgomp/testsuite/libgomp.c/alloc-pinned-2.c
+++ b/libgomp/testsuite/libgomp.c/alloc-pinned-2.c
@@ -2,6 +2,8 @@
 
 /* { dg-xfail-run-if "Pinning not implemented on this host" { ! *-*-linux-gnu } } */
 
+/* { dg-additional-options -DOFFLOAD_DEVICE_NVPTX { target offload_device_nvptx } } */
+
 /* Test that pinned memory works (pool_size code path).  */
 
 #include <stdio.h>
@@ -67,9 +69,14 @@ verify0 (char *p, size_t s)
 int
 main ()
 {
+#ifdef OFFLOAD_DEVICE_NVPTX
+  /* Go big or go home.  */
+  const int SIZE = 40 * 1024 * 1024;
+#else
   /* Allocate at least a page each time, but stay within the ulimit.  */
   const int SIZE = PAGE_SIZE;
   CHECK_SIZE (SIZE*3);
+#endif
 
   const omp_alloctrait_t traits[] = {
       { omp_atk_pinned, 1 },
@@ -87,23 +94,41 @@ main ()
     abort ();
 
   int amount = get_pinned_mem ();
+#ifdef OFFLOAD_DEVICE_NVPTX
+  /* This doesn't show up as process 'VmLck'ed memory.  */
+  if (amount != 0)
+    abort ();
+#else
   if (amount == 0)
     abort ();
+#endif
 
   p = omp_realloc (p, SIZE*2, allocator, allocator);
   if (!p)
     abort ();
 
   int amount2 = get_pinned_mem ();
+#ifdef OFFLOAD_DEVICE_NVPTX
+  /* This doesn't show up as process 'VmLck'ed memory.  */
+  if (amount2 != 0)
+    abort ();
+#else
   if (amount2 <= amount)
     abort ();
+#endif
 
   p = omp_calloc (1, SIZE, allocator);
   if (!p)
     abort ();
 
+#ifdef OFFLOAD_DEVICE_NVPTX
+  /* This doesn't show up as process 'VmLck'ed memory.  */
+  if (get_pinned_mem () != 0)
+    abort ();
+#else
   if (get_pinned_mem () <= amount2)
     abort ();
+#endif
 
   verify0 (p, SIZE);
 
diff --git a/libgomp/testsuite/libgomp.c/alloc-pinned-3.c b/libgomp/testsuite/libgomp.c/alloc-pinned-3.c
index f41797881ef..26b0c352d85 100644
--- a/libgomp/testsuite/libgomp.c/alloc-pinned-3.c
+++ b/libgomp/testsuite/libgomp.c/alloc-pinned-3.c
@@ -1,5 +1,7 @@
 /* { dg-do run } */
 
+/* { dg-additional-options -DOFFLOAD_DEVICE_NVPTX { target offload_device_nvptx } } */
+
 /* Test that pinned memory fails correctly.  */
 
 #include <stdio.h>
@@ -74,8 +76,14 @@ verify0 (char *p, size_t s)
 int
 main ()
 {
+#ifdef OFFLOAD_DEVICE_NVPTX
+  /* Go big or go home.  */
+  const int SIZE = 40 * 1024 * 1024;
+#else
   /* This needs to be large enough to cover multiple pages.  */
   const int SIZE = PAGE_SIZE*4;
+#endif
+  const int PIN_LIMIT = PAGE_SIZE*2;
 
   /* Pinned memory, no fallback.  */
   const omp_alloctrait_t traits1[] = {
@@ -92,21 +100,33 @@ main ()
   omp_allocator_handle_t allocator2 = omp_init_allocator (omp_default_mem_space, 2, traits2);
 
   /* Ensure that the limit is smaller than the allocation.  */
-  set_pin_limit (SIZE/2);
+  set_pin_limit (PIN_LIMIT);
 
   // Sanity check
   if (get_pinned_mem () != 0)
     abort ();
 
-  // Should fail
   void *p = omp_alloc (SIZE, allocator1);
+#ifdef OFFLOAD_DEVICE_NVPTX
+  // Doesn't care about 'set_pin_limit'.
+  if (!p)
+    abort ();
+#else
+  // Should fail
   if (p)
     abort ();
+#endif
 
-  // Should fail
   p = omp_calloc (1, SIZE, allocator1);
+#ifdef OFFLOAD_DEVICE_NVPTX
+  // Doesn't care about 'set_pin_limit'.
+  if (!p)
+    abort ();
+#else
+  // Should fail
   if (p)
     abort ();
+#endif
 
   // Should fall back
   p = omp_alloc (SIZE, allocator2);
@@ -119,16 +139,29 @@ main ()
     abort ();
   verify0 (p, SIZE);
 
-  // Should fail to realloc
   void *notpinned = omp_alloc (SIZE, omp_default_mem_alloc);
   p = omp_realloc (notpinned, SIZE, allocator1, omp_default_mem_alloc);
+#ifdef OFFLOAD_DEVICE_NVPTX
+  // Doesn't care about 'set_pin_limit'; does reallocate.
+  if (!notpinned || !p || p == notpinned)
+    abort ();
+#else
+  // Should fail to realloc
   if (!notpinned || p)
     abort ();
+#endif
 
-  // Should fall back to no realloc needed
+#ifdef OFFLOAD_DEVICE_NVPTX
+  void *p_ = omp_realloc (p, SIZE, allocator2, allocator1);
+  // Does reallocate.
+  if (p_ == p)
+    abort ();
+#else
   p = omp_realloc (notpinned, SIZE, allocator2, omp_default_mem_alloc);
+  // Should fall back to no realloc needed
   if (p != notpinned)
     abort ();
+#endif
 
   // No memory should have been pinned
   int amount = get_pinned_mem ();
diff --git a/libgomp/testsuite/libgomp.c/alloc-pinned-4.c b/libgomp/testsuite/libgomp.c/alloc-pinned-4.c
index a878da8c558..0bd6a552d94 100644
--- a/libgomp/testsuite/libgomp.c/alloc-pinned-4.c
+++ b/libgomp/testsuite/libgomp.c/alloc-pinned-4.c
@@ -1,5 +1,7 @@
 /* { dg-do run } */
 
+/* { dg-additional-options -DOFFLOAD_DEVICE_NVPTX { target offload_device_nvptx } } */
+
 /* Test that pinned memory fails correctly, pool_size code path.  */
 
 #include <stdio.h>
@@ -74,8 +76,14 @@ verify0 (char *p, size_t s)
 int
 main ()
 {
+#ifdef OFFLOAD_DEVICE_NVPTX
+  /* Go big or go home.  */
+  const int SIZE = 40 * 1024 * 1024;
+#else
   /* This needs to be large enough to cover multiple pages.  */
   const int SIZE = PAGE_SIZE*4;
+#endif
+  const int PIN_LIMIT = PAGE_SIZE*2;
 
   /* Pinned memory, no fallback.  */
   const omp_alloctrait_t traits1[] = {
@@ -94,21 +102,33 @@ main ()
   omp_allocator_handle_t allocator2 = omp_init_allocator (omp_default_mem_space, 3, traits2);
 
   /* Ensure that the limit is smaller than the allocation.  */
-  set_pin_limit (SIZE/2);
+  set_pin_limit (PIN_LIMIT);
 
   // Sanity check
   if (get_pinned_mem () != 0)
     abort ();
 
-  // Should fail
   void *p = omp_alloc (SIZE, allocator1);
+#ifdef OFFLOAD_DEVICE_NVPTX
+  // Doesn't care about 'set_pin_limit'.
+  if (!p)
+    abort ();
+#else
+  // Should fail
   if (p)
     abort ();
+#endif
 
-  // Should fail
   p = omp_calloc (1, SIZE, allocator1);
+#ifdef OFFLOAD_DEVICE_NVPTX
+  // Doesn't care about 'set_pin_limit'.
+  if (!p)
+    abort ();
+#else
+  // Should fail
   if (p)
     abort ();
+#endif
 
   // Should fall back
   p = omp_alloc (SIZE, allocator2);
@@ -121,16 +141,29 @@ main ()
     abort ();
   verify0 (p, SIZE);
 
-  // Should fail to realloc
   void *notpinned = omp_alloc (SIZE, omp_default_mem_alloc);
   p = omp_realloc (notpinned, SIZE, allocator1, omp_default_mem_alloc);
+#ifdef OFFLOAD_DEVICE_NVPTX
+  // Doesn't care about 'set_pin_limit'; does reallocate.
+  if (!notpinned || !p || p == notpinned)
+    abort ();
+#else
+  // Should fail to realloc
   if (!notpinned || p)
     abort ();
+#endif
 
-  // Should fall back to no realloc needed
+#ifdef OFFLOAD_DEVICE_NVPTX
+  void *p_ = omp_realloc (p, SIZE, allocator2, allocator1);
+  // Does reallocate.
+  if (p_ == p)
+    abort ();
+#else
   p = omp_realloc (notpinned, SIZE, allocator2, omp_default_mem_alloc);
+  // Should fall back to no realloc needed
   if (p != notpinned)
     abort ();
+#endif
 
   // No memory should have been pinned
   int amount = get_pinned_mem ();
diff --git a/libgomp/testsuite/libgomp.c/alloc-pinned-5.c b/libgomp/testsuite/libgomp.c/alloc-pinned-5.c
index 65983b3d03d..623c96a78e3 100644
--- a/libgomp/testsuite/libgomp.c/alloc-pinned-5.c
+++ b/libgomp/testsuite/libgomp.c/alloc-pinned-5.c
@@ -2,6 +2,8 @@
 
 /* { dg-xfail-run-if "Pinning not implemented on this host" { ! *-*-linux-gnu } } */
 
+/* { dg-additional-options -DOFFLOAD_DEVICE_NVPTX { target offload_device_nvptx } } */
+
 /* Test that ompx_pinned_mem_alloc works.  */
 
 #include <stdio.h>
@@ -67,9 +69,14 @@ verify0 (char *p, size_t s)
 int
 main ()
 {
+#ifdef OFFLOAD_DEVICE_NVPTX
+  /* Go big or go home.  */
+  const int SIZE = 40 * 1024 * 1024;
+#else
   /* Allocate at least a page each time, but stay within the ulimit.  */
   const int SIZE = PAGE_SIZE;
   CHECK_SIZE (SIZE*3);
+#endif
 
   // Sanity check
   if (get_pinned_mem () != 0)
@@ -80,19 +87,37 @@ main ()
     abort ();
 
   int amount = get_pinned_mem ();
+#ifdef OFFLOAD_DEVICE_NVPTX
+  /* This doesn't show up as process 'VmLck'ed memory.  */
+  if (amount != 0)
+    abort ();
+#else
   if (amount == 0)
     abort ();
+#endif
 
   p = omp_realloc (p, SIZE*2, ompx_pinned_mem_alloc, ompx_pinned_mem_alloc);
 
   int amount2 = get_pinned_mem ();
+#ifdef OFFLOAD_DEVICE_NVPTX
+  /* This doesn't show up as process 'VmLck'ed memory.  */
+  if (amount2 != 0)
+    abort ();
+#else
   if (amount2 <= amount)
     abort ();
+#endif
 
   p = omp_calloc (1, SIZE, ompx_pinned_mem_alloc);
 
+#ifdef OFFLOAD_DEVICE_NVPTX
+  /* This doesn't show up as process 'VmLck'ed memory.  */
+  if (get_pinned_mem () != 0)
+    abort ();
+#else
   if (get_pinned_mem () <= amount2)
     abort ();
+#endif
 
   verify0 (p, SIZE);
 
diff --git a/libgomp/testsuite/libgomp.c/alloc-pinned-6.c b/libgomp/testsuite/libgomp.c/alloc-pinned-6.c
index bbe20c04875..c0f8b260e37 100644
--- a/libgomp/testsuite/libgomp.c/alloc-pinned-6.c
+++ b/libgomp/testsuite/libgomp.c/alloc-pinned-6.c
@@ -1,5 +1,7 @@
 /* { dg-do run } */
 
+/* { dg-additional-options -DOFFLOAD_DEVICE_NVPTX { target offload_device_nvptx } } */
+
 /* Test that ompx_pinned_mem_alloc fails correctly.  */
 
 #include <stdio.h>
@@ -66,31 +68,55 @@ set_pin_limit ()
 int
 main ()
 {
+#ifdef OFFLOAD_DEVICE_NVPTX
+  /* Go big or go home.  */
+  const int SIZE = 40 * 1024 * 1024;
+#else
   /* Allocate at least a page each time, but stay within the ulimit.  */
   const int SIZE = PAGE_SIZE*4;
+#endif
+  const int PIN_LIMIT = PAGE_SIZE*2;
 
   /* Ensure that the limit is smaller than the allocation.  */
-  set_pin_limit (SIZE/2);
+  set_pin_limit (PIN_LIMIT);
 
   // Sanity check
   if (get_pinned_mem () != 0)
     abort ();
 
-  // Should fail
   void *p = omp_alloc (SIZE, ompx_pinned_mem_alloc);
+#ifdef OFFLOAD_DEVICE_NVPTX
+  // Doesn't care about 'set_pin_limit'.
+  if (!p)
+    abort ();
+#else
+  // Should fail
   if (p)
     abort ();
+#endif
 
-  // Should fail
   p = omp_calloc (1, SIZE, ompx_pinned_mem_alloc);
+#ifdef OFFLOAD_DEVICE_NVPTX
+  // Doesn't care about 'set_pin_limit'.
+  if (!p)
+    abort ();
+#else
+  // Should fail
   if (p)
     abort ();
+#endif
 
-  // Should fail to realloc
   void *notpinned = omp_alloc (SIZE, omp_default_mem_alloc);
   p = omp_realloc (notpinned, SIZE, ompx_pinned_mem_alloc, omp_default_mem_alloc);
+#ifdef OFFLOAD_DEVICE_NVPTX
+  // Doesn't care about 'set_pin_limit'; does reallocate.
+  if (!notpinned || !p || p == notpinned)
+    abort ();
+#else
+  // Should fail to realloc
   if (!notpinned || p)
     abort ();
+#endif
 
   // No memory should have been pinned
   int amount = get_pinned_mem ();
-- 
2.25.1


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [og12] Attempt to register OpenMP pinned memory using a device instead of 'mlock' (was: [PATCH] libgomp, openmp: pinned memory)
  2023-02-16 22:06                         ` [og12] " Thomas Schwinge
@ 2023-02-17  8:12                           ` Thomas Schwinge
  2023-02-20  9:48                             ` Andrew Stubbs
  0 siblings, 1 reply; 28+ messages in thread
From: Thomas Schwinge @ 2023-02-17  8:12 UTC (permalink / raw)
  To: Andrew Stubbs; +Cc: Jakub Jelinek, Tobias Burnus, gcc-patches

Hi Andrew!

On 2023-02-16T23:06:44+0100, I wrote:
> On 2023-02-16T16:17:32+0000, "Stubbs, Andrew via Gcc-patches" <gcc-patches@gcc.gnu.org> wrote:
>> The mmap implementation was not optimized for a lot of small allocations, and I can't see that issue changing here
>
> That's correct, 'mmap' remains.  Under the hood, 'cuMemHostRegister' must
> surely also be doing some 'mlock'-like thing, so I figured it's best to
> feed page-boundary memory regions to it, which 'mmap' gets us.
>
>> so I don't know if this can be used for mlockall replacement.
>>
>> I had assumed that using the Cuda allocator would fix that limitation.
>
> From what I've read (but no first-hand experiments), there's non-trivial
> overhead with 'cuMemHostRegister' (just like with 'mlock'), so routing
> all small allocations individually through it probably isn't a good idea
> either.  Therefore, I suppose, we'll indeed want to use some local
> allocator if we wish this "optimized for a lot of small allocations".

Eh, I suppose your point indirectly was that instead of 'mmap' plus
'cuMemHostRegister' we ought to use 'cuMemAllocHost'/'cuMemHostAlloc', as
we assume those already do implement such a local allocator.  Let me
quickly change that indeed -- we don't currently have a need to use
'cuMemHostRegister' instead of 'cuMemAllocHost'/'cuMemHostAlloc'.

> And, getting rid of 'mlockall' is yet another topic.

Here, the need to use 'cuMemHostRegister' may then again come up, as
begun to discuss as my "different idea" re "-foffload-memory=pinned",
<https://inbox.sourceware.org/gcc-patches/87sff9zl3u.fsf@euler.schwinge.homeip.net>.
(Let's continue that discussion there.)


Grüße
 Thomas
-----------------
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht München, HRB 106955

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [og12] Attempt to register OpenMP pinned memory using a device instead of 'mlock' (was: [PATCH] libgomp, openmp: pinned memory)
  2023-02-17  8:12                           ` Thomas Schwinge
@ 2023-02-20  9:48                             ` Andrew Stubbs
  2023-02-20 13:53                               ` [og12] Attempt to not just register but allocate OpenMP pinned memory using a device (was: [og12] Attempt to register OpenMP pinned memory using a device instead of 'mlock') Thomas Schwinge
  0 siblings, 1 reply; 28+ messages in thread
From: Andrew Stubbs @ 2023-02-20  9:48 UTC (permalink / raw)
  To: Thomas Schwinge; +Cc: Jakub Jelinek, Tobias Burnus, gcc-patches

On 17/02/2023 08:12, Thomas Schwinge wrote:
> Hi Andrew!
> 
> On 2023-02-16T23:06:44+0100, I wrote:
>> On 2023-02-16T16:17:32+0000, "Stubbs, Andrew via Gcc-patches" <gcc-patches@gcc.gnu.org> wrote:
>>> The mmap implementation was not optimized for a lot of small allocations, and I can't see that issue changing here
>>
>> That's correct, 'mmap' remains.  Under the hood, 'cuMemHostRegister' must
>> surely also be doing some 'mlock'-like thing, so I figured it's best to
>> feed page-boundary memory regions to it, which 'mmap' gets us.
>>
>>> so I don't know if this can be used for mlockall replacement.
>>>
>>> I had assumed that using the Cuda allocator would fix that limitation.
>>
>>  From what I've read (but no first-hand experiments), there's non-trivial
>> overhead with 'cuMemHostRegister' (just like with 'mlock'), so routing
>> all small allocations individually through it probably isn't a good idea
>> either.  Therefore, I suppose, we'll indeed want to use some local
>> allocator if we wish this "optimized for a lot of small allocations".
> 
> Eh, I suppose your point indirectly was that instead of 'mmap' plus
> 'cuMemHostRegister' we ought to use 'cuMemAllocHost'/'cuMemHostAlloc', as
> we assume those already do implement such a local allocator.  Let me
> quickly change that indeed -- we don't currently have a need to use
> 'cuMemHostRegister' instead of 'cuMemAllocHost'/'cuMemHostAlloc'.


Yes, that's right. I suppose it makes sense to register memory we 
already have, but if we want new memory then trying to reinvent what 
happens inside cuMemAllocHost is pointless.

Andrew

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [og12] Attempt to not just register but allocate OpenMP pinned memory using a device (was: [og12] Attempt to register OpenMP pinned memory using a device instead of 'mlock')
  2023-02-20  9:48                             ` Andrew Stubbs
@ 2023-02-20 13:53                               ` Thomas Schwinge
  0 siblings, 0 replies; 28+ messages in thread
From: Thomas Schwinge @ 2023-02-20 13:53 UTC (permalink / raw)
  To: Andrew Stubbs, gcc-patches; +Cc: Jakub Jelinek, Tobias Burnus

[-- Attachment #1: Type: text/plain, Size: 2205 bytes --]

Hi!

On 2023-02-20T09:48:53+0000, Andrew Stubbs <ams@codesourcery.com> wrote:
> On 17/02/2023 08:12, Thomas Schwinge wrote:
>> On 2023-02-16T23:06:44+0100, I wrote:
>>> On 2023-02-16T16:17:32+0000, "Stubbs, Andrew via Gcc-patches" <gcc-patches@gcc.gnu.org> wrote:
>>>> The mmap implementation was not optimized for a lot of small allocations, and I can't see that issue changing here
>>>
>>> That's correct, 'mmap' remains.  Under the hood, 'cuMemHostRegister' must
>>> surely also be doing some 'mlock'-like thing, so I figured it's best to
>>> feed page-boundary memory regions to it, which 'mmap' gets us.
>>>
>>>> so I don't know if this can be used for mlockall replacement.
>>>>
>>>> I had assumed that using the Cuda allocator would fix that limitation.
>>>
>>>  From what I've read (but no first-hand experiments), there's non-trivial
>>> overhead with 'cuMemHostRegister' (just like with 'mlock'), so routing
>>> all small allocations individually through it probably isn't a good idea
>>> either.  Therefore, I suppose, we'll indeed want to use some local
>>> allocator if we wish this "optimized for a lot of small allocations".
>>
>> Eh, I suppose your point indirectly was that instead of 'mmap' plus
>> 'cuMemHostRegister' we ought to use 'cuMemAllocHost'/'cuMemHostAlloc', as
>> we assume those already do implement such a local allocator.  Let me
>> quickly change that indeed -- we don't currently have a need to use
>> 'cuMemHostRegister' instead of 'cuMemAllocHost'/'cuMemHostAlloc'.
>
> Yes, that's right. I suppose it makes sense to register memory we
> already have, but if we want new memory then trying to reinvent what
> happens inside cuMemAllocHost is pointless.

I've pushed to devel/omp/gcc-12 branch
commit 4bd844f3e0202b3d083f0784f4343570c88bb86c
"Attempt to not just register but allocate OpenMP pinned memory using a device",
see attached.


Grüße
 Thomas


-----------------
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht München, HRB 106955

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 0001-Attempt-to-not-just-register-but-allocate-OpenMP-pin.patch --]
[-- Type: text/x-diff, Size: 19932 bytes --]

From 4bd844f3e0202b3d083f0784f4343570c88bb86c Mon Sep 17 00:00:00 2001
From: Thomas Schwinge <thomas@codesourcery.com>
Date: Mon, 20 Feb 2023 14:44:43 +0100
Subject: [PATCH] Attempt to not just register but allocate OpenMP pinned
 memory using a device

... instead of 'mmap' plus attempting to register using a device.

Implemented for nvptx offloading via 'cuMemHostAlloc'.

This re-works og12 commit a5a4800e92773da7126c00a9c79b172494d58ab5
"Attempt to register OpenMP pinned memory using a device instead of 'mlock'".

	include/
	* cuda/cuda.h (cuMemHostRegister, cuMemHostUnregister): Remove.
	libgomp/
	* config/linux/allocator.c (linux_memspace_alloc): Add 'init0'
	formal parameter.  Adjust all users.
	(linux_memspace_alloc, linux_memspace_free): Attempt to allocate
	OpenMP pinned memory using a device instead of 'mmap' plus
	attempting to register using a device.
	* libgomp-plugin.h (GOMP_OFFLOAD_register_page_locked)
	(GOMP_OFFLOAD_unregister_page_locked): Remove.
	(GOMP_OFFLOAD_page_locked_host_alloc)
	(GOMP_OFFLOAD_page_locked_host_free): New.
	* libgomp.h (gomp_register_page_locked)
	(gomp_unregister_page_locked): Remove.
	(gomp_page_locked_host_alloc, gomp_page_locked_host_free): New.
	(struct gomp_device_descr): Remove 'register_page_locked_func',
	'unregister_page_locked_func'.  Add 'page_locked_host_alloc_func',
	'page_locked_host_free_func'.
	* plugin/cuda-lib.def (cuMemHostRegister_v2, cuMemHostRegister)
	(cuMemHostUnregister): Remove.
	* plugin/plugin-nvptx.c (GOMP_OFFLOAD_register_page_locked)
	(GOMP_OFFLOAD_unregister_page_locked): Remove.
	(GOMP_OFFLOAD_page_locked_host_alloc)
	(GOMP_OFFLOAD_page_locked_host_free): New.
	* target.c (gomp_register_page_locked)
	(gomp_unregister_page_locked): Remove.
	(gomp_page_locked_host_alloc, gomp_page_locked_host_free): Add.
	(gomp_load_plugin_for_device): Don't handle
	'register_page_locked', 'unregister_page_locked'.  Handle
	'page_locked_host_alloc', 'page_locked_host_free'.

Suggested-by: Andrew Stubbs <ams@codesourcery.com>
---
 include/cuda/cuda.h              |  3 --
 libgomp/config/linux/allocator.c | 85 ++++++++++++++++++--------------
 libgomp/libgomp-plugin.h         |  4 +-
 libgomp/libgomp.h                |  8 +--
 libgomp/plugin/cuda-lib.def      |  3 --
 libgomp/plugin/plugin-nvptx.c    | 33 +++++++------
 libgomp/target.c                 | 49 +++++++++---------
 7 files changed, 98 insertions(+), 87 deletions(-)

diff --git a/include/cuda/cuda.h b/include/cuda/cuda.h
index b0c7636d318..062d394b95f 100644
--- a/include/cuda/cuda.h
+++ b/include/cuda/cuda.h
@@ -183,9 +183,6 @@ CUresult cuMemAlloc (CUdeviceptr *, size_t);
 CUresult cuMemAllocHost (void **, size_t);
 CUresult cuMemAllocManaged(CUdeviceptr *, size_t, unsigned int);
 CUresult cuMemHostAlloc (void **, size_t, unsigned int);
-#define cuMemHostRegister cuMemHostRegister_v2
-CUresult cuMemHostRegister(void *, size_t, unsigned int);
-CUresult cuMemHostUnregister(void *);
 CUresult cuMemcpy (CUdeviceptr, CUdeviceptr, size_t);
 #define cuMemcpyDtoDAsync cuMemcpyDtoDAsync_v2
 CUresult cuMemcpyDtoDAsync (CUdeviceptr, CUdeviceptr, size_t, CUstream);
diff --git a/libgomp/config/linux/allocator.c b/libgomp/config/linux/allocator.c
index 81e64b268e9..3e1bd5a1285 100644
--- a/libgomp/config/linux/allocator.c
+++ b/libgomp/config/linux/allocator.c
@@ -25,8 +25,9 @@
 /* Implement malloc routines that can handle pinned memory on Linux.
 
    Given that pinned memory is typically used to help host <-> device memory
-   transfers, we attempt to register such using a device (really: libgomp
-   plugin), but fall back to mlock if no suitable device is available.
+   transfers, we attempt to allocate such memory using a device (really:
+   libgomp plugin), but fall back to mmap plus mlock if no suitable device is
+   available.
 
    It's possible to use mlock on any heap memory, but using munlock is
    problematic if there are multiple pinned allocations on the same page.
@@ -58,40 +59,36 @@ GOMP_enable_pinned_mode ()
     always_pinned_mode = true;
 }
 
-static int using_device_for_register_page_locked
+static int using_device_for_page_locked
   = /* uninitialized */ -1;
 
 static void *
-linux_memspace_alloc (omp_memspace_handle_t memspace, size_t size, int pin)
+linux_memspace_alloc (omp_memspace_handle_t memspace, size_t size, int pin,
+		      bool init0)
 {
-  gomp_debug (0, "%s: memspace=%llu, size=%llu, pin=%d\n",
-	      __FUNCTION__, (unsigned long long) memspace, (unsigned long long) size, pin);
+  gomp_debug (0, "%s: memspace=%llu, size=%llu, pin=%d, init0=%d\n",
+	      __FUNCTION__, (unsigned long long) memspace,
+	      (unsigned long long) size, pin, init0);
 
   /* Explicit pinning may not be required.  */
   pin = pin && !always_pinned_mode;
 
+  void *addr;
+
   if (memspace == ompx_unified_shared_mem_space)
-    {
-      return gomp_usm_alloc (size, GOMP_DEVICE_ICV);
-    }
+    addr = gomp_usm_alloc (size, GOMP_DEVICE_ICV);
   else if (pin)
     {
-      /* 'mmap' zero-initializes, which 'linux_memspace_calloc' relies on.  */
-      void *addr = mmap (NULL, size, PROT_READ | PROT_WRITE,
-			 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-      if (addr == MAP_FAILED)
-	return NULL;
-
       int using_device
-	= __atomic_load_n (&using_device_for_register_page_locked,
+	= __atomic_load_n (&using_device_for_page_locked,
 			   MEMMODEL_RELAXED);
       gomp_debug (0, "  using_device=%d\n",
 		  using_device);
       if (using_device != 0)
 	{
-	  using_device = gomp_register_page_locked (addr, size);
+	  using_device = gomp_page_locked_host_alloc (&addr, size);
 	  int using_device_old
-	    = __atomic_exchange_n (&using_device_for_register_page_locked,
+	    = __atomic_exchange_n (&using_device_for_page_locked,
 				   using_device, MEMMODEL_RELAXED);
 	  gomp_debug (0, "  using_device=%d, using_device_old=%d\n",
 		      using_device, using_device_old);
@@ -101,19 +98,37 @@ linux_memspace_alloc (omp_memspace_handle_t memspace, size_t size, int pin)
 	}
       if (using_device == 0)
 	{
-	  gomp_debug (0, "  mlock\n");
-	  if (mlock (addr, size))
+	  gomp_debug (0, "  mmap\n");
+	  addr = mmap (NULL, size, PROT_READ | PROT_WRITE,
+		       MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+	  if (addr == MAP_FAILED)
+	    addr = NULL;
+	  else
 	    {
-	      gomp_debug (0, "libgomp: failed to pin memory (ulimit too low?)\n");
-	      munmap (addr, size);
-	      return NULL;
+	      /* 'mmap' zero-initializes.  */
+	      init0 = false;
+
+	      gomp_debug (0, "  mlock\n");
+	      if (mlock (addr, size))
+		{
+		  gomp_debug (0, "libgomp: failed to pin memory"
+			      " (ulimit too low?)\n");
+		  munmap (addr, size);
+		  addr = NULL;
+		}
 	    }
 	}
-
-      return addr;
     }
   else
-    return malloc (size);
+    addr = malloc (size);
+
+  if (addr && init0)
+    {
+      gomp_debug (0, "  init0\n");
+      memset (addr, 0, size);
+    }
+
+  return addr;
 }
 
 static void *
@@ -132,8 +147,7 @@ linux_memspace_calloc (omp_memspace_handle_t memspace, size_t size, int pin)
       return ret;
     }
   else if (pin)
-    /* If PINned, 'linux_memspace_alloc' 'mmap's, which zero-initializes.  */
-    return linux_memspace_alloc (memspace, size, pin);
+    return linux_memspace_alloc (memspace, size, pin, true);
   else
     return calloc (1, size);
 }
@@ -153,16 +167,15 @@ linux_memspace_free (omp_memspace_handle_t memspace, void *addr, size_t size,
   else if (pin)
     {
       int using_device
-	= __atomic_load_n (&using_device_for_register_page_locked,
+	= __atomic_load_n (&using_device_for_page_locked,
 			   MEMMODEL_RELAXED);
       gomp_debug (0, "  using_device=%d\n",
 		  using_device);
       if (using_device == 1)
-	gomp_unregister_page_locked (addr, size);
+	gomp_page_locked_host_free (addr);
       else
 	/* 'munlock'ing is implicit with following 'munmap'.  */
-	;
-      munmap (addr, size);
+	munmap (addr, size);
     }
   else
     free (addr);
@@ -183,9 +196,9 @@ linux_memspace_realloc (omp_memspace_handle_t memspace, void *addr,
   else if (oldpin && pin)
     {
       /* We can only expect to be able to just 'mremap' if not using a device
-	 for registering page-locked memory.  */
+	 for page-locked memory.  */
       int using_device
-	= __atomic_load_n (&using_device_for_register_page_locked,
+	= __atomic_load_n (&using_device_for_page_locked,
 		       MEMMODEL_RELAXED);
       gomp_debug (0, "  using_device=%d\n",
 		  using_device);
@@ -205,7 +218,7 @@ linux_memspace_realloc (omp_memspace_handle_t memspace, void *addr,
     return realloc (addr, size);
 
 manual_realloc:
-  void *newaddr = linux_memspace_alloc (memspace, size, pin);
+  void *newaddr = linux_memspace_alloc (memspace, size, pin, false);
   if (newaddr)
     {
       memcpy (newaddr, addr, oldsize < size ? oldsize : size);
@@ -216,7 +229,7 @@ manual_realloc:
 }
 
 #define MEMSPACE_ALLOC(MEMSPACE, SIZE, PIN) \
-  linux_memspace_alloc (MEMSPACE, SIZE, PIN)
+  linux_memspace_alloc (MEMSPACE, SIZE, PIN, false)
 #define MEMSPACE_CALLOC(MEMSPACE, SIZE, PIN) \
   linux_memspace_calloc (MEMSPACE, SIZE, PIN)
 #define MEMSPACE_REALLOC(MEMSPACE, ADDR, OLDSIZE, SIZE, OLDPIN, PIN) \
diff --git a/libgomp/libgomp-plugin.h b/libgomp/libgomp-plugin.h
index 345fc62d4f5..66d995f33e8 100644
--- a/libgomp/libgomp-plugin.h
+++ b/libgomp/libgomp-plugin.h
@@ -144,8 +144,8 @@ extern bool GOMP_OFFLOAD_free (int, void *);
 extern void *GOMP_OFFLOAD_usm_alloc (int, size_t);
 extern bool GOMP_OFFLOAD_usm_free (int, void *);
 extern bool GOMP_OFFLOAD_is_usm_ptr (void *);
-extern bool GOMP_OFFLOAD_register_page_locked (void *, size_t);
-extern bool GOMP_OFFLOAD_unregister_page_locked (void *, size_t);
+extern bool GOMP_OFFLOAD_page_locked_host_alloc (void **, size_t);
+extern bool GOMP_OFFLOAD_page_locked_host_free (void *);
 extern bool GOMP_OFFLOAD_dev2host (int, void *, const void *, size_t);
 extern bool GOMP_OFFLOAD_host2dev (int, void *, const void *, size_t);
 extern bool GOMP_OFFLOAD_dev2dev (int, void *, const void *, size_t);
diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h
index a5fa3f9daab..ba12d558465 100644
--- a/libgomp/libgomp.h
+++ b/libgomp/libgomp.h
@@ -1133,8 +1133,8 @@ extern void gomp_target_rev (uint64_t, uint64_t, uint64_t, uint64_t, uint64_t,
 			     void *);
 extern void * gomp_usm_alloc (size_t size, int device_num);
 extern void gomp_usm_free (void *device_ptr, int device_num);
-extern bool gomp_register_page_locked (void *, size_t);
-extern void gomp_unregister_page_locked (void *, size_t);
+extern bool gomp_page_locked_host_alloc (void **, size_t);
+extern void gomp_page_locked_host_free (void *);
 
 /* Splay tree definitions.  */
 typedef struct splay_tree_node_s *splay_tree_node;
@@ -1394,8 +1394,8 @@ struct gomp_device_descr
   __typeof (GOMP_OFFLOAD_usm_alloc) *usm_alloc_func;
   __typeof (GOMP_OFFLOAD_usm_free) *usm_free_func;
   __typeof (GOMP_OFFLOAD_is_usm_ptr) *is_usm_ptr_func;
-  __typeof (GOMP_OFFLOAD_register_page_locked) *register_page_locked_func;
-  __typeof (GOMP_OFFLOAD_unregister_page_locked) *unregister_page_locked_func;
+  __typeof (GOMP_OFFLOAD_page_locked_host_alloc) *page_locked_host_alloc_func;
+  __typeof (GOMP_OFFLOAD_page_locked_host_free) *page_locked_host_free_func;
   __typeof (GOMP_OFFLOAD_dev2host) *dev2host_func;
   __typeof (GOMP_OFFLOAD_host2dev) *host2dev_func;
   __typeof (GOMP_OFFLOAD_dev2dev) *dev2dev_func;
diff --git a/libgomp/plugin/cuda-lib.def b/libgomp/plugin/cuda-lib.def
index 8dbaadf848e..9b786c9f2f6 100644
--- a/libgomp/plugin/cuda-lib.def
+++ b/libgomp/plugin/cuda-lib.def
@@ -31,9 +31,6 @@ CUDA_ONE_CALL (cuMemAlloc)
 CUDA_ONE_CALL (cuMemAllocHost)
 CUDA_ONE_CALL (cuMemAllocManaged)
 CUDA_ONE_CALL (cuMemHostAlloc)
-CUDA_ONE_CALL_MAYBE_NULL (cuMemHostRegister_v2)
-CUDA_ONE_CALL (cuMemHostRegister)
-CUDA_ONE_CALL (cuMemHostUnregister)
 CUDA_ONE_CALL (cuMemcpy)
 CUDA_ONE_CALL (cuMemcpyDtoDAsync)
 CUDA_ONE_CALL (cuMemcpyDtoH)
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index 698317f37ac..a7896e4dabe 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -77,14 +77,11 @@ extern CUresult cuGetErrorString (CUresult, const char **);
 CUresult cuLinkAddData (CUlinkState, CUjitInputType, void *, size_t,
 			const char *, unsigned, CUjit_option *, void **);
 CUresult cuLinkCreate (unsigned, CUjit_option *, void **, CUlinkState *);
-#undef cuMemHostRegister
-CUresult cuMemHostRegister (void *, size_t, unsigned int);
 #else
 typedef size_t (*CUoccupancyB2DSize)(int);
 CUresult cuLinkAddData_v2 (CUlinkState, CUjitInputType, void *, size_t,
 			   const char *, unsigned, CUjit_option *, void **);
 CUresult cuLinkCreate_v2 (unsigned, CUjit_option *, void **, CUlinkState *);
-CUresult cuMemHostRegister_v2 (void *, size_t, unsigned int);
 CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
 					  CUoccupancyB2DSize, size_t, int);
 #endif
@@ -1709,30 +1706,36 @@ GOMP_OFFLOAD_is_usm_ptr (void *ptr)
 
 
 bool
-GOMP_OFFLOAD_register_page_locked (void *ptr, size_t size)
+GOMP_OFFLOAD_page_locked_host_alloc (void **ptr, size_t size)
 {
   GOMP_PLUGIN_debug (0, "nvptx %s: ptr=%p, size=%llu\n",
 		     __FUNCTION__, ptr, (unsigned long long) size);
 
+  CUresult r;
+
   unsigned int flags = 0;
   /* Given 'CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING', we don't need
-     'flags |= CU_MEMHOSTREGISTER_PORTABLE;' here.  */
-  if (CUDA_CALL_EXISTS (cuMemHostRegister_v2))
-    CUDA_CALL (cuMemHostRegister_v2, ptr, size, flags);
-  else
-    CUDA_CALL (cuMemHostRegister, ptr, size, flags);
-
+     'flags |= CU_MEMHOSTALLOC_PORTABLE;' here.  */
+  r = CUDA_CALL_NOCHECK (cuMemHostAlloc, ptr, size, flags);
+  if (r == CUDA_ERROR_OUT_OF_MEMORY)
+    *ptr = NULL;
+  else if (r != CUDA_SUCCESS)
+    {
+      GOMP_PLUGIN_error ("cuMemHostAlloc error: %s", cuda_error (r));
+      return false;
+    }
+  GOMP_PLUGIN_debug (0, "  -> *ptr=%p\n",
+		     *ptr);
   return true;
 }
 
 bool
-GOMP_OFFLOAD_unregister_page_locked (void *ptr, size_t size)
+GOMP_OFFLOAD_page_locked_host_free (void *ptr)
 {
-  GOMP_PLUGIN_debug (0, "nvptx %s: ptr=%p, size=%llu\n",
-		     __FUNCTION__, ptr, (unsigned long long) size);
-
-  CUDA_CALL (cuMemHostUnregister, ptr);
+  GOMP_PLUGIN_debug (0, "nvptx %s: ptr=%p\n",
+		     __FUNCTION__, ptr);
 
+  CUDA_CALL (cuMemFreeHost, ptr);
   return true;
 }
 
diff --git a/libgomp/target.c b/libgomp/target.c
index e7285188d1e..24109f28ddc 100644
--- a/libgomp/target.c
+++ b/libgomp/target.c
@@ -4585,15 +4585,15 @@ gomp_usm_free (void *device_ptr, int device_num)
 }
 
 
-/* Device (really: libgomp plugin) for registering paged-locked memory.  We
+/* Device (really: libgomp plugin) to use for paged-locked memory.  We
    assume there is either none or exactly one such device for the lifetime of
    the process.  */
 
-static struct gomp_device_descr *device_for_register_page_locked
+static struct gomp_device_descr *device_for_page_locked
   = /* uninitialized */ (void *) -1;
 
 static struct gomp_device_descr *
-get_device_for_register_page_locked (void)
+get_device_for_page_locked (void)
 {
   gomp_debug (0, "%s\n",
 	      __FUNCTION__);
@@ -4601,7 +4601,7 @@ get_device_for_register_page_locked (void)
   struct gomp_device_descr *device;
 #ifdef HAVE_SYNC_BUILTINS
   device
-    = __atomic_load_n (&device_for_register_page_locked, MEMMODEL_RELAXED);
+    = __atomic_load_n (&device_for_page_locked, MEMMODEL_RELAXED);
   if (device == (void *) -1)
     {
       gomp_debug (0, "  init\n");
@@ -4621,7 +4621,7 @@ get_device_for_register_page_locked (void)
 	  if (devices[i].target_id != 0)
 	    continue;
 
-	  if (!devices[i].register_page_locked_func)
+	  if (!devices[i].page_locked_host_alloc_func)
 	    continue;
 
 	  gomp_debug (0, "  found device: %p (%s)\n",
@@ -4629,16 +4629,16 @@ get_device_for_register_page_locked (void)
 	  if (device)
 	    gomp_fatal ("Unclear how %s and %s libgomp plugins may"
 			" simultaneously provide functionality"
-			" to register page-locked memory",
+			" for page-locked memory",
 			device->name, devices[i].name);
 	  else
 	    device = &devices[i];
 	}
 
       struct gomp_device_descr *device_old
-	= __atomic_exchange_n (&device_for_register_page_locked, device,
+	= __atomic_exchange_n (&device_for_page_locked, device,
 			       MEMMODEL_RELAXED);
-      gomp_debug (0, "  old device_for_register_page_locked: %p\n",
+      gomp_debug (0, "  old device_for_page_locked: %p\n",
 		  device_old);
       assert (device_old == (void *) -1
 	      /* We shouldn't have concurrently found a different or no
@@ -4647,7 +4647,7 @@ get_device_for_register_page_locked (void)
     }
 #else /* !HAVE_SYNC_BUILTINS */
   gomp_debug (0, "  not implemented for '!HAVE_SYNC_BUILTINS'\n");
-  (void) &device_for_register_page_locked;
+  (void) &device_for_page_locked;
   device = NULL;
 #endif /* HAVE_SYNC_BUILTINS */
 
@@ -4656,16 +4656,16 @@ get_device_for_register_page_locked (void)
   return device;
 }
 
-/* Register page-locked memory region.
+/* Allocate page-locked host memory.
    Returns whether we have a device capable of that.  */
 
 attribute_hidden bool
-gomp_register_page_locked (void *ptr, size_t size)
+gomp_page_locked_host_alloc (void **ptr, size_t size)
 {
   gomp_debug (0, "%s: ptr=%p, size=%llu\n",
 	      __FUNCTION__, ptr, (unsigned long long) size);
 
-  struct gomp_device_descr *device = get_device_for_register_page_locked ();
+  struct gomp_device_descr *device = get_device_for_page_locked ();
   gomp_debug (0, "  device=%p (%s)\n",
 	      device, device ? device->name : "[none]");
   if (device)
@@ -4676,29 +4676,30 @@ gomp_register_page_locked (void *ptr, size_t size)
       else if (device->state == GOMP_DEVICE_FINALIZED)
 	{
 	  gomp_mutex_unlock (&device->lock);
-	  gomp_fatal ("Device %s for registering page-locked memory"
-		      " is finalized", device->name);
+	  gomp_fatal ("Device %s used for for page-locked memory is finalized",
+		      device->name);
 	}
       gomp_mutex_unlock (&device->lock);
 
-      if (!device->register_page_locked_func (ptr, size))
-	gomp_fatal ("Failed to register page-locked memory"
+      if (!device->page_locked_host_alloc_func (ptr, size))
+	gomp_fatal ("Failed to allocate page-locked host memory"
 		    " via %s libgomp plugin",
 		    device->name);
     }
   return device != NULL;
 }
 
-/* Unregister page-locked memory region.
-   This must only be called if 'gomp_register_page_locked' returned 'true'.  */
+/* Free page-locked host memory.
+   This must only be called if 'gomp_page_locked_host_alloc' returned
+   'true'.  */
 
 attribute_hidden void
-gomp_unregister_page_locked (void *ptr, size_t size)
+gomp_page_locked_host_free (void *ptr)
 {
   gomp_debug (0, "%s: ptr=%p\n",
 	      __FUNCTION__, ptr);
 
-  struct gomp_device_descr *device = get_device_for_register_page_locked ();
+  struct gomp_device_descr *device = get_device_for_page_locked ();
   gomp_debug (0, "  device=%p (%s)\n",
 	      device, device ? device->name : "[none]");
   assert (device);
@@ -4712,8 +4713,8 @@ gomp_unregister_page_locked (void *ptr, size_t size)
     }
   gomp_mutex_unlock (&device->lock);
 
-  if (!device->unregister_page_locked_func (ptr, size))
-    gomp_fatal ("Failed to unregister page-locked memory"
+  if (!device->page_locked_host_free_func (ptr))
+    gomp_fatal ("Failed to free page-locked host memory"
 		" via %s libgomp plugin",
 		device->name);
 }
@@ -5403,8 +5404,8 @@ gomp_load_plugin_for_device (struct gomp_device_descr *device,
   DLSYM_OPT (usm_alloc, usm_alloc);
   DLSYM_OPT (usm_free, usm_free);
   DLSYM_OPT (is_usm_ptr, is_usm_ptr);
-  DLSYM_OPT (register_page_locked, register_page_locked);
-  DLSYM_OPT (unregister_page_locked, unregister_page_locked);
+  DLSYM_OPT (page_locked_host_alloc, page_locked_host_alloc);
+  DLSYM_OPT (page_locked_host_free, page_locked_host_free);
   DLSYM (dev2host);
   DLSYM (host2dev);
   DLSYM (evaluate_device);
-- 
2.25.1


^ permalink raw reply	[flat|nested] 28+ messages in thread

* [og12] libgomp: Document OpenMP 'pinned' memory (was: [PATCH] libgomp, openmp: pinned memory
  2022-01-04 15:32 [PATCH] libgomp, openmp: pinned memory Andrew Stubbs
  2022-01-04 15:55 ` Jakub Jelinek
@ 2023-03-24 15:49 ` Thomas Schwinge
  2023-03-27  9:27   ` Stubbs, Andrew
  1 sibling, 1 reply; 28+ messages in thread
From: Thomas Schwinge @ 2023-03-24 15:49 UTC (permalink / raw)
  To: gcc-patches, Andrew Stubbs, Tobias Burnus

[-- Attachment #1: Type: text/plain, Size: 597 bytes --]

Hi!

On 2022-01-04T15:32:17+0000, Andrew Stubbs <ams@codesourcery.com> wrote:
> This patch implements the OpenMP pinned memory trait [...]

I figure it may be helpful to document the current og12 state of affairs;
does the attached "libgomp: Document OpenMP 'pinned' memory" look good to
you?


Grüße
 Thomas


-----------------
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht München, HRB 106955

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 0001-libgomp-Document-OpenMP-pinned-memory.patch --]
[-- Type: text/x-diff, Size: 1475 bytes --]

From 35ac1fb2d37f6c33a69f85ca8bac6f6a7bd7d837 Mon Sep 17 00:00:00 2001
From: Thomas Schwinge <thomas@codesourcery.com>
Date: Fri, 24 Mar 2023 15:14:57 +0100
Subject: [PATCH] libgomp: Document OpenMP 'pinned' memory

	libgomp/
	* libgomp.texi (AMD Radeon, nvptx): Document OpenMP 'pinned'
	memory.
---
 libgomp/libgomp.texi | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/libgomp/libgomp.texi b/libgomp/libgomp.texi
index 288e0b3a8ea..1cfae0cb8d1 100644
--- a/libgomp/libgomp.texi
+++ b/libgomp/libgomp.texi
@@ -4456,6 +4456,9 @@ The implementation remark:
 @item OpenMP code that has a requires directive with @code{unified_address} or
       @code{unified_shared_memory} will remove any GCN device from the list of
       available devices (``host fallback'').
+@item OpenMP @emph{pinned} memory (@code{omp_atk_pinned},
+      @code{ompx_pinned_mem_alloc}, for example) is allocated not via
+      the device, but via @code{mmap}, @code{mlock}.
 @end itemize
 
 
@@ -4518,6 +4521,10 @@ The implementation remark:
 @item OpenMP code that has a requires directive with @code{unified_address}
       or @code{unified_shared_memory} will remove any nvptx device from the
       list of available devices (``host fallback'').
+@item OpenMP @emph{pinned} memory (@code{omp_atk_pinned},
+      @code{ompx_pinned_mem_alloc}, for example) is allocated via the
+      device, thus helping lower-overhead host <-> device data
+      transfers.
 @end itemize
 
 
-- 
2.25.1


^ permalink raw reply	[flat|nested] 28+ messages in thread

* RE: [og12] libgomp: Document OpenMP 'pinned' memory (was: [PATCH] libgomp, openmp: pinned memory
  2023-03-24 15:49 ` [og12] libgomp: Document OpenMP 'pinned' memory (was: [PATCH] libgomp, openmp: pinned memory Thomas Schwinge
@ 2023-03-27  9:27   ` Stubbs, Andrew
  2023-03-27 11:26     ` [og12] libgomp: Document OpenMP 'pinned' memory (was: [PATCH] libgomp, openmp: pinned memory) Thomas Schwinge
  0 siblings, 1 reply; 28+ messages in thread
From: Stubbs, Andrew @ 2023-03-27  9:27 UTC (permalink / raw)
  To: Thomas Schwinge, gcc-patches, Andrew Stubbs, Tobias Burnus

> -----Original Message-----
> From: Thomas Schwinge <thomas@codesourcery.com>
> Sent: 24 March 2023 15:50
> To: gcc-patches@gcc.gnu.org; Andrew Stubbs <ams@codesourcery.com>;
> Tobias Burnus <tobias@codesourcery.com>
> Subject: [og12] libgomp: Document OpenMP 'pinned' memory (was: [PATCH]
> libgomp, openmp: pinned memory
> 
> Hi!
> 
> On 2022-01-04T15:32:17+0000, Andrew Stubbs <ams@codesourcery.com>
> wrote:
> > This patch implements the OpenMP pinned memory trait [...]
> 
> I figure it may be helpful to document the current og12 state of affairs; does
> the attached "libgomp: Document OpenMP 'pinned' memory" look good to
> you?

I don't really know what "allocated via the device" means? I mean, I presume you mean "via CUDA", but I don't think this is obvious to the average reader.

Maybe "allocation is optimized for the device" or some such thing?

Andrew

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [og12] libgomp: Document OpenMP 'pinned' memory (was: [PATCH] libgomp, openmp: pinned memory)
  2023-03-27  9:27   ` Stubbs, Andrew
@ 2023-03-27 11:26     ` Thomas Schwinge
  2023-03-27 12:01       ` Andrew Stubbs
  0 siblings, 1 reply; 28+ messages in thread
From: Thomas Schwinge @ 2023-03-27 11:26 UTC (permalink / raw)
  To: gcc-patches, Andrew Stubbs, Tobias Burnus

[-- Attachment #1: Type: text/plain, Size: 1287 bytes --]

Hi!

On 2023-03-27T09:27:31+0000, "Stubbs, Andrew" <andrew.stubbs@siemens.com> wrote:
>> -----Original Message-----
>> From: Thomas Schwinge <thomas@codesourcery.com>
>> Sent: 24 March 2023 15:50
>>
>> On 2022-01-04T15:32:17+0000, Andrew Stubbs <ams@codesourcery.com>
>> wrote:
>> > This patch implements the OpenMP pinned memory trait [...]
>>
>> I figure it may be helpful to document the current og12 state of affairs; does
>> the attached "libgomp: Document OpenMP 'pinned' memory" look good to
>> you?
>
> I don't really know what "allocated via the device" means?

Heh, you're right.

> I mean, I presume you mean "via CUDA", but I don't think this is obvious to the average reader.
> Maybe "allocation is optimized for the device" or some such thing?

As we're in sections that are documenting GCN vs. nvptx specifics, we
might indeed call out which exact interfaces we're using.

How's the updated "libgomp: Document OpenMP 'pinned' memory", see
attached?


Grüße
 Thomas


-----------------
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht München, HRB 106955

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 0001-libgomp-Document-OpenMP-pinned-memory.patch --]
[-- Type: text/x-diff, Size: 1499 bytes --]

From 03e09ad4e0b4cd2232e8bb036dd2562b18ea2686 Mon Sep 17 00:00:00 2001
From: Thomas Schwinge <thomas@codesourcery.com>
Date: Fri, 24 Mar 2023 15:14:57 +0100
Subject: [PATCH] libgomp: Document OpenMP 'pinned' memory

	libgomp/
	* libgomp.texi (AMD Radeon, nvptx): Document OpenMP 'pinned'
	memory.
---
 libgomp/libgomp.texi | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/libgomp/libgomp.texi b/libgomp/libgomp.texi
index 288e0b3a8ea..6355ce2a37b 100644
--- a/libgomp/libgomp.texi
+++ b/libgomp/libgomp.texi
@@ -4456,6 +4456,9 @@ The implementation remark:
 @item OpenMP code that has a requires directive with @code{unified_address} or
       @code{unified_shared_memory} will remove any GCN device from the list of
       available devices (``host fallback'').
+@item OpenMP @emph{pinned} memory (@code{omp_atk_pinned},
+      @code{ompx_pinned_mem_alloc}, for example)
+      is allocated via @code{mmap}, @code{mlock}.
 @end itemize
 
 
@@ -4518,6 +4521,11 @@ The implementation remark:
 @item OpenMP code that has a requires directive with @code{unified_address}
       or @code{unified_shared_memory} will remove any nvptx device from the
       list of available devices (``host fallback'').
+@item OpenMP @emph{pinned} memory (@code{omp_atk_pinned},
+      @code{ompx_pinned_mem_alloc}, for example)
+      is allocated via @code{cuMemHostAlloc} (CUDA Driver API).
+      This potentially helps optimization of host <-> device data
+      transfers.
 @end itemize
 
 
-- 
2.25.1


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [og12] libgomp: Document OpenMP 'pinned' memory (was: [PATCH] libgomp, openmp: pinned memory)
  2023-03-27 11:26     ` [og12] libgomp: Document OpenMP 'pinned' memory (was: [PATCH] libgomp, openmp: pinned memory) Thomas Schwinge
@ 2023-03-27 12:01       ` Andrew Stubbs
  0 siblings, 0 replies; 28+ messages in thread
From: Andrew Stubbs @ 2023-03-27 12:01 UTC (permalink / raw)
  To: Thomas Schwinge, gcc-patches, Tobias Burnus

On 27/03/2023 12:26, Thomas Schwinge wrote:
> Hi!
> 
> On 2023-03-27T09:27:31+0000, "Stubbs, Andrew" <andrew.stubbs@siemens.com> wrote:
>>> -----Original Message-----
>>> From: Thomas Schwinge <thomas@codesourcery.com>
>>> Sent: 24 March 2023 15:50
>>>
>>> On 2022-01-04T15:32:17+0000, Andrew Stubbs <ams@codesourcery.com>
>>> wrote:
>>>> This patch implements the OpenMP pinned memory trait [...]
>>>
>>> I figure it may be helpful to document the current og12 state of affairs; does
>>> the attached "libgomp: Document OpenMP 'pinned' memory" look good to
>>> you?
>>
>> I don't really know what "allocated via the device" means?
> 
> Heh, you're right.
> 
>> I mean, I presume you mean "via CUDA", but I don't think this is obvious to the average reader.
>> Maybe "allocation is optimized for the device" or some such thing?
> 
> As we're in sections that are documenting GCN vs. nvptx specifics, we
> might indeed call out which exact interfaces we're using.
> 
> How's the updated "libgomp: Document OpenMP 'pinned' memory", see
> attached?

LGTM, FWIW.

Andrew

^ permalink raw reply	[flat|nested] 28+ messages in thread

end of thread, other threads:[~2023-03-27 12:01 UTC | newest]

Thread overview: 28+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-01-04 15:32 [PATCH] libgomp, openmp: pinned memory Andrew Stubbs
2022-01-04 15:55 ` Jakub Jelinek
2022-01-04 16:58   ` Andrew Stubbs
2022-01-04 18:28     ` Jakub Jelinek
2022-01-04 18:47       ` Jakub Jelinek
2022-01-05 17:07         ` Andrew Stubbs
2022-01-13 13:53           ` Andrew Stubbs
2022-06-07 11:05             ` Andrew Stubbs
2022-06-07 12:10               ` Jakub Jelinek
2022-06-07 12:28                 ` Andrew Stubbs
2022-06-07 12:40                   ` Jakub Jelinek
2022-06-09  9:38                   ` Thomas Schwinge
2022-06-09 10:09                     ` Tobias Burnus
2022-06-09 10:22                       ` Stubbs, Andrew
2022-06-09 10:31                     ` Stubbs, Andrew
2023-02-16 15:32                     ` Attempt to register OpenMP pinned memory using a device instead of 'mlock' (was: [PATCH] libgomp, openmp: pinned memory) Thomas Schwinge
2023-02-16 16:17                       ` Stubbs, Andrew
2023-02-16 22:06                         ` [og12] " Thomas Schwinge
2023-02-17  8:12                           ` Thomas Schwinge
2023-02-20  9:48                             ` Andrew Stubbs
2023-02-20 13:53                               ` [og12] Attempt to not just register but allocate OpenMP pinned memory using a device (was: [og12] Attempt to register OpenMP pinned memory using a device instead of 'mlock') Thomas Schwinge
2023-02-10 15:11             ` [PATCH] libgomp, openmp: pinned memory Thomas Schwinge
2023-02-10 15:55               ` Andrew Stubbs
2023-02-16 21:39             ` [og12] Clarify/verify OpenMP 'omp_calloc' zero-initialization for pinned memory (was: [PATCH] libgomp, openmp: pinned memory) Thomas Schwinge
2023-03-24 15:49 ` [og12] libgomp: Document OpenMP 'pinned' memory (was: [PATCH] libgomp, openmp: pinned memory Thomas Schwinge
2023-03-27  9:27   ` Stubbs, Andrew
2023-03-27 11:26     ` [og12] libgomp: Document OpenMP 'pinned' memory (was: [PATCH] libgomp, openmp: pinned memory) Thomas Schwinge
2023-03-27 12:01       ` Andrew Stubbs

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).