public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc r14-6225] openmp, nvptx: low-lat memory access traits
@ 2023-12-06 16:59 Andrew Stubbs
  0 siblings, 0 replies; only message in thread
From: Andrew Stubbs @ 2023-12-06 16:59 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:e9a19ead498fcc89186b724c6e76854f7751a89b

commit r14-6225-ge9a19ead498fcc89186b724c6e76854f7751a89b
Author: Andrew Stubbs <ams@codesourcery.com>
Date:   Thu Jan 27 13:48:50 2022 +0000

    openmp, nvptx: low-lat memory access traits
    
    The NVPTX low latency memory is not accessible outside the team that allocates
    it, and therefore should be unavailable for allocators with the access trait
    "all".  This change means that the omp_low_lat_mem_alloc predefined
    allocator no longer works (but omp_cgroup_mem_alloc still does).
    
    libgomp/ChangeLog:
    
            * allocator.c (MEMSPACE_VALIDATE): New macro.
            (omp_init_allocator): Use MEMSPACE_VALIDATE.
            (omp_aligned_alloc): Use OMP_LOW_LAT_MEM_ALLOC_INVALID.
            (omp_aligned_calloc): Likewise.
            (omp_realloc): Likewise.
            * config/nvptx/allocator.c (nvptx_memspace_validate): New function.
            (MEMSPACE_VALIDATE): New macro.
            (OMP_LOW_LAT_MEM_ALLOC_INVALID): New define.
            * libgomp.texi: Document low-latency implementation details.
            * testsuite/libgomp.c/omp_alloc-1.c (main): Add gnu_lowlat.
            * testsuite/libgomp.c/omp_alloc-2.c (main): Add gnu_lowlat.
            * testsuite/libgomp.c/omp_alloc-3.c (main): Add gnu_lowlat.
            * testsuite/libgomp.c/omp_alloc-4.c (main): Add access trait.
            * testsuite/libgomp.c/omp_alloc-5.c (main): Add gnu_lowlat.
            * testsuite/libgomp.c/omp_alloc-6.c (main): Add access trait.
            * testsuite/libgomp.c/omp_alloc-traits.c: New test.

Diff:
---
 libgomp/allocator.c                            | 20 ++++++++
 libgomp/config/nvptx/allocator.c               | 21 ++++++++
 libgomp/libgomp.texi                           | 18 +++++++
 libgomp/testsuite/libgomp.c/omp_alloc-1.c      | 10 ++++
 libgomp/testsuite/libgomp.c/omp_alloc-2.c      |  8 ++++
 libgomp/testsuite/libgomp.c/omp_alloc-3.c      |  7 +++
 libgomp/testsuite/libgomp.c/omp_alloc-4.c      |  7 +--
 libgomp/testsuite/libgomp.c/omp_alloc-5.c      |  8 ++++
 libgomp/testsuite/libgomp.c/omp_alloc-6.c      |  7 +--
 libgomp/testsuite/libgomp.c/omp_alloc-traits.c | 66 ++++++++++++++++++++++++++
 10 files changed, 166 insertions(+), 6 deletions(-)

diff --git a/libgomp/allocator.c b/libgomp/allocator.c
index fa398128368..a8a80f8028d 100644
--- a/libgomp/allocator.c
+++ b/libgomp/allocator.c
@@ -56,6 +56,10 @@
 #define MEMSPACE_FREE(MEMSPACE, ADDR, SIZE) \
   free (((void)(MEMSPACE), (void)(SIZE), (ADDR)))
 #endif
+#ifndef MEMSPACE_VALIDATE
+#define MEMSPACE_VALIDATE(MEMSPACE, ACCESS) \
+  (((void)(MEMSPACE), (void)(ACCESS), 1))
+#endif
 
 /* Map the predefined allocators to the correct memory space.
    The index to this table is the omp_allocator_handle_t enum value.
@@ -439,6 +443,10 @@ omp_init_allocator (omp_memspace_handle_t memspace, int ntraits,
   if (data.pinned)
     return omp_null_allocator;
 
+  /* Reject unsupported memory spaces.  */
+  if (!MEMSPACE_VALIDATE (data.memspace, data.access))
+    return omp_null_allocator;
+
   ret = gomp_malloc (sizeof (struct omp_allocator_data));
   *ret = data;
 #ifndef HAVE_SYNC_BUILTINS
@@ -522,6 +530,10 @@ retry:
     new_size += new_alignment - sizeof (void *);
   if (__builtin_add_overflow (size, new_size, &new_size))
     goto fail;
+#ifdef OMP_LOW_LAT_MEM_ALLOC_INVALID
+  if (allocator == omp_low_lat_mem_alloc)
+    goto fail;
+#endif
 
   if (__builtin_expect (allocator_data
 			&& allocator_data->pool_size < ~(uintptr_t) 0, 0))
@@ -820,6 +832,10 @@ retry:
     goto fail;
   if (__builtin_add_overflow (size_temp, new_size, &new_size))
     goto fail;
+#ifdef OMP_LOW_LAT_MEM_ALLOC_INVALID
+  if (allocator == omp_low_lat_mem_alloc)
+    goto fail;
+#endif
 
   if (__builtin_expect (allocator_data
 			&& allocator_data->pool_size < ~(uintptr_t) 0, 0))
@@ -1054,6 +1070,10 @@ retry:
   if (__builtin_add_overflow (size, new_size, &new_size))
     goto fail;
   old_size = data->size;
+#ifdef OMP_LOW_LAT_MEM_ALLOC_INVALID
+  if (allocator == omp_low_lat_mem_alloc)
+    goto fail;
+#endif
 
   if (__builtin_expect (allocator_data
 			&& allocator_data->pool_size < ~(uintptr_t) 0, 0))
diff --git a/libgomp/config/nvptx/allocator.c b/libgomp/config/nvptx/allocator.c
index 6014fba177f..a3302411bcb 100644
--- a/libgomp/config/nvptx/allocator.c
+++ b/libgomp/config/nvptx/allocator.c
@@ -108,6 +108,21 @@ nvptx_memspace_realloc (omp_memspace_handle_t memspace, void *addr,
     return realloc (addr, size);
 }
 
+static inline int
+nvptx_memspace_validate (omp_memspace_handle_t memspace, unsigned access)
+{
+#if __PTX_ISA_VERSION_MAJOR__ > 4 \
+    || (__PTX_ISA_VERSION_MAJOR__ == 4 && __PTX_ISA_VERSION_MINOR >= 1)
+  /* Disallow use of low-latency memory when it must be accessible by
+     all threads.  */
+  return (memspace != omp_low_lat_mem_space
+	  || access != omp_atv_all);
+#else
+  /* Low-latency memory is not available before PTX 4.1.  */
+  return (memspace != omp_low_lat_mem_space);
+#endif
+}
+
 #define MEMSPACE_ALLOC(MEMSPACE, SIZE) \
   nvptx_memspace_alloc (MEMSPACE, SIZE)
 #define MEMSPACE_CALLOC(MEMSPACE, SIZE) \
@@ -116,5 +131,11 @@ nvptx_memspace_realloc (omp_memspace_handle_t memspace, void *addr,
   nvptx_memspace_realloc (MEMSPACE, ADDR, OLDSIZE, SIZE)
 #define MEMSPACE_FREE(MEMSPACE, ADDR, SIZE) \
   nvptx_memspace_free (MEMSPACE, ADDR, SIZE)
+#define MEMSPACE_VALIDATE(MEMSPACE, ACCESS) \
+  nvptx_memspace_validate (MEMSPACE, ACCESS)
+
+/* The default low-latency memspace implies omp_atv_all, which is incompatible
+   with the .shared memory space.  */
+#define OMP_LOW_LAT_MEM_ALLOC_INVALID 1
 
 #include "../../allocator.c"
diff --git a/libgomp/libgomp.texi b/libgomp/libgomp.texi
index 54c4bc26584..8d57c17c450 100644
--- a/libgomp/libgomp.texi
+++ b/libgomp/libgomp.texi
@@ -5767,6 +5767,9 @@ Additional notes regarding the traits:
 @item The @code{sync_hint} trait has no effect.
 @end itemize
 
+See also:
+@ref{Offload-Target Specifics}
+
 @c ---------------------------------------------------------------------
 @c Offload-Target Specifics
 @c ---------------------------------------------------------------------
@@ -5900,6 +5903,21 @@ The implementation remark:
       directive for non-contiguous list items will use the 2D and 3D
       memory-copy functions of the CUDA library.  Higher dimensions will
       call those functions in a loop and are therefore supported.
+@item Low-latency memory (@code{omp_low_lat_mem_space}) is supported when the
+      the @code{access} trait is set to @code{cgroup}, the ISA is at least
+      @code{sm_53}, and the PTX version is at least 4.1.  The default pool size
+      is 8 kiB per team, but may be adjusted at runtime by setting environment
+      variable @code{GOMP_NVPTX_LOWLAT_POOL=@var{bytes}}.  The maximum value is
+      limited by the available hardware, and care should be taken that the
+      selected pool size does not unduly limit the number of teams that can
+      run simultaneously.
+@item @code{omp_low_lat_mem_alloc} cannot be used with true low-latency memory
+      because the definition implies the @code{omp_atv_all} trait; main
+      graphics memory is used instead.
+@item @code{omp_cgroup_mem_alloc}, @code{omp_pteam_mem_alloc}, and
+      @code{omp_thread_mem_alloc}, all use low-latency memory as first
+      preference, and fall back to main graphics memory when the low-latency
+      pool is exhausted.
 @end itemize
 
 
diff --git a/libgomp/testsuite/libgomp.c/omp_alloc-1.c b/libgomp/testsuite/libgomp.c/omp_alloc-1.c
index f4e594f1e98..7f7f440c12c 100644
--- a/libgomp/testsuite/libgomp.c/omp_alloc-1.c
+++ b/libgomp/testsuite/libgomp.c/omp_alloc-1.c
@@ -32,12 +32,21 @@ test (int n, omp_allocator_handle_t allocator)
 int
 main ()
 {
+  /* omp_low_lat_mem_alloc doesn't actually get low-latency memory on GPU.  */
+  omp_allocator_handle_t gpu_lowlat = 0;
+  #pragma omp target map(from:gpu_lowlat)
+    {
+      omp_alloctrait_t traits[1] = { { omp_atk_access, omp_atv_cgroup } };
+      gpu_lowlat = omp_init_allocator (omp_low_lat_mem_space, 1, traits);
+    }
+
   // Smaller than low-latency memory limit
   test (10, omp_default_mem_alloc);
   test (10, omp_large_cap_mem_alloc);
   test (10, omp_const_mem_alloc);
   test (10, omp_high_bw_mem_alloc);
   test (10, omp_low_lat_mem_alloc);
+  test (10, gpu_lowlat);
   test (10, omp_cgroup_mem_alloc);
   test (10, omp_pteam_mem_alloc);
   test (10, omp_thread_mem_alloc);
@@ -48,6 +57,7 @@ main ()
   test (100000, omp_const_mem_alloc);
   test (100000, omp_high_bw_mem_alloc);
   test (100000, omp_low_lat_mem_alloc);
+  test (100000, gpu_lowlat);
   test (100000, omp_cgroup_mem_alloc);
   test (100000, omp_pteam_mem_alloc);
   test (100000, omp_thread_mem_alloc);
diff --git a/libgomp/testsuite/libgomp.c/omp_alloc-2.c b/libgomp/testsuite/libgomp.c/omp_alloc-2.c
index e9fd1602946..54523f1061e 100644
--- a/libgomp/testsuite/libgomp.c/omp_alloc-2.c
+++ b/libgomp/testsuite/libgomp.c/omp_alloc-2.c
@@ -40,12 +40,19 @@ test (int n, omp_allocator_handle_t allocator)
 int
 main ()
 {
+  /* omp_low_lat_mem_alloc doesn't actually get low-latency memory on GPU.  */
+  omp_alloctrait_t traits[1] = { { omp_atk_access, omp_atv_cgroup } };
+  omp_allocator_handle_t gpu_lowlat;
+  #pragma omp target map(from:gpu_lowlat)
+  gpu_lowlat = omp_init_allocator (omp_low_lat_mem_space, 1, traits);
+
   // Smaller than low-latency memory limit
   test (10, omp_default_mem_alloc);
   test (10, omp_large_cap_mem_alloc);
   test (10, omp_const_mem_alloc);
   test (10, omp_high_bw_mem_alloc);
   test (10, omp_low_lat_mem_alloc);
+  test (10, gpu_lowlat);
   test (10, omp_cgroup_mem_alloc);
   test (10, omp_pteam_mem_alloc);
   test (10, omp_thread_mem_alloc);
@@ -56,6 +63,7 @@ main ()
   test (1000, omp_const_mem_alloc);
   test (1000, omp_high_bw_mem_alloc);
   test (1000, omp_low_lat_mem_alloc);
+  test (1000, gpu_lowlat);
   test (1000, omp_cgroup_mem_alloc);
   test (1000, omp_pteam_mem_alloc);
   test (1000, omp_thread_mem_alloc);
diff --git a/libgomp/testsuite/libgomp.c/omp_alloc-3.c b/libgomp/testsuite/libgomp.c/omp_alloc-3.c
index 792e2200f30..682d149d379 100644
--- a/libgomp/testsuite/libgomp.c/omp_alloc-3.c
+++ b/libgomp/testsuite/libgomp.c/omp_alloc-3.c
@@ -28,12 +28,19 @@ test (omp_allocator_handle_t allocator)
 int
 main ()
 {
+  /* omp_low_lat_mem_alloc doesn't actually get low-latency memory on GPU.  */
+  omp_alloctrait_t traits[1] = { { omp_atk_access, omp_atv_cgroup } };
+  omp_allocator_handle_t gpu_lowlat;
+  #pragma omp target map(from:gpu_lowlat)
+  gpu_lowlat = omp_init_allocator (omp_low_lat_mem_space, 1, traits);
+
   // Smaller than low-latency memory limit
   test (omp_default_mem_alloc);
   test (omp_large_cap_mem_alloc);
   test (omp_const_mem_alloc);
   test (omp_high_bw_mem_alloc);
   test (omp_low_lat_mem_alloc);
+  test (gpu_lowlat);
   test (omp_cgroup_mem_alloc);
   test (omp_pteam_mem_alloc);
   test (omp_thread_mem_alloc);
diff --git a/libgomp/testsuite/libgomp.c/omp_alloc-4.c b/libgomp/testsuite/libgomp.c/omp_alloc-4.c
index c7d0c46c6b3..03841404daa 100644
--- a/libgomp/testsuite/libgomp.c/omp_alloc-4.c
+++ b/libgomp/testsuite/libgomp.c/omp_alloc-4.c
@@ -26,10 +26,11 @@ main ()
   #pragma omp target
   {
     /* Ensure that the memory we get *is* low-latency with a null-fallback.  */
-    omp_alloctrait_t traits[1]
-      = { { omp_atk_fallback, omp_atv_null_fb } };
+    omp_alloctrait_t traits[2]
+      = { { omp_atk_fallback, omp_atv_null_fb },
+          { omp_atk_access, omp_atv_cgroup } };
     omp_allocator_handle_t lowlat = omp_init_allocator (omp_low_lat_mem_space,
-							1, traits);
+							2, traits);
 
     int size = 4;
 
diff --git a/libgomp/testsuite/libgomp.c/omp_alloc-5.c b/libgomp/testsuite/libgomp.c/omp_alloc-5.c
index 10805ded6d0..26bf38c1ca6 100644
--- a/libgomp/testsuite/libgomp.c/omp_alloc-5.c
+++ b/libgomp/testsuite/libgomp.c/omp_alloc-5.c
@@ -39,12 +39,19 @@ test (int n, omp_allocator_handle_t allocator)
 int
 main ()
 {
+  /* omp_low_lat_mem_alloc doesn't actually get low-latency memory on GPU.  */
+  omp_alloctrait_t traits[1] = { { omp_atk_access, omp_atv_cgroup } };
+  omp_allocator_handle_t gpu_lowlat;
+  #pragma omp target map(from:gpu_lowlat)
+  gpu_lowlat = omp_init_allocator (omp_low_lat_mem_space, 1, traits);
+
   // Smaller than low-latency memory limit
   test (10, omp_default_mem_alloc);
   test (10, omp_large_cap_mem_alloc);
   test (10, omp_const_mem_alloc);
   test (10, omp_high_bw_mem_alloc);
   test (10, omp_low_lat_mem_alloc);
+  test (10, gpu_lowlat);
   test (10, omp_cgroup_mem_alloc);
   test (10, omp_pteam_mem_alloc);
   test (10, omp_thread_mem_alloc);
@@ -55,6 +62,7 @@ main ()
   test (100000, omp_const_mem_alloc);
   test (100000, omp_high_bw_mem_alloc);
   test (100000, omp_low_lat_mem_alloc);
+  test (100000, gpu_lowlat);
   test (100000, omp_cgroup_mem_alloc);
   test (100000, omp_pteam_mem_alloc);
   test (100000, omp_thread_mem_alloc);
diff --git a/libgomp/testsuite/libgomp.c/omp_alloc-6.c b/libgomp/testsuite/libgomp.c/omp_alloc-6.c
index b326cad9233..13e8747dc3b 100644
--- a/libgomp/testsuite/libgomp.c/omp_alloc-6.c
+++ b/libgomp/testsuite/libgomp.c/omp_alloc-6.c
@@ -26,10 +26,11 @@ main ()
   #pragma omp target
   {
     /* Ensure that the memory we get *is* low-latency with a null-fallback.  */
-    omp_alloctrait_t traits[1]
-      = { { omp_atk_fallback, omp_atv_null_fb } };
+    omp_alloctrait_t traits[2]
+      = { { omp_atk_fallback, omp_atv_null_fb },
+          { omp_atk_access, omp_atv_cgroup } };
     omp_allocator_handle_t lowlat = omp_init_allocator (omp_low_lat_mem_space,
-							1, traits);
+							2, traits);
 
     int size = 16;
 
diff --git a/libgomp/testsuite/libgomp.c/omp_alloc-traits.c b/libgomp/testsuite/libgomp.c/omp_alloc-traits.c
new file mode 100644
index 00000000000..4ff0fca4986
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/omp_alloc-traits.c
@@ -0,0 +1,66 @@
+/* { dg-do run } */
+
+/* { dg-require-effective-target offload_device } */
+/* { dg-xfail-if "not implemented" { ! offload_target_nvptx } } */
+
+/* Test that GPU low-latency allocation is limited to team access.  */
+
+#include <stddef.h>
+#include <omp.h>
+
+#pragma omp requires dynamic_allocators
+
+int
+main ()
+{
+  #pragma omp target
+  {
+    /* Ensure that the memory we get *is* low-latency with a null-fallback.  */
+    omp_alloctrait_t traits[2]
+      = { { omp_atk_fallback, omp_atv_null_fb },
+	  { omp_atk_access, omp_atv_cgroup } };
+    omp_allocator_handle_t lowlat = omp_init_allocator (omp_low_lat_mem_space,
+							2, traits); // good
+
+    omp_alloctrait_t traits_all[2]
+      = { { omp_atk_fallback, omp_atv_null_fb },
+	  { omp_atk_access, omp_atv_all } };
+    omp_allocator_handle_t lowlat_all
+      = omp_init_allocator (omp_low_lat_mem_space, 2, traits_all); // bad
+
+    omp_alloctrait_t traits_default[1]
+      = { { omp_atk_fallback, omp_atv_null_fb } };
+    omp_allocator_handle_t lowlat_default
+      = omp_init_allocator (omp_low_lat_mem_space, 1, traits_default); // bad
+
+    if (lowlat_all != omp_null_allocator
+	|| lowlat_default != omp_null_allocator)
+      __builtin_abort ();
+
+    void *a = omp_alloc (1, lowlat); // good
+
+    if (!a)
+      __builtin_abort ();
+
+    omp_free (a, lowlat);
+
+
+    a = omp_calloc (1, 1, lowlat); // good
+
+    if (!a)
+      __builtin_abort ();
+
+    omp_free (a, lowlat);
+
+
+    a = omp_realloc (NULL, 1, lowlat, lowlat); // good
+
+    if (!a)
+      __builtin_abort ();
+
+    omp_free (a, lowlat);
+  }
+
+  return 0;
+}
+

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2023-12-06 16:59 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-12-06 16:59 [gcc r14-6225] openmp, nvptx: low-lat memory access traits Andrew Stubbs

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).