[gcc/devel/omp/gcc-12] nvptx: reimplement libgomp barriers [PR99555]

public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed

* [gcc/devel/omp/gcc-12] nvptx: reimplement libgomp barriers [PR99555]
@ 2022-12-21 18:21 Tobias Burnus
  0 siblings, 0 replies; only message in thread
From: Tobias Burnus @ 2022-12-21 18:21 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:881fc99968a8e43290c93500e4df5532b7b7563c

commit 881fc99968a8e43290c93500e4df5532b7b7563c
Author: Chung-Lin Tang <cltang@codesourcery.com>
Date:   Wed Dec 21 18:56:41 2022 +0100

    nvptx: reimplement libgomp barriers [PR99555]
    
    Instead of trying to have the GPU do CPU-with-OS-like things, this new barriers
    implementation for NVPTX uses simplistic bar.* synchronization instructions.
    Tasks are processed after threads have joined, and only if team->task_count != 0
    
    It is noted that: there might be a little bit of performance forfeited for
    cases where earlier arriving threads could've been used to process tasks ahead
    of other threads, but that has the requirement of implementing complex
    futex-wait/wake like behavior, which is what we're try to avoid with this patch.
    It is deemed that task processing is not what GPU target offloading is usually
    used for.
    
    Implementation highlight notes:
    1. gomp_team_barrier_wake() is now an empty function (threads never "wake" in
       the usual manner)
    2. gomp_team_barrier_cancel() now uses the "exit" PTX instruction.
    3. gomp_barrier_wait_last() now is implemented using "bar.arrive"
    
    4. gomp_team_barrier_wait_end()/gomp_team_barrier_wait_cancel_end():
       The main synchronization is done using a 'bar.red' instruction. This reduces
       across all threads the condition (team->task_count != 0), to enable the task
       processing down below if any thread created a task.
       (this bar.red usage means that this patch is dependent on the prior NVPTX
       bar.red GCC patch)
    
            PR target/99555
    
    libgomp/ChangeLog:
    
            * config/nvptx/bar.c (generation_to_barrier): Remove.
            (futex_wait,futex_wake,do_spin,do_wait): Remove.
            (GOMP_WAIT_H): Remove.
            (#include "../linux/bar.c"): Remove.
            (gomp_barrier_wait_end): New function.
            (gomp_barrier_wait): Likewise.
            (gomp_barrier_wait_last): Likewise.
            (gomp_team_barrier_wait_end): Likewise.
            (gomp_team_barrier_wait): Likewise.
            (gomp_team_barrier_wait_final): Likewise.
            (gomp_team_barrier_wait_cancel_end): Likewise.
            (gomp_team_barrier_wait_cancel): Likewise.
            (gomp_team_barrier_cancel): Likewise.
            * config/nvptx/bar.h (gomp_barrier_t): Remove waiters, lock fields.
            (gomp_barrier_init): Remove init of waiters, lock fields.
            (gomp_team_barrier_wake): Remove prototype, add new static inline
            function.
    
    (cherry picked from commit fdc7469cf597ec11229ddfc3e9c7a06f3d0fba9d)

Diff:
---
 libgomp/ChangeLog.omp      |  23 +++++
 libgomp/config/nvptx/bar.c | 215 ++++++++++++++++++++++++---------------------
 libgomp/config/nvptx/bar.h |  12 +--
 3 files changed, 147 insertions(+), 103 deletions(-)

diff --git a/libgomp/ChangeLog.omp b/libgomp/ChangeLog.omp
index 068f4810100..01bb52e7b5a 100644
--- a/libgomp/ChangeLog.omp
+++ b/libgomp/ChangeLog.omp
@@ -1,3 +1,26 @@
+2022-12-21  Tobias Burnus  <tobias@codesourcery.com>
+
+	Backported from master:
+	2022-12-21  Chung-Lin Tang  <cltang@codesourcery.com>
+
+	* config/nvptx/bar.c (generation_to_barrier): Remove.
+	(futex_wait,futex_wake,do_spin,do_wait): Remove.
+	(GOMP_WAIT_H): Remove.
+	(#include "../linux/bar.c"): Remove.
+	(gomp_barrier_wait_end): New function.
+	(gomp_barrier_wait): Likewise.
+	(gomp_barrier_wait_last): Likewise.
+	(gomp_team_barrier_wait_end): Likewise.
+	(gomp_team_barrier_wait): Likewise.
+	(gomp_team_barrier_wait_final): Likewise.
+	(gomp_team_barrier_wait_cancel_end): Likewise.
+	(gomp_team_barrier_wait_cancel): Likewise.
+	(gomp_team_barrier_cancel): Likewise.
+	* config/nvptx/bar.h (gomp_barrier_t): Remove waiters, lock fields.
+	(gomp_barrier_init): Remove init of waiters, lock fields.
+	(gomp_team_barrier_wake): Remove prototype, add new static inline
+	function.
+
 2022-12-16  Andrew Stubbs  <ams@codesourcery.com>
 
 	* libgomp.h (OFFSET_USM): New macro.
diff --git a/libgomp/config/nvptx/bar.c b/libgomp/config/nvptx/bar.c
index eee21071f47..2c9f96d8ddf 100644
--- a/libgomp/config/nvptx/bar.c
+++ b/libgomp/config/nvptx/bar.c
@@ -30,137 +30,156 @@
 #include <limits.h>
 #include "libgomp.h"
 
-/* For cpu_relax.  */
-#include "doacross.h"
-
-/* Assuming ADDR is &bar->generation, return bar.  Copied from
-   rtems/bar.c.  */
+void
+gomp_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state)
+{
+  if (__builtin_expect (state & BAR_WAS_LAST, 0))
+    {
+      /* Next time we'll be awaiting TOTAL threads again.  */
+      bar->awaited = bar->total;
+      __atomic_store_n (&bar->generation, bar->generation + BAR_INCR,
+			MEMMODEL_RELEASE);
+    }
+  if (bar->total > 1)
+    asm ("bar.sync 1, %0;" : : "r" (32 * bar->total));
+}
 
-static gomp_barrier_t *
-generation_to_barrier (int *addr)
+void
+gomp_barrier_wait (gomp_barrier_t *bar)
 {
-  char *bar
-    = (char *) addr - __builtin_offsetof (gomp_barrier_t, generation);
-  return (gomp_barrier_t *)bar;
+  gomp_barrier_wait_end (bar, gomp_barrier_wait_start (bar));
 }
 
-/* Implement futex_wait-like behaviour to plug into the linux/bar.c
-   implementation.  Assumes ADDR is &bar->generation.   */
+/* Like gomp_barrier_wait, except that if the encountering thread
+   is not the last one to hit the barrier, it returns immediately.
+   The intended usage is that a thread which intends to gomp_barrier_destroy
+   this barrier calls gomp_barrier_wait, while all other threads
+   call gomp_barrier_wait_last.  When gomp_barrier_wait returns,
+   the barrier can be safely destroyed.  */
 
-static inline void
-futex_wait (int *addr, int val)
+void
+gomp_barrier_wait_last (gomp_barrier_t *bar)
 {
-  gomp_barrier_t *bar = generation_to_barrier (addr);
+  /* The above described behavior matches 'bar.arrive' perfectly.  */
+  if (bar->total > 1)
+    asm ("bar.arrive 1, %0;" : : "r" (32 * bar->total));
+}
 
-  if (bar->total < 2)
-    /* A barrier with less than two threads, nop.  */
-    return;
+/* Barriers are implemented mainly using 'bar.red.or', which combines a bar.sync
+   operation with a OR-reduction of "team->task_count != 0" across all threads.
+   Task processing is done only after synchronization and verifying that
+   task_count was non-zero in at least one of the team threads.
 
-  gomp_mutex_lock (&bar->lock);
+   This use of simple-barriers, and queueing of tasks till the end, is deemed
+   more efficient performance-wise for GPUs in the common offloading case, as
+   opposed to implementing futex-wait/wake operations to simultaneously process
+   tasks in a CPU-thread manner (which is not easy to implement efficiently
+   on GPUs).  */
 
-  /* Futex semantics: only go to sleep if *addr == val.  */
-  if (__builtin_expect (__atomic_load_n (addr, MEMMODEL_ACQUIRE) != val, 0))
-    {
-      gomp_mutex_unlock (&bar->lock);
-      return;
-    }
+void
+gomp_team_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state)
+{
+  struct gomp_thread *thr = gomp_thread ();
+  struct gomp_team *team = thr->ts.team;
 
-  /* Register as waiter.  */
-  unsigned int waiters
-    = __atomic_add_fetch (&bar->waiters, 1, MEMMODEL_ACQ_REL);
-  if (waiters == 0)
-    __builtin_abort ();
-  unsigned int waiter_id = waiters;
+  bool run_tasks = (team->task_count != 0);
+  if (bar->total > 1)
+    run_tasks = __builtin_nvptx_bar_red_or (1, 32 * bar->total, true,
+					    (team->task_count != 0));
 
-  if (waiters > 1)
+  if (__builtin_expect (state & BAR_WAS_LAST, 0))
     {
-      /* Wake other threads in bar.sync.  */
-      asm volatile ("bar.sync 1, %0;" : : "r" (32 * waiters));
-
-      /* Ensure that they have updated waiters.  */
-      asm volatile ("bar.sync 1, %0;" : : "r" (32 * waiters));
+      /* Next time we'll be awaiting TOTAL threads again.  */
+      bar->awaited = bar->total;
+      team->work_share_cancelled = 0;
     }
 
-  gomp_mutex_unlock (&bar->lock);
-
-  while (1)
+  if (__builtin_expect (run_tasks == true, 0))
     {
-      /* Wait for next thread in barrier.  */
-      asm volatile ("bar.sync 1, %0;" : : "r" (32 * (waiters + 1)));
+      while (__atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE)
+	     & BAR_TASK_PENDING)
+	gomp_barrier_handle_tasks (state);
 
-      /* Get updated waiters.  */
-      unsigned int updated_waiters
-	= __atomic_load_n (&bar->waiters, MEMMODEL_ACQUIRE);
-
-      /* Notify that we have updated waiters.  */
-      asm volatile ("bar.sync 1, %0;" : : "r" (32 * (waiters + 1)));
-
-      waiters = updated_waiters;
+      if (bar->total > 1)
+	asm volatile ("bar.sync 1, %0;" : : "r" (32 * bar->total));
+    }
+}
 
-      if (waiter_id > waiters)
-	/* A wake happened, and we're in the group of woken threads.  */
-	break;
+void
+gomp_team_barrier_wait (gomp_barrier_t *bar)
+{
+  gomp_team_barrier_wait_end (bar, gomp_barrier_wait_start (bar));
+}
 
-      /* Continue waiting.  */
-    }
+void
+gomp_team_barrier_wait_final (gomp_barrier_t *bar)
+{
+  gomp_barrier_state_t state = gomp_barrier_wait_final_start (bar);
+  if (__builtin_expect (state & BAR_WAS_LAST, 0))
+    bar->awaited_final = bar->total;
+  gomp_team_barrier_wait_end (bar, state);
 }
 
-/* Implement futex_wake-like behaviour to plug into the linux/bar.c
-   implementation.  Assumes ADDR is &bar->generation.  */
+/* See also comments for gomp_team_barrier_wait_end.  */
 
-static inline void
-futex_wake (int *addr, int count)
+bool
+gomp_team_barrier_wait_cancel_end (gomp_barrier_t *bar,
+				   gomp_barrier_state_t state)
 {
-  gomp_barrier_t *bar = generation_to_barrier (addr);
+  struct gomp_thread *thr = gomp_thread ();
+  struct gomp_team *team = thr->ts.team;
 
-  if (bar->total < 2)
-    /* A barrier with less than two threads, nop.  */
-    return;
+  bool run_tasks = (team->task_count != 0);
+  if (bar->total > 1)
+    run_tasks = __builtin_nvptx_bar_red_or (1, 32 * bar->total, true,
+					    (team->task_count != 0));
+  if (state & BAR_CANCELLED)
+    return true;
 
-  gomp_mutex_lock (&bar->lock);
-  unsigned int waiters = __atomic_load_n (&bar->waiters, MEMMODEL_ACQUIRE);
-  if (waiters == 0)
+  if (__builtin_expect (state & BAR_WAS_LAST, 0))
     {
-      /* No threads to wake.  */
-      gomp_mutex_unlock (&bar->lock);
-      return;
+      /* Note: BAR_CANCELLED should never be set in state here, because
+	 cancellation means that at least one of the threads has been
+	 cancelled, thus on a cancellable barrier we should never see
+	 all threads to arrive.  */
+
+      /* Next time we'll be awaiting TOTAL threads again.  */
+      bar->awaited = bar->total;
+      team->work_share_cancelled = 0;
     }
 
-  if (count == INT_MAX)
-    /* Release all threads.  */
-    __atomic_store_n (&bar->waiters, 0, MEMMODEL_RELEASE);
-  else if (count < bar->total)
-    /* Release count threads.  */
-    __atomic_add_fetch (&bar->waiters, -count, MEMMODEL_ACQ_REL);
-  else
-    /* Count has an illegal value.  */
-    __builtin_abort ();
-
-  /* Wake other threads in bar.sync.  */
-  asm volatile ("bar.sync 1, %0;" : : "r" (32 * (waiters + 1)));
+  if (__builtin_expect (run_tasks == true, 0))
+    {
+      while (__atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE)
+	     & BAR_TASK_PENDING)
+	gomp_barrier_handle_tasks (state);
 
-  /* Let them get the updated waiters.  */
-  asm volatile ("bar.sync 1, %0;" : : "r" (32 * (waiters + 1)));
+      if (bar->total > 1)
+	asm volatile ("bar.sync 1, %0;" : : "r" (32 * bar->total));
+    }
 
-  gomp_mutex_unlock (&bar->lock);
+  return false;
 }
 
-/* Copied from linux/wait.h.  */
-
-static inline int do_spin (int *addr, int val)
+bool
+gomp_team_barrier_wait_cancel (gomp_barrier_t *bar)
 {
-  /* The current implementation doesn't spin.  */
-  return 1;
+  return gomp_team_barrier_wait_cancel_end (bar, gomp_barrier_wait_start (bar));
 }
 
-/* Copied from linux/wait.h.  */
-
-static inline void do_wait (int *addr, int val)
+void
+gomp_team_barrier_cancel (struct gomp_team *team)
 {
-  if (do_spin (addr, val))
-    futex_wait (addr, val);
-}
+  gomp_mutex_lock (&team->task_lock);
+  if (team->barrier.generation & BAR_CANCELLED)
+    {
+      gomp_mutex_unlock (&team->task_lock);
+      return;
+    }
+  team->barrier.generation |= BAR_CANCELLED;
+  gomp_mutex_unlock (&team->task_lock);
 
-/* Reuse the linux implementation.  */
-#define GOMP_WAIT_H 1
-#include "../linux/bar.c"
+  /* The 'exit' instruction cancels this thread and also fullfills any other
+     CTA threads waiting on barriers.  */
+  asm volatile ("exit;");
+}
diff --git a/libgomp/config/nvptx/bar.h b/libgomp/config/nvptx/bar.h
index 28bf7f4d313..61e9b55ceb1 100644
--- a/libgomp/config/nvptx/bar.h
+++ b/libgomp/config/nvptx/bar.h
@@ -38,8 +38,6 @@ typedef struct
   unsigned generation;
   unsigned awaited;
   unsigned awaited_final;
-  unsigned waiters;
-  gomp_mutex_t lock;
 } gomp_barrier_t;
 
 typedef unsigned int gomp_barrier_state_t;
@@ -59,8 +57,6 @@ static inline void gomp_barrier_init (gomp_barrier_t *bar, unsigned count)
   bar->awaited = count;
   bar->awaited_final = count;
   bar->generation = 0;
-  bar->waiters = 0;
-  gomp_mutex_init (&bar->lock);
 }
 
 static inline void gomp_barrier_reinit (gomp_barrier_t *bar, unsigned count)
@@ -83,10 +79,16 @@ extern void gomp_team_barrier_wait_end (gomp_barrier_t *,
 extern bool gomp_team_barrier_wait_cancel (gomp_barrier_t *);
 extern bool gomp_team_barrier_wait_cancel_end (gomp_barrier_t *,
 					       gomp_barrier_state_t);
-extern void gomp_team_barrier_wake (gomp_barrier_t *, int);
 struct gomp_team;
 extern void gomp_team_barrier_cancel (struct gomp_team *);
 
+static inline void
+gomp_team_barrier_wake (gomp_barrier_t *bar, int count)
+{
+  /* We never "wake up" threads on nvptx.  Threads wait at barrier
+     instructions till barrier fullfilled.  Do nothing here.  */
+}
+
 static inline gomp_barrier_state_t
 gomp_barrier_wait_start (gomp_barrier_t *bar)
 {

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2022-12-21 18:21 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-12-21 18:21 [gcc/devel/omp/gcc-12] nvptx: reimplement libgomp barriers [PR99555] Tobias Burnus

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).