public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH] libgomp, nvptx: Honor OpenMP 5.1 num_teams lower bound
@ 2021-11-12 13:20 Jakub Jelinek
  2021-11-12 13:27 ` [PATCH] libgomp, nvptx, v2: " Jakub Jelinek
  0 siblings, 1 reply; 8+ messages in thread
From: Jakub Jelinek @ 2021-11-12 13:20 UTC (permalink / raw)
  To: Tobias Burnus, Andrew Stubbs, Tom de Vries; +Cc: gcc-patches

Hi!

Here is an completely untested attempt at implementing what I was talking
about, that for num_teams_upper 0 or whenever num_teams_lower <= num_blocks,
the current implementation is fine but if the user explicitly asks for more
teams than we can provide in hardware, we need to stop assuming that
omp_get_team_num () is equal to the hw team id, but instead need to use some
team specific memory (I believe it is .shared for PTX), or if none is
provided, array indexed by the hw team id and run some teams serially within
the same hw thread.

This patch assumes that .shared variables are initialized to 0,
https://docs.nvidia.com/cuda/parallel-thread-execution/index.html lists
in Table 7. .shared as non-initializable.  If that isn't the case,
we need to initialize it somewhere for the case of #pragma omp target
without #pragma omp teams in it, maybe in libgcc/config/nvptx/crt0.c ?

2021-11-12  Jakub Jelinek  <jakub@redhat.com>

	* config/nvptx/target.c (__gomp_team_num): Define using inline asm as
	a .shared var.
	(GOMP_teams4): Use __gomp_team_num as the team number instead of
	%ctaid.x.  If first, initialize it to %ctaid.x.  If num_teams_lower
	is bigger than num_blocks, use num_teams_lower teams and arrange for
	bumping of __gomp_team_num if !first and returning false once we run
	out of teams.
	* config/nvptx/teams.c (__gomp_team_num): Declare using inline asm as
	an external .shared var.
	(omp_get_team_num): Return __gomp_team_num value instead of %ctaid.x.

--- libgomp/config/nvptx/target.c.jj	2021-11-12 12:41:11.433501988 +0100
+++ libgomp/config/nvptx/target.c	2021-11-12 14:02:56.231477929 +0100
@@ -26,28 +26,43 @@
 #include "libgomp.h"
 #include <limits.h>
 
+asm ("\n// BEGIN GLOBAL VAR DECL: __gomp_team_num"
+     "\n.visible .shared .align 4 .u32 __gomp_team_num[1];");
+
 bool
 GOMP_teams4 (unsigned int num_teams_lower, unsigned int num_teams_upper,
 	     unsigned int thread_limit, bool first)
 {
+  unsigned int num_blocks, block_id;
+  asm ("mov.u32 %0, %%nctaid.x;" : "=r" (num_blocks));
   if (!first)
-    return false;
+    {
+      unsigned in team_num;
+      if (num_blocks > gomp_num_teams_var)
+	return false;
+      asm ("ld.shared.u32\t%0, [__gomp_team_num]" : "=r" (team_num));
+      if (team_num > gomp_num_teams_var - num_blocks)
+	return false;
+      asm ("st.shared.u32\t[__gomp_team_num], %0"
+	   : : "r" (team_num + num_blocks));
+      return true;
+    }
   if (thread_limit)
     {
       struct gomp_task_icv *icv = gomp_icv (true);
       icv->thread_limit_var
 	= thread_limit > INT_MAX ? UINT_MAX : thread_limit;
     }
-  unsigned int num_blocks, block_id;
-  asm ("mov.u32 %0, %%nctaid.x;" : "=r" (num_blocks));
-  asm ("mov.u32 %0, %%ctaid.x;" : "=r" (block_id));
-  /* FIXME: If num_teams_lower > num_blocks, we want to loop multiple
-     times for some CTAs.  */
-  (void) num_teams_lower;
-  if (!num_teams_upper || num_teams_upper >= num_blocks)
+  if (!num_teams_upper)
     num_teams_upper = num_blocks;
-  else if (block_id >= num_teams_upper)
+  else if (num_blocks < num_teams_lower)
+    num_teams_upper = num_teams_lower;
+  else if (num_blocks < num_teams_upper)
+    num_teams_upper = num_blocks;
+  asm ("mov.u32 %0, %%ctaid.x;" : "=r" (block_id));
+  if (block_id >= num_teams_upper)
     return false;
+  asm ("st.shared.u32\t[__gomp_team_num], %0" : : "r" (block_id));
   gomp_num_teams_var = num_teams_upper - 1;
   return true;
 }
--- libgomp/config/nvptx/teams.c.jj	2021-01-05 00:13:58.255297642 +0100
+++ libgomp/config/nvptx/teams.c	2021-11-12 13:55:59.950421993 +0100
@@ -28,6 +28,9 @@
 
 #include "libgomp.h"
 
+asm ("\n// BEGIN GLOBAL VAR DECL: __gomp_team_num"
+     "\n.extern .shared .align 4 .u32 __gomp_team_num[1];");
+
 void
 GOMP_teams_reg (void (*fn) (void *), void *data, unsigned int num_teams,
 		unsigned int thread_limit, unsigned int flags)
@@ -48,9 +50,9 @@ omp_get_num_teams (void)
 int
 omp_get_team_num (void)
 {
-  int ctaid;
-  asm ("mov.u32 %0, %%ctaid.x;" : "=r" (ctaid));
-  return ctaid;
+  int team_num;
+  asm ("ld.shared.u32\t%0, [__gomp_team_num]" : "=r" (team_num));
+  return team_num;
 }
 
 ialias (omp_get_num_teams)

	Jakub


^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2021-11-12 21:22 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-11-12 13:20 [PATCH] libgomp, nvptx: Honor OpenMP 5.1 num_teams lower bound Jakub Jelinek
2021-11-12 13:27 ` [PATCH] libgomp, nvptx, v2: " Jakub Jelinek
2021-11-12 17:58   ` [PATCH] libgomp, nvptx, v3: " Jakub Jelinek
2021-11-12 19:16     ` Alexander Monakov
2021-11-12 19:47       ` Jakub Jelinek
2021-11-12 19:49         ` Jakub Jelinek
2021-11-12 21:21           ` Alexander Monakov
2021-11-12 21:08     ` Alexander Monakov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).