From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <vries@sourceware.org>
Received: by sourceware.org (Postfix, from userid 2205)
 id 33A893857029; Tue,  1 Feb 2022 18:29:36 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 33A893857029
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
Content-Type: text/plain; charset="utf-8"
From: Tom de Vries <vries@gcc.gnu.org>
To: gcc-cvs@gcc.gnu.org
Subject: [gcc r12-6966] [nvptx] Add some support for .local atomics
X-Act-Checkin: gcc
X-Git-Author: Tom de Vries <tdevries@suse.de>
X-Git-Refname: refs/heads/master
X-Git-Oldrev: ca902055d056773bd0ca80f68bca4b20ad0e183f
X-Git-Newrev: e0451f93d9faa13495132f4e246e9bef30b51417
Message-Id: <20220201182936.33A893857029@sourceware.org>
Date: Tue,  1 Feb 2022 18:29:36 +0000 (GMT)
X-BeenThere: gcc-cvs@gcc.gnu.org
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: Gcc-cvs mailing list <gcc-cvs.gcc.gnu.org>
List-Unsubscribe: <https://gcc.gnu.org/mailman/options/gcc-cvs>,
 <mailto:gcc-cvs-request@gcc.gnu.org?subject=unsubscribe>
List-Archive: <https://gcc.gnu.org/pipermail/gcc-cvs/>
List-Help: <mailto:gcc-cvs-request@gcc.gnu.org?subject=help>
List-Subscribe: <https://gcc.gnu.org/mailman/listinfo/gcc-cvs>,
 <mailto:gcc-cvs-request@gcc.gnu.org?subject=subscribe>
X-List-Received-Date: Tue, 01 Feb 2022 18:29:36 -0000

https://gcc.gnu.org/g:e0451f93d9faa13495132f4e246e9bef30b51417

commit r12-6966-ge0451f93d9faa13495132f4e246e9bef30b51417
Author: Tom de Vries <tdevries@suse.de>
Date:   Fri Jan 21 21:46:05 2022 +0100

    [nvptx] Add some support for .local atomics
    
    The ptx insn atom doesn't support local memory.  In case of doing an atomic
    operation on local memory, we run into:
    ...
    operation not supported on global/shared address space
    ...
    This is the cuGetErrorString message for CUDA_ERROR_INVALID_ADDRESS_SPACE.
    
    The message is somewhat confusing given that actually the operation is not
    supported on local address space.
    
    Fix this by falling back on a non-atomic version when detecting
    a frame-related memory operand.
    
    This only solves some cases that are detected at compile-time.  It does
    however fix the openacc private-atomic-* test-cases.
    
    Tested on x86_64 with nvptx accelerator.
    
    gcc/ChangeLog:
    
    2022-01-27  Tom de Vries  <tdevries@suse.de>
    
            * config/nvptx/nvptx.md (define_insn "atomic_compare_and_swap<mode>_1")
            (define_insn "atomic_exchange<mode>")
            (define_insn "atomic_fetch_add<mode>")
            (define_insn "atomic_fetch_addsf")
            (define_insn "atomic_fetch_<logic><mode>"): Output non-atomic version
            if memory operands is frame-relative.
    
    gcc/testsuite/ChangeLog:
    
    2022-01-31  Tom de Vries  <tdevries@suse.de>
    
            * gcc.target/nvptx/stack-atomics-run.c: New test.
    
    libgomp/ChangeLog:
    
    2022-01-27  Tom de Vries  <tdevries@suse.de>
    
            * testsuite/libgomp.oacc-c-c++-common/private-atomic-1.c: Remove
            PR83812 workaround.
            * testsuite/libgomp.oacc-fortran/private-atomic-1-vector.f90: Same.
            * testsuite/libgomp.oacc-fortran/private-atomic-1-worker.f90: Same.

Diff:
---
 gcc/config/nvptx/nvptx.md                          | 82 +++++++++++++++++++++-
 gcc/testsuite/gcc.target/nvptx/stack-atomics-run.c | 44 ++++++++++++
 .../libgomp.oacc-c-c++-common/private-atomic-1.c   |  7 --
 .../private-atomic-1-vector.f90                    |  7 --
 .../private-atomic-1-worker.f90                    |  7 --
 5 files changed, 124 insertions(+), 23 deletions(-)
diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md
index 773ae8fdc6f..9cbbd956f9d 100644
--- a/gcc/config/nvptx/nvptx.md
+++ b/gcc/config/nvptx/nvptx.md
@@ -1790,11 +1790,28 @@
 	(unspec_volatile:SDIM [(const_int 0)] UNSPECV_CAS))]
   ""
   {
+    struct address_info info;
+    decompose_mem_address (&info, operands[1]);
+    if (info.base != NULL && REG_P (*info.base)
+	&& REGNO_PTR_FRAME_P (REGNO (*info.base)))
+      {
+	output_asm_insn ("{", NULL);
+	output_asm_insn ("\\t"	      ".reg.pred"  "\\t" "%%eq_p;", NULL);
+	output_asm_insn ("\\t"	      ".reg%t0"	   "\\t" "%%val;", operands);
+	output_asm_insn ("\\t"	      "ld%A1%t0"   "\\t" "%%val,%1;", operands);
+	output_asm_insn ("\\t"	      "setp.eq%t0" "\\t" "%%eq_p, %%val, %2;",
+			 operands);
+	output_asm_insn ("@%%eq_p\\t" "st%A1%t0"   "\\t" "%1,%3;", operands);
+	output_asm_insn ("\\t"	      "mov%t0"	   "\\t" "%0,%%val;", operands);
+	output_asm_insn ("}", NULL);
+	return "";
+      }
     const char *t
-      = "%.\\tatom%A1.cas.b%T0\\t%0, %1, %2, %3;";
+      = "\\tatom%A1.cas.b%T0\\t%0, %1, %2, %3;";
     return nvptx_output_atomic_insn (t, operands, 1, 4);
   }
-  [(set_attr "atomic" "true")])
+  [(set_attr "atomic" "true")
+   (set_attr "predicable" "false")])
 
 (define_insn "atomic_exchange<mode>"
   [(set (match_operand:SDIM 0 "nvptx_register_operand" "=R")	;; output
@@ -1806,6 +1823,19 @@
 	(match_operand:SDIM 2 "nvptx_nonmemory_operand" "Ri"))]	;; input
   ""
   {
+    struct address_info info;
+    decompose_mem_address (&info, operands[1]);
+    if (info.base != NULL && REG_P (*info.base)
+	&& REGNO_PTR_FRAME_P (REGNO (*info.base)))
+      {
+	output_asm_insn ("{", NULL);
+	output_asm_insn ("\\t"	 ".reg%t0"  "\\t" "%%val;", operands);
+	output_asm_insn ("%.\\t" "ld%A1%t0" "\\t" "%%val,%1;", operands);
+	output_asm_insn ("%.\\t" "st%A1%t0" "\\t" "%1,%2;", operands);
+	output_asm_insn ("%.\\t" "mov%t0"   "\\t" "%0,%%val;", operands);
+	output_asm_insn ("}", NULL);
+	return "";
+      }
     const char *t
       = "%.\tatom%A1.exch.b%T0\t%0, %1, %2;";
     return nvptx_output_atomic_insn (t, operands, 1, 3);
@@ -1823,6 +1853,22 @@
 	(match_dup 1))]
   ""
   {
+    struct address_info info;
+    decompose_mem_address (&info, operands[1]);
+    if (info.base != NULL && REG_P (*info.base)
+	&& REGNO_PTR_FRAME_P (REGNO (*info.base)))
+      {
+	output_asm_insn ("{", NULL);
+	output_asm_insn ("\\t"	 ".reg%t0"  "\\t" "%%val;", operands);
+	output_asm_insn ("\\t"	 ".reg%t0"  "\\t" "%%update;", operands);
+	output_asm_insn ("%.\\t" "ld%A1%t0" "\\t" "%%val,%1;", operands);
+	output_asm_insn ("%.\\t" "add%t0"   "\\t" "%%update,%%val,%2;",
+			 operands);
+	output_asm_insn ("%.\\t" "st%A1%t0" "\\t" "%1,%%update;", operands);
+	output_asm_insn ("%.\\t" "mov%t0"   "\\t" "%0,%%val;", operands);
+	output_asm_insn ("}", NULL);
+	return "";
+      }
     const char *t
       = "%.\\tatom%A1.add%t0\\t%0, %1, %2;";
     return nvptx_output_atomic_insn (t, operands, 1, 3);
@@ -1840,6 +1886,22 @@
 	(match_dup 1))]
   ""
   {
+    struct address_info info;
+    decompose_mem_address (&info, operands[1]);
+    if (info.base != NULL && REG_P (*info.base)
+	&& REGNO_PTR_FRAME_P (REGNO (*info.base)))
+      {
+	output_asm_insn ("{", NULL);
+	output_asm_insn ("\\t"	 ".reg%t0"  "\\t" "%%val;", operands);
+	output_asm_insn ("\\t"	 ".reg%t0"  "\\t" "%%update;", operands);
+	output_asm_insn ("%.\\t" "ld%A1%t0" "\\t" "%%val,%1;", operands);
+	output_asm_insn ("%.\\t" "add%t0"   "\\t" "%%update,%%val,%2;",
+			 operands);
+	output_asm_insn ("%.\\t" "st%A1%t0" "\\t" "%1,%%update;", operands);
+	output_asm_insn ("%.\\t" "mov%t0"   "\\t" "%0,%%val;", operands);
+	output_asm_insn ("}", NULL);
+	return "";
+      }
     const char *t
       = "%.\\tatom%A1.add%t0\\t%0, %1, %2;";
     return nvptx_output_atomic_insn (t, operands, 1, 3);
@@ -1860,6 +1922,22 @@
 	(match_dup 1))]
   "<MODE>mode == SImode || TARGET_SM35"
   {
+    struct address_info info;
+    decompose_mem_address (&info, operands[1]);
+    if (info.base != NULL && REG_P (*info.base)
+	&& REGNO_PTR_FRAME_P (REGNO (*info.base)))
+      {
+	output_asm_insn ("{", NULL);
+	output_asm_insn ("\\t"	 ".reg.b%T0"    "\\t" "%%val;", operands);
+	output_asm_insn ("\\t"	 ".reg.b%T0"    "\\t" "%%update;", operands);
+	output_asm_insn ("%.\\t" "ld%A1%t0"     "\\t" "%%val,%1;", operands);
+	output_asm_insn ("%.\\t" "<logic>.b%T0" "\\t" "%%update,%%val,%2;",
+			 operands);
+	output_asm_insn ("%.\\t" "st%A1%t0"     "\\t" "%1,%%update;", operands);
+	output_asm_insn ("%.\\t" "mov%t0"       "\\t" "%0,%%val;", operands);
+	output_asm_insn ("}", NULL);
+	return "";
+      }
     const char *t
       = "%.\\tatom%A1.b%T0.<logic>\\t%0, %1, %2;";
     return nvptx_output_atomic_insn (t, operands, 1, 3);
diff --git a/gcc/testsuite/gcc.target/nvptx/stack-atomics-run.c b/gcc/testsuite/gcc.target/nvptx/stack-atomics-run.c
new file mode 100644
index 00000000000..ad8e2f842fb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nvptx/stack-atomics-run.c
@@ -0,0 +1,44 @@
+/* { dg-do run } */
+
+enum memmodel {
+  MEMMODEL_RELAXED = 0
+};
+
+int
+main (void)
+{
+  int a, b;
+
+  a = 1;
+  __atomic_fetch_add (&a, 1, MEMMODEL_RELAXED);
+  if (a != 2)
+    __builtin_abort ();
+
+  a = 0;
+  __atomic_fetch_or (&a, 1, MEMMODEL_RELAXED);
+  if (a != 1)
+    __builtin_abort ();
+  
+  a = 1;
+  b = -1;
+  b = __atomic_exchange_n (&a, 0, MEMMODEL_RELAXED);
+  if (a != 0)
+    __builtin_abort ();
+  if (b != 1)
+    __builtin_abort ();
+
+  a = 1;
+  b = -1;
+  {
+    int expected = a;
+    b = __atomic_compare_exchange_n (&a, &expected, 0, 0, MEMMODEL_RELAXED,
+				     MEMMODEL_RELAXED);
+  }
+  if (a != 0)
+    __builtin_abort ();
+  if (b != 1)
+    __builtin_abort ();
+
+  
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/private-atomic-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/private-atomic-1.c
index e651012f463..2f9e6f2d8a5 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/private-atomic-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/private-atomic-1.c
@@ -32,13 +32,6 @@ int main (void)
 	{
 #pragma acc atomic update
 	  ++v;
-	  /* nvptx offloading: PR83812 "operation not supported on global/shared address space".
-	     { dg-output "(\n|\r\n|\r)libgomp: cuStreamSynchronize error: operation not supported on global/shared address space(\n|\r\n|\r)$" { target openacc_nvidia_accel_selected } }
-	       Scan for what we expect in the "XFAILed" case (without actually XFAILing).
-	     { dg-shouldfail "XFAILed" { openacc_nvidia_accel_selected } }
-	       ... instead of 'dg-xfail-run-if' so that 'dg-output' is evaluated at all.
-	     { dg-final { if { [dg-process-target { xfail openacc_nvidia_accel_selected }] == "F" } { xfail "[testname-for-summary] really is XFAILed" } } }
-	       ... so that we still get an XFAIL visible in the log.  */
 	}
 
       res += (v == -222 + 121);
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/private-atomic-1-vector.f90 b/libgomp/testsuite/libgomp.oacc-fortran/private-atomic-1-vector.f90
index e916837fc8f..3f39d9e18e8 100644
--- a/libgomp/testsuite/libgomp.oacc-fortran/private-atomic-1-vector.f90
+++ b/libgomp/testsuite/libgomp.oacc-fortran/private-atomic-1-vector.f90
@@ -25,13 +25,6 @@ program main
       do i = 0, 31
         !$acc atomic update
         w = w + 1
-        ! nvptx offloading: PR83812 "operation not supported on global/shared address space".
-        ! { dg-output "(\n|\r\n|\r)libgomp: cuStreamSynchronize error: operation not supported on global/shared address space(\n|\r\n|\r)$" { target openacc_nvidia_accel_selected } }
-        !   Scan for what we expect in the "XFAILed" case (without actually XFAILing).
-        ! { dg-shouldfail "XFAILed" { openacc_nvidia_accel_selected } }
-        !   ... instead of 'dg-xfail-run-if' so that 'dg-output' is evaluated at all.
-        ! { dg-final { if { [dg-process-target { xfail openacc_nvidia_accel_selected }] == "F" } { xfail "[testname-for-summary] really is XFAILed" } } }
-        !   ... so that we still get an XFAIL visible in the log.
         !$acc end atomic
       end do
       arr(j) = w
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/private-atomic-1-worker.f90 b/libgomp/testsuite/libgomp.oacc-fortran/private-atomic-1-worker.f90
index 5fa157b1674..a86b7a491bc 100644
--- a/libgomp/testsuite/libgomp.oacc-fortran/private-atomic-1-worker.f90
+++ b/libgomp/testsuite/libgomp.oacc-fortran/private-atomic-1-worker.f90
@@ -25,13 +25,6 @@ program main
       do i = 0, 31
         !$acc atomic update
         w = w + 1
-        ! nvptx offloading: PR83812 "operation not supported on global/shared address space".
-        ! { dg-output "(\n|\r\n|\r)libgomp: cuStreamSynchronize error: operation not supported on global/shared address space(\n|\r\n|\r)$" { target openacc_nvidia_accel_selected } }
-        !   Scan for what we expect in the "XFAILed" case (without actually XFAILing).
-        ! { dg-shouldfail "XFAILed" { openacc_nvidia_accel_selected } }
-        !   ... instead of 'dg-xfail-run-if' so that 'dg-output' is evaluated at all.
-        ! { dg-final { if { [dg-process-target { xfail openacc_nvidia_accel_selected }] == "F" } { xfail "[testname-for-summary] really is XFAILed" } } }
-        !   ... so that we still get an XFAIL visible in the log.
         !$acc end atomic
       end do
       arr(j) = w