public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc r12-9984] LoongArch: Remove redundant barrier instructions before LL-SC loops
@ 2023-11-16 10:12 Xi Ruoyao
  0 siblings, 0 replies; only message in thread
From: Xi Ruoyao @ 2023-11-16 10:12 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:b8c198af32dbb23751d8ead6e7d9ae38402d56b2

commit r12-9984-gb8c198af32dbb23751d8ead6e7d9ae38402d56b2
Author: Xi Ruoyao <xry111@xry111.site>
Date:   Mon Nov 6 16:06:08 2023 +0800

    LoongArch: Remove redundant barrier instructions before LL-SC loops
    
    This is isomorphic to the LLVM changes [1-2].
    
    On LoongArch, the LL and SC instructions has memory barrier semantics:
    
    - LL: <memory-barrier> + <load-exclusive>
    - SC: <store-conditional> + <memory-barrier>
    
    But the compare and swap operation is allowed to fail, and if it fails
    the SC instruction is not executed, thus the guarantee of acquiring
    semantics cannot be ensured. Therefore, an acquire barrier needs to be
    generated when failure_memorder includes an acquire operation.
    
    On CPUs implementing LoongArch v1.10 or later, "dbar 0b10100" is an
    acquire barrier; on CPUs implementing LoongArch v1.00, it is a full
    barrier.  So it's always enough for acquire semantics.  OTOH if an
    acquire semantic is not needed, we still needs the "dbar 0x700" as the
    load-load barrier like all LL-SC loops.
    
    [1]:https://github.com/llvm/llvm-project/pull/67391
    [2]:https://github.com/llvm/llvm-project/pull/69339
    
    gcc/ChangeLog:
    
            * config/loongarch/loongarch.cc
            (loongarch_memmodel_needs_release_fence): Remove.
            (loongarch_cas_failure_memorder_needs_acquire): New static
            function.
            (loongarch_print_operand): Redefine 'G' for the barrier on CAS
            failure.
            * config/loongarch/sync.md (atomic_cas_value_strong<mode>):
            Remove the redundant barrier before the LL instruction, and
            emit an acquire barrier on failure if needed by
            failure_memorder.
            (atomic_cas_value_cmp_and_7_<mode>): Likewise.
            (atomic_cas_value_add_7_<mode>): Remove the unnecessary barrier
            before the LL instruction.
            (atomic_cas_value_sub_7_<mode>): Likewise.
            (atomic_cas_value_and_7_<mode>): Likewise.
            (atomic_cas_value_xor_7_<mode>): Likewise.
            (atomic_cas_value_or_7_<mode>): Likewise.
            (atomic_cas_value_nand_7_<mode>): Likewise.
            (atomic_cas_value_exchange_7_<mode>): Likewise.
    
    gcc/testsuite/ChangeLog:
    
            * gcc.target/loongarch/cas-acquire.c: New test.
    
    (cherry picked from commit 4d86dc51e34d2a5695b617afeb56e3414836a79a)

Diff:
---
 gcc/config/loongarch/loongarch.cc                | 30 +++++----
 gcc/config/loongarch/sync.md                     | 49 ++++++--------
 gcc/testsuite/gcc.target/loongarch/cas-acquire.c | 82 ++++++++++++++++++++++++
 3 files changed, 119 insertions(+), 42 deletions(-)

diff --git a/gcc/config/loongarch/loongarch.cc b/gcc/config/loongarch/loongarch.cc
index 9cc9c74cd73..41819eba3d5 100644
--- a/gcc/config/loongarch/loongarch.cc
+++ b/gcc/config/loongarch/loongarch.cc
@@ -4321,27 +4321,27 @@ loongarch_memmodel_needs_rel_acq_fence (enum memmodel model)
     }
 }
 
-/* Return true if a FENCE should be emitted to before a memory access to
-   implement the release portion of memory model MODEL.  */
+/* Return true if a FENCE should be emitted after a failed CAS to
+   implement the acquire semantic of failure_memorder.  */
 
 static bool
-loongarch_memmodel_needs_release_fence (enum memmodel model)
+loongarch_cas_failure_memorder_needs_acquire (enum memmodel model)
 {
-  switch (model)
+  switch (memmodel_base (model))
     {
+    case MEMMODEL_ACQUIRE:
     case MEMMODEL_ACQ_REL:
     case MEMMODEL_SEQ_CST:
-    case MEMMODEL_SYNC_SEQ_CST:
-    case MEMMODEL_RELEASE:
-    case MEMMODEL_SYNC_RELEASE:
       return true;
 
-    case MEMMODEL_ACQUIRE:
-    case MEMMODEL_CONSUME:
-    case MEMMODEL_SYNC_ACQUIRE:
     case MEMMODEL_RELAXED:
+    case MEMMODEL_RELEASE:
       return false;
 
+    /* MEMMODEL_CONSUME is deliberately not handled because it's always
+       replaced by MEMMODEL_ACQUIRE as at now.  If you see an ICE caused by
+       MEMMODEL_CONSUME, read the change (re)introducing it carefully and
+       decide what to do.  See PR 59448 and get_memmodel in builtins.cc.  */
     default:
       gcc_unreachable ();
     }
@@ -4368,7 +4368,8 @@ loongarch_memmodel_needs_release_fence (enum memmodel model)
    'V'	Print exact log2 of CONST_INT OP element 0 of a replicated
 	  CONST_VECTOR in decimal.
    'A'	Print a _DB suffix if the memory model requires a release.
-   'G'	Print a DBAR insn if the memory model requires a release.
+   'G'	Print a DBAR insn for CAS failure (with an acquire semantic if
+	needed, otherwise a simple load-load barrier).
    'i'	Print i if the operand is not a register.  */
 
 static void
@@ -4489,8 +4490,11 @@ loongarch_print_operand (FILE *file, rtx op, int letter)
       break;
 
     case 'G':
-      if (loongarch_memmodel_needs_release_fence ((enum memmodel) INTVAL (op)))
-	fputs ("dbar\t0", file);
+      if (loongarch_cas_failure_memorder_needs_acquire (
+	    memmodel_from_int (INTVAL (op))))
+	fputs ("dbar\t0b10100", file);
+      else
+	fputs ("dbar\t0x700", file);
       break;
 
     case 'i':
diff --git a/gcc/config/loongarch/sync.md b/gcc/config/loongarch/sync.md
index 45be1442439..b8763b8f9d1 100644
--- a/gcc/config/loongarch/sync.md
+++ b/gcc/config/loongarch/sync.md
@@ -129,19 +129,18 @@
    (clobber (match_scratch:GPR 6 "=&r"))]
   ""
 {
-  return "%G5\\n\\t"
-	 "1:\\n\\t"
+  return "1:\\n\\t"
 	 "ll.<amo>\\t%0,%1\\n\\t"
 	 "bne\\t%0,%z2,2f\\n\\t"
 	 "or%i3\\t%6,$zero,%3\\n\\t"
 	 "sc.<amo>\\t%6,%1\\n\\t"
-	 "beq\\t$zero,%6,1b\\n\\t"
+	 "beqz\\t%6,1b\\n\\t"
 	 "b\\t3f\\n\\t"
 	 "2:\\n\\t"
-	 "dbar\\t0x700\\n\\t"
+	 "%G5\\n\\t"
 	 "3:\\n\\t";
 }
-  [(set (attr "length") (const_int 32))])
+  [(set (attr "length") (const_int 28))])
 
 (define_expand "atomic_compare_and_swap<mode>"
   [(match_operand:SI 0 "register_operand" "")   ;; bool output
@@ -234,8 +233,7 @@
    (clobber (match_scratch:GPR 7 "=&r"))]
   ""
 {
-  return "%G6\\n\\t"
-	 "1:\\n\\t"
+  return "1:\\n\\t"
 	 "ll.<amo>\\t%0,%1\\n\\t"
 	 "and\\t%7,%0,%2\\n\\t"
 	 "bne\\t%7,%z4,2f\\n\\t"
@@ -245,10 +243,10 @@
 	 "beq\\t$zero,%7,1b\\n\\t"
 	 "b\\t3f\\n\\t"
 	 "2:\\n\\t"
-	 "dbar\\t0x700\\n\\t"
+	 "%G6\\n\\t"
 	 "3:\\n\\t";
 }
-  [(set (attr "length") (const_int 40))])
+  [(set (attr "length") (const_int 36))])
 
 (define_expand "atomic_compare_and_swap<mode>"
   [(match_operand:SI 0 "register_operand" "")   ;; bool output
@@ -303,8 +301,7 @@
    (clobber (match_scratch:GPR 8 "=&r"))]
   ""
 {
-  return "%G6\\n\\t"
-	 "1:\\n\\t"
+  return "1:\\n\\t"
 	 "ll.<amo>\\t%0,%1\\n\\t"
 	 "and\\t%7,%0,%3\\n\\t"
 	 "add.w\\t%8,%0,%z5\\n\\t"
@@ -314,7 +311,7 @@
 	 "beq\\t$zero,%7,1b";
 }
 
-  [(set (attr "length") (const_int 32))])
+  [(set (attr "length") (const_int 28))])
 
 (define_insn "atomic_cas_value_sub_7_<mode>"
   [(set (match_operand:GPR 0 "register_operand" "=&r")				;; res
@@ -330,8 +327,7 @@
    (clobber (match_scratch:GPR 8 "=&r"))]
   ""
 {
-  return "%G6\\n\\t"
-	 "1:\\n\\t"
+  return "1:\\n\\t"
 	 "ll.<amo>\\t%0,%1\\n\\t"
 	 "and\\t%7,%0,%3\\n\\t"
 	 "sub.w\\t%8,%0,%z5\\n\\t"
@@ -340,7 +336,7 @@
 	 "sc.<amo>\\t%7,%1\\n\\t"
 	 "beq\\t$zero,%7,1b";
 }
-  [(set (attr "length") (const_int 32))])
+  [(set (attr "length") (const_int 28))])
 
 (define_insn "atomic_cas_value_and_7_<mode>"
   [(set (match_operand:GPR 0 "register_operand" "=&r")				;; res
@@ -356,8 +352,7 @@
    (clobber (match_scratch:GPR 8 "=&r"))]
   ""
 {
-  return "%G6\\n\\t"
-	 "1:\\n\\t"
+  return "1:\\n\\t"
 	 "ll.<amo>\\t%0,%1\\n\\t"
 	 "and\\t%7,%0,%3\\n\\t"
 	 "and\\t%8,%0,%z5\\n\\t"
@@ -366,7 +361,7 @@
 	 "sc.<amo>\\t%7,%1\\n\\t"
 	 "beq\\t$zero,%7,1b";
 }
-  [(set (attr "length") (const_int 32))])
+  [(set (attr "length") (const_int 28))])
 
 (define_insn "atomic_cas_value_xor_7_<mode>"
   [(set (match_operand:GPR 0 "register_operand" "=&r")				;; res
@@ -382,8 +377,7 @@
    (clobber (match_scratch:GPR 8 "=&r"))]
   ""
 {
-  return "%G6\\n\\t"
-	 "1:\\n\\t"
+  return "1:\\n\\t"
 	 "ll.<amo>\\t%0,%1\\n\\t"
 	 "and\\t%7,%0,%3\\n\\t"
 	 "xor\\t%8,%0,%z5\\n\\t"
@@ -393,7 +387,7 @@
 	 "beq\\t$zero,%7,1b";
 }
 
-  [(set (attr "length") (const_int 32))])
+  [(set (attr "length") (const_int 28))])
 
 (define_insn "atomic_cas_value_or_7_<mode>"
   [(set (match_operand:GPR 0 "register_operand" "=&r")				;; res
@@ -409,8 +403,7 @@
    (clobber (match_scratch:GPR 8 "=&r"))]
   ""
 {
-  return "%G6\\n\\t"
-	 "1:\\n\\t"
+  return "1:\\n\\t"
 	 "ll.<amo>\\t%0,%1\\n\\t"
 	 "and\\t%7,%0,%3\\n\\t"
 	 "or\\t%8,%0,%z5\\n\\t"
@@ -420,7 +413,7 @@
 	 "beq\\t$zero,%7,1b";
 }
 
-  [(set (attr "length") (const_int 32))])
+  [(set (attr "length") (const_int 28))])
 
 (define_insn "atomic_cas_value_nand_7_<mode>"
   [(set (match_operand:GPR 0 "register_operand" "=&r")				;; res
@@ -436,8 +429,7 @@
    (clobber (match_scratch:GPR 8 "=&r"))]
   ""
 {
-  return "%G6\\n\\t"
-	 "1:\\n\\t"
+  return "1:\\n\\t"
 	 "ll.<amo>\\t%0,%1\\n\\t"
 	 "and\\t%7,%0,%3\\n\\t"
 	 "and\\t%8,%0,%z5\\n\\t"
@@ -446,7 +438,7 @@
 	 "sc.<amo>\\t%7,%1\\n\\t"
 	 "beq\\t$zero,%7,1b";
 }
-  [(set (attr "length") (const_int 32))])
+  [(set (attr "length") (const_int 28))])
 
 (define_insn "atomic_cas_value_exchange_7_<mode>"
   [(set (match_operand:GPR 0 "register_operand" "=&r")
@@ -461,8 +453,7 @@
    (clobber (match_scratch:GPR 7 "=&r"))]
   ""
 {
-  return "%G6\\n\\t"
-	 "1:\\n\\t"
+  return "1:\\n\\t"
 	 "ll.<amo>\\t%0,%1\\n\\t"
 	 "and\\t%7,%0,%z3\\n\\t"
 	 "or%i5\\t%7,%7,%5\\n\\t"
diff --git a/gcc/testsuite/gcc.target/loongarch/cas-acquire.c b/gcc/testsuite/gcc.target/loongarch/cas-acquire.c
new file mode 100644
index 00000000000..ff7ba866f32
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/cas-acquire.c
@@ -0,0 +1,82 @@
+/* { dg-do run } */
+/* { dg-require-effective-target c99_runtime } */
+/* { dg-require-effective-target pthread } */
+/* { dg-options "-std=c99 -pthread" } */
+
+/* https://github.com/llvm/llvm-project/pull/67391#issuecomment-1752403934
+   reported that this had failed with GCC and 3A6000.  */
+
+#include <pthread.h>
+#include <stdatomic.h>
+#include <stdbool.h>
+#include <stdio.h>
+
+static unsigned int tags[32];
+static unsigned int vals[32];
+
+static void *
+writer_entry (void *data)
+{
+  atomic_uint *pt = (atomic_uint *)tags;
+  atomic_uint *pv = (atomic_uint *)vals;
+
+  for (unsigned int n = 1; n < 10000; n++)
+    {
+      atomic_store_explicit (&pv[n & 31], n, memory_order_release);
+      atomic_store_explicit (&pt[n & 31], n, memory_order_release);
+    }
+
+  return NULL;
+}
+
+static void *
+reader_entry (void *data)
+{
+  atomic_uint *pt = (atomic_uint *)tags;
+  atomic_uint *pv = (atomic_uint *)vals;
+  int i;
+
+  for (;;)
+    {
+      for (i = 0; i < 32; i++)
+        {
+          unsigned int tag = 0;
+          bool res;
+
+          res = atomic_compare_exchange_weak_explicit (
+              &pt[i], &tag, 0, memory_order_acquire, memory_order_acquire);
+          if (!res)
+            {
+              unsigned int val;
+
+              val = atomic_load_explicit (&pv[i], memory_order_relaxed);
+              if (val < tag)
+                __builtin_trap ();
+            }
+        }
+    }
+
+  return NULL;
+}
+
+int
+main (int argc, char *argv[])
+{
+  pthread_t writer;
+  pthread_t reader;
+  int res;
+
+  res = pthread_create (&writer, NULL, writer_entry, NULL);
+  if (res < 0)
+    __builtin_trap ();
+
+  res = pthread_create (&reader, NULL, reader_entry, NULL);
+  if (res < 0)
+    __builtin_trap ();
+
+  res = pthread_join (writer, NULL);
+  if (res < 0)
+    __builtin_trap ();
+
+  return 0;
+}

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2023-11-16 10:12 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-11-16 10:12 [gcc r12-9984] LoongArch: Remove redundant barrier instructions before LL-SC loops Xi Ruoyao

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).