public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
From: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
To: GCC Patches <gcc-patches@gcc.gnu.org>
Cc: Richard Sandiford <Richard.Sandiford@arm.com>,
	Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
Subject: Re: [PATCH] libatomic: Enable lock-free 128-bit atomics on AArch64 [PR110061]
Date: Mon, 16 Oct 2023 12:50:46 +0000	[thread overview]
Message-ID: <PAWPR08MB89824DA98CDF333819A822E383D7A@PAWPR08MB8982.eurprd08.prod.outlook.com> (raw)
In-Reply-To: <PAWPR08MB89828C1417B4C980F7E2687B83F0A@PAWPR08MB8982.eurprd08.prod.outlook.com>

 

ping

From: Wilco Dijkstra
Sent: 02 June 2023 18:28
To: GCC Patches <gcc-patches@gcc.gnu.org>
Cc: Richard Sandiford <Richard.Sandiford@arm.com>; Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
Subject: [PATCH] libatomic: Enable lock-free 128-bit atomics on AArch64 [PR110061] 
 

Enable lock-free 128-bit atomics on AArch64.  This is backwards compatible with
existing binaries, gives better performance than locking atomics and is what
most users expect.

Note 128-bit atomic loads use a load/store exclusive loop if LSE2 is not supported.
This results in an implicit store which is invisible to software as long as the given
address is writeable (which will be true when using atomics in actual code).

A simple test on an old Cortex-A72 showed 2.7x speedup of 128-bit atomics.

Passes regress, OK for commit?

libatomic/
        PR target/110061
        config/linux/aarch64/atomic_16.S: Implement lock-free ARMv8.0 atomics.
        config/linux/aarch64/host-config.h: Use atomic_16.S for baseline v8.0.
        State we have lock-free atomics.

---

diff --git a/libatomic/config/linux/aarch64/atomic_16.S b/libatomic/config/linux/aarch64/atomic_16.S
index 05439ce394b9653c9bcb582761ff7aaa7c8f9643..0485c284117edf54f41959d2fab9341a9567b1cf 100644
--- a/libatomic/config/linux/aarch64/atomic_16.S
+++ b/libatomic/config/linux/aarch64/atomic_16.S
@@ -22,6 +22,21 @@
    <http://www.gnu.org/licenses/>.  */
 
 
+/* AArch64 128-bit lock-free atomic implementation.
+
+   128-bit atomics are now lock-free for all AArch64 architecture versions.
+   This is backwards compatible with existing binaries and gives better
+   performance than locking atomics.
+
+   128-bit atomic loads use a exclusive loop if LSE2 is not supported.
+   This results in an implicit store which is invisible to software as long
+   as the given address is writeable.  Since all other atomics have explicit
+   writes, this will be true when using atomics in actual code.
+
+   The libat_<op>_16 entry points are ARMv8.0.
+   The libat_<op>_16_i1 entry points are used when LSE2 is available.  */
+
+
         .arch   armv8-a+lse
 
 #define ENTRY(name)             \
@@ -37,6 +52,10 @@ name:                                \
         .cfi_endproc;           \
         .size name, .-name;
 
+#define ALIAS(alias,name)      \
+       .global alias;          \
+       .set alias, name;
+
 #define res0 x0
 #define res1 x1
 #define in0  x2
@@ -70,6 +89,24 @@ name:                                \
 #define SEQ_CST 5
 
 
+ENTRY (libat_load_16)
+       mov     x5, x0
+       cbnz    w1, 2f
+
+       /* RELAXED.  */
+1:     ldxp    res0, res1, [x5]
+       stxp    w4, res0, res1, [x5]
+       cbnz    w4, 1b
+       ret
+
+       /* ACQUIRE/CONSUME/SEQ_CST.  */
+2:     ldaxp   res0, res1, [x5]
+       stxp    w4, res0, res1, [x5]
+       cbnz    w4, 2b
+       ret
+END (libat_load_16)
+
+
 ENTRY (libat_load_16_i1)
         cbnz    w1, 1f
 
@@ -93,6 +130,23 @@ ENTRY (libat_load_16_i1)
 END (libat_load_16_i1)
 
 
+ENTRY (libat_store_16)
+       cbnz    w4, 2f
+
+       /* RELAXED.  */
+1:     ldxp    xzr, tmp0, [x0]
+       stxp    w4, in0, in1, [x0]
+       cbnz    w4, 1b
+       ret
+
+       /* RELEASE/SEQ_CST.  */
+2:     ldxp    xzr, tmp0, [x0]
+       stlxp   w4, in0, in1, [x0]
+       cbnz    w4, 2b
+       ret
+END (libat_store_16)
+
+
 ENTRY (libat_store_16_i1)
         cbnz    w4, 1f
 
@@ -101,14 +155,14 @@ ENTRY (libat_store_16_i1)
         ret
 
         /* RELEASE/SEQ_CST.  */
-1:     ldaxp   xzr, tmp0, [x0]
+1:     ldxp    xzr, tmp0, [x0]
         stlxp   w4, in0, in1, [x0]
         cbnz    w4, 1b
         ret
 END (libat_store_16_i1)
 
 
-ENTRY (libat_exchange_16_i1)
+ENTRY (libat_exchange_16)
         mov     x5, x0
         cbnz    w4, 2f
 
@@ -126,22 +180,55 @@ ENTRY (libat_exchange_16_i1)
         stxp    w4, in0, in1, [x5]
         cbnz    w4, 3b
         ret
-4:
-       cmp     w4, RELEASE
-       b.ne    6f
 
-       /* RELEASE.  */
-5:     ldxp    res0, res1, [x5]
+       /* RELEASE/ACQ_REL/SEQ_CST.  */
+4:     ldaxp   res0, res1, [x5]
         stlxp   w4, in0, in1, [x5]
-       cbnz    w4, 5b
+       cbnz    w4, 4b
         ret
+END (libat_exchange_16)
 
-       /* ACQ_REL/SEQ_CST.  */
-6:     ldaxp   res0, res1, [x5]
-       stlxp   w4, in0, in1, [x5]
-       cbnz    w4, 6b
+
+ENTRY (libat_compare_exchange_16)
+       ldp     exp0, exp1, [x1]
+       cbz     w4, 3f
+       cmp     w4, RELEASE
+       b.hs    4f
+
+       /* ACQUIRE/CONSUME.  */
+1:     ldaxp   tmp0, tmp1, [x0]
+       cmp     tmp0, exp0
+       ccmp    tmp1, exp1, 0, eq
+       bne     2f
+       stxp    w4, in0, in1, [x0]
+       cbnz    w4, 1b
+       mov     x0, 1
         ret
-END (libat_exchange_16_i1)
+
+2:     stp     tmp0, tmp1, [x1]
+       mov     x0, 0
+       ret
+
+       /* RELAXED.  */
+3:     ldxp    tmp0, tmp1, [x0]
+       cmp     tmp0, exp0
+       ccmp    tmp1, exp1, 0, eq
+       bne     2b
+       stxp    w4, in0, in1, [x0]
+       cbnz    w4, 3b
+       mov     x0, 1
+       ret
+
+       /* RELEASE/ACQ_REL/SEQ_CST.  */
+4:     ldaxp   tmp0, tmp1, [x0]
+       cmp     tmp0, exp0
+       ccmp    tmp1, exp1, 0, eq
+       bne     2b
+       stlxp   w4, in0, in1, [x0]
+       cbnz    w4, 4b
+       mov     x0, 1
+       ret
+END (libat_compare_exchange_16)
 
 
 ENTRY (libat_compare_exchange_16_i1)
@@ -180,7 +267,7 @@ ENTRY (libat_compare_exchange_16_i1)
 END (libat_compare_exchange_16_i1)
 
 
-ENTRY (libat_fetch_add_16_i1)
+ENTRY (libat_fetch_add_16)
         mov     x5, x0
         cbnz    w4, 2f
 
@@ -199,10 +286,10 @@ ENTRY (libat_fetch_add_16_i1)
         stlxp   w4, tmp0, tmp1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_fetch_add_16_i1)
+END (libat_fetch_add_16)
 
 
-ENTRY (libat_add_fetch_16_i1)
+ENTRY (libat_add_fetch_16)
         mov     x5, x0
         cbnz    w4, 2f
 
@@ -221,10 +308,10 @@ ENTRY (libat_add_fetch_16_i1)
         stlxp   w4, res0, res1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_add_fetch_16_i1)
+END (libat_add_fetch_16)
 
 
-ENTRY (libat_fetch_sub_16_i1)
+ENTRY (libat_fetch_sub_16)
         mov     x5, x0
         cbnz    w4, 2f
 
@@ -243,10 +330,10 @@ ENTRY (libat_fetch_sub_16_i1)
         stlxp   w4, tmp0, tmp1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_fetch_sub_16_i1)
+END (libat_fetch_sub_16)
 
 
-ENTRY (libat_sub_fetch_16_i1)
+ENTRY (libat_sub_fetch_16)
         mov     x5, x0
         cbnz    w4, 2f
 
@@ -265,10 +352,10 @@ ENTRY (libat_sub_fetch_16_i1)
         stlxp   w4, res0, res1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_sub_fetch_16_i1)
+END (libat_sub_fetch_16)
 
 
-ENTRY (libat_fetch_or_16_i1)
+ENTRY (libat_fetch_or_16)
         mov     x5, x0
         cbnz    w4, 2f
 
@@ -287,10 +374,10 @@ ENTRY (libat_fetch_or_16_i1)
         stlxp   w4, tmp0, tmp1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_fetch_or_16_i1)
+END (libat_fetch_or_16)
 
 
-ENTRY (libat_or_fetch_16_i1)
+ENTRY (libat_or_fetch_16)
         mov     x5, x0
         cbnz    w4, 2f
 
@@ -309,10 +396,10 @@ ENTRY (libat_or_fetch_16_i1)
         stlxp   w4, res0, res1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_or_fetch_16_i1)
+END (libat_or_fetch_16)
 
 
-ENTRY (libat_fetch_and_16_i1)
+ENTRY (libat_fetch_and_16)
         mov     x5, x0
         cbnz    w4, 2f
 
@@ -331,10 +418,10 @@ ENTRY (libat_fetch_and_16_i1)
         stlxp   w4, tmp0, tmp1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_fetch_and_16_i1)
+END (libat_fetch_and_16)
 
 
-ENTRY (libat_and_fetch_16_i1)
+ENTRY (libat_and_fetch_16)
         mov     x5, x0
         cbnz    w4, 2f
 
@@ -353,10 +440,10 @@ ENTRY (libat_and_fetch_16_i1)
         stlxp   w4, res0, res1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_and_fetch_16_i1)
+END (libat_and_fetch_16)
 
 
-ENTRY (libat_fetch_xor_16_i1)
+ENTRY (libat_fetch_xor_16)
         mov     x5, x0
         cbnz    w4, 2f
 
@@ -375,10 +462,10 @@ ENTRY (libat_fetch_xor_16_i1)
         stlxp   w4, tmp0, tmp1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_fetch_xor_16_i1)
+END (libat_fetch_xor_16)
 
 
-ENTRY (libat_xor_fetch_16_i1)
+ENTRY (libat_xor_fetch_16)
         mov     x5, x0
         cbnz    w4, 2f
 
@@ -397,10 +484,10 @@ ENTRY (libat_xor_fetch_16_i1)
         stlxp   w4, res0, res1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_xor_fetch_16_i1)
+END (libat_xor_fetch_16)
 
 
-ENTRY (libat_fetch_nand_16_i1)
+ENTRY (libat_fetch_nand_16)
         mov     x5, x0
         mvn     in0, in0
         mvn     in1, in1
@@ -421,10 +508,10 @@ ENTRY (libat_fetch_nand_16_i1)
         stlxp   w4, tmp0, tmp1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_fetch_nand_16_i1)
+END (libat_fetch_nand_16)
 
 
-ENTRY (libat_nand_fetch_16_i1)
+ENTRY (libat_nand_fetch_16)
         mov     x5, x0
         mvn     in0, in0
         mvn     in1, in1
@@ -445,21 +532,38 @@ ENTRY (libat_nand_fetch_16_i1)
         stlxp   w4, res0, res1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_nand_fetch_16_i1)
+END (libat_nand_fetch_16)
 
 
-ENTRY (libat_test_and_set_16_i1)
-       mov     w2, 1
-       cbnz    w1, 2f
-
-       /* RELAXED.  */
-       swpb    w0, w2, [x0]
-       ret
+/* __atomic_test_and_set is always inlined, so this entry is unused and
+   only required for completeness.  */
+ENTRY (libat_test_and_set_16)
 
-       /* ACQUIRE/CONSUME/RELEASE/ACQ_REL/SEQ_CST.  */
-2:     swpalb  w0, w2, [x0]
+       /* RELAXED/ACQUIRE/CONSUME/RELEASE/ACQ_REL/SEQ_CST.  */
+       mov     x5, x0
+1:     ldaxrb  w0, [x5]
+       stlxrb  w4, w2, [x5]
+       cbnz    w4, 1b
         ret
-END (libat_test_and_set_16_i1)
+END (libat_test_and_set_16)
+
+
+/* Alias entry points which are the same in baseline and LSE2.  */
+
+ALIAS (libat_exchange_16_i1, libat_exchange_16)
+ALIAS (libat_fetch_add_16_i1, libat_fetch_add_16)
+ALIAS (libat_add_fetch_16_i1, libat_add_fetch_16)
+ALIAS (libat_fetch_sub_16_i1, libat_fetch_sub_16)
+ALIAS (libat_sub_fetch_16_i1, libat_sub_fetch_16)
+ALIAS (libat_fetch_or_16_i1, libat_fetch_or_16)
+ALIAS (libat_or_fetch_16_i1, libat_or_fetch_16)
+ALIAS (libat_fetch_and_16_i1, libat_fetch_and_16)
+ALIAS (libat_and_fetch_16_i1, libat_and_fetch_16)
+ALIAS (libat_fetch_xor_16_i1, libat_fetch_xor_16)
+ALIAS (libat_xor_fetch_16_i1, libat_xor_fetch_16)
+ALIAS (libat_fetch_nand_16_i1, libat_fetch_nand_16)
+ALIAS (libat_nand_fetch_16_i1, libat_nand_fetch_16)
+ALIAS (libat_test_and_set_16_i1, libat_test_and_set_16)
 
 
 /* GNU_PROPERTY_AARCH64_* macros from elf.h for use in asm code.  */
diff --git a/libatomic/config/linux/aarch64/host-config.h b/libatomic/config/linux/aarch64/host-config.h
index bea26825b4f75bb8ff348ab4b5fc45f4a5bd561e..851c78c01cd643318aaa52929ce4550266238b79 100644
--- a/libatomic/config/linux/aarch64/host-config.h
+++ b/libatomic/config/linux/aarch64/host-config.h
@@ -35,10 +35,19 @@
 #endif
 #define IFUNC_NCOND(N)  (1)
 
-#if N == 16 && IFUNC_ALT != 0
+#endif /* HAVE_IFUNC */
+
+/* All 128-bit atomic functions are defined in aarch64/atomic_16.S.  */
+#if N == 16
 # define DONE 1
 #endif
 
-#endif /* HAVE_IFUNC */
+/* State we have lock-free 128-bit atomics.  */
+#undef FAST_ATOMIC_LDST_16
+#define FAST_ATOMIC_LDST_16            1
+#undef MAYBE_HAVE_ATOMIC_CAS_16
+#define MAYBE_HAVE_ATOMIC_CAS_16       1
+#undef MAYBE_HAVE_ATOMIC_EXCHANGE_16
+#define MAYBE_HAVE_ATOMIC_EXCHANGE_16  1
 
 #include_next <host-config.h>

  reply	other threads:[~2023-10-16 12:51 UTC|newest]

Thread overview: 10+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-06-02 17:28 Wilco Dijkstra
2023-06-16 12:17 ` Wilco Dijkstra
2023-07-05 17:19   ` Wilco Dijkstra
2023-08-04 15:07     ` Wilco Dijkstra
2023-09-13 14:57       ` Wilco Dijkstra
2023-10-16 12:50         ` Wilco Dijkstra [this message]
2023-11-06 12:13           ` Wilco Dijkstra
2023-11-29 23:46             ` Richard Sandiford
2023-12-04 18:22               ` [PATCH v2] " Wilco Dijkstra
2023-12-04 21:48                 ` Richard Sandiford

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=PAWPR08MB89824DA98CDF333819A822E383D7A@PAWPR08MB8982.eurprd08.prod.outlook.com \
    --to=wilco.dijkstra@arm.com \
    --cc=Kyrylo.Tkachov@arm.com \
    --cc=Richard.Sandiford@arm.com \
    --cc=gcc-patches@gcc.gnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).