* [patch] nvptx libgcc atomic routines
@ 2018-09-26 18:41 Cesar Philippidis
2018-10-05 13:14 ` Tom de Vries
0 siblings, 1 reply; 2+ messages in thread
From: Cesar Philippidis @ 2018-09-26 18:41 UTC (permalink / raw)
To: gcc-patches, Tom de Vries, Thomas Schwinge
[-- Attachment #1: Type: text/plain, Size: 558 bytes --]
This patch adds nvptx support for the atomic FETCH_AND_OP functions. I
recall that this used to be important for OpenACC reductions back in the
GCC 5.0 days before Nathan split reductions into four phases. Nowadays,
atomic reductions use a spin lock that's implemented directly by the
nvptx BE. Therefore, I'm not sure if the nvptx port still needs support
for atomic fetch_and_*.
Tom and Thomas, do either of you have any thoughts on this? Should I
commit it to trunk? I bootstrapped and regtested it for x86_64 Linux
with nvptx offloading.
Thanks,
Cesar
[-- Attachment #2: 0001-nvptx-libgcc-atomic-routines.patch --]
[-- Type: text/x-patch, Size: 9698 bytes --]
nvptx libgcc atomic routines
2018-XX-YY Cesar Philippidis <cesar@codesourcery.com>
libgcc/
* config/nvptx/atomic.c: New file.
* config/nvptx/t-nvptx (LIB2ADD): Include it.
(cherry picked from gomp-4_0-branch r223177)
---
libgcc/config/nvptx/atomic.c | 279 +++++++++++++++++++++++++++++++++++
libgcc/config/nvptx/t-nvptx | 3 +-
2 files changed, 281 insertions(+), 1 deletion(-)
create mode 100644 libgcc/config/nvptx/atomic.c
diff --git a/libgcc/config/nvptx/atomic.c b/libgcc/config/nvptx/atomic.c
new file mode 100644
index 00000000000..ab6cf23ef9d
--- /dev/null
+++ b/libgcc/config/nvptx/atomic.c
@@ -0,0 +1,279 @@
+/* Atomic operations for PTX.
+ Copyright (C) 2015-2018 Free Software Foundation, Inc.
+ Contributed by Mentor Graphics.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+<http://www.gnu.org/licenses/>. */
+
+/* Kernel helper for compare-and-exchange. */
+static int
+nvidia_cas (int oldval, int newval, int *ptr)
+{
+ int ret;
+
+ asm volatile ("atom.cas.b32 %0, [%1], %2, %3;" : "=r"(ret) : "r"(ptr),
+ "r"(oldval), "r"(newval));
+
+ return ret;
+}
+
+#define __kernel_cmpxchg (nvidia_cas)
+
+/* Kernel helper for memory barrier. */
+static void
+__threadfence_block (void)
+{
+ asm volatile ("membar.cta;");
+}
+
+#define __kernel_dmb (__threadfence_block)
+
+#define HIDDEN
+
+/* Warning: this assumes that all nvptx targets are little endian. */
+
+#define INVERT_MASK_1 0
+#define INVERT_MASK_2 0
+
+#define MASK_1 0xffu
+#define MASK_2 0xffffu
+
+#define FETCH_AND_OP_WORD(OP, PFX_OP, INF_OP) \
+ int HIDDEN \
+ __sync_fetch_and_##OP##_4 (int *ptr, int val) \
+ { \
+ int failure, tmp; \
+ \
+ do { \
+ tmp = *ptr; \
+ failure = __kernel_cmpxchg (tmp, PFX_OP (tmp INF_OP val), ptr); \
+ } while (failure != 0); \
+ \
+ return tmp; \
+ }
+
+FETCH_AND_OP_WORD (add, , +)
+FETCH_AND_OP_WORD (sub, , -)
+FETCH_AND_OP_WORD (or, , |)
+FETCH_AND_OP_WORD (and, , &)
+FETCH_AND_OP_WORD (xor, , ^)
+FETCH_AND_OP_WORD (nand, ~, &)
+
+#define NAME_oldval(OP, WIDTH) __sync_fetch_and_##OP##_##WIDTH
+#define NAME_newval(OP, WIDTH) __sync_##OP##_and_fetch_##WIDTH
+
+/* Implement both __sync_<op>_and_fetch and __sync_fetch_and_<op> for
+ subword-sized quantities. */
+
+#define SUBWORD_SYNC_OP(OP, PFX_OP, INF_OP, TYPE, WIDTH, RETURN) \
+ TYPE HIDDEN \
+ NAME##_##RETURN (OP, WIDTH) (TYPE *ptr, TYPE val) \
+ { \
+ int *wordptr = (int *) ((unsigned long) ptr & ~3); \
+ unsigned int mask, shift, oldval, newval; \
+ int failure; \
+ \
+ shift = (((unsigned long) ptr & 3) << 3) ^ INVERT_MASK_##WIDTH; \
+ mask = MASK_##WIDTH << shift; \
+ \
+ do { \
+ oldval = *wordptr; \
+ newval = ((PFX_OP (((oldval & mask) >> shift) \
+ INF_OP (unsigned int) val)) << shift) & mask; \
+ newval |= oldval & ~mask; \
+ failure = __kernel_cmpxchg (oldval, newval, wordptr); \
+ } while (failure != 0); \
+ \
+ return (RETURN & mask) >> shift; \
+ }
+
+SUBWORD_SYNC_OP (add, , +, unsigned short, 2, oldval)
+SUBWORD_SYNC_OP (sub, , -, unsigned short, 2, oldval)
+SUBWORD_SYNC_OP (or, , |, unsigned short, 2, oldval)
+SUBWORD_SYNC_OP (and, , &, unsigned short, 2, oldval)
+SUBWORD_SYNC_OP (xor, , ^, unsigned short, 2, oldval)
+SUBWORD_SYNC_OP (nand, ~, &, unsigned short, 2, oldval)
+
+SUBWORD_SYNC_OP (add, , +, unsigned char, 1, oldval)
+SUBWORD_SYNC_OP (sub, , -, unsigned char, 1, oldval)
+SUBWORD_SYNC_OP (or, , |, unsigned char, 1, oldval)
+SUBWORD_SYNC_OP (and, , &, unsigned char, 1, oldval)
+SUBWORD_SYNC_OP (xor, , ^, unsigned char, 1, oldval)
+SUBWORD_SYNC_OP (nand, ~, &, unsigned char, 1, oldval)
+
+#define OP_AND_FETCH_WORD(OP, PFX_OP, INF_OP) \
+ int HIDDEN \
+ __sync_##OP##_and_fetch_4 (int *ptr, int val) \
+ { \
+ int tmp, failure; \
+ \
+ do { \
+ tmp = *ptr; \
+ failure = __kernel_cmpxchg (tmp, PFX_OP (tmp INF_OP val), ptr); \
+ } while (failure != 0); \
+ \
+ return PFX_OP (tmp INF_OP val); \
+ }
+
+OP_AND_FETCH_WORD (add, , +)
+OP_AND_FETCH_WORD (sub, , -)
+OP_AND_FETCH_WORD (or, , |)
+OP_AND_FETCH_WORD (and, , &)
+OP_AND_FETCH_WORD (xor, , ^)
+OP_AND_FETCH_WORD (nand, ~, &)
+
+SUBWORD_SYNC_OP (add, , +, unsigned short, 2, newval)
+SUBWORD_SYNC_OP (sub, , -, unsigned short, 2, newval)
+SUBWORD_SYNC_OP (or, , |, unsigned short, 2, newval)
+SUBWORD_SYNC_OP (and, , &, unsigned short, 2, newval)
+SUBWORD_SYNC_OP (xor, , ^, unsigned short, 2, newval)
+SUBWORD_SYNC_OP (nand, ~, &, unsigned short, 2, newval)
+
+SUBWORD_SYNC_OP (add, , +, unsigned char, 1, newval)
+SUBWORD_SYNC_OP (sub, , -, unsigned char, 1, newval)
+SUBWORD_SYNC_OP (or, , |, unsigned char, 1, newval)
+SUBWORD_SYNC_OP (and, , &, unsigned char, 1, newval)
+SUBWORD_SYNC_OP (xor, , ^, unsigned char, 1, newval)
+SUBWORD_SYNC_OP (nand, ~, &, unsigned char, 1, newval)
+
+int HIDDEN
+__sync_val_compare_and_swap_4 (int *ptr, int oldval, int newval)
+{
+ int actual_oldval, fail;
+
+ while (1)
+ {
+ actual_oldval = *ptr;
+
+ if (oldval != actual_oldval)
+ return actual_oldval;
+
+ fail = __kernel_cmpxchg (actual_oldval, newval, ptr);
+
+ if (!fail)
+ return oldval;
+ }
+}
+
+#define SUBWORD_VAL_CAS(TYPE, WIDTH) \
+ TYPE HIDDEN \
+ __sync_val_compare_and_swap_##WIDTH (TYPE *ptr, TYPE oldval, \
+ TYPE newval) \
+ { \
+ int *wordptr = (int *)((unsigned long) ptr & ~3), fail; \
+ unsigned int mask, shift, actual_oldval, actual_newval; \
+ \
+ shift = (((unsigned long) ptr & 3) << 3) ^ INVERT_MASK_##WIDTH; \
+ mask = MASK_##WIDTH << shift; \
+ \
+ while (1) \
+ { \
+ actual_oldval = *wordptr; \
+ \
+ if (((actual_oldval & mask) >> shift) != (unsigned int) oldval) \
+ return (actual_oldval & mask) >> shift; \
+ \
+ actual_newval = (actual_oldval & ~mask) \
+ | (((unsigned int) newval << shift) & mask); \
+ \
+ fail = __kernel_cmpxchg (actual_oldval, actual_newval, \
+ wordptr); \
+ \
+ if (!fail) \
+ return oldval; \
+ } \
+ }
+
+SUBWORD_VAL_CAS (unsigned short, 2)
+SUBWORD_VAL_CAS (unsigned char, 1)
+
+typedef unsigned char bool;
+
+bool HIDDEN
+__sync_bool_compare_and_swap_4 (int *ptr, int oldval, int newval)
+{
+ int failure = __kernel_cmpxchg (oldval, newval, ptr);
+ return (failure == 0);
+}
+
+#define SUBWORD_BOOL_CAS(TYPE, WIDTH) \
+ bool HIDDEN \
+ __sync_bool_compare_and_swap_##WIDTH (TYPE *ptr, TYPE oldval, \
+ TYPE newval) \
+ { \
+ TYPE actual_oldval \
+ = __sync_val_compare_and_swap_##WIDTH (ptr, oldval, newval); \
+ return (oldval == actual_oldval); \
+ }
+
+SUBWORD_BOOL_CAS (unsigned short, 2)
+SUBWORD_BOOL_CAS (unsigned char, 1)
+
+int HIDDEN
+__sync_lock_test_and_set_4 (int *ptr, int val)
+{
+ int failure, oldval;
+
+ do {
+ oldval = *ptr;
+ failure = __kernel_cmpxchg (oldval, val, ptr);
+ } while (failure != 0);
+
+ return oldval;
+}
+
+#define SUBWORD_TEST_AND_SET(TYPE, WIDTH) \
+ TYPE HIDDEN \
+ __sync_lock_test_and_set_##WIDTH (TYPE *ptr, TYPE val) \
+ { \
+ int failure; \
+ unsigned int oldval, newval, shift, mask; \
+ int *wordptr = (int *) ((unsigned long) ptr & ~3); \
+ \
+ shift = (((unsigned long) ptr & 3) << 3) ^ INVERT_MASK_##WIDTH; \
+ mask = MASK_##WIDTH << shift; \
+ \
+ do { \
+ oldval = *wordptr; \
+ newval = (oldval & ~mask) \
+ | (((unsigned int) val << shift) & mask); \
+ failure = __kernel_cmpxchg (oldval, newval, wordptr); \
+ } while (failure != 0); \
+ \
+ return (oldval & mask) >> shift; \
+ }
+
+SUBWORD_TEST_AND_SET (unsigned short, 2)
+SUBWORD_TEST_AND_SET (unsigned char, 1)
+
+#define SYNC_LOCK_RELEASE(TYPE, WIDTH) \
+ void HIDDEN \
+ __sync_lock_release_##WIDTH (TYPE *ptr) \
+ { \
+ /* All writes before this point must be seen before we release \
+ the lock itself. */ \
+ __kernel_dmb (); \
+ *ptr = 0; \
+ }
+
+SYNC_LOCK_RELEASE (int, 4)
+SYNC_LOCK_RELEASE (short, 2)
+SYNC_LOCK_RELEASE (char, 1)
diff --git a/libgcc/config/nvptx/t-nvptx b/libgcc/config/nvptx/t-nvptx
index c4d20c94cbb..ede0bf0f87d 100644
--- a/libgcc/config/nvptx/t-nvptx
+++ b/libgcc/config/nvptx/t-nvptx
@@ -1,5 +1,6 @@
LIB2ADD=$(srcdir)/config/nvptx/reduction.c \
- $(srcdir)/config/nvptx/mgomp.c
+ $(srcdir)/config/nvptx/mgomp.c \
+ $(srcdir)/config/nvptx/atomic.c
LIB2ADDEH=
LIB2FUNCS_EXCLUDE=__main
--
2.17.1
^ permalink raw reply [flat|nested] 2+ messages in thread
* Re: [patch] nvptx libgcc atomic routines
2018-09-26 18:41 [patch] nvptx libgcc atomic routines Cesar Philippidis
@ 2018-10-05 13:14 ` Tom de Vries
0 siblings, 0 replies; 2+ messages in thread
From: Tom de Vries @ 2018-10-05 13:14 UTC (permalink / raw)
To: Cesar Philippidis, gcc-patches, Thomas Schwinge
On 9/26/18 8:33 PM, Cesar Philippidis wrote:
> This patch adds nvptx support for the atomic FETCH_AND_OP functions. I
> recall that this used to be important for OpenACC reductions back in the
> GCC 5.0 days before Nathan split reductions into four phases. Nowadays,
> atomic reductions use a spin lock that's implemented directly by the
> nvptx BE. Therefore, I'm not sure if the nvptx port still needs support
> for atomic fetch_and_*.
>
> Tom and Thomas, do either of you have any thoughts on this? Should I
> commit it to trunk?
I'd say no. I can think of only one possible use for this, which is to
be able use -fno-inline-atomics to workaround problems in atomics in
ptx, and I think that that's not sufficiently valuable to start
maintaining these routines in trunk.
Thanks,
- Tom
> I bootstrapped and regtested it for x86_64 Linux
> with nvptx offloading.
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2018-10-05 12:39 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-09-26 18:41 [patch] nvptx libgcc atomic routines Cesar Philippidis
2018-10-05 13:14 ` Tom de Vries
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).