public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH] LoongArch: Fix atomic_exchange make comparison and may jump out
@ 2022-11-15 13:03 Jinyang He
  2022-11-15 14:21 ` Xi Ruoyao
  0 siblings, 1 reply; 9+ messages in thread
From: Jinyang He @ 2022-11-15 13:03 UTC (permalink / raw)
  To: Chenghua Xu, Lulu Cheng; +Cc: Weining Lu, Xing Li, yala, Peng Fan, gcc-patches

gcc/ChangeLog:

* config/loongarch/sync.md:
Add atomic_cas_value_exchange_and_7<mode> and fix atomic_exchange<mode>.

gcc/testsuite/ChangeLog:

* gcc.target/loongarch/sync-1.c: New test.
---
 gcc/config/loongarch/sync.md                |  27 ++++-
 gcc/testsuite/gcc.target/loongarch/sync-1.c | 104 ++++++++++++++++++++
 2 files changed, 129 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/loongarch/sync-1.c

diff --git a/gcc/config/loongarch/sync.md b/gcc/config/loongarch/sync.md
index 0c4f1983e..8a8e6247b 100644
--- a/gcc/config/loongarch/sync.md
+++ b/gcc/config/loongarch/sync.md
@@ -448,6 +448,29 @@
 }
   [(set (attr "length") (const_int 32))])
 
+(define_insn "atomic_cas_value_exchange_and_7_<mode>"
+  [(set (match_operand:GPR 0 "register_operand" "=&r")
+	(match_operand:GPR 1 "memory_operand" "+ZC"))
+   (set (match_dup 1)
+	(unspec_volatile:GPR [(match_operand:GPR 2 "reg_or_0_operand" "rJ")
+			      (match_operand:GPR 3 "reg_or_0_operand" "rJ")
+			      (match_operand:GPR 4 "reg_or_0_operand" "rJ")
+			      (match_operand:GPR 5 "reg_or_0_operand"  "rJ")
+			      (match_operand:SI 6 "const_int_operand")] ;; model
+	 UNSPEC_SYNC_EXCHANGE))
+   (clobber (match_scratch:GPR 7 "=&r"))]
+  ""
+{
+  return "%G6\\n\\t"
+	 "1:\\n\\t"
+	 "ll.<amo>\\t%0,%1\\n\\t"
+	 "and\\t%7,%0,%z3\\n\\t"
+	 "or%i5\\t%7,%7,%5\\n\\t"
+	 "sc.<amo>\\t%7,%1\\n\\t"
+	 "beqz\\t%7,1b\\n\\t";
+}
+  [(set (attr "length") (const_int 20))])
+
 (define_expand "atomic_exchange<mode>"
   [(set (match_operand:SHORT 0 "register_operand")
 	(unspec_volatile:SHORT
@@ -459,9 +482,9 @@
   ""
 {
   union loongarch_gen_fn_ptrs generator;
-  generator.fn_7 = gen_atomic_cas_value_cmp_and_7_si;
+  generator.fn_7 = gen_atomic_cas_value_exchange_and_7_si;
   loongarch_expand_atomic_qihi (generator, operands[0], operands[1],
-				operands[1], operands[2], operands[3]);
+				const0_rtx, operands[2], operands[3]);
   DONE;
 })
 
diff --git a/gcc/testsuite/gcc.target/loongarch/sync-1.c b/gcc/testsuite/gcc.target/loongarch/sync-1.c
new file mode 100644
index 000000000..cebed6a9b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/sync-1.c
@@ -0,0 +1,104 @@
+/* Test __sync_test_and_set in atomic_exchange */
+/* { dg-do run } */
+/* { dg-options "-lpthread -std=c11" } */
+
+#include <pthread.h>
+#include <sched.h>
+#include <stdatomic.h>
+#include <stdlib.h>
+
+#define NR_THREAD 16
+#define NR_DATA 1000
+#define ITER_COUNT 10000
+
+static int _data[NR_DATA];
+static char _lock;
+static int _overcnt;
+
+static inline void proc_yield(int cnt)
+{
+  __asm__ __volatile__("":::"memory");
+}
+
+static void unlock()
+{
+  return atomic_store_explicit(&_lock, 0, memory_order_seq_cst);
+}
+
+static int trylock()
+{
+  return atomic_exchange_explicit(&_lock, 1, memory_order_acquire) == 0;
+}
+
+static void lockslow()
+{
+  for (int i = 0;; i++) {
+    if (i < 10)
+      proc_yield(i);
+    else
+      sched_yield();
+    if (atomic_load_explicit(&_lock, memory_order_relaxed) == 0
+      && atomic_exchange_explicit(&_lock, 1, memory_order_acquire) == 0)
+      return;
+  }
+}
+
+static void lock()
+{
+  if (trylock())
+    return;
+  lockslow();
+}
+
+static void checkeq(int a, int b)
+{
+  if (a != b)
+    __builtin_abort();
+}
+
+static void adddata()
+{
+  int i, v;
+  lock();
+  v = _data[0];
+  for (i = 0; i < NR_DATA; i++) {
+    checkeq(_data[i], v);
+    _data[i]++;
+  }
+  unlock();
+}
+
+static void backoff()
+{
+  int i, data[NR_DATA] = {0};
+  for (i = 0; i < NR_DATA; i++) {
+    data[i]++;
+    checkeq(data[i], 1);
+  }
+}
+
+static void *write_mutex_thread(void *unused)
+{
+  int i;
+  for (i = 0; i < ITER_COUNT; i++) {
+    adddata();
+    backoff();
+  }
+  atomic_fetch_add(&_overcnt, 1);
+}
+
+int main()
+{
+  int cnt;
+
+  pthread_t threads[NR_THREAD];
+  for (int i = 0; i < NR_THREAD; i++)
+    pthread_create(&threads[i], 0, write_mutex_thread, NULL);
+  for (int i = 0; i < NR_THREAD; i++)
+    pthread_detach(threads[i]);
+  while(cnt != NR_THREAD) {
+    sched_yield();
+    cnt = atomic_load(&_overcnt);
+  }
+  return 0;
+}
-- 
2.34.3


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] LoongArch: Fix atomic_exchange make comparison and may jump out
  2022-11-15 13:03 [PATCH] LoongArch: Fix atomic_exchange make comparison and may jump out Jinyang He
@ 2022-11-15 14:21 ` Xi Ruoyao
  2022-11-16  2:11   ` Jinyang He
  0 siblings, 1 reply; 9+ messages in thread
From: Xi Ruoyao @ 2022-11-15 14:21 UTC (permalink / raw)
  To: Jinyang He, Chenghua Xu, Lulu Cheng
  Cc: Weining Lu, Xing Li, yala, Peng Fan, gcc-patches

On Tue, 2022-11-15 at 21:03 +0800, Jinyang He wrote:
> gcc/ChangeLog:
> 
> * config/loongarch/sync.md:
> Add atomic_cas_value_exchange_and_7<mode> and fix atomic_exchange<mode>.

nit:

	* config/loongarch/sync.md (atomic_cas_value_exchange_and_7): 
	New define_insn.
	(atomic_exchange): Use atomic_cas_value_exchange_and_7 instead 
	of atomic_cas_value_cmp_and.

> gcc/testsuite/ChangeLog:
>
> * gcc.target/loongarch/sync-1.c: New test.

Likewise, ChangeLog content should be indented with a tab. (Not 8
spaces: if my mail client changes my tab to 8 spaces I'm sorry).

/* snip */

> +  return "%G6\\n\\t"
> +        "1:\\n\\t"
> +        "ll.<amo>\\t%0,%1\\n\\t"
> +        "and\\t%7,%0,%z3\\n\\t"
> +        "or%i5\\t%7,%7,%5\\n\\t"
> +        "sc.<amo>\\t%7,%1\\n\\t"
> +        "beqz\\t%7,1b\\n\\t";

Do we need a "dbar 0x700" after beqz?

/* snip */

> diff --git a/gcc/testsuite/gcc.target/loongarch/sync-1.c b/gcc/testsuite/gcc.target/loongarch/sync-1.c
> new file mode 100644
> index 000000000..cebed6a9b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/loongarch/sync-1.c
> @@ -0,0 +1,104 @@
> +/* Test __sync_test_and_set in atomic_exchange */
> +/* { dg-do run } */
> +/* { dg-options "-lpthread -std=c11" } */

This test seems not deterministic.  And the use of sched_yield is very
tricky, as the man page says:

       sched_yield() is intended for use with  real-time  scheduling  policies
       (i.e., SCHED_FIFO or SCHED_RR).  Use of sched_yield() with nondetermin‐
       istic scheduling policies such as SCHED_OTHER is unspecified  and  very
       likely means your application design is broken.

I'd suggest to create a bug report at https://gcc.gnu.org/bugzilla and
post this test in the PR.  Then add the PR number into the changelog,
and just add a { dg-do compile } and { dg-final { scan-assembler ... } }
test into the testsuite to ensure the correct ll/sc loop is generated.

A bug report also emphasises that this is a bug fix, which is suitable
for GCC 13 (in stage 3 now) and GCC 12 (the fix will be backported).

-- 
Xi Ruoyao <xry111@xry111.site>
School of Aerospace Science and Technology, Xidian University

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] LoongArch: Fix atomic_exchange make comparison and may jump out
  2022-11-15 14:21 ` Xi Ruoyao
@ 2022-11-16  2:11   ` Jinyang He
  2022-11-16 11:46     ` Xi Ruoyao
  0 siblings, 1 reply; 9+ messages in thread
From: Jinyang He @ 2022-11-16  2:11 UTC (permalink / raw)
  To: Xi Ruoyao, Chenghua Xu, Lulu Cheng
  Cc: Weining Lu, Xing Li, yala, Peng Fan, gcc-patches

On 2022/11/15 下午10:21, Xi Ruoyao wrote:

> On Tue, 2022-11-15 at 21:03 +0800, Jinyang He wrote:
>> gcc/ChangeLog:
>>
>> * config/loongarch/sync.md:
>> Add atomic_cas_value_exchange_and_7<mode> and fix atomic_exchange<mode>.
> nit:
>
> 	* config/loongarch/sync.md (atomic_cas_value_exchange_and_7):
> 	New define_insn.
> 	(atomic_exchange): Use atomic_cas_value_exchange_and_7 instead
> 	of atomic_cas_value_cmp_and.
>
>> gcc/testsuite/ChangeLog:
>>
>> * gcc.target/loongarch/sync-1.c: New test.
> Likewise, ChangeLog content should be indented with a tab. (Not 8
> spaces: if my mail client changes my tab to 8 spaces I'm sorry).
>
> /* snip */

OK. Thanks for the clear commit message and the explanation of format.


>> +  return "%G6\\n\\t"
>> +        "1:\\n\\t"
>> +        "ll.<amo>\\t%0,%1\\n\\t"
>> +        "and\\t%7,%0,%z3\\n\\t"
>> +        "or%i5\\t%7,%7,%5\\n\\t"
>> +        "sc.<amo>\\t%7,%1\\n\\t"
>> +        "beqz\\t%7,1b\\n\\t";
> Do we need a "dbar 0x700" after beqz?
>
> /* snip */

That's worth discussing. Actually I don't see any dbar hint definition
like 0x700 in the manual right now.
Besides, I think what should be provided here is a relaxed version. And
whether the barrier exsit or not is depend on the specific memory_order.

https://loongson.github.io/LoongArch-Documentation/LoongArch-Vol1-EN.html#_dbar


>
>> diff --git a/gcc/testsuite/gcc.target/loongarch/sync-1.c b/gcc/testsuite/gcc.target/loongarch/sync-1.c
>> new file mode 100644
>> index 000000000..cebed6a9b
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/loongarch/sync-1.c
>> @@ -0,0 +1,104 @@
>> +/* Test __sync_test_and_set in atomic_exchange */
>> +/* { dg-do run } */
>> +/* { dg-options "-lpthread -std=c11" } */
> This test seems not deterministic.  And the use of sched_yield is very
> tricky, as the man page says:
>
>         sched_yield() is intended for use with  real-time  scheduling  policies
>         (i.e., SCHED_FIFO or SCHED_RR).  Use of sched_yield() with nondetermin‐
>         istic scheduling policies such as SCHED_OTHER is unspecified  and  very
>         likely means your application design is broken.

Yes, there might be something wrong. The test is just a variants from
llvm::tsan. It was presented to prove that the old implementation did
have problems.


>
> I'd suggest to create a bug report at https://gcc.gnu.org/bugzilla

Thanks, I need to do that. It is must be I missing something at
https://gcc.gnu.org/contribute.html.


> and
> post this test in the PR.  Then add the PR number into the changelog,
> and just add a { dg-do compile } and { dg-final { scan-assembler ... } }
> test into the testsuite to ensure the correct ll/sc loop is generated.
>
> A bug report also emphasises that this is a bug fix, which is suitable
> for GCC 13 (in stage 3 now) and GCC 12 (the fix will be backported).
>
I will create a bug report where we all can discuss it.
Thanks for your review and help. :-)


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] LoongArch: Fix atomic_exchange make comparison and may jump out
  2022-11-16  2:11   ` Jinyang He
@ 2022-11-16 11:46     ` Xi Ruoyao
  2022-11-17  1:39       ` Jinyang He
  0 siblings, 1 reply; 9+ messages in thread
From: Xi Ruoyao @ 2022-11-16 11:46 UTC (permalink / raw)
  To: Jinyang He, Chenghua Xu, Lulu Cheng
  Cc: Weining Lu, Xing Li, yala, Peng Fan, gcc-patches

On Wed, 2022-11-16 at 10:11 +0800, Jinyang He wrote:

> > > +  return "%G6\\n\\t"
> > > +        "1:\\n\\t"
> > > +        "ll.<amo>\\t%0,%1\\n\\t"
> > > +        "and\\t%7,%0,%z3\\n\\t"
> > > +        "or%i5\\t%7,%7,%5\\n\\t"
> > > +        "sc.<amo>\\t%7,%1\\n\\t"
> > > +        "beqz\\t%7,1b\\n\\t";
> > Do we need a "dbar 0x700" after beqz?
> > 
> > /* snip */
> 
> That's worth discussing. Actually I don't see any dbar hint definition
> like 0x700 in the manual right now.
> Besides, I think what should be provided here is a relaxed version. And
> whether the barrier exsit or not is depend on the specific memory_order.

It's not related to memory order, but for a hardware issue workaround. 
Jiaxun told me (via LKML):

   I had checked with Loongson guys and they confirmed that the
   workaround still needs to be applied to latest 3A4000 processors,
   including 3A4000 for MIPS and 3A5000 for LoongArch.
   
   Though, the reason behind the workaround varies with the evaluation
   of their uArch, for GS464V based core, barrier is required as the
   uArch design allows regular load to be reordered after an atomic
   linked load, and that would break assumption of compiler atomic
   constraints.

Without these dbar instructions I'd got random test failures in GCC
libgomp test suite.

We use a non-zero hint here because it is treated exactly same as zero
in 3A5000, and the future LoongArch processors can fix the issue and
ignore the dbar 0x700 instruction.


-- 
Xi Ruoyao <xry111@xry111.site>
School of Aerospace Science and Technology, Xidian University

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] LoongArch: Fix atomic_exchange make comparison and may jump out
  2022-11-16 11:46     ` Xi Ruoyao
@ 2022-11-17  1:39       ` Jinyang He
  2022-11-17  2:55         ` Jinyang He
  0 siblings, 1 reply; 9+ messages in thread
From: Jinyang He @ 2022-11-17  1:39 UTC (permalink / raw)
  To: Xi Ruoyao, Chenghua Xu, Lulu Cheng
  Cc: Weining Lu, Xing Li, yala, Peng Fan, gcc-patches, Huang Pei

On 2022/11/16 下午7:46, Xi Ruoyao wrote:

> On Wed, 2022-11-16 at 10:11 +0800, Jinyang He wrote:
>
>>>> +  return "%G6\\n\\t"
>>>> +        "1:\\n\\t"
>>>> +        "ll.<amo>\\t%0,%1\\n\\t"
>>>> +        "and\\t%7,%0,%z3\\n\\t"
>>>> +        "or%i5\\t%7,%7,%5\\n\\t"
>>>> +        "sc.<amo>\\t%7,%1\\n\\t"
>>>> +        "beqz\\t%7,1b\\n\\t";
>>> Do we need a "dbar 0x700" after beqz?
>>>
>>> /* snip */
>> That's worth discussing. Actually I don't see any dbar hint definition
>> like 0x700 in the manual right now.
>> Besides, I think what should be provided here is a relaxed version. And
>> whether the barrier exsit or not is depend on the specific memory_order.
> It's not related to memory order, but for a hardware issue workaround.
> Jiaxun told me (via LKML):
>
>     I had checked with Loongson guys and they confirmed that the
>     workaround still needs to be applied to latest 3A4000 processors,
>     including 3A4000 for MIPS and 3A5000 for LoongArch.
>     
>     Though, the reason behind the workaround varies with the evaluation
>     of their uArch, for GS464V based core, barrier is required as the
>     uArch design allows regular load to be reordered after an atomic
>     linked load, and that would break assumption of compiler atomic
>     constraints.

That certainly seems to be needed, but before or after. It's beyond my
recognition and cc huangpei@loongson.cn for help.


>
> Without these dbar instructions I'd got random test failures in GCC
> libgomp test suite.
>
> We use a non-zero hint here because it is treated exactly same as zero
> in 3A5000, and the future LoongArch processors can fix the issue and
> ignore the dbar 0x700 instruction.
Thanks, it's a nice workaround.


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] LoongArch: Fix atomic_exchange make comparison and may jump out
  2022-11-17  1:39       ` Jinyang He
@ 2022-11-17  2:55         ` Jinyang He
  2022-11-17  3:38           ` Xi Ruoyao
  0 siblings, 1 reply; 9+ messages in thread
From: Jinyang He @ 2022-11-17  2:55 UTC (permalink / raw)
  To: Xi Ruoyao, Chenghua Xu, Lulu Cheng
  Cc: Weining Lu, Xing Li, yala, Peng Fan, gcc-patches, Huang Pei

On 2022/11/17 上午9:39, Jinyang He wrote:

> On 2022/11/16 下午7:46, Xi Ruoyao wrote:
>
>> On Wed, 2022-11-16 at 10:11 +0800, Jinyang He wrote:
>>
>>>>> +  return "%G6\\n\\t"
>>>>> +        "1:\\n\\t"
>>>>> +        "ll.<amo>\\t%0,%1\\n\\t"
>>>>> +        "and\\t%7,%0,%z3\\n\\t"
>>>>> +        "or%i5\\t%7,%7,%5\\n\\t"
>>>>> +        "sc.<amo>\\t%7,%1\\n\\t"
>>>>> +        "beqz\\t%7,1b\\n\\t";
>>>> Do we need a "dbar 0x700" after beqz?
>>>>
>>>> /* snip */
>>> That's worth discussing. Actually I don't see any dbar hint definition
>>> like 0x700 in the manual right now.
>>> Besides, I think what should be provided here is a relaxed version. And
>>> whether the barrier exsit or not is depend on the specific 
>>> memory_order.
>> It's not related to memory order, but for a hardware issue workaround.
>> Jiaxun told me (via LKML):
>>
>>     I had checked with Loongson guys and they confirmed that the
>>     workaround still needs to be applied to latest 3A4000 processors,
>>     including 3A4000 for MIPS and 3A5000 for LoongArch.
>>         Though, the reason behind the workaround varies with the 
>> evaluation
>>     of their uArch, for GS464V based core, barrier is required as the
>>     uArch design allows regular load to be reordered after an atomic
>>     linked load, and that would break assumption of compiler atomic
>>     constraints.
>
> That certainly seems to be needed, but before or after. It's beyond my
> recognition and cc huangpei@loongson.cn for help.


Pei told me the ll-sc works at present like follows,

uArch like:
   ll -> (ll.dbar ll.ld_atomic)
   sc -> (sc.dbar sc.st_atomic)

exchange:
ll.dbar
<---------------------------+
ll.ld_atomic $rd            |
...(no jmp)                 |
sc.dbar                     |
sc.st_stomic $rd            |
ld $rj -can-not-emit-at-----+

The load $rj can not emit between ll.dbar and ll.ld_atomic because the 
sc.dbar barrier it.


compare and exchange:
ll.dbar
<-----------------------+
ll.ld_atomic $rd        |
...(jmp) ---------------+------+
sc.dbar                 |      |
sc.st_stomic $rd        |      |
                         |   <--+
ld $rj -may-emit-at-----+

Jumping out ll-sc may lead loading $rj emit between ll.dbar and ll.atomic.


Thus, exchange not need dbar.


>
>
>>
>> Without these dbar instructions I'd got random test failures in GCC
>> libgomp test suite.

Which test suite?


>>
>> We use a non-zero hint here because it is treated exactly same as zero
>> in 3A5000, and the future LoongArch processors can fix the issue and
>> ignore the dbar 0x700 instruction.
> Thanks, it's a nice workaround.


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] LoongArch: Fix atomic_exchange make comparison and may jump out
  2022-11-17  2:55         ` Jinyang He
@ 2022-11-17  3:38           ` Xi Ruoyao
  2022-11-17  3:46             ` Jinyang He
  0 siblings, 1 reply; 9+ messages in thread
From: Xi Ruoyao @ 2022-11-17  3:38 UTC (permalink / raw)
  To: Jinyang He, Chenghua Xu, Lulu Cheng
  Cc: Weining Lu, Xing Li, yala, Peng Fan, gcc-patches, Huang Pei

On Thu, 2022-11-17 at 10:55 +0800, Jinyang He wrote:
> On 2022/11/17 上午9:39, Jinyang He wrote:
> 
> > On 2022/11/16 下午7:46, Xi Ruoyao wrote:
> > 
> > > On Wed, 2022-11-16 at 10:11 +0800, Jinyang He wrote:
> > > 
> > > > > > +  return "%G6\\n\\t"
> > > > > > +        "1:\\n\\t"
> > > > > > +        "ll.<amo>\\t%0,%1\\n\\t"
> > > > > > +        "and\\t%7,%0,%z3\\n\\t"
> > > > > > +        "or%i5\\t%7,%7,%5\\n\\t"
> > > > > > +        "sc.<amo>\\t%7,%1\\n\\t"
> > > > > > +        "beqz\\t%7,1b\\n\\t";
> > > > > Do we need a "dbar 0x700" after beqz?
> > > > > 
> > > > > /* snip */
> > > > That's worth discussing. Actually I don't see any dbar hint definition
> > > > like 0x700 in the manual right now.
> > > > Besides, I think what should be provided here is a relaxed version. And
> > > > whether the barrier exsit or not is depend on the specific 
> > > > memory_order.
> > > It's not related to memory order, but for a hardware issue workaround.
> > > Jiaxun told me (via LKML):
> > > 
> > >     I had checked with Loongson guys and they confirmed that the
> > >     workaround still needs to be applied to latest 3A4000 processors,
> > >     including 3A4000 for MIPS and 3A5000 for LoongArch.
> > >         Though, the reason behind the workaround varies with the 
> > > evaluation
> > >     of their uArch, for GS464V based core, barrier is required as the
> > >     uArch design allows regular load to be reordered after an atomic
> > >     linked load, and that would break assumption of compiler atomic
> > >     constraints.
> > 
> > That certainly seems to be needed, but before or after. It's beyond my
> > recognition and cc huangpei@loongson.cn for help.
> 
> 
> Pei told me the ll-sc works at present like follows,
> 
> uArch like:
>    ll -> (ll.dbar ll.ld_atomic)
>    sc -> (sc.dbar sc.st_atomic)
> 
> exchange:
> ll.dbar
> <---------------------------+
> ll.ld_atomic $rd            |
> ...(no jmp)                 |
> sc.dbar                     |
> sc.st_stomic $rd            |
> ld $rj -can-not-emit-at-----+
> 
> The load $rj can not emit between ll.dbar and ll.ld_atomic because the
> sc.dbar barrier it.
> 
> 
> compare and exchange:
> ll.dbar
> <-----------------------+
> ll.ld_atomic $rd        |
> ...(jmp) ---------------+------+
> sc.dbar                 |      |
> sc.st_stomic $rd        |      |
>                          |   <--+
> ld $rj -may-emit-at-----+
> 
> Jumping out ll-sc may lead loading $rj emit between ll.dbar and ll.atomic.
> 
> 
> Thus, exchange not need dbar.
> 
> 
> > 
> > 
> > > 
> > > Without these dbar instructions I'd got random test failures in GCC
> > > libgomp test suite.
> 
> Which test suite?

I mean when we didn't use dbar 0x700 for compare-and-exchange (during
the early development stage of GCC for LoongArch) I observed these
failures.

So we do need an additional dbar for compare-and-exchange, but do not
need it for a bare atomic exchange?

-- 
Xi Ruoyao <xry111@xry111.site>
School of Aerospace Science and Technology, Xidian University

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] LoongArch: Fix atomic_exchange make comparison and may jump out
  2022-11-17  3:38           ` Xi Ruoyao
@ 2022-11-17  3:46             ` Jinyang He
  2022-11-17  5:56               ` Xi Ruoyao
  0 siblings, 1 reply; 9+ messages in thread
From: Jinyang He @ 2022-11-17  3:46 UTC (permalink / raw)
  To: Xi Ruoyao, Chenghua Xu, Lulu Cheng
  Cc: Weining Lu, Xing Li, yala, Peng Fan, gcc-patches, Huang Pei

On 2022/11/17 上午11:38, Xi Ruoyao wrote:

> On Thu, 2022-11-17 at 10:55 +0800, Jinyang He wrote:
>> On 2022/11/17 上午9:39, Jinyang He wrote:
>>
>>> On 2022/11/16 下午7:46, Xi Ruoyao wrote:
>>>
>>>> On Wed, 2022-11-16 at 10:11 +0800, Jinyang He wrote:
>>>>
>>>>>>> +  return "%G6\\n\\t"
>>>>>>> +        "1:\\n\\t"
>>>>>>> +        "ll.<amo>\\t%0,%1\\n\\t"
>>>>>>> +        "and\\t%7,%0,%z3\\n\\t"
>>>>>>> +        "or%i5\\t%7,%7,%5\\n\\t"
>>>>>>> +        "sc.<amo>\\t%7,%1\\n\\t"
>>>>>>> +        "beqz\\t%7,1b\\n\\t";
>>>>>> Do we need a "dbar 0x700" after beqz?
>>>>>>
>>>>>> /* snip */
>>>>> That's worth discussing. Actually I don't see any dbar hint definition
>>>>> like 0x700 in the manual right now.
>>>>> Besides, I think what should be provided here is a relaxed version. And
>>>>> whether the barrier exsit or not is depend on the specific
>>>>> memory_order.
>>>> It's not related to memory order, but for a hardware issue workaround.
>>>> Jiaxun told me (via LKML):
>>>>
>>>>      I had checked with Loongson guys and they confirmed that the
>>>>      workaround still needs to be applied to latest 3A4000 processors,
>>>>      including 3A4000 for MIPS and 3A5000 for LoongArch.
>>>>          Though, the reason behind the workaround varies with the
>>>> evaluation
>>>>      of their uArch, for GS464V based core, barrier is required as the
>>>>      uArch design allows regular load to be reordered after an atomic
>>>>      linked load, and that would break assumption of compiler atomic
>>>>      constraints.
>>> That certainly seems to be needed, but before or after. It's beyond my
>>> recognition and cc huangpei@loongson.cn for help.
>>
>> Pei told me the ll-sc works at present like follows,
>>
>> uArch like:
>>     ll -> (ll.dbar ll.ld_atomic)
>>     sc -> (sc.dbar sc.st_atomic)
>>
>> exchange:
>> ll.dbar
>> <---------------------------+
>> ll.ld_atomic $rd            |
>> ...(no jmp)                 |
>> sc.dbar                     |
>> sc.st_stomic $rd            |
>> ld $rj -can-not-emit-at-----+
>>
>> The load $rj can not emit between ll.dbar and ll.ld_atomic because the
>> sc.dbar barrier it.
>>
>>
>> compare and exchange:
>> ll.dbar
>> <-----------------------+
>> ll.ld_atomic $rd        |
>> ...(jmp) ---------------+------+
>> sc.dbar                 |      |
>> sc.st_stomic $rd        |      |
>>                           |   <--+
>> ld $rj -may-emit-at-----+
>>
>> Jumping out ll-sc may lead loading $rj emit between ll.dbar and ll.atomic.
>>
>>
>> Thus, exchange not need dbar.
>>
>>
>>>
>>>> Without these dbar instructions I'd got random test failures in GCC
>>>> libgomp test suite.
>> Which test suite?
> I mean when we didn't use dbar 0x700 for compare-and-exchange (during
> the early development stage of GCC for LoongArch) I observed these
> failures.
>
> So we do need an additional dbar for compare-and-exchange, but do not
> need it for a bare atomic exchange?
Yes.


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] LoongArch: Fix atomic_exchange make comparison and may jump out
  2022-11-17  3:46             ` Jinyang He
@ 2022-11-17  5:56               ` Xi Ruoyao
  0 siblings, 0 replies; 9+ messages in thread
From: Xi Ruoyao @ 2022-11-17  5:56 UTC (permalink / raw)
  To: Jinyang He, Chenghua Xu, Lulu Cheng
  Cc: Weining Lu, Xing Li, yala, Peng Fan, gcc-patches, Huang Pei

[-- Attachment #1: Type: text/plain, Size: 597 bytes --]

On Thu, 2022-11-17 at 11:46 +0800, Jinyang He wrote:
> > So we do need an additional dbar for compare-and-exchange, but do
> > not
> > need it for a bare atomic exchange?
> Yes.

Ok, I just noticed we also don't use dbar in atomic_add etc.

I've adjusted the patch a little (in attachment): rewritten the
ChangeLog and test, and renamed "value_exchange_and" to just
"value_exchange" because we are not doing anything beyond the exchange.

If it looks OK you can just send it as v2.

-- 
Xi Ruoyao <xry111@xry111.site>
School of Aerospace Science and Technology, Xidian University

[-- Attachment #2: pr107713.patch --]
[-- Type: text/x-patch, Size: 4552 bytes --]

From 8b64115dfb289777300c4e84e278ab3d8ec4fe86 Mon Sep 17 00:00:00 2001
From: Jinyang He <hejinyang@loongson.cn>
Date: Tue, 15 Nov 2022 21:03:28 +0800
Subject: [PATCH v2] LoongArch: Fix atomic_exchange expanding [PR107713]

We used to expand atomic_exchange_n(ptr, new, mem_order) for subword types
into something like:

    {
      __typeof__(*ptr) t = atomic_load_n(ptr, mem_order);
      atomic_compare_exchange_n(ptr, &t, new, true, mem_order, mem_order);
      return t;
    }

It's incorrect because another thread may store a different value into *ptr
after atomic_load_n.  Then atomic_compare_exchange_n will not store into
*ptr, but atomic_exchange_n should always perform the store.

gcc/ChangeLog:

	PR target/107713
	* config/loongarch/sync.md
	(atomic_cas_value_exchange_7_<mode>): New define_insn.
	(atomic_exchange): Use atomic_cas_value_exchange_7_si instead of
	atomic_cas_value_cmp_and_7_si.

gcc/testsuite/ChangeLog:

	PR target/107713
	* gcc.target/loongarch/pr107713-1.c: New test.
	* gcc.target/loongarch/pr107713-2.c: New test.
---
 gcc/config/loongarch/sync.md                  | 27 +++++++++-
 .../gcc.target/loongarch/pr107713-1.c         | 50 +++++++++++++++++++
 .../gcc.target/loongarch/pr107713-2.c         |  9 ++++
 3 files changed, 84 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/loongarch/pr107713-1.c
 create mode 100644 gcc/testsuite/gcc.target/loongarch/pr107713-2.c

diff --git a/gcc/config/loongarch/sync.md b/gcc/config/loongarch/sync.md
index 0c4f1983e88..45be1442439 100644
--- a/gcc/config/loongarch/sync.md
+++ b/gcc/config/loongarch/sync.md
@@ -448,6 +448,29 @@
 }
   [(set (attr "length") (const_int 32))])
 
+(define_insn "atomic_cas_value_exchange_7_<mode>"
+  [(set (match_operand:GPR 0 "register_operand" "=&r")
+	(match_operand:GPR 1 "memory_operand" "+ZC"))
+   (set (match_dup 1)
+	(unspec_volatile:GPR [(match_operand:GPR 2 "reg_or_0_operand" "rJ")
+			      (match_operand:GPR 3 "reg_or_0_operand" "rJ")
+			      (match_operand:GPR 4 "reg_or_0_operand" "rJ")
+			      (match_operand:GPR 5 "reg_or_0_operand"  "rJ")
+			      (match_operand:SI 6 "const_int_operand")] ;; model
+	 UNSPEC_SYNC_EXCHANGE))
+   (clobber (match_scratch:GPR 7 "=&r"))]
+  ""
+{
+  return "%G6\\n\\t"
+	 "1:\\n\\t"
+	 "ll.<amo>\\t%0,%1\\n\\t"
+	 "and\\t%7,%0,%z3\\n\\t"
+	 "or%i5\\t%7,%7,%5\\n\\t"
+	 "sc.<amo>\\t%7,%1\\n\\t"
+	 "beqz\\t%7,1b\\n\\t";
+}
+  [(set (attr "length") (const_int 20))])
+
 (define_expand "atomic_exchange<mode>"
   [(set (match_operand:SHORT 0 "register_operand")
 	(unspec_volatile:SHORT
@@ -459,9 +482,9 @@
   ""
 {
   union loongarch_gen_fn_ptrs generator;
-  generator.fn_7 = gen_atomic_cas_value_cmp_and_7_si;
+  generator.fn_7 = gen_atomic_cas_value_exchange_7_si;
   loongarch_expand_atomic_qihi (generator, operands[0], operands[1],
-				operands[1], operands[2], operands[3]);
+				const0_rtx, operands[2], operands[3]);
   DONE;
 })
 
diff --git a/gcc/testsuite/gcc.target/loongarch/pr107713-1.c b/gcc/testsuite/gcc.target/loongarch/pr107713-1.c
new file mode 100644
index 00000000000..d1536c95b27
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/pr107713-1.c
@@ -0,0 +1,50 @@
+/* { dg-do run } */
+/* { dg-require-effective-target pthread } */
+/* { dg-options "-pthread" } */
+
+#include <pthread.h>
+
+char x, x1, x2;
+
+void *
+work1 (void *)
+{
+  for (int i = 0; i < 100; i++)
+    x1 = __atomic_exchange_n (&x, x1, __ATOMIC_SEQ_CST);
+  return NULL;
+}
+
+void *
+work2 (void *)
+{
+  for (int i = 0; i < 100; i++)
+    x2 = __atomic_exchange_n (&x, x2, __ATOMIC_SEQ_CST);
+  return NULL;
+}
+
+void
+test (void)
+{
+  x = 0;
+  x1 = 1;
+  x2 = 2;
+  pthread_t w1, w2;
+  if (pthread_create (&w1, NULL, work1, NULL) != 0)
+    __builtin_abort ();
+  if (pthread_create (&w2, NULL, work2, NULL) != 0)
+    __builtin_abort ();
+  if (pthread_join (w1, NULL) != 0)
+    __builtin_abort ();
+  if (pthread_join (w2, NULL) != 0)
+    __builtin_abort ();
+  if ((x ^ x1 ^ x2) != 3)
+    __builtin_abort ();
+}
+
+int
+main ()
+{
+  int i;
+  for (i = 0; i < 10000; i++)
+    test ();
+}
diff --git a/gcc/testsuite/gcc.target/loongarch/pr107713-2.c b/gcc/testsuite/gcc.target/loongarch/pr107713-2.c
new file mode 100644
index 00000000000..82d44db3d51
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/pr107713-2.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+/* { dg-final { scan-assembler-times "beq|bne" 1 } } */
+
+char
+t (char *p, char x)
+{
+  return __atomic_exchange_n (p, x, __ATOMIC_RELAXED);
+}
-- 
2.37.3


^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2022-11-17  5:56 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-11-15 13:03 [PATCH] LoongArch: Fix atomic_exchange make comparison and may jump out Jinyang He
2022-11-15 14:21 ` Xi Ruoyao
2022-11-16  2:11   ` Jinyang He
2022-11-16 11:46     ` Xi Ruoyao
2022-11-17  1:39       ` Jinyang He
2022-11-17  2:55         ` Jinyang He
2022-11-17  3:38           ` Xi Ruoyao
2022-11-17  3:46             ` Jinyang He
2022-11-17  5:56               ` Xi Ruoyao

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).