public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH][AArch64] Emit tighter strong atomic compare-exchange loop when comparing against zero
@ 2017-03-08 16:35 Kyrill Tkachov
  2017-04-24  9:54 ` Kyrill Tkachov
  2017-06-02 14:07 ` James Greenhalgh
  0 siblings, 2 replies; 5+ messages in thread
From: Kyrill Tkachov @ 2017-03-08 16:35 UTC (permalink / raw)
  To: GCC Patches; +Cc: Marcus Shawcroft, Richard Earnshaw, James Greenhalgh

[-- Attachment #1: Type: text/plain, Size: 1920 bytes --]

Hi all,

For the testcase in this patch where the value of x is zero we currently generate:
foo:
         mov     w1, 4
.L2:
         ldaxr   w2, [x0]
         cmp     w2, 0
         bne     .L3
         stxr    w3, w1, [x0]
         cbnz    w3, .L2
.L3:
         cset    w0, eq
         ret

We currently cannot merge the cmp and b.ne inside the loop into a cbnz because we need
the condition flags set for the return value of the function (i.e. the cset at the end).
But if we re-jig the sequence in that case we can generate a tighter loop:
foo:
         mov     w1, 4
.L2:
         ldaxr   w2, [x0]
         cbnz    w2, .L3
         stxr    w3, w1, [x0]
         cbnz    w3, .L2
.L3:
         cmp     w2, 0
         cset    w0, eq
         ret

So we add an explicit compare after the loop and inside the loop we use the fact that
we're comparing against zero to emit a CBNZ. This means we may re-do the comparison twice
(once inside the CBNZ, once at the CMP at the end), but there is now less code inside the loop.

I've seen this sequence appear in glibc locking code so maybe it's worth adding the extra bit
of complexity to the compare-exchange splitter to catch this case.

Bootstrapped and tested on aarch64-none-linux-gnu. In previous iterations of the patch where
I had gotten some logic wrong it would cause miscompiles of libgomp leading to timeouts in its
testsuite but this version passes everything cleanly.

Ok for GCC 8? (I know it's early, but might as well get it out in case someone wants to try it out)

Thanks,
Kyrill

2017-03-08  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>

     * config/aarch64/aarch64.c (aarch64_split_compare_and_swap):
     Emit CBNZ inside loop when doing a strong exchange and comparing
     against zero.  Generate the CC flags after the loop.

2017-03-08  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>

     * gcc.target/aarch64/atomic_cmp_exchange_zero_strong_1.c: New test.

[-- Attachment #2: aarch64-cmp-strong.patch --]
[-- Type: text/x-patch, Size: 2971 bytes --]

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 76a2de20dfcd4ea38fb7c58a9e8612509c5987bd..5fa8e197328ce4cb1718ff7d99b1ea95e02129a4 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -12095,6 +12095,17 @@ aarch64_split_compare_and_swap (rtx operands[])
   mode = GET_MODE (mem);
   model = memmodel_from_int (INTVAL (model_rtx));
 
+  /* When OLDVAL is zero and we want the strong version we can emit a tighter
+    loop:
+    .label1:
+	LD[A]XR	rval, [mem]
+	CBNZ	rval, .label2
+	ST[L]XR	scratch, newval, [mem]
+	CBNZ	scratch, .label1
+    .label2:
+	CMP	rval, 0.  */
+  bool strong_zero_p = !is_weak && oldval == const0_rtx;
+
   label1 = NULL;
   if (!is_weak)
     {
@@ -12111,11 +12122,21 @@ aarch64_split_compare_and_swap (rtx operands[])
   else
     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
 
-  cond = aarch64_gen_compare_reg (NE, rval, oldval);
-  x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
-  x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
-			    gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
-  aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
+  if (strong_zero_p)
+    {
+      x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
+      x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
+				gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
+      aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
+    }
+  else
+    {
+      cond = aarch64_gen_compare_reg (NE, rval, oldval);
+      x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
+      x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
+				 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
+      aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
+    }
 
   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
 
@@ -12134,7 +12155,15 @@ aarch64_split_compare_and_swap (rtx operands[])
     }
 
   emit_label (label2);
-
+  /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
+     to set the condition flags.  If this is not used it will be removed by
+     later passes.  */
+  if (strong_zero_p)
+    {
+      cond = gen_rtx_REG (CCmode, CC_REGNUM);
+      x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
+      emit_insn (gen_rtx_SET (cond, x));
+    }
   /* Emit any final barrier needed for a __sync operation.  */
   if (is_mm_sync (model))
     aarch64_emit_post_barrier (model);
diff --git a/gcc/testsuite/gcc.target/aarch64/atomic_cmp_exchange_zero_strong_1.c b/gcc/testsuite/gcc.target/aarch64/atomic_cmp_exchange_zero_strong_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..b14a7c294376f03cd13077d18d865f83a04bd04e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/atomic_cmp_exchange_zero_strong_1.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+int
+foo (int *a)
+{
+  int x = 0;
+  return __atomic_compare_exchange_n (a, &x, 4, 0,
+				      __ATOMIC_ACQUIRE, __ATOMIC_ACQUIRE);
+}
+
+/* { dg-final { scan-assembler-times "cbnz\\tw\[0-9\]+" 2 } } */

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH][AArch64] Emit tighter strong atomic compare-exchange loop when comparing against zero
  2017-03-08 16:35 [PATCH][AArch64] Emit tighter strong atomic compare-exchange loop when comparing against zero Kyrill Tkachov
@ 2017-04-24  9:54 ` Kyrill Tkachov
  2017-05-08 11:01   ` Kyrill Tkachov
  2017-06-02 14:07 ` James Greenhalgh
  1 sibling, 1 reply; 5+ messages in thread
From: Kyrill Tkachov @ 2017-04-24  9:54 UTC (permalink / raw)
  To: GCC Patches; +Cc: Marcus Shawcroft, Richard Earnshaw, James Greenhalgh

Pinging this back into context so that I don't forget about it...

https://gcc.gnu.org/ml/gcc-patches/2017-03/msg00376.html

Thanks,
Kyrill

On 08/03/17 16:35, Kyrill Tkachov wrote:
> Hi all,
>
> For the testcase in this patch where the value of x is zero we currently generate:
> foo:
>         mov     w1, 4
> .L2:
>         ldaxr   w2, [x0]
>         cmp     w2, 0
>         bne     .L3
>         stxr    w3, w1, [x0]
>         cbnz    w3, .L2
> .L3:
>         cset    w0, eq
>         ret
>
> We currently cannot merge the cmp and b.ne inside the loop into a cbnz because we need
> the condition flags set for the return value of the function (i.e. the cset at the end).
> But if we re-jig the sequence in that case we can generate a tighter loop:
> foo:
>         mov     w1, 4
> .L2:
>         ldaxr   w2, [x0]
>         cbnz    w2, .L3
>         stxr    w3, w1, [x0]
>         cbnz    w3, .L2
> .L3:
>         cmp     w2, 0
>         cset    w0, eq
>         ret
>
> So we add an explicit compare after the loop and inside the loop we use the fact that
> we're comparing against zero to emit a CBNZ. This means we may re-do the comparison twice
> (once inside the CBNZ, once at the CMP at the end), but there is now less code inside the loop.
>
> I've seen this sequence appear in glibc locking code so maybe it's worth adding the extra bit
> of complexity to the compare-exchange splitter to catch this case.
>
> Bootstrapped and tested on aarch64-none-linux-gnu. In previous iterations of the patch where
> I had gotten some logic wrong it would cause miscompiles of libgomp leading to timeouts in its
> testsuite but this version passes everything cleanly.
>
> Ok for GCC 8? (I know it's early, but might as well get it out in case someone wants to try it out)
>
> Thanks,
> Kyrill
>
> 2017-03-08  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
>
>     * config/aarch64/aarch64.c (aarch64_split_compare_and_swap):
>     Emit CBNZ inside loop when doing a strong exchange and comparing
>     against zero.  Generate the CC flags after the loop.
>
> 2017-03-08  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
>
>     * gcc.target/aarch64/atomic_cmp_exchange_zero_strong_1.c: New test.

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH][AArch64] Emit tighter strong atomic compare-exchange loop when comparing against zero
  2017-04-24  9:54 ` Kyrill Tkachov
@ 2017-05-08 11:01   ` Kyrill Tkachov
  2017-06-02 10:38     ` Kyrill Tkachov
  0 siblings, 1 reply; 5+ messages in thread
From: Kyrill Tkachov @ 2017-05-08 11:01 UTC (permalink / raw)
  To: GCC Patches; +Cc: Marcus Shawcroft, Richard Earnshaw, James Greenhalgh

Ping.

Thanks,
Kyrill

On 24/04/17 10:38, Kyrill Tkachov wrote:
> Pinging this back into context so that I don't forget about it...
>
> https://gcc.gnu.org/ml/gcc-patches/2017-03/msg00376.html
>
> Thanks,
> Kyrill
>
> On 08/03/17 16:35, Kyrill Tkachov wrote:
>> Hi all,
>>
>> For the testcase in this patch where the value of x is zero we currently generate:
>> foo:
>>         mov     w1, 4
>> .L2:
>>         ldaxr   w2, [x0]
>>         cmp     w2, 0
>>         bne     .L3
>>         stxr    w3, w1, [x0]
>>         cbnz    w3, .L2
>> .L3:
>>         cset    w0, eq
>>         ret
>>
>> We currently cannot merge the cmp and b.ne inside the loop into a cbnz because we need
>> the condition flags set for the return value of the function (i.e. the cset at the end).
>> But if we re-jig the sequence in that case we can generate a tighter loop:
>> foo:
>>         mov     w1, 4
>> .L2:
>>         ldaxr   w2, [x0]
>>         cbnz    w2, .L3
>>         stxr    w3, w1, [x0]
>>         cbnz    w3, .L2
>> .L3:
>>         cmp     w2, 0
>>         cset    w0, eq
>>         ret
>>
>> So we add an explicit compare after the loop and inside the loop we use the fact that
>> we're comparing against zero to emit a CBNZ. This means we may re-do the comparison twice
>> (once inside the CBNZ, once at the CMP at the end), but there is now less code inside the loop.
>>
>> I've seen this sequence appear in glibc locking code so maybe it's worth adding the extra bit
>> of complexity to the compare-exchange splitter to catch this case.
>>
>> Bootstrapped and tested on aarch64-none-linux-gnu. In previous iterations of the patch where
>> I had gotten some logic wrong it would cause miscompiles of libgomp leading to timeouts in its
>> testsuite but this version passes everything cleanly.
>>
>> Ok for GCC 8? (I know it's early, but might as well get it out in case someone wants to try it out)
>>
>> Thanks,
>> Kyrill
>>
>> 2017-03-08  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
>>
>>     * config/aarch64/aarch64.c (aarch64_split_compare_and_swap):
>>     Emit CBNZ inside loop when doing a strong exchange and comparing
>>     against zero.  Generate the CC flags after the loop.
>>
>> 2017-03-08  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
>>
>>     * gcc.target/aarch64/atomic_cmp_exchange_zero_strong_1.c: New test.
>

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH][AArch64] Emit tighter strong atomic compare-exchange loop when comparing against zero
  2017-05-08 11:01   ` Kyrill Tkachov
@ 2017-06-02 10:38     ` Kyrill Tkachov
  0 siblings, 0 replies; 5+ messages in thread
From: Kyrill Tkachov @ 2017-06-02 10:38 UTC (permalink / raw)
  To: GCC Patches; +Cc: Marcus Shawcroft, Richard Earnshaw, James Greenhalgh

Ping.

Thanks,
Kyrill

On 08/05/17 12:00, Kyrill Tkachov wrote:
> Ping.
>
> Thanks,
> Kyrill
>
> On 24/04/17 10:38, Kyrill Tkachov wrote:
>> Pinging this back into context so that I don't forget about it...
>>
>> https://gcc.gnu.org/ml/gcc-patches/2017-03/msg00376.html
>>
>> Thanks,
>> Kyrill
>>
>> On 08/03/17 16:35, Kyrill Tkachov wrote:
>>> Hi all,
>>>
>>> For the testcase in this patch where the value of x is zero we currently generate:
>>> foo:
>>>         mov     w1, 4
>>> .L2:
>>>         ldaxr   w2, [x0]
>>>         cmp     w2, 0
>>>         bne     .L3
>>>         stxr    w3, w1, [x0]
>>>         cbnz    w3, .L2
>>> .L3:
>>>         cset    w0, eq
>>>         ret
>>>
>>> We currently cannot merge the cmp and b.ne inside the loop into a cbnz because we need
>>> the condition flags set for the return value of the function (i.e. the cset at the end).
>>> But if we re-jig the sequence in that case we can generate a tighter loop:
>>> foo:
>>>         mov     w1, 4
>>> .L2:
>>>         ldaxr   w2, [x0]
>>>         cbnz    w2, .L3
>>>         stxr    w3, w1, [x0]
>>>         cbnz    w3, .L2
>>> .L3:
>>>         cmp     w2, 0
>>>         cset    w0, eq
>>>         ret
>>>
>>> So we add an explicit compare after the loop and inside the loop we use the fact that
>>> we're comparing against zero to emit a CBNZ. This means we may re-do the comparison twice
>>> (once inside the CBNZ, once at the CMP at the end), but there is now less code inside the loop.
>>>
>>> I've seen this sequence appear in glibc locking code so maybe it's worth adding the extra bit
>>> of complexity to the compare-exchange splitter to catch this case.
>>>
>>> Bootstrapped and tested on aarch64-none-linux-gnu. In previous iterations of the patch where
>>> I had gotten some logic wrong it would cause miscompiles of libgomp leading to timeouts in its
>>> testsuite but this version passes everything cleanly.
>>>
>>> Ok for GCC 8? (I know it's early, but might as well get it out in case someone wants to try it out)
>>>
>>> Thanks,
>>> Kyrill
>>>
>>> 2017-03-08  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
>>>
>>>     * config/aarch64/aarch64.c (aarch64_split_compare_and_swap):
>>>     Emit CBNZ inside loop when doing a strong exchange and comparing
>>>     against zero.  Generate the CC flags after the loop.
>>>
>>> 2017-03-08  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
>>>
>>>     * gcc.target/aarch64/atomic_cmp_exchange_zero_strong_1.c: New test.
>>
>

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH][AArch64] Emit tighter strong atomic compare-exchange loop when comparing against zero
  2017-03-08 16:35 [PATCH][AArch64] Emit tighter strong atomic compare-exchange loop when comparing against zero Kyrill Tkachov
  2017-04-24  9:54 ` Kyrill Tkachov
@ 2017-06-02 14:07 ` James Greenhalgh
  1 sibling, 0 replies; 5+ messages in thread
From: James Greenhalgh @ 2017-06-02 14:07 UTC (permalink / raw)
  To: Kyrill Tkachov; +Cc: GCC Patches, Marcus Shawcroft, Richard Earnshaw, nd

On Wed, Mar 08, 2017 at 04:35:38PM +0000, Kyrill Tkachov wrote:
> Hi all,
> 
> For the testcase in this patch where the value of x is zero we currently
> generate:
> foo:
>         mov     w1, 4
> .L2:
>         ldaxr   w2, [x0]
>         cmp     w2, 0
>         bne     .L3
>         stxr    w3, w1, [x0]
>         cbnz    w3, .L2
> .L3:
>         cset    w0, eq
>         ret
> 
> We currently cannot merge the cmp and b.ne inside the loop into a cbnz
> because we need the condition flags set for the return value of the function
> (i.e. the cset at the end).  But if we re-jig the sequence in that case we
> can generate a tighter loop:
> foo:
>         mov     w1, 4
> .L2:
>         ldaxr   w2, [x0]
>         cbnz    w2, .L3
>         stxr    w3, w1, [x0]
>         cbnz    w3, .L2
> .L3:
>         cmp     w2, 0
>         cset    w0, eq
>         ret
> 
> So we add an explicit compare after the loop and inside the loop we use the
> fact that we're comparing against zero to emit a CBNZ. This means we may
> re-do the comparison twice (once inside the CBNZ, once at the CMP at the
> end), but there is now less code inside the loop.
> 
> I've seen this sequence appear in glibc locking code so maybe it's worth
> adding the extra bit of complexity to the compare-exchange splitter to catch
> this case.
> 
> Bootstrapped and tested on aarch64-none-linux-gnu. In previous iterations of
> the patch where I had gotten some logic wrong it would cause miscompiles of
> libgomp leading to timeouts in its testsuite but this version passes
> everything cleanly.
> 
> Ok for GCC 8? (I know it's early, but might as well get it out in case
> someone wants to try it out)

OK.

Thanks,
James

> 
> Thanks,
> Kyrill
> 
> 2017-03-08  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
> 
>     * config/aarch64/aarch64.c (aarch64_split_compare_and_swap):
>     Emit CBNZ inside loop when doing a strong exchange and comparing
>     against zero.  Generate the CC flags after the loop.
> 
> 2017-03-08  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
> 
>     * gcc.target/aarch64/atomic_cmp_exchange_zero_strong_1.c: New test.

> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index 76a2de20dfcd4ea38fb7c58a9e8612509c5987bd..5fa8e197328ce4cb1718ff7d99b1ea95e02129a4 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -12095,6 +12095,17 @@ aarch64_split_compare_and_swap (rtx operands[])
>    mode = GET_MODE (mem);
>    model = memmodel_from_int (INTVAL (model_rtx));
>  
> +  /* When OLDVAL is zero and we want the strong version we can emit a tighter
> +    loop:
> +    .label1:
> +	LD[A]XR	rval, [mem]
> +	CBNZ	rval, .label2
> +	ST[L]XR	scratch, newval, [mem]
> +	CBNZ	scratch, .label1
> +    .label2:
> +	CMP	rval, 0.  */
> +  bool strong_zero_p = !is_weak && oldval == const0_rtx;
> +
>    label1 = NULL;
>    if (!is_weak)
>      {
> @@ -12111,11 +12122,21 @@ aarch64_split_compare_and_swap (rtx operands[])
>    else
>      aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
>  
> -  cond = aarch64_gen_compare_reg (NE, rval, oldval);
> -  x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
> -  x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
> -			    gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
> -  aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
> +  if (strong_zero_p)
> +    {
> +      x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
> +      x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
> +				gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
> +      aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
> +    }
> +  else
> +    {
> +      cond = aarch64_gen_compare_reg (NE, rval, oldval);
> +      x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
> +      x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
> +				 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
> +      aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
> +    }
>  
>    aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
>  
> @@ -12134,7 +12155,15 @@ aarch64_split_compare_and_swap (rtx operands[])
>      }
>  
>    emit_label (label2);
> -
> +  /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
> +     to set the condition flags.  If this is not used it will be removed by
> +     later passes.  */
> +  if (strong_zero_p)
> +    {
> +      cond = gen_rtx_REG (CCmode, CC_REGNUM);
> +      x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
> +      emit_insn (gen_rtx_SET (cond, x));
> +    }
>    /* Emit any final barrier needed for a __sync operation.  */
>    if (is_mm_sync (model))
>      aarch64_emit_post_barrier (model);
> diff --git a/gcc/testsuite/gcc.target/aarch64/atomic_cmp_exchange_zero_strong_1.c b/gcc/testsuite/gcc.target/aarch64/atomic_cmp_exchange_zero_strong_1.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..b14a7c294376f03cd13077d18d865f83a04bd04e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/atomic_cmp_exchange_zero_strong_1.c
> @@ -0,0 +1,12 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +
> +int
> +foo (int *a)
> +{
> +  int x = 0;
> +  return __atomic_compare_exchange_n (a, &x, 4, 0,
> +				      __ATOMIC_ACQUIRE, __ATOMIC_ACQUIRE);
> +}
> +
> +/* { dg-final { scan-assembler-times "cbnz\\tw\[0-9\]+" 2 } } */

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2017-06-02 14:07 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-03-08 16:35 [PATCH][AArch64] Emit tighter strong atomic compare-exchange loop when comparing against zero Kyrill Tkachov
2017-04-24  9:54 ` Kyrill Tkachov
2017-05-08 11:01   ` Kyrill Tkachov
2017-06-02 10:38     ` Kyrill Tkachov
2017-06-02 14:07 ` James Greenhalgh

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).