public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [og7] Update nvptx_fork/join barrier placement
@ 2018-03-08 23:31 Cesar Philippidis
  2018-03-09 16:22 ` Tom de Vries
  0 siblings, 1 reply; 9+ messages in thread
From: Cesar Philippidis @ 2018-03-08 23:31 UTC (permalink / raw)
  To: gcc-patches, Tom de Vries

[-- Attachment #1: Type: text/plain, Size: 2994 bytes --]

Nvidia Volta GPUs now support warp-level synchronization. As such, the
semantics of legacy bar.sync instructions have slightly changed on newer
GPUs. The PTX JIT will now, occasionally, emit a warpsync instruction
immediately before a bar.sync for Volta GPUs. That implies that warps
must be convergent on entry to those threads barriers.

The problem in og7, and trunk, is that GCC emits barrier instructions at
the wrong spots. E.g., consider the following OpenACC parallel region:

  #pragma acc parallel loop worker
  for (i = 0; i < 10; i++)
    a[i] = i;

At -O2, GCC generates the following PTX code:

        {
                .reg.u32        %y;
                mov.u32 %y, %tid.y;
                setp.ne.u32     %r76, %y, 0;
        }
        {
                .reg.u32        %x;
                mov.u32 %x, %tid.x;
                setp.ne.u32     %r75, %x, 0;
        }
        @%r76   bra.uni $L6;
        @%r75   bra     $L7;
                mov.u64 %r67, %ar0;
        // fork 2;
                cvta.shared.u64 %r74, __oacc_bcast;
                st.u64  [%r74], %r67;
$L7:
$L6:
        @%r75   bra     $L5;
        // forked 2;
                bar.sync        0;
                cvta.shared.u64 %r73, __oacc_bcast;
                ld.u64  %r67, [%r73];
                mov.u32 %r62, %ntid.y;
                mov.u32 %r63, %tid.y;
                setp.gt.s32     %r68, %r63, 9;
        @%r68   bra     $L2;
                mov.u32 %r55, %r63;
                cvt.s64.s32     %r69, %r62;
                shl.b64 %r59, %r69, 2;
                cvt.s64.s32     %r70, %r55;
                shl.b64 %r71, %r70, 2;
                add.u64 %r58, %r67, %r71;
$L3:
                st.u32  [%r58], %r55;
                add.u32 %r55, %r55, %r62;
                add.u64 %r58, %r58, %r59;
                setp.le.s32     %r72, %r55, 9;
        @%r72   bra     $L3;
$L2:
                bar.sync        1;
        // joining 2;
$L5:
        // join 2;
        ret;

Note the bar.sync instructions placed immediately after the forked
comment and before the joining comment. The problem here is that branch
above the forked comment guarantees that the warps are not synchronous
(when vector_length > 1, which is always the case). Likewise, bar.sync
instruction before joining should be placed after label L5 in order to
allow all of the threads in the warp to reach it.

The attached patch teaches the nvptx to make those adjustments. It
doesn't cause any regressions on legacy GPUs, but it does resolve quite
a few failures with Volta in the libgomp execution tests. Therefore,
this patch doesn't include any new test cases. Part of this patch came
from my vector_length patch set that I posted last week. However, that
patch set didn't consider the placement of the joining barrier.

I've applied this patch to openacc-gcc-7-branch.

Tom, is a similar patch OK for trunk? The major difference between trunk
and og7 is that og7 changed the name of nvptx_warp_sync to nvptx_cta_sync.

Cesar

[-- Attachment #2: og7-barriers.diff --]
[-- Type: text/x-patch, Size: 1953 bytes --]

2018-03-08  Cesar Philippidis  <cesar@codesourcery.com>

	gcc/
	* config/nvptx/nvptx.c (nvptx_single): Adjust placement of nvptx_fork
	and nvptx_join nutering labels.
	(nvptx_process_pars): Place the CTA barrier at the beginning of the
	join block.


diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index b16cf59575c..efc6161a6b0 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -4056,6 +4056,15 @@ nvptx_single (unsigned mask, basic_block from, basic_block to)
 	return;
     }
 
+  /* NVPTX_BARSYNC barriers are placed immediately before NVPTX_JOIN
+     in order to ensure that all of the threads in a CTA reach the
+     barrier.  Don't nueter BLOCK if head is NVPTX_BARSYNC and tail is
+     NVPTX_JOIN.  */
+  if (from == to
+      && recog_memoized (head) == CODE_FOR_nvptx_barsync
+      && recog_memoized (tail) == CODE_FOR_nvptx_join)
+    return;
+
   /* Insert the vector test inside the worker test.  */
   unsigned mode;
   rtx_insn *before = tail;
@@ -4103,7 +4112,17 @@ nvptx_single (unsigned mask, basic_block from, basic_block to)
 	  br = gen_br_true (pred, label);
 	else
 	  br = gen_br_true_uni (pred, label);
-	emit_insn_before (br, head);
+
+	if (recog_memoized (head) == CODE_FOR_nvptx_forked
+	    && recog_memoized (NEXT_INSN (head)) == CODE_FOR_nvptx_barsync)
+	  {
+	    head = NEXT_INSN (head);
+	    emit_insn_after (br, head);
+	  }
+	else if (recog_memoized (head) == CODE_FOR_nvptx_barsync)
+	  emit_insn_after (br, head);
+	else
+	  emit_insn_before (br, head);
 
 	LABEL_NUSES (label)++;
 	if (tail_branch)
@@ -4325,7 +4344,7 @@ nvptx_process_pars (parallel *par)
 	{
 	  /* Insert begin and end synchronizations.  */
 	  emit_insn_after (nvptx_cta_sync (false), par->forked_insn);
-	  emit_insn_before (nvptx_cta_sync (true), par->joining_insn);
+	  emit_insn_before (nvptx_cta_sync (true), par->join_insn);
 	}
     }
   else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [og7] Update nvptx_fork/join barrier placement
  2018-03-08 23:31 [og7] Update nvptx_fork/join barrier placement Cesar Philippidis
@ 2018-03-09 16:22 ` Tom de Vries
  2018-03-09 16:55   ` Cesar Philippidis
  0 siblings, 1 reply; 9+ messages in thread
From: Tom de Vries @ 2018-03-09 16:22 UTC (permalink / raw)
  To: Cesar Philippidis; +Cc: gcc-patches, Thomas Schwinge

On 03/09/2018 12:31 AM, Cesar Philippidis wrote:
> Nvidia Volta GPUs now support warp-level synchronization.

Well, let's try to make that statement a bit more precise.

All Nvidia architectures have supported synchronization of threads in a 
warp on a very basic level: by means of convergence (and unfortunately, 
we've seen that this is very error-prone).

What is new in ptx 6.0 combined with sm_70 is the ability to sync 
divergent threads without having to converge, f.i. by using new 
instructions bar.warp.sync and barrier.sync.

> As such, the
> semantics of legacy bar.sync instructions have slightly changed on newer
> GPUs.

Before in ptx 3.1, we have for bar.sync:
...
Barriers are executed on a per-warp basis as if all the threads in a 
warp are active. Thus, if any thread in a warp executes a bar 
instruction, it is as if all the threads in the warp have executed
the bar instruction. All threads in the warp are stalled until the 
barrier completes, and the arrival count for the barrier is incremented 
by the warp size (not the number of active threads in the warp). In 
conditionally executed code, a bar instruction should only be used if it 
is known that all threads evaluate the condition identically (the warp 
does not diverge).
...

But in ptx 6.0, we have:
...
bar.sync is equivalent to barrier.sync.aligned
...
and:
...
Instruction barrier has optional .aligned modifier. When specified, it 
indicates that all threads in CTA will execute the same barrier 
instruction. In conditionally executed code, an aligned barrier 
instruction should only be used if it is known that all threads in
CTA evaluate the condition identically, otherwise behavior is undefined.
...

So, in ptx 3.1 bar.sync should be executed in convergent mode (all the 
threads in each warp executing the same). But in ptx 6.0, bar.sync 
should be executed in the mode that the whole CTA is executing the same 
code.

So going from the description of ptx, it seems indeed that the semantics 
of bar.sync has changed. That is however surprising, since it would 
break the forward compatibility that AFAIU is the idea behind ptx.

So for now my hope is that this is a documentation error.

> The PTX JIT will now, occasionally, emit a warpsync instruction
> immediately before a bar.sync for Volta GPUs. That implies that warps
> must be convergent on entry to those threads barriers.
> 

That warps must be convergent on entry to bar.sync is already required 
by ptx 3.1.

[ And bar.warp.sync does not force convergence, so if the warpsync 
instruction you mention is equivalent to bar.warp.sync then your 
reasoning is incorrect. ]

> The problem in og7, and trunk, is that GCC emits barrier instructions at
> the wrong spots. E.g., consider the following OpenACC parallel region:
> 
>    #pragma acc parallel loop worker
>    for (i = 0; i < 10; i++)
>      a[i] = i;
> 
> At -O2, GCC generates the following PTX code:
> 
>          {
>                  .reg.u32        %y;
>                  mov.u32 %y, %tid.y;
>                  setp.ne.u32     %r76, %y, 0;
>          }
>          {
>                  .reg.u32        %x;
>                  mov.u32 %x, %tid.x;
>                  setp.ne.u32     %r75, %x, 0;
>          }
>          @%r76   bra.uni $L6;
>          @%r75   bra     $L7;
>                  mov.u64 %r67, %ar0;
>          // fork 2;
>                  cvta.shared.u64 %r74, __oacc_bcast;
>                  st.u64  [%r74], %r67;
> $L7:
> $L6:
>          @%r75   bra     $L5;
>          // forked 2;
>                  bar.sync        0;
>                  cvta.shared.u64 %r73, __oacc_bcast;
>                  ld.u64  %r67, [%r73];
>                  mov.u32 %r62, %ntid.y;
>                  mov.u32 %r63, %tid.y;
>                  setp.gt.s32     %r68, %r63, 9;
>          @%r68   bra     $L2;
>                  mov.u32 %r55, %r63;
>                  cvt.s64.s32     %r69, %r62;
>                  shl.b64 %r59, %r69, 2;
>                  cvt.s64.s32     %r70, %r55;
>                  shl.b64 %r71, %r70, 2;
>                  add.u64 %r58, %r67, %r71;
> $L3:
>                  st.u32  [%r58], %r55;
>                  add.u32 %r55, %r55, %r62;
>                  add.u64 %r58, %r58, %r59;
>                  setp.le.s32     %r72, %r55, 9;
>          @%r72   bra     $L3;
> $L2:
>                  bar.sync        1;
>          // joining 2;
> $L5:
>          // join 2;
>          ret;
> 
> Note the bar.sync instructions placed immediately after the forked
> comment and before the joining comment. The problem here is that branch
> above the forked comment guarantees that the warps are not synchronous
> (when vector_length > 1, which is always the case). 

This is already advised against in ptx 3.1, so yes, we should fix this.

> Likewise, bar.sync
> instruction before joining should be placed after label L5 in order to
> allow all of the threads in the warp to reach it.
> 

Agreed.

> The attached patch teaches the nvptx to make those adjustments.

Can you show me a diff of the ptx for the test-case above for trunk?

> It
> doesn't cause any regressions on legacy GPUs, but it does resolve quite
> a few failures with Volta in the libgomp execution tests. 

So, did you test this on trunk?

> Therefore,
> this patch doesn't include any new test cases. 

Makes sense.

 > Part of this patch came
> from my vector_length patch set that I posted last week. However, that
> patch set didn't consider the placement of the joining barrier.
> 
> I've applied this patch to openacc-gcc-7-branch.
> 
> Tom, is a similar patch OK for trunk? The major difference between trunk
> and og7 is that og7 changed the name of nvptx_warp_sync to nvptx_cta_sync.
> 

Please, if you want to have a patch accepted for trunk, then just submit 
a trunk patch.

> Cesar
> 
> 
> og7-barriers.diff
> 
> 
> 2018-03-08  Cesar Philippidis  <cesar@codesourcery.com>
> 
> 	gcc/
> 	* config/nvptx/nvptx.c (nvptx_single): Adjust placement of nvptx_fork
> 	and nvptx_join nutering labels.
> 	(nvptx_process_pars): Place the CTA barrier at the beginning of the
> 	join block.
> 
> 
> diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
> index b16cf59575c..efc6161a6b0 100644
> --- a/gcc/config/nvptx/nvptx.c
> +++ b/gcc/config/nvptx/nvptx.c
> @@ -4056,6 +4056,15 @@ nvptx_single (unsigned mask, basic_block from, basic_block to)
>   	return;
>       }
>   
> +  /* NVPTX_BARSYNC barriers are placed immediately before NVPTX_JOIN
> +     in order to ensure that all of the threads in a CTA reach the
> +     barrier.  Don't nueter BLOCK if head is NVPTX_BARSYNC and tail is
> +     NVPTX_JOIN.  */
> +  if (from == to
> +      && recog_memoized (head) == CODE_FOR_nvptx_barsync
> +      && recog_memoized (tail) == CODE_FOR_nvptx_join)
> +    return;
> +
>     /* Insert the vector test inside the worker test.  */
>     unsigned mode;
>     rtx_insn *before = tail;
> @@ -4103,7 +4112,17 @@ nvptx_single (unsigned mask, basic_block from, basic_block to)
>   	  br = gen_br_true (pred, label);
>   	else
>   	  br = gen_br_true_uni (pred, label);
> -	emit_insn_before (br, head);
> +
> +	if (recog_memoized (head) == CODE_FOR_nvptx_forked
> +	    && recog_memoized (NEXT_INSN (head)) == CODE_FOR_nvptx_barsync)
> +	  {
> +	    head = NEXT_INSN (head);
> +	    emit_insn_after (br, head);
> +	  }
> +	else if (recog_memoized (head) == CODE_FOR_nvptx_barsync)
> +	  emit_insn_after (br, head);
> +	else
> +	  emit_insn_before (br, head);
>   
>   	LABEL_NUSES (label)++;
>   	if (tail_branch)
> @@ -4325,7 +4344,7 @@ nvptx_process_pars (parallel *par)
>   	{
>   	  /* Insert begin and end synchronizations.  */
>   	  emit_insn_after (nvptx_cta_sync (false), par->forked_insn);
> -	  emit_insn_before (nvptx_cta_sync (true), par->joining_insn);
> +	  emit_insn_before (nvptx_cta_sync (true), par->join_insn);
>   	}
>       }
>     else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
> 

Hmm, the patch looks a bit fragile to me.

I wonder it it's possible to do something similar to 
https://gcc.gnu.org/bugzilla/attachment.cgi?id=43480&action=diff

Thanks,
- Tom

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [og7] Update nvptx_fork/join barrier placement
  2018-03-09 16:22 ` Tom de Vries
@ 2018-03-09 16:55   ` Cesar Philippidis
  2018-03-19 14:30     ` Tom de Vries
  0 siblings, 1 reply; 9+ messages in thread
From: Cesar Philippidis @ 2018-03-09 16:55 UTC (permalink / raw)
  To: Tom de Vries; +Cc: gcc-patches, Thomas Schwinge

[-- Attachment #1: Type: text/plain, Size: 10165 bytes --]

On 03/09/2018 08:21 AM, Tom de Vries wrote:
> On 03/09/2018 12:31 AM, Cesar Philippidis wrote:
>> Nvidia Volta GPUs now support warp-level synchronization.
> 
> Well, let's try to make that statement a bit more precise.
> 
> All Nvidia architectures have supported synchronization of threads in a
> warp on a very basic level: by means of convergence (and unfortunately,
> we've seen that this is very error-prone).
>
> What is new in ptx 6.0 combined with sm_70 is the ability to sync
> divergent threads without having to converge, f.i. by using new
> instructions bar.warp.sync and barrier.sync.

Yes. The major difference sm_70 GPU architectures and earlier GPUs is
that sm_70 allows the user to explicitly synchronize divergent warps. At
least on Maxwell and Pascal, the PTX SASS compiler uses two instructions
to branch, SYNC and BRA. I think, SYNC guarantees that a warp is
convergent at the SYNC point, whereas BRA makes no such guarantees.

What's worse, once a warp has become divergent on sm_60 and earlier
GPUs, there's no way to reliably reconverge them. So, to avoid that
problem, it critical that the PTX SASS compiler use SYNC instructions
when possible. Fortunately, bar.warp.sync resolves the divergent warp
problem on sm_70+.

>> As such, the
>> semantics of legacy bar.sync instructions have slightly changed on newer
>> GPUs.
> 
> Before in ptx 3.1, we have for bar.sync:
> ...
> Barriers are executed on a per-warp basis as if all the threads in a
> warp are active. Thus, if any thread in a warp executes a bar
> instruction, it is as if all the threads in the warp have executed
> the bar instruction. All threads in the warp are stalled until the
> barrier completes, and the arrival count for the barrier is incremented
> by the warp size (not the number of active threads in the warp). In
> conditionally executed code, a bar instruction should only be used if it
> is known that all threads evaluate the condition identically (the warp
> does not diverge).
> ...
> 
> But in ptx 6.0, we have:
> ...
> bar.sync is equivalent to barrier.sync.aligned
> ...
> and:
> ...
> Instruction barrier has optional .aligned modifier. When specified, it
> indicates that all threads in CTA will execute the same barrier
> instruction. In conditionally executed code, an aligned barrier
> instruction should only be used if it is known that all threads in
> CTA evaluate the condition identically, otherwise behavior is undefined.
> ...
> 
> So, in ptx 3.1 bar.sync should be executed in convergent mode (all the
> threads in each warp executing the same). But in ptx 6.0, bar.sync
> should be executed in the mode that the whole CTA is executing the same
> code.
> 
> So going from the description of ptx, it seems indeed that the semantics
> of bar.sync has changed. That is however surprising, since it would
> break the forward compatibility that AFAIU is the idea behind ptx.
> 
> So for now my hope is that this is a documentation error.

I spent a lot of time debugging deadlocks with the vector length changes
and I have see no changes in the SASS code generated in the newer Nvidia
drivers when compared to the older ones, at lease with respect to the
barrier instructions. This isn't the first time I've seen
inconsistencies with thread synchronization in Nvidia's documentation.
For the longest time, the "CUDA Programming Guide" provided slightly
conflicting semantics for the __syncthreads() function, which ultimately
gets implemented as bar.sync in PTX.

>> The PTX JIT will now, occasionally, emit a warpsync instruction
>> immediately before a bar.sync for Volta GPUs. That implies that warps
>> must be convergent on entry to those threads barriers.
>>
> 
> That warps must be convergent on entry to bar.sync is already required
> by ptx 3.1.
> 
> [ And bar.warp.sync does not force convergence, so if the warpsync
> instruction you mention is equivalent to bar.warp.sync then your
> reasoning is incorrect. ]

I'm under the impression that bar.warp.sync converges all of the
non-exited threads in a warp. You'd still need to use bar.sync or some
variant of the new barrier instruction to converge the entire CTA. But
at the moment, we're still generating code that's backwards compatible
with sm_30.

>> The problem in og7, and trunk, is that GCC emits barrier instructions at
>> the wrong spots. E.g., consider the following OpenACC parallel region:
>>
>>    #pragma acc parallel loop worker
>>    for (i = 0; i < 10; i++)
>>      a[i] = i;
>>
>> At -O2, GCC generates the following PTX code:
>>
>>          {
>>                  .reg.u32        %y;
>>                  mov.u32 %y, %tid.y;
>>                  setp.ne.u32     %r76, %y, 0;
>>          }
>>          {
>>                  .reg.u32        %x;
>>                  mov.u32 %x, %tid.x;
>>                  setp.ne.u32     %r75, %x, 0;
>>          }
>>          @%r76   bra.uni $L6;
>>          @%r75   bra     $L7;
>>                  mov.u64 %r67, %ar0;
>>          // fork 2;
>>                  cvta.shared.u64 %r74, __oacc_bcast;
>>                  st.u64  [%r74], %r67;
>> $L7:
>> $L6:
>>          @%r75   bra     $L5;
>>          // forked 2;
>>                  bar.sync        0;
>>                  cvta.shared.u64 %r73, __oacc_bcast;
>>                  ld.u64  %r67, [%r73];
>>                  mov.u32 %r62, %ntid.y;
>>                  mov.u32 %r63, %tid.y;
>>                  setp.gt.s32     %r68, %r63, 9;
>>          @%r68   bra     $L2;
>>                  mov.u32 %r55, %r63;
>>                  cvt.s64.s32     %r69, %r62;
>>                  shl.b64 %r59, %r69, 2;
>>                  cvt.s64.s32     %r70, %r55;
>>                  shl.b64 %r71, %r70, 2;
>>                  add.u64 %r58, %r67, %r71;
>> $L3:
>>                  st.u32  [%r58], %r55;
>>                  add.u32 %r55, %r55, %r62;
>>                  add.u64 %r58, %r58, %r59;
>>                  setp.le.s32     %r72, %r55, 9;
>>          @%r72   bra     $L3;
>> $L2:
>>                  bar.sync        1;
>>          // joining 2;
>> $L5:
>>          // join 2;
>>          ret;
>>
>> Note the bar.sync instructions placed immediately after the forked
>> comment and before the joining comment. The problem here is that branch
>> above the forked comment guarantees that the warps are not synchronous
>> (when vector_length > 1, which is always the case). 
> 
> This is already advised against in ptx 3.1, so yes, we should fix this.
> 
>> Likewise, bar.sync
>> instruction before joining should be placed after label L5 in order to
>> allow all of the threads in the warp to reach it.
>>
> 
> Agreed.
> 
>> The attached patch teaches the nvptx to make those adjustments.
> 
> Can you show me a diff of the ptx for the test-case above for trunk?

--- w-old.s     2018-03-08 15:19:47.139516578 -0800
+++ w.s 2018-03-09 08:42:52.217057332 -0800
@@ -46,9 +46,9 @@
                st.u64  [%r74], %r67;
 $L7:
 $L6:
-       @%r75   bra     $L5;
        // forked 2;
                bar.sync        0;
+       @%r75   bra     $L5;
                cvta.shared.u64 %r73, __oacc_bcast;
                ld.u64  %r67, [%r73];
                mov.u32 %r62, %ntid.y;
@@ -68,9 +68,9 @@
                setp.le.s32     %r72, %r55, 9;
        @%r72   bra     $L3;
 $L2:
-               bar.sync        1;
        // joining 2;
 $L5:
+               bar.sync        1;
        // join 2;
        ret;
 }

>> It
>> doesn't cause any regressions on legacy GPUs, but it does resolve quite
>> a few failures with Volta in the libgomp execution tests. 
> 
> So, did you test this on trunk?

Yes, but only on my GeForce 1070, because I'm debugging the
parallel-dims.c failure on the Titan V. There are no new regressions in
trunk.

>> Therefore,
>> this patch doesn't include any new test cases. 
> 
> Makes sense.
> 
>> Part of this patch came
>> from my vector_length patch set that I posted last week. However, that
>> patch set didn't consider the placement of the joining barrier.
>>
>> I've applied this patch to openacc-gcc-7-branch.
>>
>> Tom, is a similar patch OK for trunk? The major difference between trunk
>> and og7 is that og7 changed the name of nvptx_warp_sync to
>> nvptx_cta_sync.
>>
> 
> Please, if you want to have a patch accepted for trunk, then just submit
> a trunk patch.

Here's the trunk patch. Is it OK for trunk?

Cesar

[-- Attachment #2: trunk-barriers.diff --]
[-- Type: text/x-patch, Size: 2076 bytes --]

2018-03-09  Cesar Philippidis  <cesar@codesourcery.com>

	gcc/
	* config/nvptx/nvptx.c (nvptx_single): Adjust placement of nvptx_fork
	and nvptx_join nutering labels.
	(nvptx_process_pars): Place the CTA barrier at the beginning of the
	join block.


diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index a6f444340fd..81fcf2c28bc 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -4037,6 +4037,15 @@ nvptx_single (unsigned mask, basic_block from, basic_block to)
 	return;
     }
 
+  /* NVPTX_BARSYNC barriers are placed immediately before NVPTX_JOIN
+     in order to ensure that all of the threads in a CTA reach the
+     barrier.  Don't nueter BLOCK if head is NVPTX_BARSYNC and tail is
+     NVPTX_JOIN.  */
+  if (from == to
+      && recog_memoized (head) == CODE_FOR_nvptx_barsync
+      && recog_memoized (tail) == CODE_FOR_nvptx_join)
+    return;
+
   /* Insert the vector test inside the worker test.  */
   unsigned mode;
   rtx_insn *before = tail;
@@ -4057,7 +4066,17 @@ nvptx_single (unsigned mask, basic_block from, basic_block to)
 	  br = gen_br_true (pred, label);
 	else
 	  br = gen_br_true_uni (pred, label);
-	emit_insn_before (br, head);
+
+	if (recog_memoized (head) == CODE_FOR_nvptx_forked
+	    && recog_memoized (NEXT_INSN (head)) == CODE_FOR_nvptx_barsync)
+	  {
+	    head = NEXT_INSN (head);
+	    emit_insn_after (br, head);
+	  }
+	else if (recog_memoized (head) == CODE_FOR_nvptx_barsync)
+	  emit_insn_after (br, head);
+	else
+	  emit_insn_before (br, head);
 
 	LABEL_NUSES (label)++;
 	if (tail_branch)
@@ -4276,7 +4295,7 @@ nvptx_process_pars (parallel *par)
       nvptx_wpropagate (true, par->forked_block, par->fork_insn);
       /* Insert begin and end synchronizations.  */
       emit_insn_after (nvptx_wsync (false), par->forked_insn);
-      emit_insn_before (nvptx_wsync (true), par->joining_insn);
+      emit_insn_before (nvptx_wsync (true), par->join_insn);
     }
   else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
     nvptx_vpropagate (par->forked_block, par->forked_insn);

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [og7] Update nvptx_fork/join barrier placement
  2018-03-09 16:55   ` Cesar Philippidis
@ 2018-03-19 14:30     ` Tom de Vries
  2018-03-19 15:24       ` Cesar Philippidis
  0 siblings, 1 reply; 9+ messages in thread
From: Tom de Vries @ 2018-03-19 14:30 UTC (permalink / raw)
  To: Cesar Philippidis; +Cc: gcc-patches, Thomas Schwinge

[-- Attachment #1: Type: text/plain, Size: 11144 bytes --]

On 03/09/2018 05:55 PM, Cesar Philippidis wrote:
> On 03/09/2018 08:21 AM, Tom de Vries wrote:
>> On 03/09/2018 12:31 AM, Cesar Philippidis wrote:
>>> Nvidia Volta GPUs now support warp-level synchronization.
>>
>> Well, let's try to make that statement a bit more precise.
>>
>> All Nvidia architectures have supported synchronization of threads in a
>> warp on a very basic level: by means of convergence (and unfortunately,
>> we've seen that this is very error-prone).
>>
>> What is new in ptx 6.0 combined with sm_70 is the ability to sync
>> divergent threads without having to converge, f.i. by using new
>> instructions bar.warp.sync and barrier.sync.
> 
> Yes. The major difference sm_70 GPU architectures and earlier GPUs is
> that sm_70 allows the user to explicitly synchronize divergent warps. At
> least on Maxwell and Pascal, the PTX SASS compiler uses two instructions
> to branch, SYNC and BRA. I think, SYNC guarantees that a warp is
> convergent at the SYNC point, whereas BRA makes no such guarantees.
> 

If you want to understand the interplay of sync (or .s suffix), branch 
and ssy, please read 
https://people.engr.ncsu.edu/hzhou/ispass_15-poster.pdf .

> What's worse, once a warp has become divergent on sm_60 and earlier
> GPUs, there's no way to reliably reconverge them. So, to avoid that
> problem, it critical that the PTX SASS compiler use SYNC instructions
> when possible. Fortunately, bar.warp.sync resolves the divergent warp
> problem on sm_70+.
> 
>>> As such, the
>>> semantics of legacy bar.sync instructions have slightly changed on newer
>>> GPUs.
>>
>> Before in ptx 3.1, we have for bar.sync:
>> ...
>> Barriers are executed on a per-warp basis as if all the threads in a
>> warp are active. Thus, if any thread in a warp executes a bar
>> instruction, it is as if all the threads in the warp have executed
>> the bar instruction. All threads in the warp are stalled until the
>> barrier completes, and the arrival count for the barrier is incremented
>> by the warp size (not the number of active threads in the warp). In
>> conditionally executed code, a bar instruction should only be used if it
>> is known that all threads evaluate the condition identically (the warp
>> does not diverge).
>> ...
>>
>> But in ptx 6.0, we have:
>> ...
>> bar.sync is equivalent to barrier.sync.aligned
>> ...
>> and:
>> ...
>> Instruction barrier has optional .aligned modifier. When specified, it
>> indicates that all threads in CTA will execute the same barrier
>> instruction. In conditionally executed code, an aligned barrier
>> instruction should only be used if it is known that all threads in
>> CTA evaluate the condition identically, otherwise behavior is undefined.
>> ...
>>
>> So, in ptx 3.1 bar.sync should be executed in convergent mode (all the
>> threads in each warp executing the same). But in ptx 6.0, bar.sync
>> should be executed in the mode that the whole CTA is executing the same
>> code.
>>
>> So going from the description of ptx, it seems indeed that the semantics
>> of bar.sync has changed. That is however surprising, since it would
>> break the forward compatibility that AFAIU is the idea behind ptx.
>>
>> So for now my hope is that this is a documentation error.
> 
> I spent a lot of time debugging deadlocks with the vector length changes
> and I have see no changes in the SASS code generated in the newer Nvidia
> drivers when compared to the older ones, at lease with respect to the
> barrier instructions. This isn't the first time I've seen
> inconsistencies with thread synchronization in Nvidia's documentation.
> For the longest time, the "CUDA Programming Guide" provided slightly
> conflicting semantics for the __syncthreads() function, which ultimately
> gets implemented as bar.sync in PTX.
> 
>>> The PTX JIT will now, occasionally, emit a warpsync instruction
>>> immediately before a bar.sync for Volta GPUs. That implies that warps
>>> must be convergent on entry to those threads barriers.
>>>
>>
>> That warps must be convergent on entry to bar.sync is already required
>> by ptx 3.1.
>>
>> [ And bar.warp.sync does not force convergence, so if the warpsync
>> instruction you mention is equivalent to bar.warp.sync then your
>> reasoning is incorrect. ]
> 
> I'm under the impression that bar.warp.sync converges all of the
> non-exited threads in a warp.

I have not played around with the instruction yet, so I'm not sure, but 
what I read from the docs is that bar.warp.sync converges all of the 
non-exited threads in a warp only and only if it's positioned at a point 
post-dominating a divergent branch.

Consider this case:
...
if (tid.x == 0)
   {
     A;
     bar.warp.sync 32;
     B;
   }
else
   {
     C;
     bar.warp.sync 32;
     D;
   }
...
AFAIU, this allows bar.warp.sync to synchronize the threads in the warp, 
_without_ converging.


> You'd still need to use bar.sync or some
> variant of the new barrier instruction to converge the entire CTA. But
> at the moment, we're still generating code that's backwards compatible
> with sm_30.
> 
>>> The problem in og7, and trunk, is that GCC emits barrier instructions at
>>> the wrong spots. E.g., consider the following OpenACC parallel region:
>>>
>>>     #pragma acc parallel loop worker
>>>     for (i = 0; i < 10; i++)
>>>       a[i] = i;
>>>
>>> At -O2, GCC generates the following PTX code:
>>>
>>>           {
>>>                   .reg.u32        %y;
>>>                   mov.u32 %y, %tid.y;
>>>                   setp.ne.u32     %r76, %y, 0;
>>>           }
>>>           {
>>>                   .reg.u32        %x;
>>>                   mov.u32 %x, %tid.x;
>>>                   setp.ne.u32     %r75, %x, 0;
>>>           }
>>>           @%r76   bra.uni $L6;
>>>           @%r75   bra     $L7;
>>>                   mov.u64 %r67, %ar0;
>>>           // fork 2;
>>>                   cvta.shared.u64 %r74, __oacc_bcast;
>>>                   st.u64  [%r74], %r67;
>>> $L7:
>>> $L6:
>>>           @%r75   bra     $L5;
>>>           // forked 2;
>>>                   bar.sync        0;
>>>                   cvta.shared.u64 %r73, __oacc_bcast;
>>>                   ld.u64  %r67, [%r73];
>>>                   mov.u32 %r62, %ntid.y;
>>>                   mov.u32 %r63, %tid.y;
>>>                   setp.gt.s32     %r68, %r63, 9;
>>>           @%r68   bra     $L2;
>>>                   mov.u32 %r55, %r63;
>>>                   cvt.s64.s32     %r69, %r62;
>>>                   shl.b64 %r59, %r69, 2;
>>>                   cvt.s64.s32     %r70, %r55;
>>>                   shl.b64 %r71, %r70, 2;
>>>                   add.u64 %r58, %r67, %r71;
>>> $L3:
>>>                   st.u32  [%r58], %r55;
>>>                   add.u32 %r55, %r55, %r62;
>>>                   add.u64 %r58, %r58, %r59;
>>>                   setp.le.s32     %r72, %r55, 9;
>>>           @%r72   bra     $L3;
>>> $L2:
>>>                   bar.sync        1;
>>>           // joining 2;
>>> $L5:
>>>           // join 2;
>>>           ret;
>>>
>>> Note the bar.sync instructions placed immediately after the forked
>>> comment and before the joining comment. The problem here is that branch
>>> above the forked comment guarantees that the warps are not synchronous
>>> (when vector_length > 1, which is always the case).
>>
>> This is already advised against in ptx 3.1, so yes, we should fix this.
>>
>>> Likewise, bar.sync
>>> instruction before joining should be placed after label L5 in order to
>>> allow all of the threads in the warp to reach it.
>>>
>>
>> Agreed.
>>
>>> The attached patch teaches the nvptx to make those adjustments.
>>
>> Can you show me a diff of the ptx for the test-case above for trunk?
> 
> --- w-old.s     2018-03-08 15:19:47.139516578 -0800
> +++ w.s 2018-03-09 08:42:52.217057332 -0800
> @@ -46,9 +46,9 @@
>                  st.u64  [%r74], %r67;
>   $L7:
>   $L6:
> -       @%r75   bra     $L5;
>          // forked 2;
>                  bar.sync        0;
> +       @%r75   bra     $L5;
>                  cvta.shared.u64 %r73, __oacc_bcast;
>                  ld.u64  %r67, [%r73];
>                  mov.u32 %r62, %ntid.y;
> @@ -68,9 +68,9 @@
>                  setp.le.s32     %r72, %r55, 9;
>          @%r72   bra     $L3;
>   $L2:
> -               bar.sync        1;
>          // joining 2;
>   $L5:
> +               bar.sync        1;
>          // join 2;
>          ret;
>   }
> 
> 

At -O0, yes.

At -O2, we have:
...
  diff -u -a 1 2
--- 1   2018-03-19 14:13:44.074834552 +0100
+++ 2   2018-03-19 14:15:06.075301168 +0100
@@ -42,20 +42,20 @@
  st.u64 [%r32],%r25;
  $L7:
  $L6:
-@ %r33 bra $L5;
  // forked 2;
  bar.sync 0;
+@ %r33 bra $L5;
  cvta.shared.u64 %r31,__worker_bcast;
  ld.u64 %r25,[%r31];
  mov.u32 %r24,%tid.y;
  setp.le.s32 %r26,%r24,9;
  @ %r26 bra $L2;
  $L3:
-bar.sync 1;
  // joining 2;
  $L5:
-@ %r34 bra.uni $L8;
+bar.sync 1;
  @ %r33 bra $L9;
+@ %r34 bra.uni $L8;
  // join 2;
  $L9:
  $L8:
...

Note that this changes ordering of the vector-neutering jump and 
worker-neutering jump at the end. In principle, this should not be 
harmful, but it violates the invariant that vector-neutering 
branch-around code should be as short-lived as possible. So, this needs 
to be fixed.

I've found this issue by adding verification of the neutering, as 
attached below.

Thanks,
- Tom

[-- Attachment #2: 0001-Verify-bar.sync-position.patch --]
[-- Type: text/x-patch, Size: 5422 bytes --]

Verify bar.sync position

---
 gcc/config/nvptx/nvptx.c | 140 +++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 135 insertions(+), 5 deletions(-)

diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index 81fcf2c28bc..f1f9f72bf82 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -3944,6 +3944,114 @@ bb_first_real_insn (basic_block bb)
 }
 #endif
 
+static bool
+verify_neutering_jumps (basic_block from,
+			rtx_insn *vector_jump, rtx_insn *worker_jump,
+			rtx_insn *vector_label, rtx_insn *worker_label)
+{
+  basic_block bb = from;
+  rtx_insn *insn = BB_HEAD (bb);
+  bool seen_worker_jump = false;
+  bool seen_vector_jump = false;
+  bool seen_worker_label = false;
+  bool seen_vector_label = false;
+  bool worker_neutered = false;
+  bool vector_neutered = false;
+  while (true)
+    {
+      if (insn == worker_jump)
+	{
+	  seen_worker_jump = true;
+	  worker_neutered = true;
+	  gcc_assert (!vector_neutered);
+	}
+      else if (insn == vector_jump)
+	{
+	  seen_vector_jump = true;
+	  vector_neutered = true;
+	}
+      else if (insn == worker_label)
+	{
+	  seen_worker_label = true;
+	  gcc_assert (worker_neutered);
+	  worker_neutered = false;
+	}
+      else if (insn == vector_label)
+	{
+	  seen_vector_label = true;
+	  gcc_assert (vector_neutered);
+	  vector_neutered = false;
+	}
+      else if (INSN_P (insn))
+	switch (recog_memoized (insn))
+	  {
+	  case CODE_FOR_nvptx_barsync:
+	    gcc_assert (!vector_neutered && !worker_neutered);
+	    break;
+	  default:
+	    break;
+	  }
+
+      if (insn != BB_END (bb))
+	insn = NEXT_INSN (insn);
+      else if (JUMP_P (insn) && single_succ_p (bb)
+	       && !seen_vector_jump && !seen_worker_jump)
+	{
+	  bb = single_succ (bb);
+	  insn = BB_HEAD (bb);
+	}
+      else
+	break;
+    }
+
+  gcc_assert (!(vector_jump && !seen_vector_jump));
+  gcc_assert (!(worker_jump && !seen_worker_jump));
+
+  if (seen_vector_label || seen_worker_label)
+    {
+      gcc_assert (!(vector_label && !seen_vector_label));
+      gcc_assert (!(worker_label && !seen_worker_label));
+
+      return true;
+    }
+
+  return false;
+}
+
+static void
+verify_neutering_labels (basic_block to, rtx_insn *vector_label, rtx_insn *worker_label)
+{
+  basic_block bb = to;
+  rtx_insn *insn = BB_END (bb);
+  bool seen_worker_label = false;
+  bool seen_vector_label = false;
+  while (true)
+    {
+      if (insn == worker_label)
+	{
+	  seen_worker_label = true;
+	  gcc_assert (!seen_vector_label);
+	}
+      else if (insn == vector_label)
+	seen_vector_label = true;
+      else if (INSN_P (insn))
+	switch (recog_memoized (insn))
+	  {
+	  case CODE_FOR_nvptx_barsync:
+	    gcc_assert (!seen_vector_label && !seen_worker_label);
+	    break;
+	  }
+
+      if (insn != BB_HEAD (bb))
+	insn = PREV_INSN (insn);
+      else
+	break;
+    }
+
+  gcc_assert (!(vector_label && !seen_vector_label));
+  gcc_assert (!(worker_label && !seen_worker_label));
+}
+
 /* Single neutering according to MASK.  FROM is the incoming block and
    TO is the outgoing block.  These may be the same block. Insert at
    start of FROM:
@@ -4049,6 +4157,8 @@ nvptx_single (unsigned mask, basic_block from, basic_block to)
   /* Insert the vector test inside the worker test.  */
   unsigned mode;
   rtx_insn *before = tail;
+  rtx_insn *worker_label = NULL, *vector_label = NULL;
+  rtx_insn *worker_jump = NULL, *vector_jump = NULL;
   for (mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
     if (GOMP_DIM_MASK (mode) & skip_mask)
       {
@@ -4067,27 +4177,42 @@ nvptx_single (unsigned mask, basic_block from, basic_block to)
 	else
 	  br = gen_br_true_uni (pred, label);
 
+	rtx_insn *br_insn;
 	if (recog_memoized (head) == CODE_FOR_nvptx_forked
 	    && recog_memoized (NEXT_INSN (head)) == CODE_FOR_nvptx_barsync)
 	  {
 	    head = NEXT_INSN (head);
-	    emit_insn_after (br, head);
+	    br_insn = emit_insn_after (br, head);
 	  }
 	else if (recog_memoized (head) == CODE_FOR_nvptx_barsync)
-	  emit_insn_after (br, head);
+	  br_insn = emit_insn_after (br, head);
+	else
+	  br_insn = emit_insn_before (br, head);
+
+	if (mode == GOMP_DIM_VECTOR)
+	  vector_jump = br_insn;
 	else
-	  emit_insn_before (br, head);
+	  worker_jump = br_insn;
 
 	LABEL_NUSES (label)++;
+	rtx_insn *label_insn;
 	if (tail_branch)
-	  before = emit_label_before (label, before);
+	  {
+	    label_insn = emit_label_before (label, before);
+	    before = label_insn;
+	  }
 	else
 	  {
-	    rtx_insn *label_insn = emit_label_after (label, tail);
+	    label_insn = emit_label_after (label, tail);
 	    if ((mode == GOMP_DIM_VECTOR || mode == GOMP_DIM_WORKER)
 		&& CALL_P (tail) && find_reg_note (tail, REG_NORETURN, NULL))
 	      emit_insn_after (gen_exit (), label_insn);
 	  }
+
+	if (mode == GOMP_DIM_VECTOR)
+	  vector_label = label_insn;
+	else
+	  worker_label = label_insn;
       }
 
   /* Now deal with propagating the branch condition.  */
@@ -4187,6 +4312,11 @@ nvptx_single (unsigned mask, basic_block from, basic_block to)
 				 UNSPEC_BR_UNIFIED);
       validate_change (tail, recog_data.operand_loc[0], unsp, false);
     }
+
+  bool seen_label = verify_neutering_jumps (from, vector_jump, worker_jump,
+					    vector_label, worker_label);
+  if (!seen_label)
+    verify_neutering_labels (to, vector_label, worker_label);
 }
 
 /* PAR is a parallel that is being skipped in its entirety according to

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [og7] Update nvptx_fork/join barrier placement
  2018-03-19 14:30     ` Tom de Vries
@ 2018-03-19 15:24       ` Cesar Philippidis
  2018-03-19 15:44         ` Tom de Vries
  2018-03-19 17:04         ` Tom de Vries
  0 siblings, 2 replies; 9+ messages in thread
From: Cesar Philippidis @ 2018-03-19 15:24 UTC (permalink / raw)
  To: Tom de Vries; +Cc: gcc-patches, Thomas Schwinge

On 03/19/2018 07:04 AM, Tom de Vries wrote:
> On 03/09/2018 05:55 PM, Cesar Philippidis wrote:
>> On 03/09/2018 08:21 AM, Tom de Vries wrote:
>>> On 03/09/2018 12:31 AM, Cesar Philippidis wrote:
>>>> Nvidia Volta GPUs now support warp-level synchronization.
>>>
>>> Well, let's try to make that statement a bit more precise.
>>>
>>> All Nvidia architectures have supported synchronization of threads in a
>>> warp on a very basic level: by means of convergence (and unfortunately,
>>> we've seen that this is very error-prone).
>>>
>>> What is new in ptx 6.0 combined with sm_70 is the ability to sync
>>> divergent threads without having to converge, f.i. by using new
>>> instructions bar.warp.sync and barrier.sync.
>>
>> Yes. The major difference sm_70 GPU architectures and earlier GPUs is
>> that sm_70 allows the user to explicitly synchronize divergent warps. At
>> least on Maxwell and Pascal, the PTX SASS compiler uses two instructions
>> to branch, SYNC and BRA. I think, SYNC guarantees that a warp is
>> convergent at the SYNC point, whereas BRA makes no such guarantees.
>>
> 
> If you want to understand the interplay of sync (or .s suffix), branch
> and ssy, please read
> https://people.engr.ncsu.edu/hzhou/ispass_15-poster.pdf .

Interesting, thanks!

>> What's worse, once a warp has become divergent on sm_60 and earlier
>> GPUs, there's no way to reliably reconverge them. So, to avoid that
>> problem, it critical that the PTX SASS compiler use SYNC instructions
>> when possible. Fortunately, bar.warp.sync resolves the divergent warp
>> problem on sm_70+.
>>
>>>> As such, the
>>>> semantics of legacy bar.sync instructions have slightly changed on
>>>> newer
>>>> GPUs.
>>>
>>> Before in ptx 3.1, we have for bar.sync:
>>> ...
>>> Barriers are executed on a per-warp basis as if all the threads in a
>>> warp are active. Thus, if any thread in a warp executes a bar
>>> instruction, it is as if all the threads in the warp have executed
>>> the bar instruction. All threads in the warp are stalled until the
>>> barrier completes, and the arrival count for the barrier is incremented
>>> by the warp size (not the number of active threads in the warp). In
>>> conditionally executed code, a bar instruction should only be used if it
>>> is known that all threads evaluate the condition identically (the warp
>>> does not diverge).
>>> ...
>>>
>>> But in ptx 6.0, we have:
>>> ...
>>> bar.sync is equivalent to barrier.sync.aligned
>>> ...
>>> and:
>>> ...
>>> Instruction barrier has optional .aligned modifier. When specified, it
>>> indicates that all threads in CTA will execute the same barrier
>>> instruction. In conditionally executed code, an aligned barrier
>>> instruction should only be used if it is known that all threads in
>>> CTA evaluate the condition identically, otherwise behavior is undefined.
>>> ...
>>>
>>> So, in ptx 3.1 bar.sync should be executed in convergent mode (all the
>>> threads in each warp executing the same). But in ptx 6.0, bar.sync
>>> should be executed in the mode that the whole CTA is executing the same
>>> code.
>>>
>>> So going from the description of ptx, it seems indeed that the semantics
>>> of bar.sync has changed. That is however surprising, since it would
>>> break the forward compatibility that AFAIU is the idea behind ptx.
>>>
>>> So for now my hope is that this is a documentation error.
>>
>> I spent a lot of time debugging deadlocks with the vector length changes
>> and I have see no changes in the SASS code generated in the newer Nvidia
>> drivers when compared to the older ones, at lease with respect to the
>> barrier instructions. This isn't the first time I've seen
>> inconsistencies with thread synchronization in Nvidia's documentation.
>> For the longest time, the "CUDA Programming Guide" provided slightly
>> conflicting semantics for the __syncthreads() function, which ultimately
>> gets implemented as bar.sync in PTX.
>>
>>>> The PTX JIT will now, occasionally, emit a warpsync instruction
>>>> immediately before a bar.sync for Volta GPUs. That implies that warps
>>>> must be convergent on entry to those threads barriers.
>>>>
>>>
>>> That warps must be convergent on entry to bar.sync is already required
>>> by ptx 3.1.
>>>
>>> [ And bar.warp.sync does not force convergence, so if the warpsync
>>> instruction you mention is equivalent to bar.warp.sync then your
>>> reasoning is incorrect. ]
>>
>> I'm under the impression that bar.warp.sync converges all of the
>> non-exited threads in a warp.
> 
> I have not played around with the instruction yet, so I'm not sure, but
> what I read from the docs is that bar.warp.sync converges all of the
> non-exited threads in a warp only and only if it's positioned at a point
> post-dominating a divergent branch.
> 
> Consider this case:
> ...
> if (tid.x == 0)
>   {
>     A;
>     bar.warp.sync 32;
>     B;
>   }
> else
>   {
>     C;
>     bar.warp.sync 32;
>     D;
>   }
> ...
> AFAIU, this allows bar.warp.sync to synchronize the threads in the warp,
> _without_ converging.

I think that's partially wrong. Check out the literature for CUDA 9
cooperative groups, such as
<https://devblogs.nvidia.com/cooperative-groups/>, to get an idea of the
intent behind bar.warp.sync.

>> You'd still need to use bar.sync or some
>> variant of the new barrier instruction to converge the entire CTA. But
>> at the moment, we're still generating code that's backwards compatible
>> with sm_30.
>>
>>>> The problem in og7, and trunk, is that GCC emits barrier
>>>> instructions at
>>>> the wrong spots. E.g., consider the following OpenACC parallel region:
>>>>
>>>>     #pragma acc parallel loop worker
>>>>     for (i = 0; i < 10; i++)
>>>>       a[i] = i;
>>>>
>>>> At -O2, GCC generates the following PTX code:
>>>>
>>>>           {
>>>>                   .reg.u32        %y;
>>>>                   mov.u32 %y, %tid.y;
>>>>                   setp.ne.u32     %r76, %y, 0;
>>>>           }
>>>>           {
>>>>                   .reg.u32        %x;
>>>>                   mov.u32 %x, %tid.x;
>>>>                   setp.ne.u32     %r75, %x, 0;
>>>>           }
>>>>           @%r76   bra.uni $L6;
>>>>           @%r75   bra     $L7;
>>>>                   mov.u64 %r67, %ar0;
>>>>           // fork 2;
>>>>                   cvta.shared.u64 %r74, __oacc_bcast;
>>>>                   st.u64  [%r74], %r67;
>>>> $L7:
>>>> $L6:
>>>>           @%r75   bra     $L5;
>>>>           // forked 2;
>>>>                   bar.sync        0;
>>>>                   cvta.shared.u64 %r73, __oacc_bcast;
>>>>                   ld.u64  %r67, [%r73];
>>>>                   mov.u32 %r62, %ntid.y;
>>>>                   mov.u32 %r63, %tid.y;
>>>>                   setp.gt.s32     %r68, %r63, 9;
>>>>           @%r68   bra     $L2;
>>>>                   mov.u32 %r55, %r63;
>>>>                   cvt.s64.s32     %r69, %r62;
>>>>                   shl.b64 %r59, %r69, 2;
>>>>                   cvt.s64.s32     %r70, %r55;
>>>>                   shl.b64 %r71, %r70, 2;
>>>>                   add.u64 %r58, %r67, %r71;
>>>> $L3:
>>>>                   st.u32  [%r58], %r55;
>>>>                   add.u32 %r55, %r55, %r62;
>>>>                   add.u64 %r58, %r58, %r59;
>>>>                   setp.le.s32     %r72, %r55, 9;
>>>>           @%r72   bra     $L3;
>>>> $L2:
>>>>                   bar.sync        1;
>>>>           // joining 2;
>>>> $L5:
>>>>           // join 2;
>>>>           ret;
>>>>
>>>> Note the bar.sync instructions placed immediately after the forked
>>>> comment and before the joining comment. The problem here is that branch
>>>> above the forked comment guarantees that the warps are not synchronous
>>>> (when vector_length > 1, which is always the case).
>>>
>>> This is already advised against in ptx 3.1, so yes, we should fix this.
>>>
>>>> Likewise, bar.sync
>>>> instruction before joining should be placed after label L5 in order to
>>>> allow all of the threads in the warp to reach it.
>>>>
>>>
>>> Agreed.
>>>
>>>> The attached patch teaches the nvptx to make those adjustments.
>>>
>>> Can you show me a diff of the ptx for the test-case above for trunk?
>>
>> --- w-old.s     2018-03-08 15:19:47.139516578 -0800
>> +++ w.s 2018-03-09 08:42:52.217057332 -0800
>> @@ -46,9 +46,9 @@
>>                  st.u64  [%r74], %r67;
>>   $L7:
>>   $L6:
>> -       @%r75   bra     $L5;
>>          // forked 2;
>>                  bar.sync        0;
>> +       @%r75   bra     $L5;
>>                  cvta.shared.u64 %r73, __oacc_bcast;
>>                  ld.u64  %r67, [%r73];
>>                  mov.u32 %r62, %ntid.y;
>> @@ -68,9 +68,9 @@
>>                  setp.le.s32     %r72, %r55, 9;
>>          @%r72   bra     $L3;
>>   $L2:
>> -               bar.sync        1;
>>          // joining 2;
>>   $L5:
>> +               bar.sync        1;
>>          // join 2;
>>          ret;
>>   }
>>
>>
> 
> At -O0, yes.
> 
> At -O2, we have:
> ...
>  diff -u -a 1 2
> --- 1   2018-03-19 14:13:44.074834552 +0100
> +++ 2   2018-03-19 14:15:06.075301168 +0100
> @@ -42,20 +42,20 @@
>  st.u64 [%r32],%r25;
>  $L7:
>  $L6:
> -@ %r33 bra $L5;
>  // forked 2;
>  bar.sync 0;
> +@ %r33 bra $L5;
>  cvta.shared.u64 %r31,__worker_bcast;
>  ld.u64 %r25,[%r31];
>  mov.u32 %r24,%tid.y;
>  setp.le.s32 %r26,%r24,9;
>  @ %r26 bra $L2;
>  $L3:
> -bar.sync 1;
>  // joining 2;
>  $L5:
> -@ %r34 bra.uni $L8;
> +bar.sync 1;
>  @ %r33 bra $L9;
> +@ %r34 bra.uni $L8;
>  // join 2;
>  $L9:
>  $L8:
> ...
> 
> Note that this changes ordering of the vector-neutering jump and
> worker-neutering jump at the end. In principle, this should not be
> harmful, but it violates the invariant that vector-neutering
> branch-around code should be as short-lived as possible. So, this needs
> to be fixed.
> 
> I've found this issue by adding verification of the neutering, as
> attached below.

ACK, thanks. I'll take a closer look at this.

Is your patch purely for debugging, or are you planning on committing it
to og7 and trunk?

Cesar

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [og7] Update nvptx_fork/join barrier placement
  2018-03-19 15:24       ` Cesar Philippidis
@ 2018-03-19 15:44         ` Tom de Vries
  2018-03-19 17:04         ` Tom de Vries
  1 sibling, 0 replies; 9+ messages in thread
From: Tom de Vries @ 2018-03-19 15:44 UTC (permalink / raw)
  To: Cesar Philippidis; +Cc: gcc-patches, Thomas Schwinge

On 03/19/2018 03:55 PM, Cesar Philippidis wrote:
> Is your patch purely for debugging, or are you planning on committing it
> to og7 and trunk?

I plan to commit it.

We have no test-cases testing the neutering code order explicitly. So 
this check is the only thing that allows us to detect regressions, other 
than execution failures on newer archs.

Thanks,
- Tom

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [og7] Update nvptx_fork/join barrier placement
  2018-03-19 15:24       ` Cesar Philippidis
  2018-03-19 15:44         ` Tom de Vries
@ 2018-03-19 17:04         ` Tom de Vries
  2018-03-19 18:09           ` Cesar Philippidis
  2018-03-20 10:41           ` [nvptx, PR84952, committed] Fix bar.sync position Tom de Vries
  1 sibling, 2 replies; 9+ messages in thread
From: Tom de Vries @ 2018-03-19 17:04 UTC (permalink / raw)
  To: Cesar Philippidis; +Cc: gcc-patches, Thomas Schwinge

On 03/19/2018 03:55 PM, Cesar Philippidis wrote:
>> Note that this changes ordering of the vector-neutering jump and
>> worker-neutering jump at the end. In principle, this should not be
>> harmful, but it violates the invariant that vector-neutering
>> branch-around code should be as short-lived as possible. So, this needs
>> to be fixed.
>>
>> I've found this issue by adding verification of the neutering, as
>> attached below.
> ACK, thanks. I'll take a closer look at this.

I've got a tentative patch at 
https://gcc.gnu.org/bugzilla/attachment.cgi?id=43707 ( PR84952 - 
"[nvptx] bar.sync generated in divergent code" ).

Thanks,
- Tom

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [og7] Update nvptx_fork/join barrier placement
  2018-03-19 17:04         ` Tom de Vries
@ 2018-03-19 18:09           ` Cesar Philippidis
  2018-03-20 10:41           ` [nvptx, PR84952, committed] Fix bar.sync position Tom de Vries
  1 sibling, 0 replies; 9+ messages in thread
From: Cesar Philippidis @ 2018-03-19 18:09 UTC (permalink / raw)
  To: Tom de Vries; +Cc: gcc-patches, Thomas Schwinge

[-- Attachment #1: Type: text/plain, Size: 862 bytes --]

On 03/19/2018 10:02 AM, Tom de Vries wrote:
> On 03/19/2018 03:55 PM, Cesar Philippidis wrote:
>>> Note that this changes ordering of the vector-neutering jump and
>>> worker-neutering jump at the end. In principle, this should not be
>>> harmful, but it violates the invariant that vector-neutering
>>> branch-around code should be as short-lived as possible. So, this needs
>>> to be fixed.
>>>
>>> I've found this issue by adding verification of the neutering, as
>>> attached below.
>> ACK, thanks. I'll take a closer look at this.
> 
> I've got a tentative patch at
> https://gcc.gnu.org/bugzilla/attachment.cgi?id=43707 ( PR84952 -
> "[nvptx] bar.sync generated in divergent code" ).

I attached my WIP patch. But, given that you've spent a lot of time on
this, I'll let you continue working on it. Just remember to backport any
fix to og7.

Thanks,
Cesar

[-- Attachment #2: trunk-barriers-new.diff --]
[-- Type: text/x-patch, Size: 2341 bytes --]

diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index a6f444340fd..0d288cb81ba 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -4037,6 +4037,22 @@ nvptx_single (unsigned mask, basic_block from, basic_block to)
 	return;
     }
 
+  /* NVPTX_BARSYNC barriers are placed immediately before NVPTX_JOIN
+     in order to ensure that all of the threads in a CTA reach the
+     barrier.  Don't nueter BLOCK if head is NVPTX_BARSYNC and tail is
+     NVPTX_JOIN.  */
+  if (from == to
+      && recog_memoized (head) == CODE_FOR_nvptx_barsync
+      && recog_memoized (tail) == CODE_FOR_nvptx_join)
+    return;
+
+  /* Adjust HEAD to point to the NVPTX_JOIN instruction after a
+     NVPTX_BARSYNC, so that any successive state neutering code does
+     not get placed before the dummy JOIN comment. */
+  if (recog_memoized (head) == CODE_FOR_nvptx_barsync
+      && recog_memoized (NEXT_INSN (head)) == CODE_FOR_nvptx_join)
+    head = NEXT_INSN (head);
+
   /* Insert the vector test inside the worker test.  */
   unsigned mode;
   rtx_insn *before = tail;
@@ -4057,7 +4073,23 @@ nvptx_single (unsigned mask, basic_block from, basic_block to)
 	  br = gen_br_true (pred, label);
 	else
 	  br = gen_br_true_uni (pred, label);
-	emit_insn_before (br, head);
+
+	if (recog_memoized (head) == CODE_FOR_nvptx_forked
+	    && recog_memoized (NEXT_INSN (head)) == CODE_FOR_nvptx_barsync)
+	  {
+	    head = NEXT_INSN (head);
+	    emit_insn_after (br, head);
+	  }
+	else if (recog_memoized (head) == CODE_FOR_nvptx_join)
+	  {
+	    if (recog_memoized (NEXT_INSN (head)) == CODE_FOR_br_true_uni
+		&& mode == GOMP_DIM_VECTOR)
+	      emit_insn_after (br, NEXT_INSN (head));
+	    else
+	      emit_insn_after (br, head);
+	  }
+	else
+	  emit_insn_before (br, head);
 
 	LABEL_NUSES (label)++;
 	if (tail_branch)
@@ -4276,7 +4308,7 @@ nvptx_process_pars (parallel *par)
       nvptx_wpropagate (true, par->forked_block, par->fork_insn);
       /* Insert begin and end synchronizations.  */
       emit_insn_after (nvptx_wsync (false), par->forked_insn);
-      emit_insn_before (nvptx_wsync (true), par->joining_insn);
+      emit_insn_before (nvptx_wsync (true), par->join_insn);
     }
   else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
     nvptx_vpropagate (par->forked_block, par->forked_insn);

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [nvptx, PR84952, committed] Fix bar.sync position
  2018-03-19 17:04         ` Tom de Vries
  2018-03-19 18:09           ` Cesar Philippidis
@ 2018-03-20 10:41           ` Tom de Vries
  1 sibling, 0 replies; 9+ messages in thread
From: Tom de Vries @ 2018-03-20 10:41 UTC (permalink / raw)
  To: gcc-patches; +Cc: Cesar Philippidis, Thomas Schwinge, Richard Biener

[-- Attachment #1: Type: text/plain, Size: 3833 bytes --]

[ was: Re: [og7] Update nvptx_fork/join barrier placement ]

On 03/19/2018 06:02 PM, Tom de Vries wrote:
> I've got a tentative patch at 
> https://gcc.gnu.org/bugzilla/attachment.cgi?id=43707 ( PR84952 - 
> "[nvptx] bar.sync generated in divergent code" ).

Tested on x86_64 with nvptx accelerator (in combination with a patch 
that verifies the positioning of bar.sync).

Committed to stage4 trunk.

[ Recap:

Consider testcase workers.c:
...
int
main (void)
{
   int a[10];
#pragma acc parallel loop worker
   for (int i = 0; i < 10; i++)
     a[i] = i;

   return 0;
}
...

At -O2, we generate (edited for readability):
...
// BEGIN PREAMBLE
.version 3.1
.target sm_30
.address_size 64
// END PREAMBLE

// BEGIN FUNCTION DECL: main$_omp_fn$0
.entry main$_omp_fn$0 (.param .u64 %in_ar0);

//:FUNC_MAP "main$_omp_fn$0", 0x1, 0x20, 0x20

// BEGIN VAR DEF: __worker_bcast
.shared .align 8 .u8 __worker_bcast[8];

// BEGIN FUNCTION DEF: main$_omp_fn$0
.entry main$_omp_fn$0 (.param .u64 %in_ar0)
{
   .reg .u64 %ar0;
   ld.param.u64 %ar0,[%in_ar0];
   .reg .u32 %r24;
   .reg .u64 %r25;
   .reg .pred %r26;
   .reg .u64 %r27;
   .reg .u64 %r28;
   .reg .u64 %r29;
   .reg .u64 %r30;
   .reg .u64 %r31;
   .reg .u64 %r32;
   .reg .pred %r33;
   .reg .pred %r34;

   {
     .reg .u32 %y;
     mov.u32 %y,%tid.y;
     setp.ne.u32 %r34,%y,0;
   }

   {
     .reg .u32 %x;
     mov.u32 %x,%tid.x;
     setp.ne.u32 %r33,%x,0;
   }

   @ %r34 bra.uni $L6;
   @ %r33 bra $L7;
   mov.u64 %r25,%ar0;
   // fork 2;
   cvta.shared.u64 %r32,__worker_bcast;
   st.u64 [%r32],%r25;
  $L7:
  $L6:

   @ %r33 bra $L5;
   // forked 2;
   bar.sync 0;
   cvta.shared.u64 %r31,__worker_bcast;
   ld.u64 %r25,[%r31];
   mov.u32 %r24,%tid.y;
   setp.le.s32 %r26,%r24,9;
   @ %r26 bra $L2;
   bra $L3;
  $L2:
   ld.u64 %r27,[%r25];
   cvt.s64.s32 %r28,%r24;
   shl.b64 %r29,%r28,2;
   add.u64 %r30,%r27,%r29;
   st.u32 [%r30],%r24;
  $L3:
   bar.sync 1;
   // joining 2;
  $L5:

   @ %r34 bra.uni $L8;
   @ %r33 bra $L9;
   // join 2;
  $L9:
  $L8:

   ret;
}
...

The problem is the positioning of bar.sync, inside the vector-neutering 
branch "@ %r33 bra $L5".

The documentation for bar.sync says:
...
Barriers are executed on a per-warp basis as if all the threads in a 
warp are active. Thus, if any thread in a warp executes a bar 
instruction, it is as if all the threads in the warp have executed the 
bar instruction. All threads in the warp are stalled until the barrier 
completes, and the arrival count for the barrier is incremented by the 
warp size (not the number of active threads in the warp). In 
conditionally executed code, a bar instruction should only be used if it 
is known that all threads evaluate the condition identically (the warp 
does not diverge).
...

The documentation is somewhat contradictory, in that it first explains 
that that it is executed on a per-warp basis (implying that only one 
thread executing it should be fine), but then goes on to state that it 
should not be executed in divergent mode (implying that all threads 
should execute it).

Either way, the safest form of usage is: don't execute in divergent mode.

As is evident from the example above, we do generate bar.sync in 
divergent mode, and patch below fixes that.

With the patch, the difference in positioning of bar.sync is in the 
example above is:
...
@@ -42,18 +42,18 @@
    st.u64       [%r32], %r25;
   $L7:
   $L6:
+  bar.sync     0;
    @%r33        bra     $L5;
    // forked 2;
-  bar.sync     0;
    cvta.shared.u64      %r31, __worker_bcast;
    ld.u64       %r25, [%r31];
    mov.u32      %r24, %tid.y;
    setp.le.s32  %r26, %r24, 9;
    @%r26        bra     $L2;
   $L3:
-  bar.sync     1;
    // joining 2;
   $L5:
+  bar.sync     1;
    @%r34        bra.uni $L8;
    @%r33        bra     $L9;
    // join 2;
...
]

Thanks,
- Tom

[-- Attachment #2: 0001-nvptx-Fix-bar.sync-position.patch --]
[-- Type: text/x-patch, Size: 1703 bytes --]

[nvptx] Fix bar.sync position

2018-03-20  Tom de Vries  <tom@codesourcery.com>

	PR target/84952
	* config/nvptx/nvptx.c (nvptx_single): Don't neuter bar.sync.
	(nvptx_process_pars): Emit bar.sync asap and alap.

---
 gcc/config/nvptx/nvptx.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index a6f4443..a839988 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -3969,7 +3969,9 @@ nvptx_single (unsigned mask, basic_block from, basic_block to)
   while (true)
     {
       /* Find first insn of from block.  */
-      while (head != BB_END (from) && !INSN_P (head))
+      while (head != BB_END (from)
+	     && (!INSN_P (head)
+		 || recog_memoized (head) == CODE_FOR_nvptx_barsync))
 	head = NEXT_INSN (head);
 
       if (from == to)
@@ -4018,6 +4020,7 @@ nvptx_single (unsigned mask, basic_block from, basic_block to)
 	{
 	default:
 	  break;
+	case CODE_FOR_nvptx_barsync:
 	case CODE_FOR_nvptx_fork:
 	case CODE_FOR_nvptx_forked:
 	case CODE_FOR_nvptx_joining:
@@ -4275,8 +4278,8 @@ nvptx_process_pars (parallel *par)
       nvptx_wpropagate (false, par->forked_block, par->forked_insn);
       nvptx_wpropagate (true, par->forked_block, par->fork_insn);
       /* Insert begin and end synchronizations.  */
-      emit_insn_after (nvptx_wsync (false), par->forked_insn);
-      emit_insn_before (nvptx_wsync (true), par->joining_insn);
+      emit_insn_before (nvptx_wsync (false), par->forked_insn);
+      emit_insn_before (nvptx_wsync (true), par->join_insn);
     }
   else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
     nvptx_vpropagate (par->forked_block, par->forked_insn);

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2018-03-20 10:39 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-03-08 23:31 [og7] Update nvptx_fork/join barrier placement Cesar Philippidis
2018-03-09 16:22 ` Tom de Vries
2018-03-09 16:55   ` Cesar Philippidis
2018-03-19 14:30     ` Tom de Vries
2018-03-19 15:24       ` Cesar Philippidis
2018-03-19 15:44         ` Tom de Vries
2018-03-19 17:04         ` Tom de Vries
2018-03-19 18:09           ` Cesar Philippidis
2018-03-20 10:41           ` [nvptx, PR84952, committed] Fix bar.sync position Tom de Vries

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).