public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH, 2/2][nvptx, PR83589] Workaround for branch-around-nothing JIT bug
@ 2018-01-24 11:00 Tom de Vries
  2018-01-24 11:03 ` Richard Biener
                   ` (2 more replies)
  0 siblings, 3 replies; 8+ messages in thread
From: Tom de Vries @ 2018-01-24 11:00 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: GCC Patches, Richard Biener

[-- Attachment #1: Type: text/plain, Size: 1880 bytes --]

Hi,

this patch adds a workaround for the nvptx target JIT bug PR83589 - 
"[nvptx] mode-transitions.c and private-variables.{c,f90} execution 
FAILs at GOMP_NVPTX_JIT=-O0".


When compiling a branch-around-nothing (where the branch is warp 
neutering, so it's a divergent branch):
...
   .reg .pred %r36;
   {
     .reg .u32 %x;
     mov.u32 %x,%tid.x;
     setp.ne.u32 %r36,%x,0;
   }

   @ %r36 bra $L5;
   $L5:
...

The JIT fails to generate a convergence point here:
...
          /*0128*/               @P0 BRA `(.L_1);
.L_1:
...

Consequently, we execute subsequent code in divergent mode, and when 
executing a shfl.idx a bit later we run into the undefined behaviour 
that shfl.idx has when executing in divergent mode.

The workaround detects branch-around-nothing, and inserts a ptx 
operation that does nothing (I'm calling it a fake nop, I haven't been 
able to come up with a better term yet):
...
   @ %r36 bra $L5;
     {
       .reg .u32 %nop_src;
       .reg .u32 %nop_dst;
       mov.u32 %nop_dst, %nop_src;
     }
   $L5:
...
which makes the test pass, because then we generate a convergence point 
here at .L1:
...
         /*0128*/                   SSY `(.L_1);
         /*0130*/               @P0 SYNC (*"TARGET= .L_1 "*);
         /*0138*/                   SYNC (*"TARGET= .L_1 "*);
.L_1:
...

The workaround is not minimal given that it inserts the fake nop in all 
branch-around-nothings it detects, not just the warp neutering ones, but 
I think this is more robust than trying to identify the warp neutering 
branches. Furthermore, I'm not going for optimality here anyway. The 
optimal way to fix this is making sure we don't generate 
branch-around-nothing, but that's for stage1.

Build and reg-tested on x86_64 with nvptx accelerator.

I'd like to commit in stage4, but I'd appreciate a review of the code. 
Does the patch look OK?

Thanks,
- Tom

[-- Attachment #2: 0002-nvptx-PR83589-Workaround-for-branch-around-nothing-JIT-bug.patch --]
[-- Type: text/x-patch, Size: 4980 bytes --]

[nvptx, PR83589] Workaround for branch-around-nothing JIT bug

2018-01-23  Tom de Vries  <tom@codesourcery.com>

	PR target/83589
	* config/nvptx/nvptx.c (WORKAROUND_PTXJIT_BUG_2): Define to 1.
	(nvptx_pc_set, nvptx_condjump_label): New function. Copy from jump.c.
	Add strict parameter.
	(prevent_branch_around_nothing): Insert dummy insn between branch to
	label and label with no ptx insn inbetween.
	* config/nvptx/nvptx.md (define_insn "fake_nop"): New insn.

	* testsuite/libgomp.oacc-c-c++-common/pr83589.c: New test.

---
 gcc/config/nvptx/nvptx.c                           | 92 ++++++++++++++++++++++
 gcc/config/nvptx/nvptx.md                          |  9 +++
 .../testsuite/libgomp.oacc-c-c++-common/pr83589.c  | 21 +++++
 3 files changed, 122 insertions(+)

diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index 3516740..e55b426 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -78,6 +78,7 @@
 #include "target-def.h"
 
 #define WORKAROUND_PTXJIT_BUG 1
+#define WORKAROUND_PTXJIT_BUG_2 1
 
 /* The various PTX memory areas an object might reside in.  */
 enum nvptx_data_area
@@ -4363,6 +4364,93 @@ nvptx_neuter_pars (parallel *par, unsigned modes, unsigned outer)
     nvptx_neuter_pars (par->next, modes, outer);
 }
 
+#if WORKAROUND_PTXJIT_BUG_2
+/* Variant of pc_set that only requires JUMP_P (INSN) if STRICT.  This variant
+   is needed in the nvptx target because the branches generated for
+   parititioning are NONJUMP_INSN_P, not JUMP_P.  */
+
+static rtx
+nvptx_pc_set (const rtx_insn *insn, bool strict = true)
+{
+  rtx pat;
+  if ((strict && !JUMP_P (insn))
+      || (!strict && !INSN_P (insn)))
+    return NULL_RTX;
+  pat = PATTERN (insn);
+
+  /* The set is allowed to appear either as the insn pattern or
+     the first set in a PARALLEL.  */
+  if (GET_CODE (pat) == PARALLEL)
+    pat = XVECEXP (pat, 0, 0);
+  if (GET_CODE (pat) == SET && GET_CODE (SET_DEST (pat)) == PC)
+    return pat;
+
+  return NULL_RTX;
+}
+
+/* Variant of condjump_label that only requires JUMP_P (INSN) if STRICT.  */
+
+static rtx
+nvptx_condjump_label (const rtx_insn *insn, bool strict = true)
+{
+  rtx x = nvptx_pc_set (insn, strict);
+
+  if (!x)
+    return NULL_RTX;
+  x = SET_SRC (x);
+  if (GET_CODE (x) == LABEL_REF)
+    return x;
+  if (GET_CODE (x) != IF_THEN_ELSE)
+    return NULL_RTX;
+  if (XEXP (x, 2) == pc_rtx && GET_CODE (XEXP (x, 1)) == LABEL_REF)
+    return XEXP (x, 1);
+  if (XEXP (x, 1) == pc_rtx && GET_CODE (XEXP (x, 2)) == LABEL_REF)
+    return XEXP (x, 2);
+  return NULL_RTX;
+}
+
+/* Insert a dummy ptx insn when encountering a branch to a label with no ptx
+   insn inbetween the branch and the label.  This works around a JIT bug
+   observed at driver version 384.111, at -O0 for sm_50.  */
+
+static void
+prevent_branch_around_nothing (void)
+{
+  rtx_insn *seen_label = 0;
+    for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
+      {
+	if (seen_label == 0)
+	  {
+	    if (INSN_P (insn) && condjump_p (insn))
+	      seen_label = label_ref_label (nvptx_condjump_label (insn, false));
+
+	    continue;
+	  }
+
+	if (NOTE_P (insn))
+	  continue;
+
+	if (INSN_P (insn))
+	  switch (recog_memoized (insn))
+	    {
+	    case CODE_FOR_nvptx_fork:
+	    case CODE_FOR_nvptx_forked:
+	    case CODE_FOR_nvptx_joining:
+	    case CODE_FOR_nvptx_join:
+	      continue;
+	    default:
+	      seen_label = 0;
+	      continue;
+	    }
+
+	if (LABEL_P (insn) && insn == seen_label)
+	  emit_insn_before (gen_fake_nop (), insn);
+
+	seen_label = 0;
+      }
+  }
+#endif
+
 /* PTX-specific reorganization
    - Split blocks at fork and join instructions
    - Compute live registers
@@ -4442,6 +4530,10 @@ nvptx_reorg (void)
   if (TARGET_UNIFORM_SIMT)
     nvptx_reorg_uniform_simt ();
 
+#if WORKAROUND_PTXJIT_BUG_2
+  prevent_branch_around_nothing ();
+#endif
+
   regstat_free_n_sets_and_refs ();
 
   df_finish_pass (true);
diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md
index 135479b..4f4453d 100644
--- a/gcc/config/nvptx/nvptx.md
+++ b/gcc/config/nvptx/nvptx.md
@@ -999,6 +999,15 @@
   ""
   "exit;")
 
+(define_insn "fake_nop"
+  [(const_int 2)]
+  ""
+  "{
+     .reg .u32 %%nop_src;
+     .reg .u32 %%nop_dst;
+     mov.u32 %%nop_dst, %%nop_src;
+   }")
+
 (define_insn "return"
   [(return)]
   ""
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/pr83589.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/pr83589.c
new file mode 100644
index 0000000..a6ed5cf
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/pr83589.c
@@ -0,0 +1,21 @@
+/* { dg-do run } */
+/* { dg-set-target-env-var GOMP_NVPTX_JIT "-O0" } */
+
+#define n 32
+
+int
+main (void)
+{
+  int arr_a[n];
+
+#pragma acc parallel copyout(arr_a) num_gangs(1) num_workers(1) vector_length(32)
+  {
+    #pragma acc loop vector
+    for (int m = 0; m < 32; m++)
+      ;
+
+    #pragma acc loop vector
+    for (int m = 0; m < 32; m++)
+      arr_a[m] = 0;
+  }
+}

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2018-05-17  6:35 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-01-24 11:00 [PATCH, 2/2][nvptx, PR83589] Workaround for branch-around-nothing JIT bug Tom de Vries
2018-01-24 11:03 ` Richard Biener
2018-01-24 11:10 ` Jakub Jelinek
2018-01-24 14:04   ` Tom de Vries
2018-01-24 14:19     ` Jakub Jelinek
2018-01-24 16:03       ` Tom de Vries
2018-05-17  7:25     ` Thomas Schwinge
2018-03-20  9:44 ` [nvptx, PR84954, committed] Fix prevent_branch_around_nothing Tom de Vries

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).