[PATCH 1/8] nvptx -msoft-stack

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCH 1/8] nvptx -msoft-stack
  2016-06-09 16:54 [PATCH 0/8] NVPTX offloading to NVPTX: backend patches Alexander Monakov
  2016-06-09 16:54 ` [PATCH 5/8] nvptx mkoffload: pass -mgomp for OpenMP offloading Alexander Monakov
  2016-06-09 16:54 ` [PATCH 7/8] nvptx: new insns for OpenMP SIMD-on-SIMT Alexander Monakov
@ 2016-06-09 16:54 ` Alexander Monakov
  2016-06-09 16:54 ` [PATCH 6/8] new target hook: TARGET_SIMT_VF Alexander Monakov
                   ` (5 subsequent siblings)
  8 siblings, 0 replies; 16+ messages in thread
From: Alexander Monakov @ 2016-06-09 16:54 UTC (permalink / raw)
  To: gcc-patches; +Cc: Nathan Sidwell

This is a respin of the recently reviewed -msoft-stack patch that addresses
review feedback and adds required libgcc changes.

gcc/:
	* config/nvptx/nvptx-protos.h (nvptx_output_set_softstack): Declare.
	* config/nvptx/nvptx.c: (need_softstack_decl): New variable.
	(init_softstack_frame): New.
	(nvptx_declare_function_name): Handle TARGET_SOFT_STACK.
	(nvptx_output_set_softstack): New.
	(nvptx_output_return): Emit stack restore if needed.
	(nvptx_get_drap_rtx): Return %argp as the DRAP if needed.
	(nvptx_file_end): Handle need_softstack_decl.
	* config/nvptx/nvptx.h: (TARGET_CPU_CPP_BUILTINS): Define
	__nvptx_softstack__ when -msoft-stack is active.
	(STACK_SIZE_MODE): Define.
	(FIXED_REGISTERS): Adjust.
	(SOFTSTACK_SLOT_REGNUM): New.
	(SOFTSTACK_PREV_REGNUM): New.
	(REGISTER_NAMES): Adjust.
	(struct machine_function): New bool field has_softstack.
	* config/nvptx/nvptx.md (UNSPEC_SET_SOFTSTACK): New.
	(allocate_stack): Implement for TARGET_SOFT_STACK.  Remove unused code.
	(allocate_stack_<mode>): Remove unused pattern.
	(set_softstack_insn): New pattern.
	(restore_stack_block): Handle for TARGET_SOFT_STACK.
	* config/nvptx/nvptx.opt: (msoft-stack): New option.
	* doc/invoke.texi (msoft-stack): Document.

gcc/testsuite/:
	* gcc.target/nvptx/softstack.c: New test.
	* lib/target-supports.exp (check_effective_target_alloca): Use a
	compile test.

libgcc/:
	* config/nvptx/crt0.c (__main) [__nvptx_softstack__]: Setup
	__nvptx_stacks.
	* config/nvptx/stacks.c: New file.
	* config/nvptx/t-nvptx (LIB2ADD): Add stacks.c.
---
 gcc/config/nvptx/nvptx-protos.h            |   1 +
 gcc/config/nvptx/nvptx.c                   | 111 ++++++++++++++++++++++++++---
 gcc/config/nvptx/nvptx.h                   |  15 +++-
 gcc/config/nvptx/nvptx.md                  |  33 ++++++---
 gcc/config/nvptx/nvptx.opt                 |   4 ++
 gcc/doc/invoke.texi                        |  12 ++++
 gcc/testsuite/gcc.target/nvptx/softstack.c |  23 ++++++
 gcc/testsuite/lib/target-supports.exp      |   5 +-
 libgcc/config/nvptx/crt0.c                 |   6 ++
 libgcc/config/nvptx/stacks.c               |  24 +++++++
 libgcc/config/nvptx/t-nvptx                |   3 +-
 11 files changed, 213 insertions(+), 24 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/nvptx/softstack.c
 create mode 100644 libgcc/config/nvptx/stacks.c

diff --git a/gcc/config/nvptx/nvptx-protos.h b/gcc/config/nvptx/nvptx-protos.h
index ec4588e..647607d 100644
--- a/gcc/config/nvptx/nvptx-protos.h
+++ b/gcc/config/nvptx/nvptx-protos.h
@@ -41,5 +41,6 @@ extern const char *nvptx_ptx_type_from_mode (machine_mode, bool);
 extern const char *nvptx_output_mov_insn (rtx, rtx);
 extern const char *nvptx_output_call_insn (rtx_insn *, rtx, rtx);
 extern const char *nvptx_output_return (void);
+extern const char *nvptx_output_set_softstack (unsigned);
 #endif
 #endif
diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index 39e4145..1c95fdf 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -139,6 +139,9 @@ static GTY(()) rtx worker_red_sym;
 /* Global lock variable, needed for 128bit worker & gang reductions.  */
 static GTY(()) tree global_lock_var;
 
+/* True if any function references __nvptx_stacks.  */
+static bool need_softstack_decl;
+
 /* Allocate a new, cleared machine_function structure.  */
 
 static struct machine_function *
@@ -979,6 +982,60 @@ init_frame (FILE  *file, int regno, unsigned align, unsigned size)
 	   POINTER_SIZE, reg_names[regno], reg_names[regno]);
 }
 
+/* Emit soft stack frame setup sequence.  */
+
+static void
+init_softstack_frame (FILE *file, unsigned alignment, HOST_WIDE_INT size)
+{
+  /* Maintain 64-bit stack alignment.  */
+  unsigned keep_align = BIGGEST_ALIGNMENT / BITS_PER_UNIT;
+  size = ROUND_UP (size, keep_align);
+  int bits = POINTER_SIZE;
+  const char *reg_stack = reg_names[STACK_POINTER_REGNUM];
+  const char *reg_frame = reg_names[FRAME_POINTER_REGNUM];
+  const char *reg_sspslot = reg_names[SOFTSTACK_SLOT_REGNUM];
+  const char *reg_sspprev = reg_names[SOFTSTACK_PREV_REGNUM];
+  fprintf (file, "\t.reg.u%d %s;\n", bits, reg_stack);
+  fprintf (file, "\t.reg.u%d %s;\n", bits, reg_frame);
+  fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspslot);
+  fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspprev);
+  fprintf (file, "\t{\n");
+  fprintf (file, "\t\t.reg.u32 %%fstmp0;\n");
+  fprintf (file, "\t\t.reg.u%d %%fstmp1;\n", bits);
+  fprintf (file, "\t\t.reg.u%d %%fstmp2;\n", bits);
+  fprintf (file, "\t\tmov.u32 %%fstmp0, %%tid.y;\n");
+  fprintf (file, "\t\tmul%s.u32 %%fstmp1, %%fstmp0, %d;\n",
+	   bits == 64 ? ".wide" : ".lo", bits / 8);
+  fprintf (file, "\t\tmov.u%d %%fstmp2, __nvptx_stacks;\n", bits);
+  /* Initialize %sspslot = &__nvptx_stacks[tid.y].  */
+  fprintf (file, "\t\tadd.u%d %s, %%fstmp2, %%fstmp1;\n", bits, reg_sspslot);
+  /* Initialize %sspprev = __nvptx_stacks[tid.y].  */
+  fprintf (file, "\t\tld.shared.u%d %s, [%s];\n",
+	   bits, reg_sspprev, reg_sspslot);
+  /* Initialize %frame = %sspprev - size.  */
+  fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
+	   bits, reg_frame, reg_sspprev, size);
+  /* Apply alignment, if larger than 64.  */
+  if (alignment > keep_align)
+    fprintf (file, "\t\tand.b%d %s, %s, %d;\n",
+	     bits, reg_frame, reg_frame, -alignment);
+  size = crtl->outgoing_args_size;
+  gcc_assert (size % keep_align == 0);
+  /* Initialize %stack.  */
+  fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
+	   bits, reg_stack, reg_frame, size);
+  /* Usually 'crtl->is_leaf' is computed during register allocator
+     initialization, which is not done on NVPTX.  Compute it now.  */
+  gcc_assert (!crtl->is_leaf);
+  crtl->is_leaf = leaf_function_p ();
+  if (!crtl->is_leaf)
+    fprintf (file, "\t\tst.shared.u%d [%s], %s;\n",
+	     bits, reg_sspslot, reg_stack);
+  fprintf (file, "\t}\n");
+  cfun->machine->has_softstack = true;
+  need_softstack_decl = true;
+}
+
 /* Emit code to initialize the REGNO predicate register to indicate
    whether we are not lane zero on the NAME axis.  */
 
@@ -1040,16 +1097,22 @@ nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
 
   fprintf (file, "%s", s.str().c_str());
 
-  /* Declare a local var for outgoing varargs.  */
-  if (cfun->machine->has_varadic)
-    init_frame (file, STACK_POINTER_REGNUM,
-		UNITS_PER_WORD, crtl->outgoing_args_size);
-
-  /* Declare a local variable for the frame.  */
   HOST_WIDE_INT sz = get_frame_size ();
-  if (sz || cfun->machine->has_chain)
-    init_frame (file, FRAME_POINTER_REGNUM,
-		crtl->stack_alignment_needed / BITS_PER_UNIT, sz);
+  bool need_frameptr = sz || cfun->machine->has_chain;
+  int alignment = crtl->stack_alignment_needed / BITS_PER_UNIT;
+  if (!TARGET_SOFT_STACK)
+    {
+      /* Declare a local var for outgoing varargs.  */
+      if (cfun->machine->has_varadic)
+	init_frame (file, STACK_POINTER_REGNUM,
+		    UNITS_PER_WORD, crtl->outgoing_args_size);
+
+      /* Declare a local variable for the frame.  */
+      if (need_frameptr)
+	init_frame (file, FRAME_POINTER_REGNUM, alignment, sz);
+    }
+  else if (need_frameptr || cfun->machine->has_varadic || cfun->calls_alloca)
+    init_softstack_frame (file, alignment, sz);
 
   /* Declare the pseudos we have as ptx registers.  */
   int maxregs = max_reg_num ();
@@ -1077,6 +1140,21 @@ nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
 			       REGNO (cfun->machine->axis_predicate[1]), "x");
 }
 
+/* Output instruction that sets soft stack pointer in shared memory to the
+   value in register given by SRC_REGNO.  */
+
+const char *
+nvptx_output_set_softstack (unsigned src_regno)
+{
+  if (cfun->machine->has_softstack && !crtl->is_leaf)
+    {
+      fprintf (asm_out_file, "\tst.shared.u%d\t[%s], ",
+	       POINTER_SIZE, reg_names[SOFTSTACK_SLOT_REGNUM]);
+      output_reg (asm_out_file, src_regno, VOIDmode);
+      fprintf (asm_out_file, ";\n");
+    }
+  return "";
+}
 /* Output a return instruction.  Also copy the return value to its outgoing
    location.  */
 
@@ -1085,6 +1163,9 @@ nvptx_output_return (void)
 {
   machine_mode mode = (machine_mode)cfun->machine->return_mode;
 
+  if (TARGET_SOFT_STACK)
+    nvptx_output_set_softstack (SOFTSTACK_PREV_REGNUM);
+
   if (mode != VOIDmode)
     fprintf (asm_out_file, "\tst.param%s\t[%s_out], %s;\n",
 	     nvptx_ptx_type_from_mode (mode, false),
@@ -1116,6 +1197,8 @@ nvptx_function_ok_for_sibcall (tree, tree)
 static rtx
 nvptx_get_drap_rtx (void)
 {
+  if (TARGET_SOFT_STACK && stack_realign_drap)
+    return arg_pointer_rtx;
   return NULL_RTX;
 }
 
@@ -4023,6 +4106,16 @@ nvptx_file_end (void)
   if (worker_red_size)
     write_worker_buffer (asm_out_file, worker_red_sym,
 			 worker_red_align, worker_red_size);
+
+  if (need_softstack_decl)
+    {
+      write_var_marker (asm_out_file, false, true, "__nvptx_stacks");
+      /* 32 is the maximum number of warps in a block.  Even though it's an
+         external declaration, emit the array size explicitly; otherwise, it
+	 may fail at PTX JIT time if the definition is later in link order.  */
+      fprintf (asm_out_file, ".extern .shared .u%d __nvptx_stacks[32];\n",
+	       POINTER_SIZE);
+    }
 }
 
 /* Expander for the shuffle builtins.  */
diff --git a/gcc/config/nvptx/nvptx.h b/gcc/config/nvptx/nvptx.h
index 381269e..2c4962a 100644
--- a/gcc/config/nvptx/nvptx.h
+++ b/gcc/config/nvptx/nvptx.h
@@ -31,6 +31,8 @@
       builtin_assert ("machine=nvptx");		\
       builtin_assert ("cpu=nvptx");		\
       builtin_define ("__nvptx__");		\
+      if (TARGET_SOFT_STACK)			\
+        builtin_define ("__nvptx_softstack__");	\
     } while (0)
 
 /* Avoid the default in ../../gcc.c, which adds "-pthread", which is not
@@ -79,13 +81,14 @@
 
 #define POINTER_SIZE (TARGET_ABI64 ? 64 : 32)
 #define Pmode (TARGET_ABI64 ? DImode : SImode)
+#define STACK_SIZE_MODE Pmode
 
 /* Registers.  Since ptx is a virtual target, we just define a few
    hard registers for special purposes and leave pseudos unallocated.
    We have to have some available hard registers, to keep gcc setup
    happy.  */
 #define FIRST_PSEUDO_REGISTER 16
-#define FIXED_REGISTERS	    { 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }
+#define FIXED_REGISTERS	    { 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0 }
 #define CALL_USED_REGISTERS { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }
 
 #define HARD_REGNO_NREGS(REG, MODE)		\
@@ -133,10 +136,17 @@ enum reg_class             {  NO_REGS,    ALL_REGS,	LIM_REG_CLASSES };
 #define FRAME_POINTER_REGNUM 2
 #define ARG_POINTER_REGNUM 3
 #define STATIC_CHAIN_REGNUM 4
+/* This register points to the shared memory location with the current warp's
+   soft stack pointer (__nvptx_stacks[tid.y]).  */
+#define SOFTSTACK_SLOT_REGNUM 5
+/* This register is used to save the previous value of the soft stack pointer
+   in the prologue and restore it when returning.  */
+#define SOFTSTACK_PREV_REGNUM 6
 
 #define REGISTER_NAMES							\
   {									\
-    "%value", "%stack", "%frame", "%args", "%chain", "%hr5", "%hr6", "%hr7", \
+    "%value", "%stack", "%frame", "%args",                              \
+    "%chain", "%sspslot", "%sspprev", "%hr7",                           \
     "%hr8", "%hr9", "%hr10", "%hr11", "%hr12", "%hr13", "%hr14", "%hr15" \
   }
 
@@ -200,6 +210,7 @@ struct GTY(()) machine_function
   bool is_varadic;  /* This call is varadic  */
   bool has_varadic;  /* Current function has a varadic call.  */
   bool has_chain; /* Current function has outgoing static chain.  */
+  bool has_softstack; /* Current function has a soft stack frame.  */
   int num_args;	/* Number of args of current call.  */
   int return_mode; /* Return mode of current fn.
 		      (machine_mode not defined yet.) */
diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md
index e48412d..bbc8b1c 100644
--- a/gcc/config/nvptx/nvptx.md
+++ b/gcc/config/nvptx/nvptx.md
@@ -36,6 +36,8 @@
 
    UNSPEC_ALLOCA
 
+   UNSPEC_SET_SOFTSTACK
+
    UNSPEC_DIM_SIZE
 
    UNSPEC_BIT_CONV
@@ -972,31 +974,40 @@
    (match_operand 1 "nvptx_register_operand")]
   ""
 {
+  if (TARGET_SOFT_STACK)
+    {
+      emit_move_insn (stack_pointer_rtx,
+		      gen_rtx_MINUS (Pmode, stack_pointer_rtx, operands[1]));
+      emit_insn (gen_set_softstack_insn (stack_pointer_rtx));
+      emit_move_insn (operands[0], virtual_stack_dynamic_rtx);
+      DONE;
+    }
   /* The ptx documentation specifies an alloca intrinsic (for 32 bit
      only)  but notes it is not implemented.  The assembler emits a
      confused error message.  Issue a blunt one now instead.  */
   sorry ("target cannot support alloca.");
   emit_insn (gen_nop ());
   DONE;
-  if (TARGET_ABI64)
-    emit_insn (gen_allocate_stack_di (operands[0], operands[1]));
-  else
-    emit_insn (gen_allocate_stack_si (operands[0], operands[1]));
-  DONE;
 })
 
-(define_insn "allocate_stack_<mode>"
-  [(set (match_operand:P 0 "nvptx_register_operand" "=R")
-        (unspec:P [(match_operand:P 1 "nvptx_register_operand" "R")]
-                   UNSPEC_ALLOCA))]
-  ""
-  "%.\\tcall (%0), %%alloca, (%1);")
+(define_insn "set_softstack_insn"
+  [(unspec [(match_operand 0 "nvptx_register_operand" "R")]
+	   UNSPEC_SET_SOFTSTACK)]
+  "TARGET_SOFT_STACK"
+{
+  return nvptx_output_set_softstack (REGNO (operands[0]));
+})
 
 (define_expand "restore_stack_block"
   [(match_operand 0 "register_operand" "")
    (match_operand 1 "register_operand" "")]
   ""
 {
+  if (TARGET_SOFT_STACK)
+    {
+      emit_move_insn (operands[0], operands[1]);
+      emit_insn (gen_set_softstack_insn (operands[0]));
+    }
   DONE;
 })
 
diff --git a/gcc/config/nvptx/nvptx.opt b/gcc/config/nvptx/nvptx.opt
index 601cf12..71ddadb 100644
--- a/gcc/config/nvptx/nvptx.opt
+++ b/gcc/config/nvptx/nvptx.opt
@@ -32,3 +32,7 @@ Link in code for a __main kernel.
 moptimize
 Target Report Var(nvptx_optimize) Init(-1)
 Optimize partition neutering.
+
+msoft-stack
+Target Report Mask(SOFT_STACK)
+Use custom stacks instead of local memory for automatic storage.
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index e3a2399..5ed5789 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -19609,6 +19609,18 @@ offloading execution.
 Apply partitioned execution optimizations.  This is the default when any
 level of optimization is selected.
 
+@item -msoft-stack
+@opindex msoft-stack
+Generate code that does not use @code{.local} memory
+directly for stack storage. Instead, a per-warp stack pointer is
+maintained explicitly. This enables variable-length stack allocation (with
+variable-length arrays or @code{alloca}), and when global memory is used for
+underlying storage, makes it possible to access automatic variables from other
+threads, or with atomic instructions. This code generation variant is used
+for OpenMP offloading, but the option is exposed on its own for the purpose
+of testing the compiler; to generate code suitable for linking into programs
+using OpenMP offloading, use option @option{-mgomp}.
+
 @end table
 
 @node PDP-11 Options
diff --git a/gcc/testsuite/gcc.target/nvptx/softstack.c b/gcc/testsuite/gcc.target/nvptx/softstack.c
new file mode 100644
index 0000000..73e60f2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nvptx/softstack.c
@@ -0,0 +1,23 @@
+/* { dg-options "-O2 -msoft-stack" } */
+/* { dg-do run } */
+
+static __attribute__((noinline,noclone)) int f(int *p)
+{
+  return __sync_lock_test_and_set(p, 1);
+}
+
+static __attribute__((noinline,noclone)) int g(int n)
+{
+  /* Check that variable-length stack allocation works.  */
+  int v[n];
+  v[0] = 0;
+  /* Check that atomic operations can be applied to auto data.  */
+  return f(v) == 0 && v[0] == 1;
+}
+
+int main()
+{
+  if (!g(1))
+    __builtin_abort();
+  return 0;
+}
diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
index f4cb276..c10b6e6 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -729,7 +729,10 @@ proc check_effective_target_untyped_assembly {} {
 
 proc check_effective_target_alloca {} {
     if { [istarget nvptx-*-*] } {
-	return 0
+	return [check_no_compiler_messages alloca assembly {
+	    void f (void*);
+	    void g (int n) { f (__builtin_alloca (n)); }
+	}]
     }
     return 1
 }
diff --git a/libgcc/config/nvptx/crt0.c b/libgcc/config/nvptx/crt0.c
index 3b7382d..e31696c 100644
--- a/libgcc/config/nvptx/crt0.c
+++ b/libgcc/config/nvptx/crt0.c
@@ -33,5 +33,11 @@ __main (int *rval_ptr, int argc, void **argv)
   if (rval_ptr)
     *rval_ptr = 255;
 
+#ifdef __nvptx_softstack__
+  extern char *__nvptx_stacks[32] __attribute__((shared));
+  static char stack[131072] __attribute__((aligned(8)));
+  __nvptx_stacks[0] = stack + sizeof stack;
+#endif
+
   exit (main (argc, argv));
 }
diff --git a/libgcc/config/nvptx/stacks.c b/libgcc/config/nvptx/stacks.c
new file mode 100644
index 0000000..7582c72
--- /dev/null
+++ b/libgcc/config/nvptx/stacks.c
@@ -0,0 +1,24 @@
+/* Define shared memory array for -msoft-stack.
+
+   Copyright (C) 2015-2016 Free Software Foundation, Inc.
+
+   This file is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by the
+   Free Software Foundation; either version 3, or (at your option) any
+   later version.
+
+   This file is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+char *__nvptx_stacks[32] __attribute__((shared,nocommon));
diff --git a/libgcc/config/nvptx/t-nvptx b/libgcc/config/nvptx/t-nvptx
index daf252f..23a8252 100644
--- a/libgcc/config/nvptx/t-nvptx
+++ b/libgcc/config/nvptx/t-nvptx
@@ -1,4 +1,5 @@
-LIB2ADD=$(srcdir)/config/nvptx/reduction.c
+LIB2ADD=$(srcdir)/config/nvptx/reduction.c \
+	$(srcdir)/config/nvptx/stacks.c
 
 LIB2ADDEH=
 LIB2FUNCS_EXCLUDE=__main
-- 
2.6.6

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH 0/8] NVPTX offloading to NVPTX: backend patches
@ 2016-06-09 16:54 Alexander Monakov
  2016-06-09 16:54 ` [PATCH 5/8] nvptx mkoffload: pass -mgomp for OpenMP offloading Alexander Monakov
                   ` (8 more replies)
  0 siblings, 9 replies; 16+ messages in thread
From: Alexander Monakov @ 2016-06-09 16:54 UTC (permalink / raw)
  To: gcc-patches; +Cc: Nathan Sidwell, Jakub Jelinek

Hi,

I'm sending updated patch series with backend prerequisites for OpenMP
offloading to the NVIDIA PTX ISA.  The first patch has already received some
comments and this version reflects review feedback.  The other patches have
been adjusted for clarity and re-cut in a more rigorous manner.  All patches are
rebased onto current trunk.

Jakub, can you offer wishes/recommendations for sending the rest of
(middle-end and libgomp) patches?  As you know there's a branch with
development history; is that of interest, or would it be easier if I rebased
all stuff anew on current trunk?

Thanks.
Alexander

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH 2/8] nvptx: implement predicated instructions
  2016-06-09 16:54 [PATCH 0/8] NVPTX offloading to NVPTX: backend patches Alexander Monakov
                   ` (5 preceding siblings ...)
  2016-06-09 16:54 ` [PATCH 3/8] nvptx -muniform-simt Alexander Monakov
@ 2016-06-09 16:54 ` Alexander Monakov
  2016-06-09 16:54 ` [PATCH 8/8] nvptx: handle OpenMP "omp target entrypoint" Alexander Monakov
  2016-06-09 17:00 ` [PATCH 0/8] NVPTX offloading to NVPTX: backend patches Jakub Jelinek
  8 siblings, 0 replies; 16+ messages in thread
From: Alexander Monakov @ 2016-06-09 16:54 UTC (permalink / raw)
  To: gcc-patches; +Cc: Nathan Sidwell

This patch wires up generation of predicated instruction forms in nvptx.md and
fixes their handling in nvptx.c.  This is a prerequisite for the following
patch.  On its own it doesn't affect generated code because COND_EXEC
instructions are created by if-conversion only after register allocation,
which is not performed on NVPTX.

	* config/nvptx/nvptx.c (nvptx_output_call_insn): Handle COND_EXEC
	patterns.  Emit instruction predicate.
	(nvptx_print_operand): Fix handling of instruction predicates.
	* config/nvptx/nvptx.md (predicable): New attribute.  Generate
	predicated forms via define_cond_exec.
	(br_true): Mark as not predicable.
	(br_false): Ditto.
	(br_true_uni): Ditto.
	(br_false_uni): Ditto.
	(return): Ditto.
	(trap_if_true): Ditto.
	(trap_if_false): Ditto.
	(nvptx_fork): Ditto.
	(nvptx_forked): Ditto.
	(nvptx_joining): Ditto.
	(nvptx_join): Ditto.
	(nvptx_barsync): Ditto.
---
 gcc/config/nvptx/nvptx.c  | 14 ++++++++------
 gcc/config/nvptx/nvptx.md | 43 +++++++++++++++++++++++++++++++------------
 2 files changed, 39 insertions(+), 18 deletions(-)

diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index 1c95fdf..7b90cb1 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -1919,6 +1919,8 @@ nvptx_output_mov_insn (rtx dst, rtx src)
   return "%.\tcvt%t0%t1\t%0, %1;";
 }
 
+static void nvptx_print_operand (FILE *, rtx, int);
+
 /* Output INSN, which is a call to CALLEE with result RESULT.  For ptx, this
    involves writing .param declarations and in/out copies into them.  For
    indirect calls, also write the .callprototype.  */
@@ -1930,6 +1932,8 @@ nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx callee)
   static int labelno;
   bool needs_tgt = register_operand (callee, Pmode);
   rtx pat = PATTERN (insn);
+  if (GET_CODE (pat) == COND_EXEC)
+    pat = COND_EXEC_CODE (pat);
   int arg_end = XVECLEN (pat, 0);
   tree decl = NULL_TREE;
 
@@ -1974,6 +1978,8 @@ nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx callee)
       fprintf (asm_out_file, ";\n");
     }
 
+  /* The '.' stands for the call's predicate, if any.  */
+  nvptx_print_operand (asm_out_file, NULL_RTX, '.');
   fprintf (asm_out_file, "\t\tcall ");
   if (result != NULL_RTX)
     fprintf (asm_out_file, "(%s_in), ", reg_names[NVPTX_RETURN_REGNUM]);
@@ -2037,8 +2043,6 @@ nvptx_print_operand_punct_valid_p (unsigned char c)
   return c == '.' || c== '#';
 }
 
-static void nvptx_print_operand (FILE *, rtx, int);
-
 /* Subroutine of nvptx_print_operand; used to print a memory reference X to FILE.  */
 
 static void
@@ -2099,12 +2103,10 @@ nvptx_print_operand (FILE *file, rtx x, int code)
       x = current_insn_predicate;
       if (x)
 	{
-	  unsigned int regno = REGNO (XEXP (x, 0));
-	  fputs ("[", file);
+	  fputs ("@", file);
 	  if (GET_CODE (x) == EQ)
 	    fputs ("!", file);
-	  fputs (reg_names [regno], file);
-	  fputs ("]", file);
+	  output_reg (file, REGNO (XEXP (x, 0)), VOIDmode);
 	}
       return;
     }
diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md
index bbc8b1c..bfe3260 100644
--- a/gcc/config/nvptx/nvptx.md
+++ b/gcc/config/nvptx/nvptx.md
@@ -126,6 +126,17 @@
   return true;
 })
 
+(define_attr "predicable" "false,true"
+  (const_string "true"))
+
+(define_cond_exec
+  [(match_operator 0 "predicate_operator"
+      [(match_operand:BI 1 "nvptx_register_operand" "")
+       (match_operand:BI 2 "const0_operand" "")])]
+  ""
+  ""
+  )
+
 (define_constraint "P0"
   "An integer with the value 0."
   (and (match_code "const_int")
@@ -511,7 +522,8 @@
 		      (label_ref (match_operand 1 "" ""))
 		      (pc)))]
   ""
-  "%j0\\tbra\\t%l1;")
+  "%j0\\tbra\\t%l1;"
+  [(set_attr "predicable" "false")])
 
 (define_insn "br_false"
   [(set (pc)
@@ -520,7 +532,8 @@
 		      (label_ref (match_operand 1 "" ""))
 		      (pc)))]
   ""
-  "%J0\\tbra\\t%l1;")
+  "%J0\\tbra\\t%l1;"
+  [(set_attr "predicable" "false")])
 
 ;; unified conditional branch
 (define_insn "br_true_uni"
@@ -529,7 +542,8 @@
 		       UNSPEC_BR_UNIFIED) (const_int 0))
         (label_ref (match_operand 1 "" "")) (pc)))]
   ""
-  "%j0\\tbra.uni\\t%l1;")
+  "%j0\\tbra.uni\\t%l1;"
+  [(set_attr "predicable" "false")])
 
 (define_insn "br_false_uni"
   [(set (pc) (if_then_else
@@ -537,7 +551,8 @@
 		       UNSPEC_BR_UNIFIED) (const_int 0))
         (label_ref (match_operand 1 "" "")) (pc)))]
   ""
-  "%J0\\tbra.uni\\t%l1;")
+  "%J0\\tbra.uni\\t%l1;"
+  [(set_attr "predicable" "false")])
 
 (define_expand "cbranch<mode>4"
   [(set (pc)
@@ -940,7 +955,8 @@
   ""
 {
   return nvptx_output_return ();
-})
+}
+  [(set_attr "predicable" "false")])
 
 (define_expand "epilogue"
   [(clobber (const_int 0))]
@@ -1029,14 +1045,16 @@
 		(const_int 0))
 	    (const_int 0))]
   ""
-  "%j0 trap;")
+  "%j0 trap;"
+  [(set_attr "predicable" "false")])
 
 (define_insn "trap_if_false"
   [(trap_if (eq (match_operand:BI 0 "nvptx_register_operand" "R")
 		(const_int 0))
 	    (const_int 0))]
   ""
-  "%J0 trap;")
+  "%J0 trap;"
+  [(set_attr "predicable" "false")])
 
 (define_expand "ctrap<mode>4"
   [(trap_if (match_operator 0 "nvptx_comparison_operator"
@@ -1085,28 +1103,28 @@
 		       UNSPECV_FORK)]
   ""
   "// fork %0;"
-)
+  [(set_attr "predicable" "false")])
 
 (define_insn "nvptx_forked"
   [(unspec_volatile:SI [(match_operand:SI 0 "const_int_operand" "")]
 		       UNSPECV_FORKED)]
   ""
   "// forked %0;"
-)
+  [(set_attr "predicable" "false")])
 
 (define_insn "nvptx_joining"
   [(unspec_volatile:SI [(match_operand:SI 0 "const_int_operand" "")]
 		       UNSPECV_JOINING)]
   ""
   "// joining %0;"
-)
+  [(set_attr "predicable" "false")])
 
 (define_insn "nvptx_join"
   [(unspec_volatile:SI [(match_operand:SI 0 "const_int_operand" "")]
 		       UNSPECV_JOIN)]
   ""
   "// join %0;"
-)
+  [(set_attr "predicable" "false")])
 
 (define_expand "oacc_fork"
   [(set (match_operand:SI 0 "nvptx_nonmemory_operand" "")
@@ -1254,4 +1272,5 @@
   [(unspec_volatile [(match_operand:SI 0 "const_int_operand" "")]
 		    UNSPECV_BARSYNC)]
   ""
-  "\\tbar.sync\\t%0;")
+  "\\tbar.sync\\t%0;"
+  [(set_attr "predicable" "false")])
-- 
2.6.6

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH 7/8] nvptx: new insns for OpenMP SIMD-on-SIMT
  2016-06-09 16:54 [PATCH 0/8] NVPTX offloading to NVPTX: backend patches Alexander Monakov
  2016-06-09 16:54 ` [PATCH 5/8] nvptx mkoffload: pass -mgomp for OpenMP offloading Alexander Monakov
@ 2016-06-09 16:54 ` Alexander Monakov
  2016-06-09 16:54 ` [PATCH 1/8] nvptx -msoft-stack Alexander Monakov
                   ` (6 subsequent siblings)
  8 siblings, 0 replies; 16+ messages in thread
From: Alexander Monakov @ 2016-06-09 16:54 UTC (permalink / raw)
  To: gcc-patches; +Cc: Nathan Sidwell

This patch implements in nvptx.md a few new instruction patterns that are used
for OpenMP SIMD code.

	* config/nvptx/nvptx-protos.h (nvptx_shuffle_kind): Move enum
	declaration from nvptx.c.
	(nvptx_gen_shuffle): Declare.
	* config/nvptx/nvptx.c (nvptx_shuffle_kind): Move to nvptx-protos.h.
	(nvptx_gen_shuffle): Export.
	* config/nvptx/nvptx.md (UNSPEC_VOTE_BALLOT): New unspec.
	(UNSPEC_LANEID): Ditto.
	(UNSPECV_NOUNROLL): Ditto.
	(nvptx_vote_ballot): New pattern.
	(omp_simt_lane): Ditto.
	(omp_simt_last_lane): Ditto.
	(omp_simt_ordered): Ditto.
	(omp_simt_vote_any): Ditto.
	(omp_simt_xchg_bfly): Ditto.
	(omp_simt_xchg_idx): Ditto.
	(nvptx_nounroll): Ditto.
	* target-insns.def (omp_simt_lane): New.
	(omp_simt_last_lane): New.
	(omp_simt_ordered): New.
	(omp_simt_vote_any): New.
	(omp_simt_xchg_bfly): New.
	(omp_simt_xchg_idx): New.
---
 gcc/config/nvptx/nvptx-protos.h | 11 +++++
 gcc/config/nvptx/nvptx.c        | 12 +-----
 gcc/config/nvptx/nvptx.md       | 94 +++++++++++++++++++++++++++++++++++++++++
 gcc/target-insns.def            |  6 +++
 4 files changed, 112 insertions(+), 11 deletions(-)

diff --git a/gcc/config/nvptx/nvptx-protos.h b/gcc/config/nvptx/nvptx-protos.h
index 647607d..331ec0a 100644
--- a/gcc/config/nvptx/nvptx-protos.h
+++ b/gcc/config/nvptx/nvptx-protos.h
@@ -21,6 +21,16 @@
 #ifndef GCC_NVPTX_PROTOS_H
 #define GCC_NVPTX_PROTOS_H
 
+/* The kind of shuffe instruction.  */
+enum nvptx_shuffle_kind
+{
+  SHUFFLE_UP,
+  SHUFFLE_DOWN,
+  SHUFFLE_BFLY,
+  SHUFFLE_IDX,
+  SHUFFLE_MAX
+};
+
 extern void nvptx_declare_function_name (FILE *, const char *, const_tree decl);
 extern void nvptx_declare_object_name (FILE *file, const char *name,
 				       const_tree decl);
@@ -36,6 +46,7 @@ extern void nvptx_register_pragmas (void);
 extern void nvptx_expand_oacc_fork (unsigned);
 extern void nvptx_expand_oacc_join (unsigned);
 extern void nvptx_expand_call (rtx, rtx);
+extern rtx nvptx_gen_shuffle (rtx, rtx, rtx, nvptx_shuffle_kind);
 extern rtx nvptx_expand_compare (rtx);
 extern const char *nvptx_ptx_type_from_mode (machine_mode, bool);
 extern const char *nvptx_output_mov_insn (rtx, rtx);
diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index 46afae4..d959f85 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -70,16 +70,6 @@
 /* This file should be included last.  */
 #include "target-def.h"
 
-/* The kind of shuffe instruction.  */
-enum nvptx_shuffle_kind
-{
-  SHUFFLE_UP,
-  SHUFFLE_DOWN,
-  SHUFFLE_BFLY,
-  SHUFFLE_IDX,
-  SHUFFLE_MAX
-};
-
 /* The various PTX memory areas an object might reside in.  */
 enum nvptx_data_area
 {
@@ -1433,7 +1423,7 @@ nvptx_gen_pack (rtx dst, rtx src0, rtx src1)
 /* Generate an instruction or sequence to broadcast register REG
    across the vectors of a single warp.  */
 
-static rtx
+rtx
 nvptx_gen_shuffle (rtx dst, rtx src, rtx idx, nvptx_shuffle_kind kind)
 {
   rtx res;
diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md
index 31f87ec..b1b27a2 100644
--- a/gcc/config/nvptx/nvptx.md
+++ b/gcc/config/nvptx/nvptx.md
@@ -42,6 +42,10 @@
 
    UNSPEC_BIT_CONV
 
+   UNSPEC_VOTE_BALLOT
+
+   UNSPEC_LANEID
+
    UNSPEC_SHUFFLE
    UNSPEC_BR_UNIFIED
 ])
@@ -57,6 +61,8 @@
    UNSPECV_FORKED
    UNSPECV_JOINING
    UNSPECV_JOIN
+
+   UNSPECV_NOUNROLL
 ])
 
 (define_attr "subregs_ok" "false,true"
@@ -1166,6 +1172,88 @@
   ""
   "%.\\tshfl%S3.b32\\t%0, %1, %2, 31;")
 
+(define_insn "nvptx_vote_ballot"
+  [(set (match_operand:SI 0 "nvptx_register_operand" "=R")
+	(unspec:SI [(match_operand:BI 1 "nvptx_register_operand" "R")]
+		   UNSPEC_VOTE_BALLOT))]
+  ""
+  "%.\\tvote.ballot.b32\\t%0, %1;")
+
+;; Patterns for OpenMP SIMD-via-SIMT lowering
+
+;; Implement IFN_GOMP_SIMT_LANE: set operand 0 to lane index
+(define_insn "omp_simt_lane"
+  [(set (match_operand:SI 0 "nvptx_register_operand" "")
+	(unspec:SI [(const_int 0)] UNSPEC_LANEID))]
+  ""
+  "%.\\tmov.u32\\t%0, %%laneid;")
+
+;; Implement IFN_GOMP_SIMT_ORDERED: copy operand 1 to operand 0 and
+;; place a compiler barrier to disallow unrolling/peeling the containing loop
+(define_expand "omp_simt_ordered"
+  [(match_operand:SI 0 "nvptx_register_operand" "=R")
+   (match_operand:SI 1 "nvptx_register_operand" "R")]
+  ""
+{
+  emit_move_insn (operands[0], operands[1]);
+  emit_insn (gen_nvptx_nounroll ());
+  DONE;
+})
+
+;; Implement IFN_GOMP_SIMT_XCHG_BFLY: perform a "butterfly" exchange
+;; across lanes
+(define_expand "omp_simt_xchg_bfly"
+  [(match_operand 0 "nvptx_register_operand" "=R")
+   (match_operand 1 "nvptx_register_operand" "R")
+   (match_operand:SI 2 "nvptx_nonmemory_operand" "Ri")]
+  ""
+{
+  emit_insn (nvptx_gen_shuffle (operands[0], operands[1], operands[2],
+				SHUFFLE_BFLY));
+  DONE;
+})
+
+;; Implement IFN_GOMP_SIMT_XCHG_IDX: broadcast value in operand 1
+;; from lane given by index in operand 2 to operand 0 in all lanes
+(define_expand "omp_simt_xchg_idx"
+  [(match_operand 0 "nvptx_register_operand" "=R")
+   (match_operand 1 "nvptx_register_operand" "R")
+   (match_operand:SI 2 "nvptx_nonmemory_operand" "Ri")]
+  ""
+{
+  emit_insn (nvptx_gen_shuffle (operands[0], operands[1], operands[2],
+				SHUFFLE_IDX));
+  DONE;
+})
+
+;; Implement IFN_GOMP_SIMT_VOTE_ANY:
+;; set operand 0 to zero iff all lanes supply zero in operand 1
+(define_expand "omp_simt_vote_any"
+  [(match_operand:SI 0 "nvptx_register_operand" "=R")
+   (match_operand:SI 1 "nvptx_register_operand" "R")]
+  ""
+{
+  rtx pred = gen_reg_rtx (BImode);
+  emit_move_insn (pred, gen_rtx_NE (BImode, operands[1], const0_rtx));
+  emit_insn (gen_nvptx_vote_ballot (operands[0], pred));
+  DONE;
+})
+
+;; Implement IFN_GOMP_SIMT_LAST_LANE:
+;; set operand 0 to the lowest lane index that passed non-zero in operand 1
+(define_expand "omp_simt_last_lane"
+  [(match_operand:SI 0 "nvptx_register_operand" "=R")
+   (match_operand:SI 1 "nvptx_register_operand" "R")]
+  ""
+{
+  rtx pred = gen_reg_rtx (BImode);
+  rtx tmp = gen_reg_rtx (SImode);
+  emit_move_insn (pred, gen_rtx_NE (BImode, operands[1], const0_rtx));
+  emit_insn (gen_nvptx_vote_ballot (tmp, pred));
+  emit_insn (gen_ctzsi2 (operands[0], tmp));
+  DONE;
+})
+
 ;; extract parts of a 64 bit object into 2 32-bit ints
 (define_insn "unpack<mode>si2"
   [(set (match_operand:SI 0 "nvptx_register_operand" "=R")
@@ -1282,3 +1370,9 @@
   ""
   "\\tbar.sync\\t%0;"
   [(set_attr "predicable" "false")])
+
+(define_insn "nvptx_nounroll"
+  [(unspec_volatile [(const_int 0)] UNSPECV_NOUNROLL)]
+  ""
+  "\\t.pragma \\\"nounroll\\\";"
+  [(set_attr "predicable" "false")])
diff --git a/gcc/target-insns.def b/gcc/target-insns.def
index a6a040e..e011a5a 100644
--- a/gcc/target-insns.def
+++ b/gcc/target-insns.def
@@ -68,6 +68,12 @@ DEF_TARGET_INSN (oacc_dim_pos, (rtx x0, rtx x1))
 DEF_TARGET_INSN (oacc_dim_size, (rtx x0, rtx x1))
 DEF_TARGET_INSN (oacc_fork, (rtx x0, rtx x1, rtx x2))
 DEF_TARGET_INSN (oacc_join, (rtx x0, rtx x1, rtx x2))
+DEF_TARGET_INSN (omp_simt_lane, (rtx x0))
+DEF_TARGET_INSN (omp_simt_last_lane, (rtx x0, rtx x1))
+DEF_TARGET_INSN (omp_simt_ordered, (rtx x0, rtx x1))
+DEF_TARGET_INSN (omp_simt_vote_any, (rtx x0, rtx x1))
+DEF_TARGET_INSN (omp_simt_xchg_bfly, (rtx x0, rtx x1, rtx x2))
+DEF_TARGET_INSN (omp_simt_xchg_idx, (rtx x0, rtx x1, rtx x2))
 DEF_TARGET_INSN (prefetch, (rtx x0, rtx x1, rtx x2))
 DEF_TARGET_INSN (probe_stack, (rtx x0))
 DEF_TARGET_INSN (probe_stack_address, (rtx x0))
-- 
2.6.6

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH 4/8] nvptx -mgomp
  2016-06-09 16:54 [PATCH 0/8] NVPTX offloading to NVPTX: backend patches Alexander Monakov
                   ` (3 preceding siblings ...)
  2016-06-09 16:54 ` [PATCH 6/8] new target hook: TARGET_SIMT_VF Alexander Monakov
@ 2016-06-09 16:54 ` Alexander Monakov
  2016-06-13  3:57   ` Sandra Loosemore
  2016-06-09 16:54 ` [PATCH 3/8] nvptx -muniform-simt Alexander Monakov
                   ` (3 subsequent siblings)
  8 siblings, 1 reply; 16+ messages in thread
From: Alexander Monakov @ 2016-06-09 16:54 UTC (permalink / raw)
  To: gcc-patches; +Cc: Nathan Sidwell, Sandra Loosemore

This patch adds option -mgomp which enables -msoft-stack plus -muniform-simt,
and builds a multilib with it.  This codegen convention is used for OpenMP
offloading.

	* config/nvptx/nvptx.c (nvptx_option_override): Handle TARGET_GOMP.
	* config/nvptx/nvptx.opt (mgomp): New option.
	* config/nvptx/t-nvptx (MULTILIB_OPTIONS): New.
	* doc/invoke.texi (mgomp): Document.
---
 gcc/config/nvptx/nvptx.c   | 3 +++
 gcc/config/nvptx/nvptx.opt | 4 ++++
 gcc/config/nvptx/t-nvptx   | 2 ++
 gcc/doc/invoke.texi        | 5 +++++
 4 files changed, 14 insertions(+)

diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index f91573a..294002e 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -192,6 +192,9 @@ nvptx_option_override (void)
   worker_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_red");
   SET_SYMBOL_DATA_AREA (worker_red_sym, DATA_AREA_SHARED);
   worker_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
+
+  if (TARGET_GOMP)
+    target_flags |= MASK_SOFT_STACK | MASK_UNIFORM_SIMT;
 }
 
 /* Return a ptx type for MODE.  If PROMOTE, then use .u32 for QImode to
diff --git a/gcc/config/nvptx/nvptx.opt b/gcc/config/nvptx/nvptx.opt
index 0d46e1d..cb6194d 100644
--- a/gcc/config/nvptx/nvptx.opt
+++ b/gcc/config/nvptx/nvptx.opt
@@ -40,3 +40,7 @@ Use custom stacks instead of local memory for automatic storage.
 muniform-simt
 Target Report Mask(UNIFORM_SIMT)
 Generate code that can keep local state uniform across all lanes.
+
+mgomp
+Target Report Mask(GOMP)
+Generate code for OpenMP offloading: enables -msoft-stack and -muniform-simt.
diff --git a/gcc/config/nvptx/t-nvptx b/gcc/config/nvptx/t-nvptx
index e2580c9..6c1010d 100644
--- a/gcc/config/nvptx/t-nvptx
+++ b/gcc/config/nvptx/t-nvptx
@@ -8,3 +8,5 @@ ALL_HOST_OBJS += mkoffload.o
 mkoffload$(exeext): mkoffload.o collect-utils.o libcommon-target.a $(LIBIBERTY) $(LIBDEPS)
 	+$(LINKER) $(ALL_LINKERFLAGS) $(LDFLAGS) -o $@ \
 	  mkoffload.o collect-utils.o libcommon-target.a $(LIBIBERTY) $(LIBS)
+
+MULTILIB_OPTIONS = mgomp
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 534025d..bae7f2d 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -19631,6 +19631,11 @@ generation variant is used for OpenMP offloading, but the option is exposed on
 its own for the purpose of testing the compiler; to generate code suitable for
 linking into programs using OpenMP offloading, use option @option{-mgomp}.
 
+@item -mgomp
+@opindex mgomp
+Generate code for use in OpenMP offloading: enables @option{-msoft-stack} and
+@option{-muniform-simt} options, and selects corresponding multilib variant.
+
 @end table
 
 @node PDP-11 Options
-- 
2.6.6

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH 8/8] nvptx: handle OpenMP "omp target entrypoint"
  2016-06-09 16:54 [PATCH 0/8] NVPTX offloading to NVPTX: backend patches Alexander Monakov
                   ` (6 preceding siblings ...)
  2016-06-09 16:54 ` [PATCH 2/8] nvptx: implement predicated instructions Alexander Monakov
@ 2016-06-09 16:54 ` Alexander Monakov
  2016-06-09 17:00 ` [PATCH 0/8] NVPTX offloading to NVPTX: backend patches Jakub Jelinek
  8 siblings, 0 replies; 16+ messages in thread
From: Alexander Monakov @ 2016-06-09 16:54 UTC (permalink / raw)
  To: gcc-patches; +Cc: Nathan Sidwell

This patch implements emission of OpenMP target region entrypoints: the
compiler emits the target function with '$impl' appended to the name, and
under the original name it emits a short entry sequence that sets up shared
memory arrays and calls the target function via 'gomp_nvptx_main' (which is
implemented in libgomp).

	* config/nvptx/nvptx.c (write_as_kernel): Restrict to OpenACC target
	regions.
	(write_omp_entry): New.  Use it...
	(nvptx_declare_function_name): ...here to emit OpenMP target region
	entrypoints.
	(nvptx_record_offload_symbol): Handle NULL attributes.
---
 gcc/config/nvptx/nvptx.c | 82 +++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 78 insertions(+), 4 deletions(-)

diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index d959f85..5e47002 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -723,7 +723,10 @@ static bool
 write_as_kernel (tree attrs)
 {
   return (lookup_attribute ("kernel", attrs) != NULL_TREE
-	  || lookup_attribute ("omp target entrypoint", attrs) != NULL_TREE);
+	  || (lookup_attribute ("omp target entrypoint", attrs) != NULL_TREE
+	      && lookup_attribute ("oacc function", attrs) != NULL_TREE));
+  /* For OpenMP target regions, the corresponding kernel entry is emitted from
+     write_omp_entry as a separate function.  */
 }
 
 /* Emit a linker marker for a function decl or defn.  */
@@ -1073,6 +1076,69 @@ nvptx_init_unisimt_predicate (FILE *file)
   need_unisimt_decl = true;
 }
 
+/* Emit kernel NAME for function ORIG outlined for an OpenMP 'target' region:
+
+   extern void gomp_nvptx_main (void (*fn)(void*), void *fnarg);
+   void __attribute__((kernel)) NAME (void *arg, char *stack, size_t stacksize)
+   {
+     __nvptx_stacks[tid.y] = stack + stacksize * (ctaid.x * ntid.y + tid.y + 1);
+     __nvptx_uni[tid.y] = 0;
+     gomp_nvptx_main (ORIG, arg);
+   }
+   ORIG itself should not be emitted as a PTX .entry function.  */
+
+static void
+write_omp_entry (FILE *file, const char *name, const char *orig)
+{
+  static bool gomp_nvptx_main_declared;
+  if (!gomp_nvptx_main_declared)
+    {
+      gomp_nvptx_main_declared = true;
+      write_fn_marker (func_decls, false, true, "gomp_nvptx_main");
+      func_decls << ".extern .func gomp_nvptx_main (.param.u" << POINTER_SIZE
+        << " %in_ar1, .param.u" << POINTER_SIZE << " %in_ar2);\n";
+    }
+#define ENTRY_TEMPLATE(PS, PS_BYTES, MAD_PS_32) "\
+ (.param.u" PS " %arg, .param.u" PS " %stack, .param.u" PS " %sz)\n\
+{\n\
+	.reg.u32 %r<3>;\n\
+	.reg.u" PS " %R<4>;\n\
+	mov.u32 %r0, %tid.y;\n\
+	mov.u32 %r1, %ntid.y;\n\
+	mov.u32 %r2, %ctaid.x;\n\
+	cvt.u" PS ".u32 %R1, %r0;\n\
+	" MAD_PS_32 " %R1, %r1, %r2, %R1;\n\
+	mov.u" PS " %R0, __nvptx_stacks;\n\
+	" MAD_PS_32 " %R0, %r0, " PS_BYTES ", %R0;\n\
+	ld.param.u" PS " %R2, [%stack];\n\
+	ld.param.u" PS " %R3, [%sz];\n\
+	add.u" PS " %R2, %R2, %R3;\n\
+	mad.lo.u" PS " %R2, %R1, %R3, %R2;\n\
+	st.shared.u" PS " [%R0], %R2;\n\
+	mov.u" PS " %R0, __nvptx_uni;\n\
+	" MAD_PS_32 " %R0, %r0, 4, %R0;\n\
+	mov.u32 %r0, 0;\n\
+	st.shared.u32 [%R0], %r0;\n\
+	mov.u" PS " %R0, \0;\n\
+	ld.param.u" PS " %R1, [%arg];\n\
+	{\n\
+		.param.u" PS " %P<2>;\n\
+		st.param.u" PS " [%P0], %R0;\n\
+		st.param.u" PS " [%P1], %R1;\n\
+		call.uni gomp_nvptx_main, (%P0, %P1);\n\
+	}\n\
+	ret.uni;\n\
+}\n"
+  static const char entry64[] = ENTRY_TEMPLATE ("64", "8", "mad.wide.u32");
+  static const char entry32[] = ENTRY_TEMPLATE ("32", "4", "mad.lo.u32  ");
+#undef ENTRY_TEMPLATE
+  const char *entry_1 = TARGET_ABI64 ? entry64 : entry32;
+  /* Position ENTRY_2 after the embedded nul using strlen of the prefix.  */
+  const char *entry_2 = entry_1 + strlen (entry64) + 1;
+  fprintf (file, ".visible .entry %s%s%s%s", name, entry_1, orig, entry_2);
+  need_softstack_decl = need_unisimt_decl = true;
+}
+
 /* Implement ASM_DECLARE_FUNCTION_NAME.  Writes the start of a ptx
    function, including local var decls and copies from the arguments to
    local regs.  */
@@ -1084,6 +1150,14 @@ nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
   tree result_type = TREE_TYPE (fntype);
   int argno = 0;
 
+  if (lookup_attribute ("omp target entrypoint", DECL_ATTRIBUTES (decl))
+      && !lookup_attribute ("oacc function", DECL_ATTRIBUTES (decl)))
+    {
+      char *buf = (char *) alloca (strlen (name) + sizeof ("$impl"));
+      sprintf (buf, "%s$impl", name);
+      write_omp_entry (file, name, buf);
+      name = buf;
+    }
   /* We construct the initial part of the function into a string
      stream, in order to share the prototype writing code.  */
   std::stringstream s;
@@ -4153,13 +4227,13 @@ nvptx_record_offload_symbol (tree decl)
     case FUNCTION_DECL:
       {
 	tree attr = get_oacc_fn_attrib (decl);
-	tree dims = TREE_VALUE (attr);
-	unsigned ix;
+	/* OpenMP offloading does not set this attribute.  */
+	tree dims = attr ? TREE_VALUE (attr) : NULL_TREE;
 
 	fprintf (asm_out_file, "//:FUNC_MAP \"%s\"",
 		 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
 
-	for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
+	for (; dims; dims = TREE_CHAIN (dims))
 	  {
 	    int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
 
-- 
2.6.6

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH 5/8] nvptx mkoffload: pass -mgomp for OpenMP offloading
  2016-06-09 16:54 [PATCH 0/8] NVPTX offloading to NVPTX: backend patches Alexander Monakov
@ 2016-06-09 16:54 ` Alexander Monakov
  2016-06-09 16:54 ` [PATCH 7/8] nvptx: new insns for OpenMP SIMD-on-SIMT Alexander Monakov
                   ` (7 subsequent siblings)
  8 siblings, 0 replies; 16+ messages in thread
From: Alexander Monakov @ 2016-06-09 16:54 UTC (permalink / raw)
  To: gcc-patches; +Cc: Nathan Sidwell

This patch wires up use of alternative -mgomp multilib for OpenMP offloading
via nvptx mkoffload.  It makes OpenACC and OpenMP incompatible for
simultaneous offloading compilation, so I've added a diagnostic for that.

	* config/nvptx/mkoffload.c (main): Check that either OpenACC or OpenMP
	is selected.  Pass -mgomp to offload compiler in OpenMP case.
---
 gcc/config/nvptx/mkoffload.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/gcc/config/nvptx/mkoffload.c b/gcc/config/nvptx/mkoffload.c
index c8eed45..d876c7b 100644
--- a/gcc/config/nvptx/mkoffload.c
+++ b/gcc/config/nvptx/mkoffload.c
@@ -460,6 +460,7 @@ main (int argc, char **argv)
 
   /* Scan the argument vector.  */
   bool fopenmp = false;
+  bool fopenacc = false;
   for (int i = 1; i < argc; i++)
     {
 #define STR "-foffload-abi="
@@ -476,11 +477,15 @@ main (int argc, char **argv)
 #undef STR
       else if (strcmp (argv[i], "-fopenmp") == 0)
 	fopenmp = true;
+      else if (strcmp (argv[i], "-fopenacc") == 0)
+	fopenacc = true;
       else if (strcmp (argv[i], "-save-temps") == 0)
 	save_temps = true;
       else if (strcmp (argv[i], "-v") == 0)
 	verbose = true;
     }
+  if (!(fopenacc ^ fopenmp))
+    fatal_error (input_location, "either -fopenacc or -fopenmp must be set");
 
   struct obstack argv_obstack;
   obstack_init (&argv_obstack);
@@ -501,6 +506,8 @@ main (int argc, char **argv)
     default:
       gcc_unreachable ();
     }
+  if (fopenmp)
+    obstack_ptr_grow (&argv_obstack, "-mgomp");
 
   for (int ix = 1; ix != argc; ix++)
     {
-- 
2.6.6

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH 3/8] nvptx -muniform-simt
  2016-06-09 16:54 [PATCH 0/8] NVPTX offloading to NVPTX: backend patches Alexander Monakov
                   ` (4 preceding siblings ...)
  2016-06-09 16:54 ` [PATCH 4/8] nvptx -mgomp Alexander Monakov
@ 2016-06-09 16:54 ` Alexander Monakov
  2016-06-13  3:55   ` Sandra Loosemore
  2016-06-09 16:54 ` [PATCH 2/8] nvptx: implement predicated instructions Alexander Monakov
                   ` (2 subsequent siblings)
  8 siblings, 1 reply; 16+ messages in thread
From: Alexander Monakov @ 2016-06-09 16:54 UTC (permalink / raw)
  To: gcc-patches; +Cc: Nathan Sidwell, Sandra Loosemore

This patch implements -muniform-simt code generation option, which is used to
emit code for OpenMP offloading.  The goal is to emit code that can either
execute "normally", or can execute in a way that keeps all lanes in a given
warp active, their local state synchronized, and observable effects from
execution happening as if only one lane was active.  The latter mode is how
OpenMP offloaded code runs outside of SIMD regions.

To achieve that, the compiler instruments atomic instructions and calls to
functions provided by the CUDA runtime (malloc, free, vprintf), i.e. those
that GCC itself doesn't compile.

Instrumentation converts an atomic instruction to a predicated atomic
instruction followed by a warp shuffle.  To illustrate,

	atom.op dest, <args>

becomes

  @PRED	atom.op	dest, <args>
	shfl.idx dest, dest, MASTER

where, outside of SIMD regions:

- PRED is true in lane 0, false in lanes 1-31, so the side effect happens once
- MASTER is 0 in all lanes, so the shuffle synchronizes 'dest' among all lanes

and inside of SIMD regions:

- PRED is true in all lanes, so the atomic is done in all lanes independently
- MASTER equals to current lane number, so the shuffle is a no-op.

To keep track of current state and compute PRED and MASTER, the compiler uses
shared memory array 'unsigned __nvptx_uni[]' with per-warp all-zeros or
all-ones masks.  The mask word is zero outside of SIMD regions, all-ones
inside.  Function prologue uses mask to compute MASTER and PRED via:

    MASTER = LANE_ID & MASK;
    PRED   = LANE_ID == MASTER;

Calls are handled like atomics.

gcc/
	* config/nvptx/nvptx.c (need_unisimt_decl): New variable.  Set it...
	(nvptx_init_unisimt_predicate): ...here (new function) and use it...
	(nvptx_file_end): ...here to emit declaration of __nvptx_uni array.
	(nvptx_declare_function_name): Call nvptx_init_unisimt_predicate.
	(nvptx_get_unisimt_master): New helper function.
	(nvptx_get_unisimt_predicate): Ditto.
	(nvptx_call_insn_is_syscall_p): Ditto.
	(nvptx_unisimt_handle_set): Ditto.
	(nvptx_reorg_uniform_simt): New.  Transform code for -muniform-simt.
	(nvptx_reorg): Call nvptx_reorg_uniform_simt.
	* config/nvptx/nvptx.h (TARGET_CPU_CPP_BUILTINS): Define
	__nvptx_unisimt__ when -muniform-simt option is active.
	(struct machine_function): Add unisimt_master, unisimt_predicate
	rtx fields.
	* config/nvptx/nvptx.md (atomic): New attribute.
	(atomic_compare_and_swap<mode>_1): Mark with atomic attribute.
	(atomic_exchange<mode>): Ditto.
	(atomic_fetch_add<mode>): Ditto.
	(atomic_fetch_addsf): Ditto.
	(atomic_fetch_<logic><mode>): Ditto.
	* config/nvptx/nvptx.opt (muniform-simt): New option.
	* doc/invoke.texi (-muniform-simt): Document.

gcc/testsuite/
	* gcc.target/nvptx/unisimt.c: New test.

libgcc/
	* config/nvptx/crt0.c (__main) [__nvptx_unisimt__]: Setup __nvptx_uni.
	* config/nvptx/stacks.c (__nvptx_uni): Define.
---
 gcc/config/nvptx/nvptx.c                 | 124 +++++++++++++++++++++++++++++++
 gcc/config/nvptx/nvptx.h                 |   4 +
 gcc/config/nvptx/nvptx.md                |  18 +++--
 gcc/config/nvptx/nvptx.opt               |   4 +
 gcc/doc/invoke.texi                      |  10 +++
 gcc/testsuite/gcc.target/nvptx/unisimt.c |  22 ++++++
 libgcc/config/nvptx/crt0.c               |   4 +
 libgcc/config/nvptx/stacks.c             |   3 +-
 8 files changed, 183 insertions(+), 6 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/nvptx/unisimt.c

diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index 7b90cb1..f91573a 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -142,6 +142,9 @@ static GTY(()) tree global_lock_var;
 /* True if any function references __nvptx_stacks.  */
 static bool need_softstack_decl;
 
+/* True if any function references __nvptx_uni.  */
+static bool need_unisimt_decl;
+
 /* Allocate a new, cleared machine_function structure.  */
 
 static struct machine_function *
@@ -1049,6 +1052,34 @@ nvptx_init_axis_predicate (FILE *file, int regno, const char *name)
   fprintf (file, "\t}\n");
 }
 
+/* Emit code to initialize predicate and master lane index registers for
+   -muniform-simt code generation variant.  */
+
+static void
+nvptx_init_unisimt_predicate (FILE *file)
+{
+  int bits = POINTER_SIZE;
+  int master = REGNO (cfun->machine->unisimt_master);
+  int pred = REGNO (cfun->machine->unisimt_predicate);
+  fprintf (file, "\t{\n");
+  fprintf (file, "\t\t.reg.u32 %%ustmp0;\n");
+  fprintf (file, "\t\t.reg.u%d %%ustmp1;\n", bits);
+  fprintf (file, "\t\t.reg.u%d %%ustmp2;\n", bits);
+  fprintf (file, "\t\tmov.u32 %%ustmp0, %%tid.y;\n");
+  fprintf (file, "\t\tmul%s.u32 %%ustmp1, %%ustmp0, 4;\n",
+	   bits == 64 ? ".wide" : ".lo");
+  fprintf (file, "\t\tmov.u%d %%ustmp2, __nvptx_uni;\n", bits);
+  fprintf (file, "\t\tadd.u%d %%ustmp2, %%ustmp2, %%ustmp1;\n", bits);
+  fprintf (file, "\t\tld.shared.u32 %%r%d, [%%ustmp2];\n", master);
+  fprintf (file, "\t\tmov.u32 %%ustmp0, %%tid.x;\n");
+  /* Compute 'master lane index' as 'tid.x & __nvptx_uni[tid.y]'.  */
+  fprintf (file, "\t\tand.b32 %%r%d, %%r%d, %%ustmp0;\n", master, master);
+  /* Compute predicate as 'tid.x == master'.  */
+  fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp0;\n", pred, master);
+  fprintf (file, "\t}\n");
+  need_unisimt_decl = true;
+}
+
 /* Implement ASM_DECLARE_FUNCTION_NAME.  Writes the start of a ptx
    function, including local var decls and copies from the arguments to
    local regs.  */
@@ -1138,6 +1169,8 @@ nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
   if (cfun->machine->axis_predicate[1])
     nvptx_init_axis_predicate (file,
 			       REGNO (cfun->machine->axis_predicate[1]), "x");
+  if (cfun->machine->unisimt_predicate)
+    nvptx_init_unisimt_predicate (file);
 }
 
 /* Output instruction that sets soft stack pointer in shared memory to the
@@ -2401,6 +2434,89 @@ nvptx_reorg_subreg (void)
     }
 }
 
+/* Return a SImode "master lane index" register for uniform-simt, allocating on
+   first use.  */
+
+static rtx
+nvptx_get_unisimt_master ()
+{
+  rtx &master = cfun->machine->unisimt_master;
+  return master ? master : master = gen_reg_rtx (SImode);
+}
+
+/* Return a BImode "predicate" register for uniform-simt, similar to above.  */
+
+static rtx
+nvptx_get_unisimt_predicate ()
+{
+  rtx &pred = cfun->machine->unisimt_predicate;
+  return pred ? pred : pred = gen_reg_rtx (BImode);
+}
+
+/* Return true if given call insn references one of the functions provided by
+   the CUDA runtime: malloc, free, vprintf.  */
+
+static bool
+nvptx_call_insn_is_syscall_p (rtx_insn *insn)
+{
+  rtx pat = PATTERN (insn);
+  gcc_checking_assert (GET_CODE (pat) == PARALLEL);
+  pat = XVECEXP (pat, 0, 0);
+  if (GET_CODE (pat) == SET)
+    pat = SET_SRC (pat);
+  gcc_checking_assert (GET_CODE (pat) == CALL
+		       && GET_CODE (XEXP (pat, 0)) == MEM);
+  rtx addr = XEXP (XEXP (pat, 0), 0);
+  if (GET_CODE (addr) != SYMBOL_REF)
+    return false;
+  const char *name = XSTR (addr, 0);
+  /* Ordinary malloc/free are redirected to __nvptx_{malloc,free), so only the
+     references with forced assembler name refer to PTX syscalls.  For vprintf,
+     accept both normal and forced-assembler-name references.  */
+  return (!strcmp (name, "vprintf") || !strcmp (name, "*vprintf")
+	  || !strcmp (name, "*malloc")
+	  || !strcmp (name, "*free"));
+}
+
+/* If SET subexpression of INSN sets a register, emit a shuffle instruction to
+   propagate its value from lane MASTER to current lane.  */
+
+static void
+nvptx_unisimt_handle_set (rtx set, rtx_insn *insn, rtx master)
+{
+  rtx reg;
+  if (GET_CODE (set) == SET && REG_P (reg = SET_DEST (set)))
+    emit_insn_after (nvptx_gen_shuffle (reg, reg, master, SHUFFLE_IDX), insn);
+}
+
+/* Adjust code for uniform-simt code generation variant by making atomics and
+   "syscalls" conditionally executed, and inserting shuffle-based propagation
+   for registers being set.  */
+
+static void
+nvptx_reorg_uniform_simt ()
+{
+  rtx_insn *insn, *next;
+
+  for (insn = get_insns (); insn; insn = next)
+    {
+      next = NEXT_INSN (insn);
+      if (!(CALL_P (insn) && nvptx_call_insn_is_syscall_p (insn))
+	  && !(NONJUMP_INSN_P (insn)
+	       && GET_CODE (PATTERN (insn)) == PARALLEL
+	       && get_attr_atomic (insn)))
+	continue;
+      rtx pat = PATTERN (insn);
+      rtx master = nvptx_get_unisimt_master ();
+      for (int i = 0; i < XVECLEN (pat, 0); i++)
+	nvptx_unisimt_handle_set (XVECEXP (pat, 0, i), insn, master);
+      rtx pred = nvptx_get_unisimt_predicate ();
+      pred = gen_rtx_NE (BImode, pred, const0_rtx);
+      pat = gen_rtx_COND_EXEC (VOIDmode, pred, pat);
+      validate_change (insn, &PATTERN (insn), pat, false);
+    }
+}
+
 /* Loop structure of the function.  The entire function is described as
    a NULL loop.  */
 
@@ -3922,6 +4038,9 @@ nvptx_reorg (void)
   /* Replace subregs.  */
   nvptx_reorg_subreg ();
 
+  if (TARGET_UNIFORM_SIMT)
+    nvptx_reorg_uniform_simt ();
+
   regstat_free_n_sets_and_refs ();
 
   df_finish_pass (true);
@@ -4118,6 +4237,11 @@ nvptx_file_end (void)
       fprintf (asm_out_file, ".extern .shared .u%d __nvptx_stacks[32];\n",
 	       POINTER_SIZE);
     }
+  if (need_unisimt_decl)
+    {
+      write_var_marker (asm_out_file, false, true, "__nvptx_uni");
+      fprintf (asm_out_file, ".extern .shared .u32 __nvptx_uni[32];\n");
+    }
 }
 
 /* Expander for the shuffle builtins.  */
diff --git a/gcc/config/nvptx/nvptx.h b/gcc/config/nvptx/nvptx.h
index 2c4962a..1702178 100644
--- a/gcc/config/nvptx/nvptx.h
+++ b/gcc/config/nvptx/nvptx.h
@@ -33,6 +33,8 @@
       builtin_define ("__nvptx__");		\
       if (TARGET_SOFT_STACK)			\
         builtin_define ("__nvptx_softstack__");	\
+      if (TARGET_UNIFORM_SIMT)			\
+        builtin_define ("__nvptx_unisimt__");	\
     } while (0)
 
 /* Avoid the default in ../../gcc.c, which adds "-pthread", which is not
@@ -215,6 +217,8 @@ struct GTY(()) machine_function
   int return_mode; /* Return mode of current fn.
 		      (machine_mode not defined yet.) */
   rtx axis_predicate[2]; /* Neutering predicates.  */
+  rtx unisimt_master; /* 'Master lane index' for -muniform-simt.  */
+  rtx unisimt_predicate; /* Predicate for -muniform-simt.  */
 };
 #endif
 \f
diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md
index bfe3260..31f87ec 100644
--- a/gcc/config/nvptx/nvptx.md
+++ b/gcc/config/nvptx/nvptx.md
@@ -62,6 +62,9 @@
 (define_attr "subregs_ok" "false,true"
   (const_string "false"))
 
+(define_attr "atomic" "false,true"
+  (const_string "false"))
+
 ;; The nvptx operand predicates, in general, don't permit subregs and
 ;; only literal constants, which differ from the generic ones, which
 ;; permit subregs and symbolc constants (as appropriate)
@@ -1215,7 +1218,8 @@
    (set (match_dup 1)
 	(unspec_volatile:SDIM [(const_int 0)] UNSPECV_CAS))]
   ""
-  "%.\\tatom%A1.cas.b%T0\\t%0, %1, %2, %3;")
+  "%.\\tatom%A1.cas.b%T0\\t%0, %1, %2, %3;"
+  [(set_attr "atomic" "true")])
 
 (define_insn "atomic_exchange<mode>"
   [(set (match_operand:SDIM 0 "nvptx_register_operand" "=R")	;; output
@@ -1226,7 +1230,8 @@
    (set (match_dup 1)
 	(match_operand:SDIM 2 "nvptx_nonmemory_operand" "Ri"))]	;; input
   ""
-  "%.\\tatom%A1.exch.b%T0\\t%0, %1, %2;")
+  "%.\\tatom%A1.exch.b%T0\\t%0, %1, %2;"
+  [(set_attr "atomic" "true")])
 
 (define_insn "atomic_fetch_add<mode>"
   [(set (match_operand:SDIM 1 "memory_operand" "+m")
@@ -1238,7 +1243,8 @@
    (set (match_operand:SDIM 0 "nvptx_register_operand" "=R")
 	(match_dup 1))]
   ""
-  "%.\\tatom%A1.add%t0\\t%0, %1, %2;")
+  "%.\\tatom%A1.add%t0\\t%0, %1, %2;"
+  [(set_attr "atomic" "true")])
 
 (define_insn "atomic_fetch_addsf"
   [(set (match_operand:SF 1 "memory_operand" "+m")
@@ -1250,7 +1256,8 @@
    (set (match_operand:SF 0 "nvptx_register_operand" "=R")
 	(match_dup 1))]
   ""
-  "%.\\tatom%A1.add%t0\\t%0, %1, %2;")
+  "%.\\tatom%A1.add%t0\\t%0, %1, %2;"
+  [(set_attr "atomic" "true")])
 
 (define_code_iterator any_logic [and ior xor])
 (define_code_attr logic [(and "and") (ior "or") (xor "xor")])
@@ -1266,7 +1273,8 @@
    (set (match_operand:SDIM 0 "nvptx_register_operand" "=R")
 	(match_dup 1))]
   "0"
-  "%.\\tatom%A1.b%T0.<logic>\\t%0, %1, %2;")
+  "%.\\tatom%A1.b%T0.<logic>\\t%0, %1, %2;"
+  [(set_attr "atomic" "true")])
 
 (define_insn "nvptx_barsync"
   [(unspec_volatile [(match_operand:SI 0 "const_int_operand" "")]
diff --git a/gcc/config/nvptx/nvptx.opt b/gcc/config/nvptx/nvptx.opt
index 71ddadb..0d46e1d 100644
--- a/gcc/config/nvptx/nvptx.opt
+++ b/gcc/config/nvptx/nvptx.opt
@@ -36,3 +36,7 @@ Optimize partition neutering.
 msoft-stack
 Target Report Mask(SOFT_STACK)
 Use custom stacks instead of local memory for automatic storage.
+
+muniform-simt
+Target Report Mask(UNIFORM_SIMT)
+Generate code that can keep local state uniform across all lanes.
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 5ed5789..534025d 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -19621,6 +19621,16 @@ for OpenMP offloading, but the option is exposed on its own for the purpose
 of testing the compiler; to generate code suitable for linking into programs
 using OpenMP offloading, use option @option{-mgomp}.
 
+@item -muniform-simt
+@opindex muniform-simt
+Generate code that allows to keep all lanes in each warp active, even when
+observable effects from execution should appear as if only one lane was
+active. This is achieved by instrumenting syscalls and atomic instructions in
+a lightweight way that allows to switch behavior at runtime. This code
+generation variant is used for OpenMP offloading, but the option is exposed on
+its own for the purpose of testing the compiler; to generate code suitable for
+linking into programs using OpenMP offloading, use option @option{-mgomp}.
+
 @end table
 
 @node PDP-11 Options
diff --git a/gcc/testsuite/gcc.target/nvptx/unisimt.c b/gcc/testsuite/gcc.target/nvptx/unisimt.c
new file mode 100644
index 0000000..d268721
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nvptx/unisimt.c
@@ -0,0 +1,22 @@
+/* { dg-options "-O2 -muniform-simt" } */
+/* { dg-do run } */
+
+#include <stdarg.h>
+
+static __attribute__((noinline,noclone)) int print (const char *fmt, ...)
+{
+  va_list va;
+  va_start (va, fmt);
+  int r = __builtin_vprintf (fmt, va);
+  va_end (va);
+  return r;
+}
+
+int main()
+{
+  static int v;
+  __sync_fetch_and_add (&v, 1);
+  if (print ("%d", v) != 1) /* { dg-output "^1$" } */
+    __builtin_abort ();
+  return 0;
+}
diff --git a/libgcc/config/nvptx/crt0.c b/libgcc/config/nvptx/crt0.c
index e31696c..2431308 100644
--- a/libgcc/config/nvptx/crt0.c
+++ b/libgcc/config/nvptx/crt0.c
@@ -38,6 +38,10 @@ __main (int *rval_ptr, int argc, void **argv)
   static char stack[131072] __attribute__((aligned(8)));
   __nvptx_stacks[0] = stack + sizeof stack;
 #endif
+#ifdef __nvptx_unisimt__
+  extern unsigned __nvptx_uni[32] __attribute__((shared));
+  __nvptx_uni[0] = 0;
+#endif
 
   exit (main (argc, argv));
 }
diff --git a/libgcc/config/nvptx/stacks.c b/libgcc/config/nvptx/stacks.c
index 7582c72..3c33e00 100644
--- a/libgcc/config/nvptx/stacks.c
+++ b/libgcc/config/nvptx/stacks.c
@@ -1,4 +1,4 @@
-/* Define shared memory array for -msoft-stack.
+/* Define shared memory arrays for -msoft-stack and -muniform-simt.
 
    Copyright (C) 2015-2016 Free Software Foundation, Inc.
 
@@ -22,3 +22,4 @@
    <http://www.gnu.org/licenses/>.  */
 
 char *__nvptx_stacks[32] __attribute__((shared,nocommon));
+unsigned __nvptx_uni[32] __attribute__((shared,nocommon));
-- 
2.6.6

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH 6/8] new target hook: TARGET_SIMT_VF
  2016-06-09 16:54 [PATCH 0/8] NVPTX offloading to NVPTX: backend patches Alexander Monakov
                   ` (2 preceding siblings ...)
  2016-06-09 16:54 ` [PATCH 1/8] nvptx -msoft-stack Alexander Monakov
@ 2016-06-09 16:54 ` Alexander Monakov
  2016-06-09 16:54 ` [PATCH 4/8] nvptx -mgomp Alexander Monakov
                   ` (4 subsequent siblings)
  8 siblings, 0 replies; 16+ messages in thread
From: Alexander Monakov @ 2016-06-09 16:54 UTC (permalink / raw)
  To: gcc-patches; +Cc: Nathan Sidwell

This patch adds a new target hook and implements it in a straightforward
manner on NVPTX to indicate that the target is running in SIMT fashion with 32
threads in a synchronous group ("warp").  For use in OpenMP transforms.

	* config/nvptx/nvptx.c (nvptx_simt_vf): New.
	(TARGET_SIMT_VF): Define.
	* doc/tm.texi: Regenerate.
	* doc/tm.texi.in: (TARGET_SIMT_VF): New hook.
	* target.def: Define it.
---
 gcc/config/nvptx/nvptx.c | 11 +++++++++++
 gcc/doc/tm.texi          |  4 ++++
 gcc/doc/tm.texi.in       |  2 ++
 gcc/target.def           | 12 ++++++++++++
 4 files changed, 29 insertions(+)

diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index 294002e..46afae4 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -4430,6 +4430,14 @@ nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget),
 #define PTX_WORKER_LENGTH 32
 #define PTX_GANG_DEFAULT  32
 
+/* Implement TARGET_SIMT_VF target hook: number of threads in a warp.  */
+
+static int
+nvptx_simt_vf ()
+{
+  return PTX_VECTOR_LENGTH;
+}
+
 /* Validate compute dimensions of an OpenACC offload or routine, fill
    in non-unity defaults.  FN_LEVEL indicates the level at which a
    routine might spawn a loop.  It is negative for non-routines.  If
@@ -5195,6 +5203,9 @@ nvptx_goacc_reduction (gcall *call)
 #undef  TARGET_BUILTIN_DECL
 #define TARGET_BUILTIN_DECL nvptx_builtin_decl
 
+#undef TARGET_SIMT_VF
+#define TARGET_SIMT_VF nvptx_simt_vf
+
 #undef TARGET_GOACC_VALIDATE_DIMS
 #define TARGET_GOACC_VALIDATE_DIMS nvptx_goacc_validate_dims
 
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index b318615..93af006 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -5768,6 +5768,10 @@ usable.  In that case, the smaller the number is, the more desirable it is
 to use it.
 @end deftypefn
 
+@deftypefn {Target Hook} int TARGET_SIMT_VF (void)
+Return number of threads in SIMT thread group on the target.
+@end deftypefn
+
 @deftypefn {Target Hook} bool TARGET_GOACC_VALIDATE_DIMS (tree @var{decl}, int *@var{dims}, int @var{fn_level})
 This hook should check the launch dimensions provided for an OpenACC
 compute region, or routine.  Defaulted values are represented as -1
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index 1e8423c..ccec1b1 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -4263,6 +4263,8 @@ address;  but often a machine-dependent strategy can generate better code.
 
 @hook TARGET_SIMD_CLONE_USABLE
 
+@hook TARGET_SIMT_VF
+
 @hook TARGET_GOACC_VALIDATE_DIMS
 
 @hook TARGET_GOACC_DIM_LIMIT
diff --git a/gcc/target.def b/gcc/target.def
index a4df363..ba47e5b 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -1639,6 +1639,18 @@ int, (struct cgraph_node *), NULL)
 
 HOOK_VECTOR_END (simd_clone)
 
+/* Functions relating to OpenMP SIMT vectorization transform.  */
+#undef HOOK_PREFIX
+#define HOOK_PREFIX "TARGET_SIMT_"
+HOOK_VECTOR (TARGET_SIMT, simt)
+
+DEFHOOK
+(vf,
+"Return number of threads in SIMT thread group on the target.",
+int, (void), NULL)
+
+HOOK_VECTOR_END (simt)
+
 /* Functions relating to openacc.  */
 #undef HOOK_PREFIX
 #define HOOK_PREFIX "TARGET_GOACC_"
-- 
2.6.6

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/8] NVPTX offloading to NVPTX: backend patches
  2016-06-09 16:54 [PATCH 0/8] NVPTX offloading to NVPTX: backend patches Alexander Monakov
                   ` (7 preceding siblings ...)
  2016-06-09 16:54 ` [PATCH 8/8] nvptx: handle OpenMP "omp target entrypoint" Alexander Monakov
@ 2016-06-09 17:00 ` Jakub Jelinek
  8 siblings, 0 replies; 16+ messages in thread
From: Jakub Jelinek @ 2016-06-09 17:00 UTC (permalink / raw)
  To: Alexander Monakov; +Cc: gcc-patches, Nathan Sidwell

On Thu, Jun 09, 2016 at 07:53:52PM +0300, Alexander Monakov wrote:
> I'm sending updated patch series with backend prerequisites for OpenMP
> offloading to the NVIDIA PTX ISA.  The first patch has already received some
> comments and this version reflects review feedback.  The other patches have
> been adjusted for clarity and re-cut in a more rigorous manner.  All patches are
> rebased onto current trunk.
> 
> Jakub, can you offer wishes/recommendations for sending the rest of
> (middle-end and libgomp) patches?  As you know there's a branch with

Once all the prerequisites are in (I assume the patches depend on the NVPTX
backend patches you've just posted), then I'd prefer if you rebase the rest
to current trunk and post  in reasonably reviewable chunks (that can be all
of middle-end changes in one patch, all of libgomp plugin changes, all of
other libgomp changes, or if some of those would be too large, split that a
little bit).

	Jakub

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 3/8] nvptx -muniform-simt
  2016-06-09 16:54 ` [PATCH 3/8] nvptx -muniform-simt Alexander Monakov
@ 2016-06-13  3:55   ` Sandra Loosemore
  2016-06-13 17:37     ` Alexander Monakov
  0 siblings, 1 reply; 16+ messages in thread
From: Sandra Loosemore @ 2016-06-13  3:55 UTC (permalink / raw)
  To: Alexander Monakov, gcc-patches; +Cc: Nathan Sidwell

On 06/09/2016 10:53 AM, Alexander Monakov wrote:
>
> [snip]
>
> --- a/gcc/doc/invoke.texi
> +++ b/gcc/doc/invoke.texi
> @@ -19621,6 +19621,16 @@ for OpenMP offloading, but the option is exposed on its own for the purpose
>   of testing the compiler; to generate code suitable for linking into programs
>   using OpenMP offloading, use option @option{-mgomp}.
>
> +@item -muniform-simt
> +@opindex muniform-simt
> +Generate code that allows to keep all lanes in each warp active, even when

Allows *what* to keep?  E.g. what is doing the keeping here?  If it is 
the generated code itself, please rephrase as

Generate code that keeps....

> +observable effects from execution should appear as if only one lane was

s/was/is/

> +active. This is achieved by instrumenting syscalls and atomic instructions in
> +a lightweight way that allows to switch behavior at runtime. This code

Same issue here....  allows *what* to switch behavior?  (And how would 
you select which run-time behavior you want?)

Also, in the snippet above where it is used as a noun, please
s/runtime/run time/

> +generation variant is used for OpenMP offloading, but the option is exposed on
> +its own for the purpose of testing the compiler; to generate code suitable for
> +linking into programs using OpenMP offloading, use option @option{-mgomp}.
> +
>   @end table
>
>   @node PDP-11 Options

-Sandra

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 4/8] nvptx -mgomp
  2016-06-09 16:54 ` [PATCH 4/8] nvptx -mgomp Alexander Monakov
@ 2016-06-13  3:57   ` Sandra Loosemore
  0 siblings, 0 replies; 16+ messages in thread
From: Sandra Loosemore @ 2016-06-13  3:57 UTC (permalink / raw)
  To: Alexander Monakov, gcc-patches; +Cc: Nathan Sidwell

On 06/09/2016 10:53 AM, Alexander Monakov wrote:
>
> [snip]
>
> --- a/gcc/doc/invoke.texi
> +++ b/gcc/doc/invoke.texi
> @@ -19631,6 +19631,11 @@ generation variant is used for OpenMP offloading, but the option is exposed on
>   its own for the purpose of testing the compiler; to generate code suitable for
>   linking into programs using OpenMP offloading, use option @option{-mgomp}.
>
> +@item -mgomp
> +@opindex mgomp
> +Generate code for use in OpenMP offloading: enables @option{-msoft-stack} and

s/enables/enables the/

> +@option{-muniform-simt} options, and selects corresponding multilib variant.

s/selects/selects the/

> +
>   @end table
>
>   @node PDP-11 Options
>

The documentation part is OK with those changes.

-Sandra

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 3/8] nvptx -muniform-simt
  2016-06-13  3:55   ` Sandra Loosemore
@ 2016-06-13 17:37     ` Alexander Monakov
  2016-06-22 18:39       ` Alexander Monakov
  0 siblings, 1 reply; 16+ messages in thread
From: Alexander Monakov @ 2016-06-13 17:37 UTC (permalink / raw)
  To: Sandra Loosemore; +Cc: gcc-patches, Nathan Sidwell

On Sun, 12 Jun 2016, Sandra Loosemore wrote:
> On 06/09/2016 10:53 AM, Alexander Monakov wrote:
> > +@item -muniform-simt
> > +@opindex muniform-simt
> > +Generate code that allows to keep all lanes in each warp active, even when
> 
> Allows *what* to keep?  E.g. what is doing the keeping here?  If it is the
> generated code itself, please rephrase as
> 
> Generate code that keeps....

Let me try to expand and rephrase what I meant:

Allows the compiler to emit code that, at run time, may have all lanes active,
particularly in those regions of the program where observable effects from
execution must happen as if one lane is active (outside of SIMD loops).

But nevertheless generated code can run just like conventionally generated
code does: with each lane being active/inactive independently, and side
effects happening from each active lane (inside of SIMD loops).

Whether it actually runs in the former (let's call it "uniform") or the latter
("conventional") way is switchable at run time. The compiler itself is
responsible for emitting mode changes at SIMD region boundaries.

Does this help? Below I went with your suggestion, but changed "keeps" to "may
keep" because that's generally true only outside of SIMD regions.

> > +observable effects from execution should appear as if only one lane was
> 
> s/was/is/
> 
> > +active. This is achieved by instrumenting syscalls and atomic instructions
> > in
> > +a lightweight way that allows to switch behavior at runtime. This code
> 
> Same issue here....  allows *what* to switch behavior?  (And how would you
> select which run-time behavior you want?)

Sorry. This gives compiler itself a way to emit code that will switch behavior
of the subsequently running code.

> Also, in the snippet above where it is used as a noun, please
> s/runtime/run time/

Thanks. Does the following look better?

@item -muniform-simt
@opindex muniform-simt
Generate code that may keep all lanes in each warp active, even when
observable effects from execution must appear as if only one lane is active.
This is achieved by instrumenting syscalls and atomic instructions in a
lightweight way, allowing the compiler to emit code that can switch at run
time between this and conventional execution modes. This code generation
variant is used for OpenMP offloading, but the option is exposed on its own
for the purpose of testing the compiler; to generate code suitable for linking
into programs using OpenMP offloading, use option @option{-mgomp}.

Alexander

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 3/8] nvptx -muniform-simt
  2016-06-13 17:37     ` Alexander Monakov
@ 2016-06-22 18:39       ` Alexander Monakov
  2016-06-24 20:41         ` Sandra Loosemore
  0 siblings, 1 reply; 16+ messages in thread
From: Alexander Monakov @ 2016-06-22 18:39 UTC (permalink / raw)
  To: Sandra Loosemore; +Cc: gcc-patches, Nathan Sidwell

Ping.

On Mon, 13 Jun 2016, Alexander Monakov wrote:
> On Sun, 12 Jun 2016, Sandra Loosemore wrote:
> > On 06/09/2016 10:53 AM, Alexander Monakov wrote:
> > > +@item -muniform-simt
> > > +@opindex muniform-simt
> > > +Generate code that allows to keep all lanes in each warp active, even when
> > 
> > Allows *what* to keep?  E.g. what is doing the keeping here?  If it is the
> > generated code itself, please rephrase as
> > 
> > Generate code that keeps....
> 
> Let me try to expand and rephrase what I meant:
> 
> Allows the compiler to emit code that, at run time, may have all lanes active,
> particularly in those regions of the program where observable effects from
> execution must happen as if one lane is active (outside of SIMD loops).
> 
> But nevertheless generated code can run just like conventionally generated
> code does: with each lane being active/inactive independently, and side
> effects happening from each active lane (inside of SIMD loops).
> 
> Whether it actually runs in the former (let's call it "uniform") or the latter
> ("conventional") way is switchable at run time. The compiler itself is
> responsible for emitting mode changes at SIMD region boundaries.
> 
> Does this help? Below I went with your suggestion, but changed "keeps" to "may
> keep" because that's generally true only outside of SIMD regions.
> 
> > > +observable effects from execution should appear as if only one lane was
> > 
> > s/was/is/
> > 
> > > +active. This is achieved by instrumenting syscalls and atomic instructions
> > > in
> > > +a lightweight way that allows to switch behavior at runtime. This code
> > 
> > Same issue here....  allows *what* to switch behavior?  (And how would you
> > select which run-time behavior you want?)
> 
> Sorry. This gives compiler itself a way to emit code that will switch behavior
> of the subsequently running code.
> 
> > Also, in the snippet above where it is used as a noun, please
> > s/runtime/run time/
> 
> Thanks. Does the following look better?
> 
> @item -muniform-simt
> @opindex muniform-simt
> Generate code that may keep all lanes in each warp active, even when
> observable effects from execution must appear as if only one lane is active.
> This is achieved by instrumenting syscalls and atomic instructions in a
> lightweight way, allowing the compiler to emit code that can switch at run
> time between this and conventional execution modes. This code generation
> variant is used for OpenMP offloading, but the option is exposed on its own
> for the purpose of testing the compiler; to generate code suitable for linking
> into programs using OpenMP offloading, use option @option{-mgomp}.
> 
> Alexander

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 3/8] nvptx -muniform-simt
  2016-06-22 18:39       ` Alexander Monakov
@ 2016-06-24 20:41         ` Sandra Loosemore
  0 siblings, 0 replies; 16+ messages in thread
From: Sandra Loosemore @ 2016-06-24 20:41 UTC (permalink / raw)
  To: Alexander Monakov; +Cc: gcc-patches, Nathan Sidwell

On 06/22/2016 12:39 PM, Alexander Monakov wrote:
> Ping.
>
> On Mon, 13 Jun 2016, Alexander Monakov wrote:
>> Does the following look better?
>>
>> @item -muniform-simt
>> @opindex muniform-simt
>> Generate code that may keep all lanes in each warp active, even when
>> observable effects from execution must appear as if only one lane is active.
>> This is achieved by instrumenting syscalls and atomic instructions in a
>> lightweight way, allowing the compiler to emit code that can switch at run
>> time between this and conventional execution modes. This code generation
>> variant is used for OpenMP offloading, but the option is exposed on its own
>> for the purpose of testing the compiler; to generate code suitable for linking
>> into programs using OpenMP offloading, use option @option{-mgomp}.

Yes, this version looks fine.  Sorry about the delay in reviewing this.  :-(

-Sandra


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH 6/8] new target hook: TARGET_SIMT_VF
  2016-10-14 16:40 Alexander Monakov
@ 2016-10-14 16:40 ` Alexander Monakov
  0 siblings, 0 replies; 16+ messages in thread
From: Alexander Monakov @ 2016-10-14 16:40 UTC (permalink / raw)
  To: gcc-patches; +Cc: Nathan Sidwell

This patch adds a new target hook and implements it in a straightforward
manner on NVPTX to indicate that the target is running in SIMT fashion with 32
threads in a synchronous group ("warp").  For use in OpenMP transforms.

	* config/nvptx/nvptx.c (nvptx_simt_vf): New.
	(TARGET_SIMT_VF): Define.
	* doc/tm.texi: Regenerate.
	* doc/tm.texi.in: (TARGET_SIMT_VF): New hook.
	* target.def: Define it.
---
 gcc/config/nvptx/nvptx.c | 11 +++++++++++
 gcc/doc/tm.texi          |  4 ++++
 gcc/doc/tm.texi.in       |  2 ++
 gcc/target.def           | 12 ++++++++++++
 4 files changed, 29 insertions(+)

diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index 1c3267f..ef85ef6 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -4453,6 +4453,14 @@ nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget),
 #define PTX_WORKER_LENGTH 32
 #define PTX_GANG_DEFAULT  32
 
+/* Implement TARGET_SIMT_VF target hook: number of threads in a warp.  */
+
+static int
+nvptx_simt_vf ()
+{
+  return PTX_VECTOR_LENGTH;
+}
+
 /* Validate compute dimensions of an OpenACC offload or routine, fill
    in non-unity defaults.  FN_LEVEL indicates the level at which a
    routine might spawn a loop.  It is negative for non-routines.  If
@@ -5221,6 +5229,9 @@ nvptx_goacc_reduction (gcall *call)
 #undef  TARGET_BUILTIN_DECL
 #define TARGET_BUILTIN_DECL nvptx_builtin_decl
 
+#undef TARGET_SIMT_VF
+#define TARGET_SIMT_VF nvptx_simt_vf
+
 #undef TARGET_GOACC_VALIDATE_DIMS
 #define TARGET_GOACC_VALIDATE_DIMS nvptx_goacc_validate_dims
 
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index a4a8e49..76477d6 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -5836,6 +5836,10 @@ usable.  In that case, the smaller the number is, the more desirable it is
 to use it.
 @end deftypefn
 
+@deftypefn {Target Hook} int TARGET_SIMT_VF (void)
+Return number of threads in SIMT thread group on the target.
+@end deftypefn
+
 @deftypefn {Target Hook} bool TARGET_GOACC_VALIDATE_DIMS (tree @var{decl}, int *@var{dims}, int @var{fn_level})
 This hook should check the launch dimensions provided for an OpenACC
 compute region, or routine.  Defaulted values are represented as -1
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index 265f1be..36672af 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -4289,6 +4289,8 @@ address;  but often a machine-dependent strategy can generate better code.
 
 @hook TARGET_SIMD_CLONE_USABLE
 
+@hook TARGET_SIMT_VF
+
 @hook TARGET_GOACC_VALIDATE_DIMS
 
 @hook TARGET_GOACC_DIM_LIMIT
diff --git a/gcc/target.def b/gcc/target.def
index b6968f7..0018f4d 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -1648,6 +1648,18 @@ int, (struct cgraph_node *), NULL)
 
 HOOK_VECTOR_END (simd_clone)
 
+/* Functions relating to OpenMP SIMT vectorization transform.  */
+#undef HOOK_PREFIX
+#define HOOK_PREFIX "TARGET_SIMT_"
+HOOK_VECTOR (TARGET_SIMT, simt)
+
+DEFHOOK
+(vf,
+"Return number of threads in SIMT thread group on the target.",
+int, (void), NULL)
+
+HOOK_VECTOR_END (simt)
+
 /* Functions relating to openacc.  */
 #undef HOOK_PREFIX
 #define HOOK_PREFIX "TARGET_GOACC_"
-- 
1.8.3.1

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2016-10-14 16:40 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-06-09 16:54 [PATCH 0/8] NVPTX offloading to NVPTX: backend patches Alexander Monakov
2016-06-09 16:54 ` [PATCH 5/8] nvptx mkoffload: pass -mgomp for OpenMP offloading Alexander Monakov
2016-06-09 16:54 ` [PATCH 7/8] nvptx: new insns for OpenMP SIMD-on-SIMT Alexander Monakov
2016-06-09 16:54 ` [PATCH 1/8] nvptx -msoft-stack Alexander Monakov
2016-06-09 16:54 ` [PATCH 6/8] new target hook: TARGET_SIMT_VF Alexander Monakov
2016-06-09 16:54 ` [PATCH 4/8] nvptx -mgomp Alexander Monakov
2016-06-13  3:57   ` Sandra Loosemore
2016-06-09 16:54 ` [PATCH 3/8] nvptx -muniform-simt Alexander Monakov
2016-06-13  3:55   ` Sandra Loosemore
2016-06-13 17:37     ` Alexander Monakov
2016-06-22 18:39       ` Alexander Monakov
2016-06-24 20:41         ` Sandra Loosemore
2016-06-09 16:54 ` [PATCH 2/8] nvptx: implement predicated instructions Alexander Monakov
2016-06-09 16:54 ` [PATCH 8/8] nvptx: handle OpenMP "omp target entrypoint" Alexander Monakov
2016-06-09 17:00 ` [PATCH 0/8] NVPTX offloading to NVPTX: backend patches Jakub Jelinek
2016-10-14 16:40 Alexander Monakov
2016-10-14 16:40 ` [PATCH 6/8] new target hook: TARGET_SIMT_VF Alexander Monakov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).