public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* Re: Improving code generation in the nvptx back end
       [not found] <87ino97z1i.fsf@euler.schwinge.homeip.net>
@ 2017-02-17 13:23 ` Thomas Schwinge
  2017-02-17 20:04   ` Cesar Philippidis
                     ` (2 more replies)
  0 siblings, 3 replies; 4+ messages in thread
From: Thomas Schwinge @ 2017-02-17 13:23 UTC (permalink / raw)
  To: gcc-patches; +Cc: gcc, Alexander Monakov, Bernd Schmidt

Hi!

On Fri, 17 Feb 2017 14:00:09 +0100, I wrote:
> [...] for "normal" functions there is no reason to use the
> ".param" space for passing arguments in and out of functions.  We can
> then get rid of the boilerplate code to move ".param %in_ar*" into ".reg
> %ar*", and the other way round for "%value_out"/"%value".  This will then
> also simplify the call sites, where all that code "evaporates".  That's
> actually something I started to look into, many months ago, and I now
> just dug out those changes, and will post them later.
> 
> (Very likely, the PTX "JIT" compiler will do the very same thing without
> difficulty, but why not directly generate code that is less verbose to
> read?)

Using my WIP patch, the generated PTX code changes/is simplified as
follows:

     // BEGIN GLOBAL FUNCTION DECL: f
    -.visible .func (.param.f32 %value_out) f (.param.u32 %in_ar0, .param.u64 %in_ar1);
    +.visible .func (.reg.f32 %value_out) f (.reg.u32 %ar0, .reg.u64 %ar1);
    
     // BEGIN GLOBAL FUNCTION DEF: f
    -.visible .func (.param.f32 %value_out) f (.param.u32 %in_ar0, .param.u64 %in_ar1)
    +.visible .func (.reg.f32 %value_out) f (.reg.u32 %ar0, .reg.u64 %ar1)
     {
            .reg.f32 %value;
    -       .reg.u32 %ar0;
    -       ld.param.u32 %ar0, [%in_ar0];
    -       .reg.u64 %ar1;
    -       ld.param.u64 %ar1, [%in_ar1];
            .reg.f64 %r23;
            .reg.f32 %r24;
            .reg.u32 %r25;
    @@ -34,15 +30,15 @@ $L3:
                    mov.f32 %r24, 0f00000000;
     $L1:
                    mov.f32 %value, %r24;
    -       st.param.f32    [%value_out], %value;
    +       mov.f32 %value_out, %value;
            ret;
     }
    
     // BEGIN GLOBAL FUNCTION DECL: main
    -.visible .func (.param.u32 %value_out) main (.param.u32 %in_ar0, .param.u64 %in_ar1);
    +.visible .func (.reg.u32 %value_out) main (.reg.u32 %ar0, .reg.u64 %ar1);
    
     // BEGIN GLOBAL FUNCTION DEF: main
    -.visible .func (.param.u32 %value_out) main (.param.u32 %in_ar0, .param.u64 %in_ar1)
    +.visible .func (.reg.u32 %value_out) main (.reg.u32 %ar0, .reg.u64 %ar1)
     {
            .reg.u32 %value;
            .local .align 8 .b8 %frame_ar[32];
    @@ -70,13 +66,9 @@ $L1:
                    st.u64  [%frame+24], %r29;
                    add.u64 %r31, %frame, 16;
            {
    -               .param.f32 %value_in;
    -               .param.u32 %out_arg1;
    -               st.param.u32 [%out_arg1], %r26;
    -               .param.u64 %out_arg2;
    -               st.param.u64 [%out_arg2], %r31;
    -               call (%value_in), f, (%out_arg1, %out_arg2);
    -               ld.param.f32    %r32, [%value_in];
    +               .reg.f32 %value_in;
    +               call (%value_in), f, (%r26, %r31);
    +               mov.f32 %r32, %value_in;
            }
                    setp.eq.f32     %r33, %r32, 0f00000000;
            @%r33   bra     $L5;
    @@ -89,17 +81,13 @@ $L5:
                    st.u64  [%frame+24], %r36;
                    mov.u32 %r34, 1;
            {
    -               .param.f32 %value_in;
    -               .param.u32 %out_arg1;
    -               st.param.u32 [%out_arg1], %r34;
    -               .param.u64 %out_arg2;
    -               st.param.u64 [%out_arg2], %r31;
    -               call (%value_in), f, (%out_arg1, %out_arg2);
    -               ld.param.f32    %r39, [%value_in];
    +               .reg.f32 %value_in;
    +               call (%value_in), f, (%r34, %r31);
    +               mov.f32 %r39, %value_in;
            }
                    setp.neu.f32    %r40, %r39, 0f3f800000;
            @%r40   bra     $L6;
                    mov.u32 %value, 0;
    -       st.param.u32    [%value_out], %value;
    +       mov.u32 %value_out, %value;
            ret;
     }

(Not yet directly using "%value_out" instead of the intermediate "%value"
register.)

Is such a patch something to pursue to completion?

--- gcc/config/nvptx/nvptx.c
+++ gcc/config/nvptx/nvptx.c
@@ -603,19 +603,32 @@ nvptx_promote_function_mode (const_tree type, machine_mode mode,
    to an argument register and it is greater than zero if we're
    copying to a specific hard register.  */
 
+static bool write_as_kernel (tree attrs);
 static int
 write_arg_mode (std::stringstream &s, int for_reg, int argno,
-		machine_mode mode)
+		machine_mode mode, const_tree decl)
 {
+  bool kernel = (decl != NULL_TREE) && write_as_kernel (DECL_ATTRIBUTES (decl));
   const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
 
   if (for_reg < 0)
     {
       /* Writing PTX prototype.  */
       s << (argno ? ", " : " (");
-      s << ".param" << ptx_type << " %in_ar" << argno;
+      if (kernel)
+	s << ".param" << ptx_type << " %in_ar" << argno;
+      else
+#if 0
+	s << ".reg" << ptx_type << " %in_ar" << argno;
+#else
+	s << ".reg" << ptx_type << " %ar" << argno;
+#endif
     }
+#if 0
   else
+#else
+  else if (kernel || for_reg)
+#endif
     {
       s << "\t.reg" << ptx_type << " ";
       if (for_reg)
@@ -625,12 +638,31 @@ write_arg_mode (std::stringstream &s, int for_reg, int argno,
       s << ";\n";
       if (argno >= 0)
 	{
-	  s << "\tld.param" << ptx_type << " ";
-	  if (for_reg)
-	    s << reg_names[for_reg];
+	  if (kernel)
+	    {
+	      s << "\tld.param" << ptx_type << " ";
+	      if (for_reg)
+		s << reg_names[for_reg];
+	      else
+		s << "%ar" << argno;
+	      s << ", [%in_ar" << argno << "];\n";
+	    }
 	  else
-	    s << "%ar" << argno;
-	  s << ", [%in_ar" << argno << "];\n";
+	    {
+	      s << "\tmov" << ptx_type << " ";
+	      if (for_reg)
+		s << reg_names[for_reg];
+	      else
+		s << "%ar" << argno;
+	      /* TODO: we should directly emit "reg_names[for_reg]" above when
+		 writing prototype, but will need to change all call sites,
+		 because these just pass in -1 for for_reg.  With that changed,
+		 we can then avoid this additional ".reg", and the "mov".  */
+	      if (for_reg)
+		s << ", %ar" << argno << ";\n";
+	      else
+		s << ", %in_ar" << argno << ";\n";
+	    }
 	}
     }
   return argno + 1;
@@ -646,7 +678,7 @@ write_arg_mode (std::stringstream &s, int for_reg, int argno,
 
 static int
 write_arg_type (std::stringstream &s, int for_reg, int argno,
-		tree type, bool prototyped)
+		tree type, bool prototyped, const_tree decl)
 {
   machine_mode mode = TYPE_MODE (type);
 
@@ -669,24 +701,31 @@ write_arg_type (std::stringstream &s, int for_reg, int argno,
 
       mode = promote_arg (mode, prototyped);
       if (split)
-	argno = write_arg_mode (s, for_reg, argno, mode);
+	argno = write_arg_mode (s, for_reg, argno, mode, decl);
     }
 
-  return write_arg_mode (s, for_reg, argno, mode);
+  return write_arg_mode (s, for_reg, argno, mode, decl);
 }
 
 /* Emit a PTX return as a prototype or function prologue declaration
    for MODE.  */
 
 static void
-write_return_mode (std::stringstream &s, bool for_proto, machine_mode mode)
+write_return_mode (std::stringstream &s, bool for_proto, machine_mode mode, const_tree decl)
 {
   const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
   const char *pfx = "\t.reg";
   const char *sfx = ";\n";
   
   if (for_proto)
-    pfx = "(.param", sfx = "_out) ";
+    {
+      bool kernel = (decl != NULL_TREE) && write_as_kernel (DECL_ATTRIBUTES (decl));
+      if (kernel)
+	pfx = "(.param";
+      else
+	pfx = "(.reg";
+      sfx = "_out) ";
+    }
   
   s << pfx << ptx_type << " " << reg_names[NVPTX_RETURN_REGNUM] << sfx;
 }
@@ -697,7 +736,7 @@ write_return_mode (std::stringstream &s, bool for_proto, machine_mode mode)
    match the regular GCC function return mashalling.  */
 
 static bool
-write_return_type (std::stringstream &s, bool for_proto, tree type)
+write_return_type (std::stringstream &s, bool for_proto, tree type, const_tree decl)
 {
   machine_mode mode = TYPE_MODE (type);
 
@@ -728,7 +767,7 @@ write_return_type (std::stringstream &s, bool for_proto, tree type)
   else
     mode = promote_return (mode);
 
-  write_return_mode (s, for_proto, mode);
+  write_return_mode (s, for_proto, mode, decl);
 
   return return_in_mem;
 }
@@ -824,7 +863,7 @@ write_fn_proto (std::stringstream &s, bool is_defn,
       }
 
   /* Declare the result.  */
-  bool return_in_mem = write_return_type (s, true, result_type);
+  bool return_in_mem = write_return_type (s, true, result_type, decl);
 
   s << name;
 
@@ -832,7 +871,7 @@ write_fn_proto (std::stringstream &s, bool is_defn,
 
   /* Emit argument list.  */
   if (return_in_mem)
-    argno = write_arg_type (s, -1, argno, ptr_type_node, true);
+    argno = write_arg_type (s, -1, argno, ptr_type_node, true, decl);
 
   /* We get:
      NULL in TYPE_ARG_TYPES, for old-style functions
@@ -852,21 +891,21 @@ write_fn_proto (std::stringstream &s, bool is_defn,
       tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
       
       if (not_atomic_weak_arg)
-	argno = write_arg_type (s, -1, argno, type, prototyped);
+	argno = write_arg_type (s, -1, argno, type, prototyped, decl);
       else
 	gcc_assert (type == boolean_type_node);
     }
 
   if (stdarg_p (fntype))
-    argno = write_arg_type (s, -1, argno, ptr_type_node, true);
+    argno = write_arg_type (s, -1, argno, ptr_type_node, true, decl);
 
   if (DECL_STATIC_CHAIN (decl))
-    argno = write_arg_type (s, -1, argno, ptr_type_node, true);
+    argno = write_arg_type (s, -1, argno, ptr_type_node, true, decl);
 
   if (!argno && strcmp (name, "main") == 0)
     {
-      argno = write_arg_type (s, -1, argno, integer_type_node, true);
-      argno = write_arg_type (s, -1, argno, ptr_type_node, true);
+      argno = write_arg_type (s, -1, argno, integer_type_node, true, decl);
+      argno = write_arg_type (s, -1, argno, ptr_type_node, true, decl);
     }
 
   if (argno)
@@ -899,7 +938,7 @@ write_fn_proto_from_insn (std::stringstream &s, const char *name,
     }
 
   if (result != NULL_RTX)
-    write_return_mode (s, true, GET_MODE (result));
+    write_return_mode (s, true, GET_MODE (result), NULL_TREE);
 
   s << name;
 
@@ -911,7 +950,7 @@ write_fn_proto_from_insn (std::stringstream &s, const char *name,
 	 sequence.  */
       machine_mode mode = GET_MODE (XEXP (XVECEXP (pat, 0, i), 0));
 
-      write_arg_mode (s, -1, i - 1, mode);
+      write_arg_mode (s, -1, i - 1, mode, NULL_TREE);
     }
   if (arg_end != 1)
     s << ")";
@@ -1189,9 +1228,9 @@ nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
   write_fn_proto (s, true, name, decl);
   s << "{\n";
 
-  bool return_in_mem = write_return_type (s, false, result_type);
+  bool return_in_mem = write_return_type (s, false, result_type, decl);
   if (return_in_mem)
-    argno = write_arg_type (s, 0, argno, ptr_type_node, true);
+    argno = write_arg_type (s, 0, argno, ptr_type_node, true, decl);
   
   /* Declare and initialize incoming arguments.  */
   tree args = TYPE_ARG_TYPES (fntype);
@@ -1206,17 +1245,17 @@ nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
     {
       tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
 
-      argno = write_arg_type (s, 0, argno, type, prototyped);
+      argno = write_arg_type (s, 0, argno, type, prototyped, decl);
     }
 
   if (stdarg_p (fntype))
     argno = write_arg_type (s, ARG_POINTER_REGNUM, argno, ptr_type_node,
-			    true);
+			    true, decl);
 
   if (DECL_STATIC_CHAIN (decl) || cfun->machine->has_chain)
     write_arg_type (s, STATIC_CHAIN_REGNUM,
 		    DECL_STATIC_CHAIN (decl) ? argno : -1, ptr_type_node,
-		    true);
+		    true, decl);
 
   fprintf (file, "%s", s.str().c_str());
 
@@ -1290,8 +1329,14 @@ nvptx_output_return (void)
 {
   machine_mode mode = (machine_mode)cfun->machine->return_mode;
 
+  const char *fmt;
+  bool kernel = write_as_kernel (DECL_ATTRIBUTES (current_function_decl));
+  if (kernel)
+    fmt = "\tst.param%s\t[%s_out], %s;\n";
+  else
+    fmt = "\tmov%s\t%s_out, %s;\n";
   if (mode != VOIDmode)
-    fprintf (asm_out_file, "\tst.param%s\t[%s_out], %s;\n",
+    fprintf (asm_out_file, fmt,
 	     nvptx_ptx_type_from_mode (mode, false),
 	     reg_names[NVPTX_RETURN_REGNUM],
 	     reg_names[NVPTX_RETURN_REGNUM]);
@@ -2063,7 +2108,9 @@ nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx callee)
 
   fprintf (asm_out_file, "\t{\n");
   if (result != NULL)
-    fprintf (asm_out_file, "\t\t.param%s %s_in;\n",
+    //We can never have a kernel call another kernel.
+    //fprintf (asm_out_file, "\t\t.param%s %s_in;\n",
+    fprintf (asm_out_file, "\t\t.reg%s %s_in;\n",
 	     nvptx_ptx_type_from_mode (GET_MODE (result), false),
 	     reg_names[NVPTX_RETURN_REGNUM]);
 
@@ -2088,6 +2135,7 @@ nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx callee)
       fputs (s.str().c_str(), asm_out_file);
     }
 
+#if 0
   for (int argno = 1; argno < arg_end; argno++)
     {
       rtx t = XEXP (XVECEXP (pat, 0, argno), 0);
@@ -2095,12 +2143,14 @@ nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx callee)
       const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
 
       /* Mode splitting has already been done.  */
-      fprintf (asm_out_file, "\t\t.param%s %%out_arg%d;\n"
-	       "\t\tst.param%s [%%out_arg%d], ",
+      //We can never have a kernel call another kernel, so don't need to care for ".param" here.
+      fprintf (asm_out_file, "\t\t.reg%s %%out_arg%d;\n"
+	       "\t\tmov%s %%out_arg%d, ",
 	       ptx_type, argno, ptx_type, argno);
       output_reg (asm_out_file, REGNO (t), VOIDmode);
       fprintf (asm_out_file, ";\n");
     }
+#endif
 
   /* The '.' stands for the call's predicate, if any.  */
   nvptx_print_operand (asm_out_file, NULL_RTX, '.');
@@ -2120,7 +2170,13 @@ nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx callee)
   const char *open = "(";
   for (int argno = 1; argno < arg_end; argno++)
     {
+#if 0
       fprintf (asm_out_file, ", %s%%out_arg%d", open, argno);
+#else
+      fprintf (asm_out_file, ", %s", open);
+      rtx t = XEXP (XVECEXP (pat, 0, argno), 0);
+      output_reg (asm_out_file, REGNO (t), VOIDmode);
+#endif
       open = "";
     }
   if (decl && DECL_STATIC_CHAIN (decl))
@@ -2147,11 +2203,12 @@ nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx callee)
 
   if (result)
     {
-      static char rval[sizeof ("\tld.param%%t0\t%%0, [%%%s_in];\n\t}") + 8];
+      //We can never have a kernel call another kernel.
+      static char rval[sizeof ("\tmov%%t0\t%%0, %%%s_in;\n\t}") + 8];
 
       if (!rval[0])
 	/* We must escape the '%' that starts RETURN_REGNUM.  */
-	sprintf (rval, "\tld.param%%t0\t%%0, [%%%s_in];\n\t}",
+	sprintf (rval, "\tmov%%t0\t%%0, %%%s_in;\n\t}",
 		 reg_names[NVPTX_RETURN_REGNUM]);
       return rval;
     }


Grüße
 Thomas

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: Re: Improving code generation in the nvptx back end
  2017-02-17 13:23 ` Improving code generation in the nvptx back end Thomas Schwinge
@ 2017-02-17 20:04   ` Cesar Philippidis
  2017-02-20 13:59   ` Bernd Schmidt
  2017-02-20 15:17   ` Alexander Monakov
  2 siblings, 0 replies; 4+ messages in thread
From: Cesar Philippidis @ 2017-02-17 20:04 UTC (permalink / raw)
  To: Thomas Schwinge, gcc-patches; +Cc: gcc, Alexander Monakov, Bernd Schmidt

On 02/17/2017 05:09 AM, Thomas Schwinge wrote:

> On Fri, 17 Feb 2017 14:00:09 +0100, I wrote:
>> [...] for "normal" functions there is no reason to use the
>> ".param" space for passing arguments in and out of functions.  We can
>> then get rid of the boilerplate code to move ".param %in_ar*" into ".reg
>> %ar*", and the other way round for "%value_out"/"%value".  This will then
>> also simplify the call sites, where all that code "evaporates".  That's
>> actually something I started to look into, many months ago, and I now
>> just dug out those changes, and will post them later.
>>
>> (Very likely, the PTX "JIT" compiler will do the very same thing without
>> difficulty, but why not directly generate code that is less verbose to
>> read?)
> 
> Using my WIP patch, the generated PTX code changes/is simplified as
> follows:
> 
>      // BEGIN GLOBAL FUNCTION DECL: f
>     -.visible .func (.param.f32 %value_out) f (.param.u32 %in_ar0, .param.u64 %in_ar1);
>     +.visible .func (.reg.f32 %value_out) f (.reg.u32 %ar0, .reg.u64 %ar1);
>     
>      // BEGIN GLOBAL FUNCTION DEF: f
>     -.visible .func (.param.f32 %value_out) f (.param.u32 %in_ar0, .param.u64 %in_ar1)
>     +.visible .func (.reg.f32 %value_out) f (.reg.u32 %ar0, .reg.u64 %ar1)
>      {
>             .reg.f32 %value;
>     -       .reg.u32 %ar0;
>     -       ld.param.u32 %ar0, [%in_ar0];
>     -       .reg.u64 %ar1;
>     -       ld.param.u64 %ar1, [%in_ar1];
>             .reg.f64 %r23;
>             .reg.f32 %r24;
>             .reg.u32 %r25;
>     @@ -34,15 +30,15 @@ $L3:
>                     mov.f32 %r24, 0f00000000;
>      $L1:
>                     mov.f32 %value, %r24;
>     -       st.param.f32    [%value_out], %value;
>     +       mov.f32 %value_out, %value;
>             ret;
>      }
>     
>      // BEGIN GLOBAL FUNCTION DECL: main
>     -.visible .func (.param.u32 %value_out) main (.param.u32 %in_ar0, .param.u64 %in_ar1);
>     +.visible .func (.reg.u32 %value_out) main (.reg.u32 %ar0, .reg.u64 %ar1);
>     
>      // BEGIN GLOBAL FUNCTION DEF: main
>     -.visible .func (.param.u32 %value_out) main (.param.u32 %in_ar0, .param.u64 %in_ar1)
>     +.visible .func (.reg.u32 %value_out) main (.reg.u32 %ar0, .reg.u64 %ar1)
>      {
>             .reg.u32 %value;
>             .local .align 8 .b8 %frame_ar[32];
>     @@ -70,13 +66,9 @@ $L1:
>                     st.u64  [%frame+24], %r29;
>                     add.u64 %r31, %frame, 16;
>             {
>     -               .param.f32 %value_in;
>     -               .param.u32 %out_arg1;
>     -               st.param.u32 [%out_arg1], %r26;
>     -               .param.u64 %out_arg2;
>     -               st.param.u64 [%out_arg2], %r31;
>     -               call (%value_in), f, (%out_arg1, %out_arg2);
>     -               ld.param.f32    %r32, [%value_in];
>     +               .reg.f32 %value_in;
>     +               call (%value_in), f, (%r26, %r31);
>     +               mov.f32 %r32, %value_in;
>             }
>                     setp.eq.f32     %r33, %r32, 0f00000000;
>             @%r33   bra     $L5;
>     @@ -89,17 +81,13 @@ $L5:
>                     st.u64  [%frame+24], %r36;
>                     mov.u32 %r34, 1;
>             {
>     -               .param.f32 %value_in;
>     -               .param.u32 %out_arg1;
>     -               st.param.u32 [%out_arg1], %r34;
>     -               .param.u64 %out_arg2;
>     -               st.param.u64 [%out_arg2], %r31;
>     -               call (%value_in), f, (%out_arg1, %out_arg2);
>     -               ld.param.f32    %r39, [%value_in];
>     +               .reg.f32 %value_in;
>     +               call (%value_in), f, (%r34, %r31);
>     +               mov.f32 %r39, %value_in;
>             }
>                     setp.neu.f32    %r40, %r39, 0f3f800000;
>             @%r40   bra     $L6;
>                     mov.u32 %value, 0;
>     -       st.param.u32    [%value_out], %value;
>     +       mov.u32 %value_out, %value;
>             ret;
>      }
> 
> (Not yet directly using "%value_out" instead of the intermediate "%value"
> register.)
> 
> Is such a patch something to pursue to completion?

Are you trying to optimize acc routines in general? I'm not sure how
frequently they are used at the moment.

Also, while .param values may be overkill for routines, they are
addressable. Looking at section 5.1.6.1 in the PTX reference manual, you
can have something like this:

.entry foo ( .param .b32 N, .param .align 8 .b8 buffer[64] )
{
  .reg .u32 %n;
  .reg .f64 %d;
  ld.param.u32 %n, [N];
  ld.param.f64
  ...

Granted, this is an entry function to be called from the host, but the
same usage is applicable inside routines.

This gives me an idea. While working on the firstprivate changes, I
noticed that GCC packs all of the offloaded function arguments into a
structure, which the nvptx run time plugin uploads to a special data
mapping prior to calling cuLaunchKernel. That's inefficient in
application that launch a lot of small offloaded regions because those
data transfers require an additional hardware synchronization. In light
of this observation, I had originally proposed that we teach GCC how to
invoke the offloaded region with individual arguments for each offloaded
variable instead of a pointer to a packed struct, because cuLaunchKernel
supports the former more efficiently. However, given that param values
can be addressable values, it would probably be more straightforward to
just pass in the struct containing the offloaded variables by value
instead of by reference.

I'd need to look more closely at the PTX ISA to see if this is even
feasible.

Cesar

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: Improving code generation in the nvptx back end
  2017-02-17 13:23 ` Improving code generation in the nvptx back end Thomas Schwinge
  2017-02-17 20:04   ` Cesar Philippidis
@ 2017-02-20 13:59   ` Bernd Schmidt
  2017-02-20 15:17   ` Alexander Monakov
  2 siblings, 0 replies; 4+ messages in thread
From: Bernd Schmidt @ 2017-02-20 13:59 UTC (permalink / raw)
  To: Thomas Schwinge, gcc-patches; +Cc: gcc, Alexander Monakov, Julian Brown

On 02/17/2017 02:09 PM, Thomas Schwinge wrote:
> Hi!
>
> On Fri, 17 Feb 2017 14:00:09 +0100, I wrote:
>> [...] for "normal" functions there is no reason to use the
>> ".param" space for passing arguments in and out of functions.  We can
>> then get rid of the boilerplate code to move ".param %in_ar*" into ".reg
>> %ar*", and the other way round for "%value_out"/"%value".  This will then
>> also simplify the call sites, where all that code "evaporates".  That's
>> actually something I started to look into, many months ago, and I now
>> just dug out those changes, and will post them later.
>>
>> (Very likely, the PTX "JIT" compiler will do the very same thing without
>> difficulty, but why not directly generate code that is less verbose to
>> read?)
>
> Using my WIP patch, the generated PTX code changes/is simplified as
> follows:

It's probably a good idea to run cuobjdump -sass to see whether this has 
any effect at all.

The most important issue that needs solving is probably still the old 
issue that ptxas isn't reliable. Looks like the llvm folks ran into the 
same problem, as I discovered last week:
   https://bugs.llvm.org//show_bug.cgi?id=27738


Bernd

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: Improving code generation in the nvptx back end
  2017-02-17 13:23 ` Improving code generation in the nvptx back end Thomas Schwinge
  2017-02-17 20:04   ` Cesar Philippidis
  2017-02-20 13:59   ` Bernd Schmidt
@ 2017-02-20 15:17   ` Alexander Monakov
  2 siblings, 0 replies; 4+ messages in thread
From: Alexander Monakov @ 2017-02-20 15:17 UTC (permalink / raw)
  To: Thomas Schwinge; +Cc: gcc-patches, gcc, Bernd Schmidt

On Fri, 17 Feb 2017, Thomas Schwinge wrote:
> On Fri, 17 Feb 2017 14:00:09 +0100, I wrote:
> > [...] for "normal" functions there is no reason to use the
> > ".param" space for passing arguments in and out of functions.  We can
> > then get rid of the boilerplate code to move ".param %in_ar*" into ".reg
> > %ar*", and the other way round for "%value_out"/"%value".  This will then
> > also simplify the call sites, where all that code "evaporates".  That's
> > actually something I started to look into, many months ago, and I now
> > just dug out those changes, and will post them later.
> > 
> > (Very likely, the PTX "JIT" compiler will do the very same thing without
> > difficulty, but why not directly generate code that is less verbose to
> > read?)

In general you cannot use this shorthand notation because PTX interop guidelines
explicitly prescribe using the .param space for argument passing.  See
https://docs.nvidia.com/cuda/ptx-writers-guide-to-interoperability/ , section 3.

So at best GCC can use it for calls where interop concerns are guaranteed to not
arise: when the callee is not externally visible, and does not have its address
taken.  And there's a question of how well it's going to work after time passes,
since other compilers always use the verbose form (and thus the .reg calling
style is not frequently exercised).

Alexander

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2017-02-20 15:16 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <87ino97z1i.fsf@euler.schwinge.homeip.net>
2017-02-17 13:23 ` Improving code generation in the nvptx back end Thomas Schwinge
2017-02-17 20:04   ` Cesar Philippidis
2017-02-20 13:59   ` Bernd Schmidt
2017-02-20 15:17   ` Alexander Monakov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).