* Re: Improving code generation in the nvptx back end
[not found] <87ino97z1i.fsf@euler.schwinge.homeip.net>
@ 2017-02-17 13:23 ` Thomas Schwinge
2017-02-17 20:04 ` Cesar Philippidis
` (2 more replies)
0 siblings, 3 replies; 4+ messages in thread
From: Thomas Schwinge @ 2017-02-17 13:23 UTC (permalink / raw)
To: gcc-patches; +Cc: gcc, Alexander Monakov, Bernd Schmidt
Hi!
On Fri, 17 Feb 2017 14:00:09 +0100, I wrote:
> [...] for "normal" functions there is no reason to use the
> ".param" space for passing arguments in and out of functions. We can
> then get rid of the boilerplate code to move ".param %in_ar*" into ".reg
> %ar*", and the other way round for "%value_out"/"%value". This will then
> also simplify the call sites, where all that code "evaporates". That's
> actually something I started to look into, many months ago, and I now
> just dug out those changes, and will post them later.
>
> (Very likely, the PTX "JIT" compiler will do the very same thing without
> difficulty, but why not directly generate code that is less verbose to
> read?)
Using my WIP patch, the generated PTX code changes/is simplified as
follows:
// BEGIN GLOBAL FUNCTION DECL: f
-.visible .func (.param.f32 %value_out) f (.param.u32 %in_ar0, .param.u64 %in_ar1);
+.visible .func (.reg.f32 %value_out) f (.reg.u32 %ar0, .reg.u64 %ar1);
// BEGIN GLOBAL FUNCTION DEF: f
-.visible .func (.param.f32 %value_out) f (.param.u32 %in_ar0, .param.u64 %in_ar1)
+.visible .func (.reg.f32 %value_out) f (.reg.u32 %ar0, .reg.u64 %ar1)
{
.reg.f32 %value;
- .reg.u32 %ar0;
- ld.param.u32 %ar0, [%in_ar0];
- .reg.u64 %ar1;
- ld.param.u64 %ar1, [%in_ar1];
.reg.f64 %r23;
.reg.f32 %r24;
.reg.u32 %r25;
@@ -34,15 +30,15 @@ $L3:
mov.f32 %r24, 0f00000000;
$L1:
mov.f32 %value, %r24;
- st.param.f32 [%value_out], %value;
+ mov.f32 %value_out, %value;
ret;
}
// BEGIN GLOBAL FUNCTION DECL: main
-.visible .func (.param.u32 %value_out) main (.param.u32 %in_ar0, .param.u64 %in_ar1);
+.visible .func (.reg.u32 %value_out) main (.reg.u32 %ar0, .reg.u64 %ar1);
// BEGIN GLOBAL FUNCTION DEF: main
-.visible .func (.param.u32 %value_out) main (.param.u32 %in_ar0, .param.u64 %in_ar1)
+.visible .func (.reg.u32 %value_out) main (.reg.u32 %ar0, .reg.u64 %ar1)
{
.reg.u32 %value;
.local .align 8 .b8 %frame_ar[32];
@@ -70,13 +66,9 @@ $L1:
st.u64 [%frame+24], %r29;
add.u64 %r31, %frame, 16;
{
- .param.f32 %value_in;
- .param.u32 %out_arg1;
- st.param.u32 [%out_arg1], %r26;
- .param.u64 %out_arg2;
- st.param.u64 [%out_arg2], %r31;
- call (%value_in), f, (%out_arg1, %out_arg2);
- ld.param.f32 %r32, [%value_in];
+ .reg.f32 %value_in;
+ call (%value_in), f, (%r26, %r31);
+ mov.f32 %r32, %value_in;
}
setp.eq.f32 %r33, %r32, 0f00000000;
@%r33 bra $L5;
@@ -89,17 +81,13 @@ $L5:
st.u64 [%frame+24], %r36;
mov.u32 %r34, 1;
{
- .param.f32 %value_in;
- .param.u32 %out_arg1;
- st.param.u32 [%out_arg1], %r34;
- .param.u64 %out_arg2;
- st.param.u64 [%out_arg2], %r31;
- call (%value_in), f, (%out_arg1, %out_arg2);
- ld.param.f32 %r39, [%value_in];
+ .reg.f32 %value_in;
+ call (%value_in), f, (%r34, %r31);
+ mov.f32 %r39, %value_in;
}
setp.neu.f32 %r40, %r39, 0f3f800000;
@%r40 bra $L6;
mov.u32 %value, 0;
- st.param.u32 [%value_out], %value;
+ mov.u32 %value_out, %value;
ret;
}
(Not yet directly using "%value_out" instead of the intermediate "%value"
register.)
Is such a patch something to pursue to completion?
--- gcc/config/nvptx/nvptx.c
+++ gcc/config/nvptx/nvptx.c
@@ -603,19 +603,32 @@ nvptx_promote_function_mode (const_tree type, machine_mode mode,
to an argument register and it is greater than zero if we're
copying to a specific hard register. */
+static bool write_as_kernel (tree attrs);
static int
write_arg_mode (std::stringstream &s, int for_reg, int argno,
- machine_mode mode)
+ machine_mode mode, const_tree decl)
{
+ bool kernel = (decl != NULL_TREE) && write_as_kernel (DECL_ATTRIBUTES (decl));
const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
if (for_reg < 0)
{
/* Writing PTX prototype. */
s << (argno ? ", " : " (");
- s << ".param" << ptx_type << " %in_ar" << argno;
+ if (kernel)
+ s << ".param" << ptx_type << " %in_ar" << argno;
+ else
+#if 0
+ s << ".reg" << ptx_type << " %in_ar" << argno;
+#else
+ s << ".reg" << ptx_type << " %ar" << argno;
+#endif
}
+#if 0
else
+#else
+ else if (kernel || for_reg)
+#endif
{
s << "\t.reg" << ptx_type << " ";
if (for_reg)
@@ -625,12 +638,31 @@ write_arg_mode (std::stringstream &s, int for_reg, int argno,
s << ";\n";
if (argno >= 0)
{
- s << "\tld.param" << ptx_type << " ";
- if (for_reg)
- s << reg_names[for_reg];
+ if (kernel)
+ {
+ s << "\tld.param" << ptx_type << " ";
+ if (for_reg)
+ s << reg_names[for_reg];
+ else
+ s << "%ar" << argno;
+ s << ", [%in_ar" << argno << "];\n";
+ }
else
- s << "%ar" << argno;
- s << ", [%in_ar" << argno << "];\n";
+ {
+ s << "\tmov" << ptx_type << " ";
+ if (for_reg)
+ s << reg_names[for_reg];
+ else
+ s << "%ar" << argno;
+ /* TODO: we should directly emit "reg_names[for_reg]" above when
+ writing prototype, but will need to change all call sites,
+ because these just pass in -1 for for_reg. With that changed,
+ we can then avoid this additional ".reg", and the "mov". */
+ if (for_reg)
+ s << ", %ar" << argno << ";\n";
+ else
+ s << ", %in_ar" << argno << ";\n";
+ }
}
}
return argno + 1;
@@ -646,7 +678,7 @@ write_arg_mode (std::stringstream &s, int for_reg, int argno,
static int
write_arg_type (std::stringstream &s, int for_reg, int argno,
- tree type, bool prototyped)
+ tree type, bool prototyped, const_tree decl)
{
machine_mode mode = TYPE_MODE (type);
@@ -669,24 +701,31 @@ write_arg_type (std::stringstream &s, int for_reg, int argno,
mode = promote_arg (mode, prototyped);
if (split)
- argno = write_arg_mode (s, for_reg, argno, mode);
+ argno = write_arg_mode (s, for_reg, argno, mode, decl);
}
- return write_arg_mode (s, for_reg, argno, mode);
+ return write_arg_mode (s, for_reg, argno, mode, decl);
}
/* Emit a PTX return as a prototype or function prologue declaration
for MODE. */
static void
-write_return_mode (std::stringstream &s, bool for_proto, machine_mode mode)
+write_return_mode (std::stringstream &s, bool for_proto, machine_mode mode, const_tree decl)
{
const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
const char *pfx = "\t.reg";
const char *sfx = ";\n";
if (for_proto)
- pfx = "(.param", sfx = "_out) ";
+ {
+ bool kernel = (decl != NULL_TREE) && write_as_kernel (DECL_ATTRIBUTES (decl));
+ if (kernel)
+ pfx = "(.param";
+ else
+ pfx = "(.reg";
+ sfx = "_out) ";
+ }
s << pfx << ptx_type << " " << reg_names[NVPTX_RETURN_REGNUM] << sfx;
}
@@ -697,7 +736,7 @@ write_return_mode (std::stringstream &s, bool for_proto, machine_mode mode)
match the regular GCC function return mashalling. */
static bool
-write_return_type (std::stringstream &s, bool for_proto, tree type)
+write_return_type (std::stringstream &s, bool for_proto, tree type, const_tree decl)
{
machine_mode mode = TYPE_MODE (type);
@@ -728,7 +767,7 @@ write_return_type (std::stringstream &s, bool for_proto, tree type)
else
mode = promote_return (mode);
- write_return_mode (s, for_proto, mode);
+ write_return_mode (s, for_proto, mode, decl);
return return_in_mem;
}
@@ -824,7 +863,7 @@ write_fn_proto (std::stringstream &s, bool is_defn,
}
/* Declare the result. */
- bool return_in_mem = write_return_type (s, true, result_type);
+ bool return_in_mem = write_return_type (s, true, result_type, decl);
s << name;
@@ -832,7 +871,7 @@ write_fn_proto (std::stringstream &s, bool is_defn,
/* Emit argument list. */
if (return_in_mem)
- argno = write_arg_type (s, -1, argno, ptr_type_node, true);
+ argno = write_arg_type (s, -1, argno, ptr_type_node, true, decl);
/* We get:
NULL in TYPE_ARG_TYPES, for old-style functions
@@ -852,21 +891,21 @@ write_fn_proto (std::stringstream &s, bool is_defn,
tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
if (not_atomic_weak_arg)
- argno = write_arg_type (s, -1, argno, type, prototyped);
+ argno = write_arg_type (s, -1, argno, type, prototyped, decl);
else
gcc_assert (type == boolean_type_node);
}
if (stdarg_p (fntype))
- argno = write_arg_type (s, -1, argno, ptr_type_node, true);
+ argno = write_arg_type (s, -1, argno, ptr_type_node, true, decl);
if (DECL_STATIC_CHAIN (decl))
- argno = write_arg_type (s, -1, argno, ptr_type_node, true);
+ argno = write_arg_type (s, -1, argno, ptr_type_node, true, decl);
if (!argno && strcmp (name, "main") == 0)
{
- argno = write_arg_type (s, -1, argno, integer_type_node, true);
- argno = write_arg_type (s, -1, argno, ptr_type_node, true);
+ argno = write_arg_type (s, -1, argno, integer_type_node, true, decl);
+ argno = write_arg_type (s, -1, argno, ptr_type_node, true, decl);
}
if (argno)
@@ -899,7 +938,7 @@ write_fn_proto_from_insn (std::stringstream &s, const char *name,
}
if (result != NULL_RTX)
- write_return_mode (s, true, GET_MODE (result));
+ write_return_mode (s, true, GET_MODE (result), NULL_TREE);
s << name;
@@ -911,7 +950,7 @@ write_fn_proto_from_insn (std::stringstream &s, const char *name,
sequence. */
machine_mode mode = GET_MODE (XEXP (XVECEXP (pat, 0, i), 0));
- write_arg_mode (s, -1, i - 1, mode);
+ write_arg_mode (s, -1, i - 1, mode, NULL_TREE);
}
if (arg_end != 1)
s << ")";
@@ -1189,9 +1228,9 @@ nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
write_fn_proto (s, true, name, decl);
s << "{\n";
- bool return_in_mem = write_return_type (s, false, result_type);
+ bool return_in_mem = write_return_type (s, false, result_type, decl);
if (return_in_mem)
- argno = write_arg_type (s, 0, argno, ptr_type_node, true);
+ argno = write_arg_type (s, 0, argno, ptr_type_node, true, decl);
/* Declare and initialize incoming arguments. */
tree args = TYPE_ARG_TYPES (fntype);
@@ -1206,17 +1245,17 @@ nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
{
tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
- argno = write_arg_type (s, 0, argno, type, prototyped);
+ argno = write_arg_type (s, 0, argno, type, prototyped, decl);
}
if (stdarg_p (fntype))
argno = write_arg_type (s, ARG_POINTER_REGNUM, argno, ptr_type_node,
- true);
+ true, decl);
if (DECL_STATIC_CHAIN (decl) || cfun->machine->has_chain)
write_arg_type (s, STATIC_CHAIN_REGNUM,
DECL_STATIC_CHAIN (decl) ? argno : -1, ptr_type_node,
- true);
+ true, decl);
fprintf (file, "%s", s.str().c_str());
@@ -1290,8 +1329,14 @@ nvptx_output_return (void)
{
machine_mode mode = (machine_mode)cfun->machine->return_mode;
+ const char *fmt;
+ bool kernel = write_as_kernel (DECL_ATTRIBUTES (current_function_decl));
+ if (kernel)
+ fmt = "\tst.param%s\t[%s_out], %s;\n";
+ else
+ fmt = "\tmov%s\t%s_out, %s;\n";
if (mode != VOIDmode)
- fprintf (asm_out_file, "\tst.param%s\t[%s_out], %s;\n",
+ fprintf (asm_out_file, fmt,
nvptx_ptx_type_from_mode (mode, false),
reg_names[NVPTX_RETURN_REGNUM],
reg_names[NVPTX_RETURN_REGNUM]);
@@ -2063,7 +2108,9 @@ nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx callee)
fprintf (asm_out_file, "\t{\n");
if (result != NULL)
- fprintf (asm_out_file, "\t\t.param%s %s_in;\n",
+ //We can never have a kernel call another kernel.
+ //fprintf (asm_out_file, "\t\t.param%s %s_in;\n",
+ fprintf (asm_out_file, "\t\t.reg%s %s_in;\n",
nvptx_ptx_type_from_mode (GET_MODE (result), false),
reg_names[NVPTX_RETURN_REGNUM]);
@@ -2088,6 +2135,7 @@ nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx callee)
fputs (s.str().c_str(), asm_out_file);
}
+#if 0
for (int argno = 1; argno < arg_end; argno++)
{
rtx t = XEXP (XVECEXP (pat, 0, argno), 0);
@@ -2095,12 +2143,14 @@ nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx callee)
const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
/* Mode splitting has already been done. */
- fprintf (asm_out_file, "\t\t.param%s %%out_arg%d;\n"
- "\t\tst.param%s [%%out_arg%d], ",
+ //We can never have a kernel call another kernel, so don't need to care for ".param" here.
+ fprintf (asm_out_file, "\t\t.reg%s %%out_arg%d;\n"
+ "\t\tmov%s %%out_arg%d, ",
ptx_type, argno, ptx_type, argno);
output_reg (asm_out_file, REGNO (t), VOIDmode);
fprintf (asm_out_file, ";\n");
}
+#endif
/* The '.' stands for the call's predicate, if any. */
nvptx_print_operand (asm_out_file, NULL_RTX, '.');
@@ -2120,7 +2170,13 @@ nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx callee)
const char *open = "(";
for (int argno = 1; argno < arg_end; argno++)
{
+#if 0
fprintf (asm_out_file, ", %s%%out_arg%d", open, argno);
+#else
+ fprintf (asm_out_file, ", %s", open);
+ rtx t = XEXP (XVECEXP (pat, 0, argno), 0);
+ output_reg (asm_out_file, REGNO (t), VOIDmode);
+#endif
open = "";
}
if (decl && DECL_STATIC_CHAIN (decl))
@@ -2147,11 +2203,12 @@ nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx callee)
if (result)
{
- static char rval[sizeof ("\tld.param%%t0\t%%0, [%%%s_in];\n\t}") + 8];
+ //We can never have a kernel call another kernel.
+ static char rval[sizeof ("\tmov%%t0\t%%0, %%%s_in;\n\t}") + 8];
if (!rval[0])
/* We must escape the '%' that starts RETURN_REGNUM. */
- sprintf (rval, "\tld.param%%t0\t%%0, [%%%s_in];\n\t}",
+ sprintf (rval, "\tmov%%t0\t%%0, %%%s_in;\n\t}",
reg_names[NVPTX_RETURN_REGNUM]);
return rval;
}
Grüße
Thomas
^ permalink raw reply [flat|nested] 4+ messages in thread