2015-11-10 Nathan Sidwell * config/nvptx/nvptx.opt (moptimize): New flag. * config/nvptx/nvptx.c (nvptx_option_override): Set nvptx_optimize default. (nvptx_optimize_inner): New. (nvptx_process_pars): Call it when optimizing. * doc/invoke.texi (Nvidia PTX Options): Document -moptimize. Index: config/nvptx/nvptx.c =================================================================== --- config/nvptx/nvptx.c (revision 230112) +++ config/nvptx/nvptx.c (working copy) @@ -137,6 +137,9 @@ nvptx_option_override (void) write_symbols = NO_DEBUG; debug_info_level = DINFO_LEVEL_NONE; + if (nvptx_optimize < 0) + nvptx_optimize = optimize > 0; + declared_fndecls_htab = hash_table::create_ggc (17); needed_fndecls_htab = hash_table::create_ggc (17); declared_libfuncs_htab @@ -2942,6 +2945,69 @@ nvptx_skip_par (unsigned mask, parallel nvptx_single (mask, par->forked_block, pre_tail); } +/* If PAR has a single inner parallel and PAR itself only contains + empty entry and exit blocks, swallow the inner PAR. */ + +static void +nvptx_optimize_inner (parallel *par) +{ + parallel *inner = par->inner; + + /* We mustn't be the outer dummy par. */ + if (!par->mask) + return; + + /* We must have a single inner par. */ + if (!inner || inner->next) + return; + + /* We must only contain 2 blocks ourselves -- the head and tail of + the inner par. */ + if (par->blocks.length () != 2) + return; + + /* We must be disjoint partitioning. As we only have vector and + worker partitioning, this is sufficient to guarantee the pars + have adjacent partitioning. */ + if ((par->mask & inner->mask) & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1)) + /* This indicates malformed code generation. */ + return; + + /* The outer forked insn should be immediately followed by the inner + fork insn. */ + rtx_insn *forked = par->forked_insn; + rtx_insn *fork = BB_END (par->forked_block); + + if (NEXT_INSN (forked) != fork) + return; + gcc_checking_assert (recog_memoized (fork) == CODE_FOR_nvptx_fork); + + /* The outer joining insn must immediately follow the inner join + insn. */ + rtx_insn *joining = par->joining_insn; + rtx_insn *join = inner->join_insn; + if (NEXT_INSN (join) != joining) + return; + + /* Preconditions met. Swallow the inner par. */ + if (dump_file) + fprintf (dump_file, "Merging loop %x [%d,%d] into %x [%d,%d]\n", + inner->mask, inner->forked_block->index, + inner->join_block->index, + par->mask, par->forked_block->index, par->join_block->index); + + par->mask |= inner->mask & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1); + + par->blocks.reserve (inner->blocks.length ()); + while (inner->blocks.length ()) + par->blocks.quick_push (inner->blocks.pop ()); + + par->inner = inner->inner; + inner->inner = NULL; + + delete inner; +} + /* Process the parallel PAR and all its contained parallels. We do everything but the neutering. Return mask of partitioned modes used within this parallel. */ @@ -2949,6 +3015,9 @@ nvptx_skip_par (unsigned mask, parallel static unsigned nvptx_process_pars (parallel *par) { + if (nvptx_optimize) + nvptx_optimize_inner (par); + unsigned inner_mask = par->mask; /* Do the inner parallels first. */ Index: config/nvptx/nvptx.opt =================================================================== --- config/nvptx/nvptx.opt (revision 230112) +++ config/nvptx/nvptx.opt (working copy) @@ -28,3 +28,7 @@ Generate code for a 64-bit ABI. mmainkernel Target Report RejectNegative Link in code for a __main kernel. + +moptimize +Target Report Var(nvptx_optimize) Init(-1) +Optimize partition neutering Index: doc/invoke.texi =================================================================== --- doc/invoke.texi (revision 230112) +++ doc/invoke.texi (working copy) @@ -873,7 +873,7 @@ Objective-C and Objective-C++ Dialects}. -march=@var{arch} -mbmx -mno-bmx -mcdx -mno-cdx} @emph{Nvidia PTX Options} -@gccoptlist{-m32 -m64 -mmainkernel} +@gccoptlist{-m32 -m64 -mmainkernel -moptimize} @emph{PDP-11 Options} @gccoptlist{-mfpu -msoft-float -mac0 -mno-ac0 -m40 -m45 -m10 @gol @@ -18960,6 +18960,11 @@ Generate code for 32-bit or 64-bit ABI. Link in code for a __main kernel. This is for stand-alone instead of offloading execution. +@item -moptimize +@opindex moptimize +Apply partitioned execution optimizations. This is the default when any +level of optimization is selected. + @end table @node PDP-11 Options