2015-08-26 Nathan Sidwell * config/nvptx/nvptx.opt (moptimize): New flag. * config/nvptx/nvptx.c (nvptx_option_override): Default nvptx_optimize. (nvptx_optimmize_inner): New. (nvptx_process_pars): Call it. * doc/invoke.txi (Nvptx options): Document moptimize. Index: gcc/config/nvptx/nvptx.c =================================================================== --- gcc/config/nvptx/nvptx.c (revision 227180) +++ gcc/config/nvptx/nvptx.c (working copy) @@ -178,6 +178,9 @@ nvptx_option_override (void) write_symbols = NO_DEBUG; debug_info_level = DINFO_LEVEL_NONE; + if (nvptx_optimize < 0) + nvptx_optimize = optimize > 0; + declared_fndecls_htab = hash_table::create_ggc (17); needed_fndecls_htab = hash_table::create_ggc (17); declared_libfuncs_htab @@ -3005,6 +3008,64 @@ nvptx_skip_par (unsigned mask, parallel nvptx_single (mask, par->forked_block, pre_tail); } +/* If PAR has a single inner parallel and PAR itself only contains + empty entry and exit blocks, swallow the inner PAR. */ + +static void +nvptx_optimize_inner (parallel *par) +{ + parallel *inner = par->inner; + + /* We mustn't be the outer dummy par. */ + if (!par->mask) + return; + + /* We must have a single inner par. */ + if (!inner || inner->next) + return; + + /* We must only contain 2 blocks ourselves -- the head and tail of + the inner par. */ + if (par->blocks.length () != 2) + return; + + /* We must be disjoint partitioning. As we only have vector and + worker partitioning, this is sufficient to guarantee the pars + have adjacent partitioning. */ + if ((par->mask & inner->mask) & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1)) + /* This indicates malformed code generation. */ + return; + + /* The outer forked insn should be the only one in its block. */ + rtx_insn *probe; + rtx_insn *forked = par->forked_insn; + for (probe = BB_END (par->forked_block); + probe != forked; probe = PREV_INSN (probe)) + if (INSN_P (probe)) + return; + + /* The outer joining insn, if any, must be in the same block as the inner + joined instruction, which must otherwise be empty of insns. */ + rtx_insn *joining = par->joining_insn; + rtx_insn *join = inner->join_insn; + for (probe = BB_END (inner->join_block); + probe != join; probe = PREV_INSN (probe)) + if (probe != joining && INSN_P (probe)) + return; + + /* Preconditions met. Swallow the inner par. */ + par->mask |= inner->mask & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1); + + par->blocks.reserve (inner->blocks.length ()); + while (inner->blocks.length ()) + par->blocks.quick_push (inner->blocks.pop ()); + + par->inner = inner->inner; + inner->inner = NULL; + + delete inner; +} + /* Process the parallel PAR and all its contained parallels. We do everything but the neutering. Return mask of partitioned modes used within this parallel. */ @@ -3012,8 +3073,11 @@ nvptx_skip_par (unsigned mask, parallel static unsigned nvptx_process_pars (parallel *par) { - unsigned inner_mask = par->mask; + if (nvptx_optimize) + nvptx_optimize_inner (par); + unsigned inner_mask = par->mask; + /* Do the inner parallels first. */ if (par->inner) { Index: gcc/config/nvptx/nvptx.opt =================================================================== --- gcc/config/nvptx/nvptx.opt (revision 227180) +++ gcc/config/nvptx/nvptx.opt (working copy) @@ -29,6 +29,10 @@ mmainkernel Target Report RejectNegative Link in code for a __main kernel. +moptimize +Target Report Var(nvptx_optimize) Init(-1) +Optimize partition neutering + Enum Name(ptx_isa) Type(int) Known PTX ISA versions (for use with the -misa= option): Index: gcc/doc/invoke.texi =================================================================== --- gcc/doc/invoke.texi (revision 227180) +++ gcc/doc/invoke.texi (working copy) @@ -18814,6 +18814,11 @@ Generate code for 32-bit or 64-bit ABI. Link in code for a __main kernel. This is for stand-alone instead of offloading execution. +@item -moptimize +@opindex moptimize +Apply partitioned execution optimizations. This is the default when any +level of optimization is selected. + @end table @node PDP-11 Options