Add extra initialization of broadcasted condition variables 2017-07-11 Tom de Vries * config/nvptx/nvptx.c (WORKAROUND_PTXJIT_BUG): New macro. (bb_first_real_insn): New function. (nvptx_single): Add extra initialization of broadcasted condition variables. --- gcc/config/nvptx/nvptx.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c index daeec27..c8847a5 100644 --- a/gcc/config/nvptx/nvptx.c +++ b/gcc/config/nvptx/nvptx.c @@ -74,6 +74,8 @@ /* This file should be included last. */ #include "target-def.h" +#define WORKAROUND_PTXJIT_BUG 1 + /* The various PTX memory areas an object might reside in. */ enum nvptx_data_area { @@ -3844,6 +3846,24 @@ nvptx_wsync (bool after) return gen_nvptx_barsync (GEN_INT (after)); } +#if WORKAROUND_PTXJIT_BUG +/* Return first real insn in BB, or return NULL_RTX if BB does not contain + real insns. */ + +static rtx_insn * +bb_first_real_insn (basic_block bb) +{ + rtx_insn *insn; + + /* Find first insn of from block. */ + FOR_BB_INSNS (bb, insn) + if (INSN_P (insn)) + return insn; + + return 0; +} +#endif + /* Single neutering according to MASK. FROM is the incoming block and TO is the outgoing block. These may be the same block. Insert at start of FROM: @@ -3958,6 +3978,39 @@ nvptx_single (unsigned mask, basic_block from, basic_block to) if (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask) { /* Vector mode only, do a shuffle. */ +#if WORKAROUND_PTXJIT_BUG + /* The branch condition %rcond is propagated like this: + + { + .reg .u32 %x; + mov.u32 %x,%tid.x; + setp.ne.u32 %rnotvzero,%x,0; + } + + @%rnotvzero bra Lskip; + setp.. %rcond,op1,op2; + Lskip: + selp.u32 %rcondu32,1,0,%rcond; + shfl.idx.b32 %rcondu32,%rcondu32,0,31; + setp.ne.u32 %rcond,%rcondu32,0; + + There seems to be a bug in the ptx JIT compiler (observed at driver + version 381.22, at -O1 and higher for sm_61), that drops the shfl + unless %rcond is initialized to something before 'bra Lskip'. The + bug is not observed with ptxas from cuda 8.0.61. + + It is true that the code is non-trivial: at Lskip, %rcond is + uninitialized in threads 1-31, and after the selp the same holds + for %rcondu32. But shfl propagates the defined value in thread 0 + to threads 1-31, so after the shfl %rcondu32 is defined in threads + 0-31, and after the setp.ne %rcond is defined in threads 0-31. + + There is nothing in the PTX spec to suggest that this is wrong, or + to explain why the extra initialization is needed. So, we classify + it as a JIT bug, and the extra initialization as workaround. */ + emit_insn_before (gen_movbi (pvar, const0_rtx), + bb_first_real_insn (from)); +#endif emit_insn_before (nvptx_gen_vcast (pvar), tail); } else