amdgcn: multi-size vector reductions Add support for vector reductions for any vector width by switching iterators and generalising the code slightly. There's no one-instruction way to move an item from lane 31 to lane 0 (63, 15, 7, 3, and 1 are all fine though), and vec_extract is probably fewer cycles anyway, so now we always reduce to an SGPR. gcc/ChangeLog: * config/gcn/gcn-valu.md (V64_SI): Delete iterator. (V64_DI): Likewise. (V64_1REG): Likewise. (V64_INT_1REG): Likewise. (V64_2REG): Likewise. (V64_ALL): Likewise. (V64_FP): Likewise. (reduc__scal_): Use V_ALL. Use gen_vec_extract. (fold_left_plus_): Use V_FP. (*_dpp_shr_): Use V_1REG. (*_dpp_shr_): Use V_DI. (*plus_carry_dpp_shr_): Use V_INT_1REG. (*plus_carry_in_dpp_shr_): Use V_SI. (*plus_carry_dpp_shr_): Use V_DI. (mov_from_lane63_): Delete. (mov_from_lane63_): Delete. * config/gcn/gcn.cc (gcn_expand_reduc_scalar): Support partial vectors. * config/gcn/gcn.md (unspec): Remove UNSPEC_MOV_FROM_LANE63. diff --git a/gcc/config/gcn/gcn-valu.md b/gcc/config/gcn/gcn-valu.md index 00c0e3be1ea..6274d2e9228 100644 --- a/gcc/config/gcn/gcn-valu.md +++ b/gcc/config/gcn/gcn-valu.md @@ -32,11 +32,6 @@ (define_mode_iterator V_DI (define_mode_iterator V_DF [V2DF V4DF V8DF V16DF V32DF V64DF]) -(define_mode_iterator V64_SI - [V64SI]) -(define_mode_iterator V64_DI - [V64DI]) - ; Vector modes for sub-dword modes (define_mode_iterator V_QIHI [V2QI V2HI @@ -77,13 +72,6 @@ (define_mode_iterator V_FP_1REG V32HF V32SF V64HF V64SF]) -; V64_* modes are for where more general support is unimplemented -; (e.g. reductions) -(define_mode_iterator V64_1REG - [V64QI V64HI V64SI V64HF V64SF]) -(define_mode_iterator V64_INT_1REG - [V64QI V64HI V64SI]) - ; Vector modes for two vector registers (define_mode_iterator V_2REG [V2DI V2DF @@ -93,9 +81,6 @@ (define_mode_iterator V_2REG V32DI V32DF V64DI V64DF]) -(define_mode_iterator V64_2REG - [V64DI V64DF]) - ; Vector modes with native support (define_mode_iterator V_noQI [V2HI V2HF V2SI V2SF V2DI V2DF @@ -158,11 +143,6 @@ (define_mode_iterator V_FP V32HF V32SF V32DF V64HF V64SF V64DF]) -(define_mode_iterator V64_ALL - [V64QI V64HI V64HF V64SI V64SF V64DI V64DF]) -(define_mode_iterator V64_FP - [V64HF V64SF V64DF]) - (define_mode_attr scalar_mode [(V2QI "qi") (V2HI "hi") (V2SI "si") (V2HF "hf") (V2SF "sf") (V2DI "di") (V2DF "df") @@ -3528,15 +3508,16 @@ (define_int_attr reduc_insn [(UNSPEC_SMIN_DPP_SHR "v_min%i0") (define_expand "reduc__scal_" [(set (match_operand: 0 "register_operand") (unspec: - [(match_operand:V64_ALL 1 "register_operand")] + [(match_operand:V_ALL 1 "register_operand")] REDUC_UNSPEC))] "" { rtx tmp = gcn_expand_reduc_scalar (mode, operands[1], ); - /* The result of the reduction is in lane 63 of tmp. */ - emit_insn (gen_mov_from_lane63_ (operands[0], tmp)); + rtx last_lane = GEN_INT (GET_MODE_NUNITS (mode) - 1); + emit_insn (gen_vec_extract (operands[0], tmp, + last_lane)); DONE; }) @@ -3547,7 +3528,7 @@ (define_expand "reduc__scal_" (define_expand "fold_left_plus_" [(match_operand: 0 "register_operand") (match_operand: 1 "gcn_alu_operand") - (match_operand:V64_FP 2 "gcn_alu_operand")] + (match_operand:V_FP 2 "gcn_alu_operand")] "can_create_pseudo_p () && (flag_openacc || flag_openmp || flag_associative_math)" @@ -3563,11 +3544,11 @@ (define_expand "fold_left_plus_" }) (define_insn "*_dpp_shr_" - [(set (match_operand:V64_1REG 0 "register_operand" "=v") - (unspec:V64_1REG - [(match_operand:V64_1REG 1 "register_operand" "v") - (match_operand:V64_1REG 2 "register_operand" "v") - (match_operand:SI 3 "const_int_operand" "n")] + [(set (match_operand:V_1REG 0 "register_operand" "=v") + (unspec:V_1REG + [(match_operand:V_1REG 1 "register_operand" "v") + (match_operand:V_1REG 2 "register_operand" "v") + (match_operand:SI 3 "const_int_operand" "n")] REDUC_UNSPEC))] ; GCN3 requires a carry out, GCN5 not "!(TARGET_GCN3 && SCALAR_INT_MODE_P (mode) @@ -3580,11 +3561,11 @@ (define_insn "*_dpp_shr_" (set_attr "length" "8")]) (define_insn_and_split "*_dpp_shr_" - [(set (match_operand:V64_DI 0 "register_operand" "=v") - (unspec:V64_DI - [(match_operand:V64_DI 1 "register_operand" "v") - (match_operand:V64_DI 2 "register_operand" "v") - (match_operand:SI 3 "const_int_operand" "n")] + [(set (match_operand:V_DI 0 "register_operand" "=v") + (unspec:V_DI + [(match_operand:V_DI 1 "register_operand" "v") + (match_operand:V_DI 2 "register_operand" "v") + (match_operand:SI 3 "const_int_operand" "n")] REDUC_2REG_UNSPEC))] "" "#" @@ -3609,10 +3590,10 @@ (define_insn_and_split "*_dpp_shr_" ; Special cases for addition. (define_insn "*plus_carry_dpp_shr_" - [(set (match_operand:V64_INT_1REG 0 "register_operand" "=v") - (unspec:V64_INT_1REG - [(match_operand:V64_INT_1REG 1 "register_operand" "v") - (match_operand:V64_INT_1REG 2 "register_operand" "v") + [(set (match_operand:V_INT_1REG 0 "register_operand" "=v") + (unspec:V_INT_1REG + [(match_operand:V_INT_1REG 1 "register_operand" "v") + (match_operand:V_INT_1REG 2 "register_operand" "v") (match_operand:SI 3 "const_int_operand" "n")] UNSPEC_PLUS_CARRY_DPP_SHR)) (clobber (reg:DI VCC_REG))] @@ -3626,12 +3607,12 @@ (define_insn "*plus_carry_dpp_shr_" (set_attr "length" "8")]) (define_insn "*plus_carry_in_dpp_shr_" - [(set (match_operand:V64_SI 0 "register_operand" "=v") - (unspec:V64_SI - [(match_operand:V64_SI 1 "register_operand" "v") - (match_operand:V64_SI 2 "register_operand" "v") - (match_operand:SI 3 "const_int_operand" "n") - (match_operand:DI 4 "register_operand" "cV")] + [(set (match_operand:V_SI 0 "register_operand" "=v") + (unspec:V_SI + [(match_operand:V_SI 1 "register_operand" "v") + (match_operand:V_SI 2 "register_operand" "v") + (match_operand:SI 3 "const_int_operand" "n") + (match_operand:DI 4 "register_operand" "cV")] UNSPEC_PLUS_CARRY_IN_DPP_SHR)) (clobber (reg:DI VCC_REG))] "" @@ -3644,11 +3625,11 @@ (define_insn "*plus_carry_in_dpp_shr_" (set_attr "length" "8")]) (define_insn_and_split "*plus_carry_dpp_shr_" - [(set (match_operand:V64_DI 0 "register_operand" "=v") - (unspec:V64_DI - [(match_operand:V64_DI 1 "register_operand" "v") - (match_operand:V64_DI 2 "register_operand" "v") - (match_operand:SI 3 "const_int_operand" "n")] + [(set (match_operand:V_DI 0 "register_operand" "=v") + (unspec:V_DI + [(match_operand:V_DI 1 "register_operand" "v") + (match_operand:V_DI 2 "register_operand" "v") + (match_operand:SI 3 "const_int_operand" "n")] UNSPEC_PLUS_CARRY_DPP_SHR)) (clobber (reg:DI VCC_REG))] "" @@ -3675,38 +3656,6 @@ (define_insn_and_split "*plus_carry_dpp_shr_" [(set_attr "type" "vmult") (set_attr "length" "16")]) -; Instructions to move a scalar value from lane 63 of a vector register. -(define_insn "mov_from_lane63_" - [(set (match_operand: 0 "register_operand" "=Sg,v") - (unspec: - [(match_operand:V64_1REG 1 "register_operand" " v,v")] - UNSPEC_MOV_FROM_LANE63))] - "" - "@ - v_readlane_b32\t%0, %1, 63 - v_mov_b32\t%0, %1 wave_ror:1" - [(set_attr "type" "vop3a,vop_dpp") - (set_attr "exec" "none,*") - (set_attr "length" "8")]) - -(define_insn "mov_from_lane63_" - [(set (match_operand: 0 "register_operand" "=Sg,v") - (unspec: - [(match_operand:V64_2REG 1 "register_operand" " v,v")] - UNSPEC_MOV_FROM_LANE63))] - "" - "@ - v_readlane_b32\t%L0, %L1, 63\;v_readlane_b32\t%H0, %H1, 63 - * if (REGNO (operands[0]) <= REGNO (operands[1])) \ - return \"v_mov_b32\t%L0, %L1 wave_ror:1\;\" \ - \"v_mov_b32\t%H0, %H1 wave_ror:1\"; \ - else \ - return \"v_mov_b32\t%H0, %H1 wave_ror:1\;\" \ - \"v_mov_b32\t%L0, %L1 wave_ror:1\";" - [(set_attr "type" "vop3a,vop_dpp") - (set_attr "exec" "none,*") - (set_attr "length" "8")]) - ;; }}} ;; {{{ Miscellaneous diff --git a/gcc/config/gcn/gcn.cc b/gcc/config/gcn/gcn.cc index a561976d7f5..b9d9170f167 100644 --- a/gcc/config/gcn/gcn.cc +++ b/gcc/config/gcn/gcn.cc @@ -4918,23 +4918,25 @@ gcn_expand_dpp_shr_insn (machine_mode mode, const char *insn, The vector register SRC of mode MODE is reduced using the operation given by UNSPEC, and the scalar result is returned in lane 63 of a vector - register. */ -/* FIXME: Implement reductions for sizes other than V64. - (They're currently disabled in the machine description.) */ + register (or lane 31, 15, 7, 3, 1 for partial vectors). */ rtx gcn_expand_reduc_scalar (machine_mode mode, rtx src, int unspec) { machine_mode orig_mode = mode; + machine_mode scalar_mode = GET_MODE_INNER (mode); + int vf = GET_MODE_NUNITS (mode); bool use_moves = (((unspec == UNSPEC_SMIN_DPP_SHR + || unspec == UNSPEC_SMIN_DPP_SHR || unspec == UNSPEC_SMAX_DPP_SHR || unspec == UNSPEC_UMIN_DPP_SHR || unspec == UNSPEC_UMAX_DPP_SHR) - && (mode == V64DImode - || mode == V64DFmode)) + && (scalar_mode == DImode + || scalar_mode == DFmode)) || (unspec == UNSPEC_PLUS_DPP_SHR - && mode == V64DFmode)); + && scalar_mode == DFmode)); rtx_code code = (unspec == UNSPEC_SMIN_DPP_SHR ? SMIN + : unspec == UNSPEC_SMIN_DPP_SHR ? SMIN : unspec == UNSPEC_SMAX_DPP_SHR ? SMAX : unspec == UNSPEC_UMIN_DPP_SHR ? UMIN : unspec == UNSPEC_UMAX_DPP_SHR ? UMAX @@ -4944,23 +4946,23 @@ gcn_expand_reduc_scalar (machine_mode mode, rtx src, int unspec) || unspec == UNSPEC_SMAX_DPP_SHR || unspec == UNSPEC_UMIN_DPP_SHR || unspec == UNSPEC_UMAX_DPP_SHR) - && (mode == V64QImode - || mode == V64HImode)); + && (scalar_mode == QImode + || scalar_mode == HImode)); bool unsignedp = (unspec == UNSPEC_UMIN_DPP_SHR || unspec == UNSPEC_UMAX_DPP_SHR); bool use_plus_carry = unspec == UNSPEC_PLUS_DPP_SHR && GET_MODE_CLASS (mode) == MODE_VECTOR_INT - && (TARGET_GCN3 || mode == V64DImode); + && (TARGET_GCN3 || scalar_mode == DImode); if (use_plus_carry) unspec = UNSPEC_PLUS_CARRY_DPP_SHR; if (use_extends) { - rtx tmp = gen_reg_rtx (V64SImode); + mode = VnMODE (vf, SImode); + rtx tmp = gen_reg_rtx (mode); convert_move (tmp, src, unsignedp); src = tmp; - mode = V64SImode; } /* Perform reduction by first performing the reduction operation on every @@ -4968,7 +4970,8 @@ gcn_expand_reduc_scalar (machine_mode mode, rtx src, int unspec) iteration (thereby effectively reducing every 4 lanes) and so on until all lanes are reduced. */ rtx in, out = force_reg (mode, src); - for (int i = 0, shift = 1; i < 6; i++, shift <<= 1) + int iterations = exact_log2 (vf); + for (int i = 0, shift = 1; i < iterations; i++, shift <<= 1) { rtx shift_val = gen_rtx_CONST_INT (VOIDmode, shift); in = out; diff --git a/gcc/config/gcn/gcn.md b/gcc/config/gcn/gcn.md index a3c9523cd6d..6c1a438f9d1 100644 --- a/gcc/config/gcn/gcn.md +++ b/gcc/config/gcn/gcn.md @@ -78,7 +78,6 @@ (define_c_enum "unspec" [ UNSPEC_PLUS_CARRY_DPP_SHR UNSPEC_PLUS_CARRY_IN_DPP_SHR UNSPEC_AND_DPP_SHR UNSPEC_IOR_DPP_SHR UNSPEC_XOR_DPP_SHR UNSPEC_MOV_DPP_SHR - UNSPEC_MOV_FROM_LANE63 UNSPEC_GATHER UNSPEC_SCATTER UNSPEC_RCP