From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from esa1.mentor.iphmx.com (esa1.mentor.iphmx.com [68.232.129.153]) by sourceware.org (Postfix) with ESMTPS id 06BBC3857025 for ; Tue, 11 Oct 2022 11:02:43 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 06BBC3857025 Authentication-Results: sourceware.org; dmarc=none (p=none dis=none) header.from=codesourcery.com Authentication-Results: sourceware.org; spf=pass smtp.mailfrom=mentor.com X-IronPort-AV: E=Sophos;i="5.95,176,1661846400"; d="scan'208";a="87280555" Received: from orw-gwy-01-in.mentorg.com ([192.94.38.165]) by esa1.mentor.iphmx.com with ESMTP; 11 Oct 2022 03:02:42 -0800 IronPort-SDR: fYNvSZGTOuHHzOPjWfwo2wPSAD0yvFB6IsQ3nSQfXlkgaWXjvexd2gPLb14fuRB2XYohQIUOAZ emlF9mWhqK6quyEp5q4zaL3XO7wyTeG4OVRlEqxw9FMlXXqkjSliwWnXKUeXIdoSpNEZUPeWsz jegS7FGfdRRdUNwV105Q++P+dG8HBMGvldxfz++RQp5SKDruPXWAtlBCXn97stb1QGQVLkLzOc +yH7I7MRQWoQJ5TWi6UDzZ63If/R6SssgAZnOsU44K0aCdxKEtcWRvX5P9nDGIuAGA+x8RHf14 FGU= From: Andrew Stubbs To: Subject: [committed 4/6] amdgcn: vec_init for multiple vector sizes Date: Tue, 11 Oct 2022 12:02:06 +0100 Message-ID: <769a10d0fc45e4923d7eb631170a117529ad5e39.1665485382.git.ams@codesourcery.com> X-Mailer: git-send-email 2.37.0 In-Reply-To: References: MIME-Version: 1.0 Content-Type: multipart/mixed; boundary="------------2.37.0" Content-Transfer-Encoding: 8bit X-Originating-IP: [137.202.0.90] X-ClientProxiedBy: svr-ies-mbx-14.mgc.mentorg.com (139.181.222.14) To svr-ies-mbx-11.mgc.mentorg.com (139.181.222.11) X-Spam-Status: No, score=-12.3 required=5.0 tests=BAYES_00,GIT_PATCH_0,HEADER_FROM_DIFFERENT_DOMAINS,KAM_DMARC_STATUS,SCC_5_SHORT_WORD_LINES,SPF_HELO_PASS,SPF_PASS,TXREP autolearn=ham autolearn_force=no version=3.4.6 X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on server2.sourceware.org List-Id: --------------2.37.0 Content-Type: text/plain; charset="UTF-8"; format=fixed Content-Transfer-Encoding: 8bit Implements vec_init when the input is a vector of smaller vectors, or of vector MEM types, or a smaller vector duplicated several times. gcc/ChangeLog: * config/gcn/gcn-valu.md (vec_init): New. * config/gcn/gcn.cc (GEN_VN): Add andvNsi3, subvNsi3. (GEN_VNM): Add gathervNm_expr. (GEN_VN_NOEXEC): Add vec_seriesvNsi. (gcn_expand_vector_init): Add initialization of vectors from smaller vectors. --- gcc/config/gcn/gcn-valu.md | 10 +++ gcc/config/gcn/gcn.cc | 159 +++++++++++++++++++++++++++++++------ 2 files changed, 143 insertions(+), 26 deletions(-) --------------2.37.0 Content-Type: text/x-patch; name="0004-amdgcn-vec_init-for-multiple-vector-sizes.patch" Content-Transfer-Encoding: 8bit Content-Disposition: attachment; filename="0004-amdgcn-vec_init-for-multiple-vector-sizes.patch" diff --git a/gcc/config/gcn/gcn-valu.md b/gcc/config/gcn/gcn-valu.md index 9ea60e1174f..f708e587f38 100644 --- a/gcc/config/gcn/gcn-valu.md +++ b/gcc/config/gcn/gcn-valu.md @@ -893,6 +893,16 @@ (define_expand "vec_init" DONE; }) +(define_expand "vec_init" + [(match_operand:V_ALL 0 "register_operand") + (match_operand:V_ALL_ALT 1)] + "mode == mode + && MODE_VF (mode) < MODE_VF (mode)" + { + gcn_expand_vector_init (operands[0], operands[1]); + DONE; + }) + ;; }}} ;; {{{ Scatter / Gather diff --git a/gcc/config/gcn/gcn.cc b/gcc/config/gcn/gcn.cc index fdcf290ef8b..3dc294c2d2f 100644 --- a/gcc/config/gcn/gcn.cc +++ b/gcc/config/gcn/gcn.cc @@ -1365,12 +1365,17 @@ GEN_VN (add,di3_vcc_zext_dup2, A(rtx dest, rtx src1, rtx src2, rtx vcc), A(dest, src1, src2, vcc)) GEN_VN (addc,si3, A(rtx dest, rtx src1, rtx src2, rtx vccout, rtx vccin), A(dest, src1, src2, vccout, vccin)) +GEN_VN (and,si3, A(rtx dest, rtx src1, rtx src2), A(dest, src1, src2)) GEN_VN (ashl,si3, A(rtx dest, rtx src, rtx shift), A(dest, src, shift)) GEN_VNM_NOEXEC (ds_bpermute,, A(rtx dest, rtx addr, rtx src, rtx exec), A(dest, addr, src, exec)) +GEN_VNM (gather,_expr, A(rtx dest, rtx addr, rtx as, rtx vol), + A(dest, addr, as, vol)) GEN_VNM (mov,, A(rtx dest, rtx src), A(dest, src)) GEN_VN (mul,si3_dup, A(rtx dest, rtx src1, rtx src2), A(dest, src1, src2)) +GEN_VN (sub,si3, A(rtx dest, rtx src1, rtx src2), A(dest, src1, src2)) GEN_VNM (vec_duplicate,, A(rtx dest, rtx src), A(dest, src)) +GEN_VN_NOEXEC (vec_series,si, A(rtx dest, rtx x, rtx c), A(dest, x, c)) #undef GEN_VNM #undef GEN_VN @@ -1993,44 +1998,146 @@ regno_ok_for_index_p (int regno) void gcn_expand_vector_init (rtx op0, rtx vec) { - int64_t initialized_mask = 0; - int64_t curr_mask = 1; + rtx val[64]; machine_mode mode = GET_MODE (op0); int vf = GET_MODE_NUNITS (mode); + machine_mode addrmode = VnMODE (vf, DImode); + machine_mode offsetmode = VnMODE (vf, SImode); - rtx val = XVECEXP (vec, 0, 0); + int64_t mem_mask = 0; + int64_t item_mask[64]; + rtx ramp = gen_reg_rtx (offsetmode); + rtx addr = gen_reg_rtx (addrmode); - for (int i = 1; i < vf; i++) - if (rtx_equal_p (val, XVECEXP (vec, 0, i))) - curr_mask |= (int64_t) 1 << i; + int unit_size = GET_MODE_SIZE (GET_MODE_INNER (GET_MODE (op0))); + emit_insn (gen_mulvNsi3_dup (ramp, gen_rtx_REG (offsetmode, VGPR_REGNO (1)), + GEN_INT (unit_size))); - if (gcn_constant_p (val)) - emit_move_insn (op0, gcn_vec_constant (mode, val)); - else + bool simple_repeat = true; + + /* Expand nested vectors into one vector. */ + int item_count = XVECLEN (vec, 0); + for (int i = 0, j = 0; i < item_count; i++) + { + rtx item = XVECEXP (vec, 0, i); + machine_mode mode = GET_MODE (item); + int units = VECTOR_MODE_P (mode) ? GET_MODE_NUNITS (mode) : 1; + item_mask[j] = (((uint64_t)-1)>>(64-units)) << j; + + if (simple_repeat && i != 0) + simple_repeat = item == XVECEXP (vec, 0, i-1); + + /* If its a vector of values then copy them into the final location. */ + if (GET_CODE (item) == CONST_VECTOR) + { + for (int k = 0; k < units; k++) + val[j++] = XVECEXP (item, 0, k); + continue; + } + /* Otherwise, we have a scalar or an expression that expands... */ + + if (MEM_P (item)) + { + rtx base = XEXP (item, 0); + if (MEM_ADDR_SPACE (item) == DEFAULT_ADDR_SPACE + && REG_P (base)) + { + /* We have a simple vector load. We can put the addresses in + the vector, combine it with any other such MEMs, and load it + all with a single gather at the end. */ + int64_t mask = ((0xffffffffffffffffUL + >> (64-GET_MODE_NUNITS (mode))) + << j); + rtx exec = get_exec (mask); + emit_insn (gen_subvNsi3 + (ramp, ramp, + gcn_vec_constant (offsetmode, j*unit_size), + ramp, exec)); + emit_insn (gen_addvNdi3_zext_dup2 + (addr, ramp, base, + (mem_mask ? addr : gcn_gen_undef (addrmode)), + exec)); + mem_mask |= mask; + } + else + /* The MEM is non-trivial, so let's load it independently. */ + item = force_reg (mode, item); + } + else if (!CONST_INT_P (item) && !CONST_DOUBLE_P (item)) + /* The item may be a symbol_ref, or something else non-trivial. */ + item = force_reg (mode, item); + + /* Duplicate the vector across each item. + It is either a smaller vector register that needs shifting, + or a MEM that needs loading. */ + val[j] = item; + j += units; + } + + int64_t initialized_mask = 0; + rtx prev = NULL; + + if (mem_mask) { - val = force_reg (GET_MODE_INNER (mode), val); - emit_insn (gen_vec_duplicatevNm (op0, val)); + emit_insn (gen_gathervNm_expr + (op0, gen_rtx_PLUS (addrmode, addr, + gen_rtx_VEC_DUPLICATE (addrmode, + const0_rtx)), + GEN_INT (DEFAULT_ADDR_SPACE), GEN_INT (0), + NULL, get_exec (mem_mask))); + prev = op0; + initialized_mask = mem_mask; } - initialized_mask |= curr_mask; - for (int i = 1; i < vf; i++) + + if (simple_repeat && item_count > 1 && !prev) + { + /* Special case for instances of {A, B, A, B, A, B, ....}, etc. */ + rtx src = gen_rtx_SUBREG (mode, val[0], 0); + rtx input_vf_mask = GEN_INT (GET_MODE_NUNITS (GET_MODE (val[0]))-1); + + rtx permutation = gen_reg_rtx (VnMODE (vf, SImode)); + emit_insn (gen_vec_seriesvNsi (permutation, GEN_INT (0), GEN_INT (1))); + rtx mask_dup = gen_reg_rtx (VnMODE (vf, SImode)); + emit_insn (gen_vec_duplicatevNsi (mask_dup, input_vf_mask)); + emit_insn (gen_andvNsi3 (permutation, permutation, mask_dup)); + emit_insn (gen_ashlvNsi3 (permutation, permutation, GEN_INT (2))); + emit_insn (gen_ds_bpermutevNm (op0, permutation, src, get_exec (mode))); + return; + } + + /* Write each value, elementwise, but coalesce matching values into one + instruction, where possible. */ + for (int i = 0; i < vf; i++) if (!(initialized_mask & ((int64_t) 1 << i))) { - curr_mask = (int64_t) 1 << i; - rtx val = XVECEXP (vec, 0, i); - - for (int j = i + 1; j < vf; j++) - if (rtx_equal_p (val, XVECEXP (vec, 0, j))) - curr_mask |= (int64_t) 1 << j; - if (gcn_constant_p (val)) - emit_insn (gen_movvNm (op0, gcn_vec_constant (mode, val), op0, - get_exec (curr_mask))); + if (gcn_constant_p (val[i])) + emit_insn (gen_movvNm (op0, gcn_vec_constant (mode, val[i]), prev, + get_exec (item_mask[i]))); + else if (VECTOR_MODE_P (GET_MODE (val[i])) + && (GET_MODE_NUNITS (GET_MODE (val[i])) == vf + || i == 0)) + emit_insn (gen_movvNm (op0, gen_rtx_SUBREG (mode, val[i], 0), prev, + get_exec (item_mask[i]))); + else if (VECTOR_MODE_P (GET_MODE (val[i]))) + { + rtx permutation = gen_reg_rtx (VnMODE (vf, SImode)); + emit_insn (gen_vec_seriesvNsi (permutation, GEN_INT (-i*4), + GEN_INT (4))); + rtx tmp = gen_reg_rtx (mode); + emit_insn (gen_ds_bpermutevNm (tmp, permutation, + gen_rtx_SUBREG (mode, val[i], 0), + get_exec (-1))); + emit_insn (gen_movvNm (op0, tmp, prev, get_exec (item_mask[i]))); + } else { - val = force_reg (GET_MODE_INNER (mode), val); - emit_insn (gen_vec_duplicatevNm (op0, val, op0, - get_exec (curr_mask))); + rtx reg = force_reg (GET_MODE_INNER (mode), val[i]); + emit_insn (gen_vec_duplicatevNm (op0, reg, prev, + get_exec (item_mask[i]))); } - initialized_mask |= curr_mask; + + initialized_mask |= item_mask[i]; + prev = op0; } } --------------2.37.0--