From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 1923) id 5A7813834690; Tue, 28 Nov 2023 13:35:10 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 5A7813834690 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org; s=default; t=1701178510; bh=RLf29P2P4dhMoR1JJQwgrl79ys6s3IOBbOixNGSYEkw=; h=From:To:Subject:Date:From; b=Sju/Sk9toF6HamJVb4wigE0dL7Nia6lbuMuYLEmYINQPwXEATQ5e6SW7txHMfPDOL vlkpvzYC/gcPWsCmLsueDHCizetR8zSScjiYEcHE072ikspWeq1mKs4v1JtQljgt1v hLCCg9so1KQ+qEwc2r4wxDjGU/MSWUpNeodvilU8= Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: Philipp Tomsich To: gcc-cvs@gcc.gnu.org Subject: [gcc(refs/vendors/vrull/heads/slp-improvements)] aarch64: expand VEC_PERM into ins + uzp[12] X-Act-Checkin: gcc X-Git-Author: Manolis Tsamis X-Git-Refname: refs/vendors/vrull/heads/slp-improvements X-Git-Oldrev: e9bbe3d12c929449053eadb485bcfa5586d311e0 X-Git-Newrev: 527c082ecad2383d022857e5b50d3fba8705cbe6 Message-Id: <20231128133510.5A7813834690@sourceware.org> Date: Tue, 28 Nov 2023 13:35:10 +0000 (GMT) List-Id: https://gcc.gnu.org/g:527c082ecad2383d022857e5b50d3fba8705cbe6 commit 527c082ecad2383d022857e5b50d3fba8705cbe6 Author: Manolis Tsamis Date: Fri Nov 3 14:36:34 2023 +0100 aarch64: expand VEC_PERM into ins + uzp[12] The AArch64 backend has specific strategies that can be used to expand VEC_PERM expression (see aarch64_expand_vec_perm_const_1). The last strategy applied if everything else fails is to use a tbl instruction, which is known to have very bad latency and performance (see aarch64_evpc_tbl). There are various improvements and additions that can be done to the reduce the harmful tbl instructions. The existing mechanisms work for cases that the permute can be done with a single existing AArch64 vector instruction, but for x264's first loop we need some patterns that may need two vector instructions. On x264, this change results in the following change in instruction distribution: tbl: 8 -> 0 ldr: 10 -> 8 (due to the eliminated tbls) ins: 8 -> 16 uzp: 8 -> 16 A reduction of the newly introduced ins/uzp[12] sequences will be addressed in a follow-on change. Ref #344 Diff: --- gcc/config/aarch64/aarch64.cc | 76 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index f6f6f94bf43..dc89c8fad30 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -22548,6 +22548,80 @@ aarch64_evpc_ins (struct expand_vec_perm_d *d) return true; } +/* Recognize patterns suitable for the an INS + UZP. + This addresses limited permute optimizations before a more generic search + algorithm for two operator sequences is implemented. */ +static bool +aarch64_evpc_ins_uzp (struct expand_vec_perm_d *d) +{ + machine_mode mode = d->vmode; + + if (d->vec_flags != VEC_ADVSIMD || BYTES_BIG_ENDIAN) + return false; + + unsigned HOST_WIDE_INT nelt = d->perm.length ().to_constant (); + + if (nelt != 4 + || !d->perm[0].is_constant() + || !d->perm[1].is_constant() + || !d->perm.series_p (0, 2, d->perm[0], 0) + || !d->perm.series_p (1, 2, d->perm[1], 0)) + return false; + + /* We have a {A, B, A, B} permutation. */ + HOST_WIDE_INT A = d->perm[0].to_constant (); + HOST_WIDE_INT B = d->perm[1].to_constant (); + + if (A >= nelt || B < nelt || d->op0 == d->op1) + return false; + + rtx insv; + rtx extractv; + HOST_WIDE_INT idx, extractindex; + + /* If A is the first element or B is the second element of a UZP1/2 then we + can emit this permute as INS + UZP . */ + if (A == 0 || A == 1) + { + insv = d->op0; + extractv = d->op1; + idx = A == 0 ? 2 : 3; + extractindex = B; + } + else if (B == nelt + 2 || B == nelt + 3) + { + insv = d->op1; + extractv = d->op0; + idx = B == nelt + 2 ? 0 : 1; + extractindex = A; + } + else + return false; + + if (d->testing_p) + return true; + + if (extractindex >= nelt) + extractindex -= nelt; + gcc_assert (extractindex < nelt); + + /* Emit INS. */ + insn_code icode = code_for_aarch64_simd_vec_copy_lane (mode); + expand_operand ops[5]; + create_output_operand (&ops[0], d->target, mode); + create_input_operand (&ops[1], insv, mode); + create_integer_operand (&ops[2], 1 << idx); + create_input_operand (&ops[3], extractv, mode); + create_integer_operand (&ops[4], extractindex); + expand_insn (icode, 5, ops); + + /* Emit UZP. */ + emit_set_insn (d->target, gen_rtx_UNSPEC (mode, gen_rtvec (2, d->target, d->target), + idx & 1 ? UNSPEC_UZP2 : UNSPEC_UZP1)); + + return true; +} + static bool aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d) { @@ -22589,6 +22663,8 @@ aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d) return true; else if (aarch64_evpc_ins (d)) return true; + else if (aarch64_evpc_ins_uzp (d)) + return true; else if (aarch64_evpc_reencode (d)) return true;