[gcc(refs/vendors/vrull/heads/slp-improvements)] aarch64: expand VEC_PERM into ins + uzp[12]

public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed

From: Philipp Tomsich <ptomsich@gcc.gnu.org>
To: gcc-cvs@gcc.gnu.org
Subject: [gcc(refs/vendors/vrull/heads/slp-improvements)] aarch64: expand VEC_PERM into ins + uzp[12]
Date: Tue, 23 Jan 2024 20:57:21 +0000 (GMT)	[thread overview]
Message-ID: <20240123205721.3EB5A385800B@sourceware.org> (raw)

https://gcc.gnu.org/g:d61be742513b5b8529ab9ef4022011c471925622

commit d61be742513b5b8529ab9ef4022011c471925622
Author: Manolis Tsamis <manolis.tsamis@vrull.eu>
Date:   Fri Nov 3 14:36:34 2023 +0100

    aarch64: expand VEC_PERM into ins + uzp[12]
    
    The AArch64 backend has specific strategies that can be used to expand
    VEC_PERM expression (see aarch64_expand_vec_perm_const_1).
    
    The last strategy applied if everything else fails is to use a tbl
    instruction, which is known to have very bad latency and performance
    (see aarch64_evpc_tbl). There are various improvements and additions
    that can be done to the reduce the harmful tbl instructions.
    
    The existing mechanisms work for cases that the permute can be done
    with a single existing AArch64 vector instruction, but for x264's
    first loop we need some patterns that may need two vector
    instructions.
    
    On x264, this change results in the following change in instruction
    distribution:
            tbl: 8 -> 0
            ldr: 10 -> 8 (due to the eliminated tbls)
            ins: 8 -> 16
            uzp: 8 -> 16
    A reduction of the newly introduced ins/uzp[12] sequences will be
    addressed in a follow-on change.
    
    Ref #344

Diff:
---
 gcc/config/aarch64/aarch64.cc | 76 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 76 insertions(+)

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index e6bd3fd0bb4..0f2423ef7de 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -25890,6 +25890,80 @@ aarch64_evpc_ins (struct expand_vec_perm_d *d)
   return true;
 }
 
+/* Recognize patterns suitable for the an INS + UZP.
+   This addresses limited permute optimizations before a more generic search
+   algorithm for two operator sequences is implemented.  */
+static bool
+aarch64_evpc_ins_uzp (struct expand_vec_perm_d *d)
+{
+  machine_mode mode = d->vmode;
+
+  if (d->vec_flags != VEC_ADVSIMD || BYTES_BIG_ENDIAN)
+    return false;
+
+  unsigned HOST_WIDE_INT nelt = d->perm.length ().to_constant ();
+
+  if (nelt != 4
+      || !d->perm[0].is_constant()
+      || !d->perm[1].is_constant()
+      || !d->perm.series_p (0, 2, d->perm[0], 0)
+      || !d->perm.series_p (1, 2, d->perm[1], 0))
+    return false;
+
+  /* We have a {A, B, A, B} permutation.  */
+  HOST_WIDE_INT A = d->perm[0].to_constant ();
+  HOST_WIDE_INT B = d->perm[1].to_constant ();
+
+  if (A >= nelt || B < nelt || d->op0 == d->op1)
+    return false;
+
+  rtx insv;
+  rtx extractv;
+  HOST_WIDE_INT idx, extractindex;
+
+  /* If A is the first element or B is the second element of a UZP1/2 then we
+     can emit this permute as INS + UZP .  */
+  if (A == 0 || A == 1)
+    {
+      insv = d->op0;
+      extractv = d->op1;
+      idx = A == 0 ? 2 : 3;
+      extractindex = B;
+    }
+  else if (B == nelt + 2 || B == nelt + 3)
+    {
+      insv = d->op1;
+      extractv = d->op0;
+      idx = B == nelt + 2 ? 0 : 1;
+      extractindex = A;
+    }
+  else
+    return false;
+
+  if (d->testing_p)
+    return true;
+
+  if (extractindex >= nelt)
+    extractindex -= nelt;
+  gcc_assert (extractindex < nelt);
+
+  /* Emit INS.  */
+  insn_code icode = code_for_aarch64_simd_vec_copy_lane (mode);
+  expand_operand ops[5];
+  create_output_operand (&ops[0], d->target, mode);
+  create_input_operand (&ops[1], insv, mode);
+  create_integer_operand (&ops[2], 1 << idx);
+  create_input_operand (&ops[3], extractv, mode);
+  create_integer_operand (&ops[4], extractindex);
+  expand_insn (icode, 5, ops);
+
+  /* Emit UZP.  */
+  emit_set_insn (d->target, gen_rtx_UNSPEC (mode, gen_rtvec (2, d->target, d->target),
+				      idx & 1 ? UNSPEC_UZP2 : UNSPEC_UZP1));
+
+  return true;
+}
+
 static bool
 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
 {
@@ -25931,6 +26005,8 @@ aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
 	    return true;
 	  else if (aarch64_evpc_ins (d))
 	    return true;
+	  else if (aarch64_evpc_ins_uzp (d))
+	    return true;
 	  else if (aarch64_evpc_reencode (d))
 	    return true;

next             reply	other threads:[~2024-01-23 20:57 UTC|newest]

Thread overview: 5+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-01-23 20:57 Philipp Tomsich [this message]
2024-01-23 23:50 ` Andrew Pinski
  -- strict thread matches above, loose matches on Subject: below --
2024-02-27 13:37 Philipp Tomsich
2024-01-17 19:14 Philipp Tomsich
2023-11-28 13:35 Philipp Tomsich

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20240123205721.3EB5A385800B@sourceware.org \
    --to=ptomsich@gcc.gnu.org \
    --cc=gcc-cvs@gcc.gnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).