From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <ptomsich@sourceware.org>
Received: by sourceware.org (Postfix, from userid 1923)
	id 5B31938582A4; Tue, 27 Feb 2024 13:37:23 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 5B31938582A4
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org;
	s=default; t=1709041043;
	bh=KlQXZm/fuufw/9ZqG5V8iau097GuCBiPGvud7T8RYWM=;
	h=From:To:Subject:Date:From;
	b=iYS9+nc/5TDrGqqnzAFMJPMJX6NXx/NB1+L3cTq+36KX5lDwqqQ0zL84BdQv/4knF
	 NDpPQkmNAxEdKwEG6hBK/BHdExhIBPjoZNQ7ScmvOMbmsSVxqr+bqWGLXz31suJoOn
	 C6GZV781F/NXvTrYhvFgRtqACIezsHMe6NR/EPuA=
Content-Type: text/plain; charset="us-ascii"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
From: Philipp Tomsich <ptomsich@gcc.gnu.org>
To: gcc-cvs@gcc.gnu.org
Subject: [gcc(refs/vendors/vrull/heads/slp-improvements)] aarch64: expand
 VEC_PERM into ins + uzp[12]
X-Act-Checkin: gcc
X-Git-Author: Manolis Tsamis <manolis.tsamis@vrull.eu>
X-Git-Refname: refs/vendors/vrull/heads/slp-improvements
X-Git-Oldrev: e8a30f2e241d213a9761a473eb63f1c32e2c182e
X-Git-Newrev: ed7d62118f587c660491efa06e53ab003eef4a41
Message-Id: <20240227133723.5B31938582A4@sourceware.org>
Date: Tue, 27 Feb 2024 13:37:23 +0000 (GMT)
List-Id: <gcc-cvs.sourceware.org>

https://gcc.gnu.org/g:ed7d62118f587c660491efa06e53ab003eef4a41

commit ed7d62118f587c660491efa06e53ab003eef4a41
Author: Manolis Tsamis <manolis.tsamis@vrull.eu>
Date:   Fri Nov 3 14:36:34 2023 +0100

    aarch64: expand VEC_PERM into ins + uzp[12]
    
    The AArch64 backend has specific strategies that can be used to expand
    VEC_PERM expression (see aarch64_expand_vec_perm_const_1).
    
    The last strategy applied if everything else fails is to use a tbl
    instruction, which is known to have very bad latency and performance
    (see aarch64_evpc_tbl). There are various improvements and additions
    that can be done to the reduce the harmful tbl instructions.
    
    The existing mechanisms work for cases that the permute can be done
    with a single existing AArch64 vector instruction, but for x264's
    first loop we need some patterns that may need two vector
    instructions.
    
    On x264, this change results in the following change in instruction
    distribution:
            tbl: 8 -> 0
            ldr: 10 -> 8 (due to the eliminated tbls)
            ins: 8 -> 16
            uzp: 8 -> 16
    A reduction of the newly introduced ins/uzp[12] sequences will be
    addressed in a follow-on change.
    
    Ref #344

Diff:
---
 gcc/config/aarch64/aarch64.cc               | 76 +++++++++++++++++++++++++++++
 gcc/testsuite/gcc.target/aarch64/vins_uzp.c | 36 ++++++++++++++
 2 files changed, 112 insertions(+)

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 16318bf9258..a1c214a5104 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -25960,6 +25960,80 @@ aarch64_evpc_ins (struct expand_vec_perm_d *d)
   return true;
 }
 
+/* Recognize patterns suitable for the an INS + UZP.
+   This addresses limited permute optimizations before a more generic search
+   algorithm for two operator sequences is implemented.  */
+static bool
+aarch64_evpc_ins_uzp (struct expand_vec_perm_d *d)
+{
+  machine_mode mode = d->vmode;
+
+  if (d->vec_flags != VEC_ADVSIMD || BYTES_BIG_ENDIAN)
+    return false;
+
+  unsigned HOST_WIDE_INT nelt = d->perm.length ().to_constant ();
+
+  if (nelt != 4
+      || !d->perm[0].is_constant()
+      || !d->perm[1].is_constant()
+      || !d->perm.series_p (0, 2, d->perm[0], 0)
+      || !d->perm.series_p (1, 2, d->perm[1], 0))
+    return false;
+
+  /* We have a {A, B, A, B} permutation.  */
+  unsigned HOST_WIDE_INT A = d->perm[0].to_constant ();
+  unsigned HOST_WIDE_INT B = d->perm[1].to_constant ();
+
+  if (A >= nelt || B < nelt || d->op0 == d->op1)
+    return false;
+
+  rtx insv;
+  rtx extractv;
+  unsigned HOST_WIDE_INT idx, extractindex;
+
+  /* If A is the first element or B is the second element of a UZP1/2 then we
+     can emit this permute as INS + UZP .  */
+  if (A == 0 || A == 1)
+    {
+      insv = d->op0;
+      extractv = d->op1;
+      idx = A == 0 ? 2 : 3;
+      extractindex = B;
+    }
+  else if (B == nelt + 2 || B == nelt + 3)
+    {
+      insv = d->op1;
+      extractv = d->op0;
+      idx = B == nelt + 2 ? 0 : 1;
+      extractindex = A;
+    }
+  else
+    return false;
+
+  if (d->testing_p)
+    return true;
+
+  if (extractindex >= nelt)
+    extractindex -= nelt;
+  gcc_assert (extractindex < nelt);
+
+  /* Emit INS.  */
+  insn_code icode = code_for_aarch64_simd_vec_copy_lane (mode);
+  expand_operand ops[5];
+  create_output_operand (&ops[0], d->target, mode);
+  create_input_operand (&ops[1], insv, mode);
+  create_integer_operand (&ops[2], 1 << idx);
+  create_input_operand (&ops[3], extractv, mode);
+  create_integer_operand (&ops[4], extractindex);
+  expand_insn (icode, 5, ops);
+
+  /* Emit UZP.  */
+  emit_set_insn (d->target, gen_rtx_UNSPEC (mode, gen_rtvec (2, d->target, d->target),
+				      idx & 1 ? UNSPEC_UZP2 : UNSPEC_UZP1));
+
+  return true;
+}
+
 static bool
 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
 {
@@ -26001,6 +26075,8 @@ aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
 	    return true;
 	  else if (aarch64_evpc_ins (d))
 	    return true;
+	  else if (aarch64_evpc_ins_uzp (d))
+	    return true;
 	  else if (aarch64_evpc_reencode (d))
 	    return true;
 
diff --git a/gcc/testsuite/gcc.target/aarch64/vins_uzp.c b/gcc/testsuite/gcc.target/aarch64/vins_uzp.c
new file mode 100644
index 00000000000..d82d1f43c15
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vins_uzp.c
@@ -0,0 +1,36 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+/* { dg-require-effective-target aarch64_little_endian } */
+
+typedef int v4si __attribute__ ((vector_size (4 * sizeof (int))));
+
+v4si case1(v4si a, v4si b) {
+    return  __builtin_shufflevector (a, b, 0, 5, 0, 5);
+}
+
+v4si case2(v4si a, v4si b) {
+    return  __builtin_shufflevector (a, b, 1, 5, 1, 5);
+}
+
+v4si case3(v4si a, v4si b) {
+    return  __builtin_shufflevector (a, b, 0, 6, 0, 6);
+}
+
+v4si case4(v4si a, v4si b) {
+    return  __builtin_shufflevector (a, b, 1, 7, 1, 7);
+}
+
+v4si case5(v4si a, v4si b) {
+    return  __builtin_shufflevector (a, b, 2, 7, 2, 7);
+}
+
+v4si case6(v4si a, v4si b) {
+    return  __builtin_shufflevector (b, a, 2, 7, 2, 7);
+}
+
+v4si case7(v4si a, v4si b) {
+    return  __builtin_shufflevector (a, b, 7, 2, 7, 2);
+}
+
+/* { dg-final { scan-assembler-not {\ttbl\t} } } */
+/* { dg-final { scan-assembler-not {\tldr\t} } } */