From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <victorldn@sourceware.org>
Received: by sourceware.org (Postfix, from userid 7922)
	id 3460F3858D1E; Tue, 25 Apr 2023 10:44:57 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 3460F3858D1E
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org;
	s=default; t=1682419497;
	bh=W8eZF1FQp9hsaTOpBqbY/ATjq2o3McTfrAy35OkWZqM=;
	h=From:To:Subject:Date:From;
	b=RvcR/PbHntSVqnk9AVxitbVJinCrCue9AYI4oYdqAEqUBlQ7ayAUK9YHp/jz9hgF3
	 a+ogVNMnfY4PJwJzdBkK5HmI2wei3C4VoyDYM/fz+C9opG9K9fyiMA6nENtqobY43T
	 X+YIlVqr80KPSM6Stl78nTHMuqsVGOL88QDuyiks=
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
Content-Type: text/plain; charset="utf-8"
From: Victor Do Nascimento <victorldn@gcc.gnu.org>
To: gcc-cvs@gcc.gnu.org
Subject: [gcc r14-215] aarch64: Leveraging the use of STP instruction for
 vec_duplicate
X-Act-Checkin: gcc
X-Git-Author: Victor Do Nascimento <victor.donascimento@arm.com>
X-Git-Refname: refs/heads/master
X-Git-Oldrev: a024ac7bca9b9de1d2e0c19d4bb11df293e27a7d
X-Git-Newrev: 85279b0bddc1c5a7d181e2168e26ded354b21f32
Message-Id: <20230425104457.3460F3858D1E@sourceware.org>
Date: Tue, 25 Apr 2023 10:44:57 +0000 (GMT)
List-Id: <gcc-cvs.sourceware.org>

https://gcc.gnu.org/g:85279b0bddc1c5a7d181e2168e26ded354b21f32

commit r14-215-g85279b0bddc1c5a7d181e2168e26ded354b21f32
Author: Victor Do Nascimento <victor.donascimento@arm.com>
Date:   Tue Apr 25 10:57:00 2023 +0100

    aarch64: Leveraging the use of STP instruction for vec_duplicate
    
    The backend pattern for storing a pair of identical values in 32 and
    64-bit modes with the machine instruction STP was missing, and
    multiple instructions were needed to reproduce this behavior as a
    result of failed RTL pattern match in combine pass.
    
    For the test case:
    
    typedef long long v2di __attribute__((vector_size (16)));
    typedef int v2si __attribute__((vector_size (8)));
    
    void
    foo (v2di *x, long long a)
    {
      v2di tmp = {a, a};
      *x = tmp;
    }
    
    void
    foo2 (v2si *x, int a)
    {
      v2si tmp = {a, a};
      *x = tmp;
    }
    
    at -O2 on aarch64 gives:
    
    foo:
        stp x1, x1, [x0]
        ret
    foo2:
        stp w1, w1, [x0]
        ret
    
    instead of:
    
    foo:
            dup     v0.2d, x1
            str     q0, [x0]
            ret
    foo2:
            dup     v0.2s, w1
            str     d0, [x0]
            ret
    
    Bootstrapped and regtested on aarch64-none-linux-gnu.
    
    gcc/
            * config/aarch64/aarch64-simd.md(aarch64_simd_stp<mode>): New.
            * config/aarch64/constraints.md: Make "Umn" relaxed memory
            constraint.
            * config/aarch64/iterators.md(ldpstp_vel_sz): New.
    
    gcc/testsuite/
            * gcc.target/aarch64/stp_vec_dup_32_64-1.c: New.

Diff:
---
 gcc/config/aarch64/aarch64-simd.md                 | 10 ++++
 gcc/config/aarch64/constraints.md                  |  2 +-
 gcc/config/aarch64/iterators.md                    |  3 ++
 .../gcc.target/aarch64/stp_vec_dup_32_64-1.c       | 57 ++++++++++++++++++++++
 4 files changed, 71 insertions(+), 1 deletion(-)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 9f2fce6f033..cfad812658f 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -257,6 +257,16 @@
   [(set_attr "type" "neon_stp")]
 )
 
+(define_insn "aarch64_simd_stp<mode>"
+  [(set (match_operand:VP_2E 0 "aarch64_mem_pair_lanes_operand" "=Umn,Umn")
+	(vec_duplicate:VP_2E (match_operand:<VEL> 1 "register_operand" "w,r")))]
+  "TARGET_SIMD"
+  "@
+   stp\\t%<Vetype>1, %<Vetype>1, %y0
+   stp\\t%<vw>1, %<vw>1, %y0"
+  [(set_attr "type" "neon_stp, store_<ldpstp_vel_sz>")]
+)
+
 (define_insn "load_pair<VQ:mode><VQ2:mode>"
   [(set (match_operand:VQ 0 "register_operand" "=w")
 	(match_operand:VQ 1 "aarch64_mem_pair_operand" "Ump"))
diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md
index 5b20abc27e5..6df1dbec2a8 100644
--- a/gcc/config/aarch64/constraints.md
+++ b/gcc/config/aarch64/constraints.md
@@ -287,7 +287,7 @@
 ;; Used for storing or loading pairs in an AdvSIMD register using an STP/LDP
 ;; as a vector-concat.  The address mode uses the same constraints as if it
 ;; were for a single value.
-(define_memory_constraint "Umn"
+(define_relaxed_memory_constraint "Umn"
   "@internal
   A memory address suitable for a load/store pair operation."
   (and (match_code "mem")
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 13a7e89777d..1d0b4822102 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -1020,6 +1020,9 @@
 ;; Likewise for load/store pair.
 (define_mode_attr ldpstp_sz [(SI "8") (DI "16")])
 
+;; Size of element access for STP/LDP-generated vectors.
+(define_mode_attr ldpstp_vel_sz [(V2SI "8") (V2SF "8") (V2DI "16") (V2DF "16")])
+
 ;; For inequal width int to float conversion
 (define_mode_attr w1 [(HF "w") (SF "w") (DF "x")])
 (define_mode_attr w2 [(HF "x") (SF "x") (DF "w")])
diff --git a/gcc/testsuite/gcc.target/aarch64/stp_vec_dup_32_64-1.c b/gcc/testsuite/gcc.target/aarch64/stp_vec_dup_32_64-1.c
new file mode 100644
index 00000000000..fc2c1ea39e0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/stp_vec_dup_32_64-1.c
@@ -0,0 +1,57 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+typedef long long v2di __attribute__((vector_size (16)));
+typedef int v2si __attribute__((vector_size (8)));
+
+#define TESTV2DI(lab, idx)			\
+  void						\
+  stpv2di_##lab (v2di *x, long long a)		\
+  {						\
+    v2di tmp = {a, a};				\
+    x[idx] = tmp;				\
+  }
+
+
+#define TESTV2SI(lab, idx)			\
+  void						\
+  stpv2si_##lab (v2si *x, int a)		\
+  {						\
+    v2si tmp = {a, a};				\
+    x[idx] = tmp;				\
+  }						\
+
+/* Core test, no imm assembler offset:  */
+
+TESTV2SI(0, 0)
+TESTV2DI(0, 0)
+/* { dg-final { scan-assembler {\s+stp\t(w[0-9]+), \1, \[x[0-9]+\]} } } */
+/* { dg-final { scan-assembler {\s+stp\t(x[0-9]+), \1, \[x[0-9]+\]} } } */
+
+/* Lower offset bounds:  */
+
+/* Vaid offsets:  */
+TESTV2SI(1, -32)
+TESTV2DI(1, -32)
+/* { dg-final { scan-assembler {\s+stp\t(w[0-9]+), \1, \[x[0-9]+, -256\]} } } */
+/* { dg-final { scan-assembler {\s+stp\t(x[0-9]+), \1, \[x[0-9]+, -512\]} } } */
+/* Invalid offsets:  */
+TESTV2SI(2, -33)
+TESTV2DI(2, -33)
+/* { dg-final { scan-assembler-not {\s+stp\t(w[0-9]+), \1, \[x[0-9]+, -264\]} } } */
+/* { dg-final { scan-assembler-not {\s+stp\t(x[0-9]+), \1, \[x[0-9]+, -528\]} } } */
+
+/* Upper offset bounds:   */
+
+/* Valid offsets:  */
+TESTV2SI(3, 31)
+TESTV2DI(3, 31)
+/* { dg-final { scan-assembler {\s+stp\t(w[0-9]+), \1, \[x[0-9]+, 248\]} } } */
+/* { dg-final { scan-assembler {\s+stp\t(x[0-9]+), \1, \[x[0-9]+, 496\]} } } */
+/* Invalid offsets:  */
+TESTV2SI(4, 32)
+TESTV2DI(4, 32)
+/* { dg-final { scan-assembler-not {\s+stp\t(w[0-9]+), \1, \[x[0-9]+, 256\]} } } */
+/* { dg-final { scan-assembler-not {\s+stp\t(x[0-9]+), \1, \[x[0-9]+, 512\]} } } */
+
+