From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <acoplan@sourceware.org>
Received: by sourceware.org (Postfix, from userid 7810)
	id 302A4385843E; Thu, 19 Oct 2023 10:15:20 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 302A4385843E
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org;
	s=default; t=1697710520;
	bh=j5UxmeB8QrRafmrei5a0xiQckMIsZMf6UP5Zqyoi4ks=;
	h=From:To:Subject:Date:From;
	b=MGWElkgb1Dj51RVSTL6PZ6T5Z1cHi08GZqJkbFBDs/B8m13Nr/eLaxffkjPZZPTUp
	 TbKUGBqpZE3hhrKDAx89NKbRyZ3MJErYK/INBBOGAIpnRTJoxxZM9tfsehznR1bOHd
	 KaAYDQKlXd0ovYRB7NVj6SxwjVa5QznomV8egpPE=
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
Content-Type: text/plain; charset="utf-8"
From: Alex Coplan <acoplan@gcc.gnu.org>
To: gcc-cvs@gcc.gnu.org
Subject: [gcc r14-4743] aarch64: Generalise TFmode load/store pair patterns
X-Act-Checkin: gcc
X-Git-Author: Alex Coplan <alex.coplan@arm.com>
X-Git-Refname: refs/heads/master
X-Git-Oldrev: 61ea0a89c6face2ce5651e4819adea5d9b613f3d
X-Git-Newrev: 947fb34a165af881ad53fc8ebd226660f7bc199e
Message-Id: <20231019101520.302A4385843E@sourceware.org>
Date: Thu, 19 Oct 2023 10:15:20 +0000 (GMT)
List-Id: <gcc-cvs.sourceware.org>

https://gcc.gnu.org/g:947fb34a165af881ad53fc8ebd226660f7bc199e

commit r14-4743-g947fb34a165af881ad53fc8ebd226660f7bc199e
Author: Alex Coplan <alex.coplan@arm.com>
Date:   Fri Oct 6 13:35:45 2023 +0100

    aarch64: Generalise TFmode load/store pair patterns
    
    This patch generalises the TFmode load/store pair patterns to TImode and
    TDmode.  This brings them in line with the DXmode patterns, and uses the
    same technique with separate mode iterators (TX and TX2) to allow for
    distinct modes in each arm of the load/store pair.
    
    For example, in combination with the post-RA load/store pair fusion pass
    in the following patch, this improves the codegen for the following
    varargs testcase involving TImode stores:
    
    void g(void *);
    int foo(int x, ...)
    {
        __builtin_va_list ap;
        __builtin_va_start (ap, x);
        g(&ap);
        __builtin_va_end (ap);
    }
    
    from:
    
    foo:
    .LFB0:
            stp     x29, x30, [sp, -240]!
    .LCFI0:
            mov     w9, -56
            mov     w8, -128
            mov     x29, sp
            add     x10, sp, 176
            stp     x1, x2, [sp, 184]
            add     x1, sp, 240
            add     x0, sp, 16
            stp     x1, x1, [sp, 16]
            str     x10, [sp, 32]
            stp     w9, w8, [sp, 40]
            str     q0, [sp, 48]
            str     q1, [sp, 64]
            str     q2, [sp, 80]
            str     q3, [sp, 96]
            str     q4, [sp, 112]
            str     q5, [sp, 128]
            str     q6, [sp, 144]
            str     q7, [sp, 160]
            stp     x3, x4, [sp, 200]
            stp     x5, x6, [sp, 216]
            str     x7, [sp, 232]
            bl      g
            ldp     x29, x30, [sp], 240
    .LCFI1:
            ret
    
    to:
    
    foo:
    .LFB0:
            stp     x29, x30, [sp, -240]!
    .LCFI0:
            mov     w9, -56
            mov     w8, -128
            mov     x29, sp
            add     x10, sp, 176
            stp     x1, x2, [sp, 1bd4971b7c71e70a637a1dq84]
            add     x1, sp, 240
            add     x0, sp, 16
            stp     x1, x1, [sp, 16]
            str     x10, [sp, 32]
            stp     w9, w8, [sp, 40]
            stp     q0, q1, [sp, 48]
            stp     q2, q3, [sp, 80]
            stp     q4, q5, [sp, 112]
            stp     q6, q7, [sp, 144]
            stp     x3, x4, [sp, 200]
            stp     x5, x6, [sp, 216]
            str     x7, [sp, 232]
            bl      g
            ldp     x29, x30, [sp], 240
    .LCFI1:
            ret
    
    Note that this patch isn't neeed if we only use the mode
    canonicalization approach in the new ldp fusion pass (since we
    canonicalize T{I,F,D}mode to V16QImode), but we seem to get slightly
    better performance with mode canonicalization disabled (see
    --param=aarch64-ldp-canonicalize-modes in the following patch).
    
    gcc/ChangeLog:
    
            * config/aarch64/aarch64.md (load_pair_dw_tftf): Rename to ...
            (load_pair_dw_<TX:mode><TX2:mode>): ... this.
            (store_pair_dw_tftf): Rename to ...
            (store_pair_dw_<TX:mode><TX2:mode>): ... this.
            * config/aarch64/iterators.md (TX2): New.

Diff:
---
 gcc/config/aarch64/aarch64.md   | 22 +++++++++++-----------
 gcc/config/aarch64/iterators.md |  3 +++
 2 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 32c7adc89281..e6af09c2e8b3 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -1757,16 +1757,16 @@
   }
 )
 
-(define_insn "load_pair_dw_tftf"
-  [(set (match_operand:TF 0 "register_operand" "=w")
-	(match_operand:TF 1 "aarch64_mem_pair_operand" "Ump"))
-   (set (match_operand:TF 2 "register_operand" "=w")
-	(match_operand:TF 3 "memory_operand" "m"))]
+(define_insn "load_pair_dw_<TX:mode><TX2:mode>"
+  [(set (match_operand:TX 0 "register_operand" "=w")
+	(match_operand:TX 1 "aarch64_mem_pair_operand" "Ump"))
+   (set (match_operand:TX2 2 "register_operand" "=w")
+	(match_operand:TX2 3 "memory_operand" "m"))]
    "TARGET_SIMD
     && rtx_equal_p (XEXP (operands[3], 0),
 		    plus_constant (Pmode,
 				   XEXP (operands[1], 0),
-				   GET_MODE_SIZE (TFmode)))"
+				   GET_MODE_SIZE (<TX:MODE>mode)))"
   "ldp\\t%q0, %q2, %z1"
   [(set_attr "type" "neon_ldp_q")
    (set_attr "fp" "yes")]
@@ -1805,11 +1805,11 @@
   }
 )
 
-(define_insn "store_pair_dw_tftf"
-  [(set (match_operand:TF 0 "aarch64_mem_pair_operand" "=Ump")
-	(match_operand:TF 1 "register_operand" "w"))
-   (set (match_operand:TF 2 "memory_operand" "=m")
-	(match_operand:TF 3 "register_operand" "w"))]
+(define_insn "store_pair_dw_<TX:mode><TX2:mode>"
+  [(set (match_operand:TX 0 "aarch64_mem_pair_operand" "=Ump")
+	(match_operand:TX 1 "register_operand" "w"))
+   (set (match_operand:TX2 2 "memory_operand" "=m")
+	(match_operand:TX2 3 "register_operand" "w"))]
    "TARGET_SIMD &&
     rtx_equal_p (XEXP (operands[2], 0),
 		 plus_constant (Pmode,
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 2451d8c2cd8e..f9e2210095ea 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -319,6 +319,9 @@
 
 (define_mode_iterator TX [TI TF TD])
 
+;; Duplicate of the above
+(define_mode_iterator TX2 [TI TF TD])
+
 (define_mode_iterator VTX [TI TF TD V16QI V8HI V4SI V2DI V8HF V4SF V2DF V8BF])
 
 ;; Advanced SIMD opaque structure modes.