* [PATCH 10/11] aarch64: Generalise TFmode load/store pair patterns
@ 2023-10-17 20:50 Alex Coplan
2023-10-18 18:16 ` Richard Sandiford
0 siblings, 1 reply; 2+ messages in thread
From: Alex Coplan @ 2023-10-17 20:50 UTC (permalink / raw)
To: gcc-patches; +Cc: Richard Earnshaw, Richard Sandiford, Kyrylo Tkachov
[-- Attachment #1: Type: text/plain, Size: 2332 bytes --]
This patch generalises the TFmode load/store pair patterns to TImode and
TDmode. This brings them in line with the DXmode patterns, and uses the
same technique with separate mode iterators (TX and TX2) to allow for
distinct modes in each arm of the load/store pair.
For example, in combination with the post-RA load/store pair fusion pass
in the following patch, this improves the codegen for the following
varargs testcase involving TImode stores:
void g(void *);
int foo(int x, ...)
{
__builtin_va_list ap;
__builtin_va_start (ap, x);
g(&ap);
__builtin_va_end (ap);
}
from:
foo:
.LFB0:
stp x29, x30, [sp, -240]!
.LCFI0:
mov w9, -56
mov w8, -128
mov x29, sp
add x10, sp, 176
stp x1, x2, [sp, 184]
add x1, sp, 240
add x0, sp, 16
stp x1, x1, [sp, 16]
str x10, [sp, 32]
stp w9, w8, [sp, 40]
str q0, [sp, 48]
str q1, [sp, 64]
str q2, [sp, 80]
str q3, [sp, 96]
str q4, [sp, 112]
str q5, [sp, 128]
str q6, [sp, 144]
str q7, [sp, 160]
stp x3, x4, [sp, 200]
stp x5, x6, [sp, 216]
str x7, [sp, 232]
bl g
ldp x29, x30, [sp], 240
.LCFI1:
ret
to:
foo:
.LFB0:
stp x29, x30, [sp, -240]!
.LCFI0:
mov w9, -56
mov w8, -128
mov x29, sp
add x10, sp, 176
stp x1, x2, [sp, 1bd4971b7c71e70a637a1dq84]
add x1, sp, 240
add x0, sp, 16
stp x1, x1, [sp, 16]
str x10, [sp, 32]
stp w9, w8, [sp, 40]
stp q0, q1, [sp, 48]
stp q2, q3, [sp, 80]
stp q4, q5, [sp, 112]
stp q6, q7, [sp, 144]
stp x3, x4, [sp, 200]
stp x5, x6, [sp, 216]
str x7, [sp, 232]
bl g
ldp x29, x30, [sp], 240
.LCFI1:
ret
Note that this patch isn't needed if we only use the mode
canonicalization approach in the new ldp fusion pass (since we
canonicalize T{I,F,D}mode to V16QImode), but we seem to get slightly
better performance with mode canonicalization disabled (see
--param=aarch64-ldp-canonicalize-modes in the following patch).
Bootstrapped/regtested as a series on aarch64-linux-gnu, OK for trunk?
gcc/ChangeLog:
* config/aarch64/aarch64.md (load_pair_dw_tftf): Rename to ...
(load_pair_dw_<TX:mode><TX2:mode>): ... this.
(store_pair_dw_tftf): Rename to ...
(store_pair_dw_<TX:mode><TX2:mode>): ... this.
* config/aarch64/iterators.md (TX2): New.
---
gcc/config/aarch64/aarch64.md | 22 +++++++++++-----------
gcc/config/aarch64/iterators.md | 3 +++
2 files changed, 14 insertions(+), 11 deletions(-)
[-- Attachment #2: 0010-aarch64-Generalise-TFmode-load-store-pair-patterns.patch --]
[-- Type: text/x-patch, Size: 2221 bytes --]
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 32c7adc8928..e6af09c2e8b 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -1757,16 +1757,16 @@ (define_insn "load_pair_dw_<DX:mode><DX2:mode>"
}
)
-(define_insn "load_pair_dw_tftf"
- [(set (match_operand:TF 0 "register_operand" "=w")
- (match_operand:TF 1 "aarch64_mem_pair_operand" "Ump"))
- (set (match_operand:TF 2 "register_operand" "=w")
- (match_operand:TF 3 "memory_operand" "m"))]
+(define_insn "load_pair_dw_<TX:mode><TX2:mode>"
+ [(set (match_operand:TX 0 "register_operand" "=w")
+ (match_operand:TX 1 "aarch64_mem_pair_operand" "Ump"))
+ (set (match_operand:TX2 2 "register_operand" "=w")
+ (match_operand:TX2 3 "memory_operand" "m"))]
"TARGET_SIMD
&& rtx_equal_p (XEXP (operands[3], 0),
plus_constant (Pmode,
XEXP (operands[1], 0),
- GET_MODE_SIZE (TFmode)))"
+ GET_MODE_SIZE (<TX:MODE>mode)))"
"ldp\\t%q0, %q2, %z1"
[(set_attr "type" "neon_ldp_q")
(set_attr "fp" "yes")]
@@ -1805,11 +1805,11 @@ (define_insn "store_pair_dw_<DX:mode><DX2:mode>"
}
)
-(define_insn "store_pair_dw_tftf"
- [(set (match_operand:TF 0 "aarch64_mem_pair_operand" "=Ump")
- (match_operand:TF 1 "register_operand" "w"))
- (set (match_operand:TF 2 "memory_operand" "=m")
- (match_operand:TF 3 "register_operand" "w"))]
+(define_insn "store_pair_dw_<TX:mode><TX2:mode>"
+ [(set (match_operand:TX 0 "aarch64_mem_pair_operand" "=Ump")
+ (match_operand:TX 1 "register_operand" "w"))
+ (set (match_operand:TX2 2 "memory_operand" "=m")
+ (match_operand:TX2 3 "register_operand" "w"))]
"TARGET_SIMD &&
rtx_equal_p (XEXP (operands[2], 0),
plus_constant (Pmode,
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 2451d8c2cd8..f9e2210095e 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -319,6 +319,9 @@ (define_mode_iterator VS [V2SI V4SI])
(define_mode_iterator TX [TI TF TD])
+;; Duplicate of the above
+(define_mode_iterator TX2 [TI TF TD])
+
(define_mode_iterator VTX [TI TF TD V16QI V8HI V4SI V2DI V8HF V4SF V2DF V8BF])
;; Advanced SIMD opaque structure modes.
^ permalink raw reply [flat|nested] 2+ messages in thread
* Re: [PATCH 10/11] aarch64: Generalise TFmode load/store pair patterns
2023-10-17 20:50 [PATCH 10/11] aarch64: Generalise TFmode load/store pair patterns Alex Coplan
@ 2023-10-18 18:16 ` Richard Sandiford
0 siblings, 0 replies; 2+ messages in thread
From: Richard Sandiford @ 2023-10-18 18:16 UTC (permalink / raw)
To: Alex Coplan; +Cc: gcc-patches, Richard Earnshaw, Kyrylo Tkachov
Alex Coplan <alex.coplan@arm.com> writes:
> This patch generalises the TFmode load/store pair patterns to TImode and
> TDmode. This brings them in line with the DXmode patterns, and uses the
> same technique with separate mode iterators (TX and TX2) to allow for
> distinct modes in each arm of the load/store pair.
>
> For example, in combination with the post-RA load/store pair fusion pass
> in the following patch, this improves the codegen for the following
> varargs testcase involving TImode stores:
>
> void g(void *);
> int foo(int x, ...)
> {
> __builtin_va_list ap;
> __builtin_va_start (ap, x);
> g(&ap);
> __builtin_va_end (ap);
> }
>
> from:
>
> foo:
> .LFB0:
> stp x29, x30, [sp, -240]!
> .LCFI0:
> mov w9, -56
> mov w8, -128
> mov x29, sp
> add x10, sp, 176
> stp x1, x2, [sp, 184]
> add x1, sp, 240
> add x0, sp, 16
> stp x1, x1, [sp, 16]
> str x10, [sp, 32]
> stp w9, w8, [sp, 40]
> str q0, [sp, 48]
> str q1, [sp, 64]
> str q2, [sp, 80]
> str q3, [sp, 96]
> str q4, [sp, 112]
> str q5, [sp, 128]
> str q6, [sp, 144]
> str q7, [sp, 160]
> stp x3, x4, [sp, 200]
> stp x5, x6, [sp, 216]
> str x7, [sp, 232]
> bl g
> ldp x29, x30, [sp], 240
> .LCFI1:
> ret
>
> to:
>
> foo:
> .LFB0:
> stp x29, x30, [sp, -240]!
> .LCFI0:
> mov w9, -56
> mov w8, -128
> mov x29, sp
> add x10, sp, 176
> stp x1, x2, [sp, 1bd4971b7c71e70a637a1dq84]
> add x1, sp, 240
> add x0, sp, 16
> stp x1, x1, [sp, 16]
> str x10, [sp, 32]
> stp w9, w8, [sp, 40]
> stp q0, q1, [sp, 48]
> stp q2, q3, [sp, 80]
> stp q4, q5, [sp, 112]
> stp q6, q7, [sp, 144]
> stp x3, x4, [sp, 200]
> stp x5, x6, [sp, 216]
> str x7, [sp, 232]
> bl g
> ldp x29, x30, [sp], 240
> .LCFI1:
> ret
>
> Note that this patch isn't needed if we only use the mode
> canonicalization approach in the new ldp fusion pass (since we
> canonicalize T{I,F,D}mode to V16QImode), but we seem to get slightly
> better performance with mode canonicalization disabled (see
> --param=aarch64-ldp-canonicalize-modes in the following patch).
>
> Bootstrapped/regtested as a series on aarch64-linux-gnu, OK for trunk?
>
> gcc/ChangeLog:
>
> * config/aarch64/aarch64.md (load_pair_dw_tftf): Rename to ...
> (load_pair_dw_<TX:mode><TX2:mode>): ... this.
> (store_pair_dw_tftf): Rename to ...
> (store_pair_dw_<TX:mode><TX2:mode>): ... this.
> * config/aarch64/iterators.md (TX2): New.
OK, thanks. It would be nice to investigate & fix the reasons for
the regressions with canonicalised modes, but I agree that this patch
is a strict improvement, since it fixes a hole in the current scheme.
Richard
> ---
> gcc/config/aarch64/aarch64.md | 22 +++++++++++-----------
> gcc/config/aarch64/iterators.md | 3 +++
> 2 files changed, 14 insertions(+), 11 deletions(-)
>
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index 32c7adc8928..e6af09c2e8b 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -1757,16 +1757,16 @@ (define_insn "load_pair_dw_<DX:mode><DX2:mode>"
> }
> )
>
> -(define_insn "load_pair_dw_tftf"
> - [(set (match_operand:TF 0 "register_operand" "=w")
> - (match_operand:TF 1 "aarch64_mem_pair_operand" "Ump"))
> - (set (match_operand:TF 2 "register_operand" "=w")
> - (match_operand:TF 3 "memory_operand" "m"))]
> +(define_insn "load_pair_dw_<TX:mode><TX2:mode>"
> + [(set (match_operand:TX 0 "register_operand" "=w")
> + (match_operand:TX 1 "aarch64_mem_pair_operand" "Ump"))
> + (set (match_operand:TX2 2 "register_operand" "=w")
> + (match_operand:TX2 3 "memory_operand" "m"))]
> "TARGET_SIMD
> && rtx_equal_p (XEXP (operands[3], 0),
> plus_constant (Pmode,
> XEXP (operands[1], 0),
> - GET_MODE_SIZE (TFmode)))"
> + GET_MODE_SIZE (<TX:MODE>mode)))"
> "ldp\\t%q0, %q2, %z1"
> [(set_attr "type" "neon_ldp_q")
> (set_attr "fp" "yes")]
> @@ -1805,11 +1805,11 @@ (define_insn "store_pair_dw_<DX:mode><DX2:mode>"
> }
> )
>
> -(define_insn "store_pair_dw_tftf"
> - [(set (match_operand:TF 0 "aarch64_mem_pair_operand" "=Ump")
> - (match_operand:TF 1 "register_operand" "w"))
> - (set (match_operand:TF 2 "memory_operand" "=m")
> - (match_operand:TF 3 "register_operand" "w"))]
> +(define_insn "store_pair_dw_<TX:mode><TX2:mode>"
> + [(set (match_operand:TX 0 "aarch64_mem_pair_operand" "=Ump")
> + (match_operand:TX 1 "register_operand" "w"))
> + (set (match_operand:TX2 2 "memory_operand" "=m")
> + (match_operand:TX2 3 "register_operand" "w"))]
> "TARGET_SIMD &&
> rtx_equal_p (XEXP (operands[2], 0),
> plus_constant (Pmode,
> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
> index 2451d8c2cd8..f9e2210095e 100644
> --- a/gcc/config/aarch64/iterators.md
> +++ b/gcc/config/aarch64/iterators.md
> @@ -319,6 +319,9 @@ (define_mode_iterator VS [V2SI V4SI])
>
> (define_mode_iterator TX [TI TF TD])
>
> +;; Duplicate of the above
> +(define_mode_iterator TX2 [TI TF TD])
> +
> (define_mode_iterator VTX [TI TF TD V16QI V8HI V4SI V2DI V8HF V4SF V2DF V8BF])
>
> ;; Advanced SIMD opaque structure modes.
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2023-10-18 18:16 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-10-17 20:50 [PATCH 10/11] aarch64: Generalise TFmode load/store pair patterns Alex Coplan
2023-10-18 18:16 ` Richard Sandiford
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).