public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc r12-5934] aarch64: Use +mops to inline memset operations
@ 2021-12-13 15:18 Kyrylo Tkachov
0 siblings, 0 replies; only message in thread
From: Kyrylo Tkachov @ 2021-12-13 15:18 UTC (permalink / raw)
To: gcc-cvs
https://gcc.gnu.org/g:d3bd985e799b63e2133e89870472ac36d06015d3
commit r12-5934-gd3bd985e799b63e2133e89870472ac36d06015d3
Author: Kyrylo Tkachov <kyrylo.tkachov@arm.com>
Date: Mon Dec 13 14:14:21 2021 +0000
aarch64: Use +mops to inline memset operations
This 3rd patch in the series adds an inline sequence for the memset operation.
The aarch64-mops-memset-size-threshold param is added to control the size threshold for the sequence.
Its default setting is 256, which may seem a bit high, but it is consistent with the current
SIMD memset inline sequence limit, and future CPU tunings can override it easily as needed.
Bootstrapped and tested on aarch64-none-linux-gnu.
gcc/ChangeLog:
* config/aarch64/aarch64.c (aarch64_expand_setmem_mops): Define.
(aarch64_expand_setmem): Adjust for TARGET_MOPS.
* config/aarch64/aarch64.h (CLEAR_RATIO): Adjust for TARGET_MOPS.
(SET_RATIO): Likewise.
* config/aarch64/aarch64.md ("unspec"): Add UNSPEC_SETMEM.
(aarch64_setmemdi): Define.
(setmemdi): Adjust for TARGET_MOPS.
* config/aarch64/aarch64.opt (aarch64-mops-memset-size-threshold):
New param.
gcc/testsuite/ChangeLog:
* gcc.target/aarch64/mops_3.c: New test.
Diff:
---
gcc/config/aarch64/aarch64.c | 93 ++++++++++++++++++++++++-------
gcc/config/aarch64/aarch64.h | 4 +-
gcc/config/aarch64/aarch64.md | 20 +++++--
gcc/config/aarch64/aarch64.opt | 4 ++
gcc/testsuite/gcc.target/aarch64/mops_3.c | 85 ++++++++++++++++++++++++++++
5 files changed, 181 insertions(+), 25 deletions(-)
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index bd754e4e2c2..d11a40c02cd 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -23754,6 +23754,28 @@ aarch64_set_one_block_and_progress_pointer (rtx src, rtx *dst,
*dst = aarch64_progress_pointer (*dst);
}
+/* Expand a setmem using the MOPS instructions. OPERANDS are the same
+ as for the setmem pattern. Return true iff we succeed. */
+static bool
+aarch64_expand_setmem_mops (rtx *operands)
+{
+ if (!TARGET_MOPS)
+ return false;
+
+ rtx addr_dst = XEXP (operands[0], 0);
+ rtx sz_reg = operands[1];
+ rtx val = operands[2];
+
+ if (!REG_P (sz_reg))
+ sz_reg = force_reg (DImode, sz_reg);
+ if (!REG_P (addr_dst))
+ addr_dst = force_reg (DImode, addr_dst);
+ if (!REG_P (val) && val != CONST0_RTX (QImode))
+ val = force_reg (QImode, val);
+ emit_insn (gen_aarch64_setmemdi (addr_dst, val, sz_reg));
+ return true;
+}
+
/* Expand setmem, as if from a __builtin_memset. Return true if
we succeed, otherwise return false. */
@@ -23767,39 +23789,59 @@ aarch64_expand_setmem (rtx *operands)
rtx base;
machine_mode cur_mode = BLKmode, next_mode;
- /* We can't do anything smart if the amount to copy is not constant. */
- if (!CONST_INT_P (operands[1]))
- return false;
+ /* If we don't have SIMD registers or the size is variable use the MOPS
+ inlined sequence if possible. */
+ if (!CONST_INT_P (operands[1]) || !TARGET_SIMD)
+ return aarch64_expand_setmem_mops (operands);
bool size_p = optimize_function_for_size_p (cfun);
- /* Default the maximum to 256-bytes. */
+ /* Default the maximum to 256-bytes when considering only libcall vs
+ SIMD broadcast sequence. */
unsigned max_set_size = 256;
len = INTVAL (operands[1]);
-
- /* Upper bound check. */
- if (len > max_set_size)
+ if (len > max_set_size && !TARGET_MOPS)
return false;
+ int cst_val = !!(CONST_INT_P (val) && (INTVAL (val) != 0));
+ /* The MOPS sequence takes:
+ 3 instructions for the memory storing
+ + 1 to move the constant size into a reg
+ + 1 if VAL is a non-zero constant to move into a reg
+ (zero constants can use XZR directly). */
+ unsigned mops_cost = 3 + 1 + cst_val;
+ /* A libcall to memset in the worst case takes 3 instructions to prepare
+ the arguments + 1 for the call. */
+ unsigned libcall_cost = 4;
+
+ /* Upper bound check. For large constant-sized setmem use the MOPS sequence
+ when available. */
+ if (TARGET_MOPS
+ && len >= (unsigned HOST_WIDE_INT) aarch64_mops_memset_size_threshold)
+ return aarch64_expand_setmem_mops (operands);
+
/* Attempt a sequence with a vector broadcast followed by stores.
- Count the number of operations involved to see if it's worth it for
- code size. */
+ Count the number of operations involved to see if it's worth it
+ against the alternatives. A simple counter simd_ops on the
+ algorithmically-relevant operations is used rather than an rtx_insn count
+ as all the pointer adjusmtents and mode reinterprets will be optimized
+ away later. */
start_sequence ();
- unsigned nops = 0;
+ unsigned simd_ops = 0;
+
base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
dst = adjust_automodify_address (dst, VOIDmode, base, 0);
/* Prepare the val using a DUP/MOVI v0.16B, val. */
src = expand_vector_broadcast (V16QImode, val);
src = force_reg (V16QImode, src);
- nops++;
+ simd_ops++;
/* Convert len to bits to make the rest of the code simpler. */
n = len * BITS_PER_UNIT;
/* Maximum amount to copy in one go. We allow 256-bit chunks based on the
- AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter. setmem expand
- pattern is only turned on for TARGET_SIMD. */
+ AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter. */
const int copy_limit = (aarch64_tune_params.extra_tuning_flags
& AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
? GET_MODE_BITSIZE (TImode) : 256;
@@ -23817,7 +23859,7 @@ aarch64_expand_setmem (rtx *operands)
mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
aarch64_set_one_block_and_progress_pointer (src, &dst, cur_mode);
- nops++;
+ simd_ops++;
n -= mode_bits;
/* Do certain trailing copies as overlapping if it's going to be
@@ -23835,12 +23877,25 @@ aarch64_expand_setmem (rtx *operands)
}
rtx_insn *seq = get_insns ();
end_sequence ();
- /* A call to memset in the worst case requires 3 instructions to prepare
- the arguments + 1 for the call. Prefer the inline sequence for size if
- it is no longer than that. */
- if (size_p && nops > 4)
- return false;
+ if (size_p)
+ {
+ /* When optimizing for size we have 3 options: the SIMD broadcast sequence,
+ call to memset or the MOPS expansion. */
+ if (TARGET_MOPS
+ && mops_cost <= libcall_cost
+ && mops_cost <= simd_ops)
+ return aarch64_expand_setmem_mops (operands);
+ /* If MOPS is not available or not shorter pick a libcall if the SIMD
+ sequence is too long. */
+ else if (libcall_cost < simd_ops)
+ return false;
+ emit_insn (seq);
+ return true;
+ }
+
+ /* At this point the SIMD broadcast sequence is the best choice when
+ optimizing for speed. */
emit_insn (seq);
return true;
}
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index 79d0bcd357f..2478d0db290 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -1063,14 +1063,14 @@ typedef struct
Otherwise follow a sensible default: when optimizing for size, give a better
estimate of the length of a memset call, but use the default otherwise. */
#define CLEAR_RATIO(speed) \
- (!STRICT_ALIGNMENT ? 4 : (speed) ? 15 : AARCH64_CALL_RATIO)
+ (!STRICT_ALIGNMENT ? (TARGET_MOPS ? 0 : 4) : (speed) ? 15 : AARCH64_CALL_RATIO)
/* SET_RATIO is similar to CLEAR_RATIO, but for a non-zero constant. Without
-mstrict-align, make decisions in "setmem". Otherwise follow a sensible
default: when optimizing for size adjust the ratio to account for the
overhead of loading the constant. */
#define SET_RATIO(speed) \
- (!STRICT_ALIGNMENT ? 0 : (speed) ? 15 : AARCH64_CALL_RATIO - 2)
+ ((!STRICT_ALIGNMENT || TARGET_MOPS) ? 0 : (speed) ? 15 : AARCH64_CALL_RATIO - 2)
/* Disable auto-increment in move_by_pieces et al. Use of auto-increment is
rarely a good idea in straight-line code since it adds an extra address
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index b71c171ca04..9e50a26e6f4 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -204,6 +204,7 @@
UNSPEC_SABDL2
UNSPEC_SADALP
UNSPEC_SCVTF
+ UNSPEC_SETMEM
UNSPEC_SISD_NEG
UNSPEC_SISD_SSHL
UNSPEC_SISD_USHL
@@ -1650,18 +1651,29 @@
}
)
+(define_insn "aarch64_setmemdi"
+ [(parallel [
+ (set (match_operand:DI 2 "register_operand" "+&r") (const_int 0))
+ (clobber (match_operand:DI 0 "register_operand" "+&r"))
+ (set (mem:BLK (match_dup 0))
+ (unspec:BLK [(match_operand:QI 1 "aarch64_reg_or_zero" "rZ")
+ (match_dup 2)] UNSPEC_SETMEM))])]
+ "TARGET_MOPS"
+ "setp\t[%x0]!, %x2!, %x1\;setm\t[%x0]!, %x2!, %x1\;sete\t[%x0]!, %x2!, %x1"
+ [(set_attr "length" "12")]
+)
+
;; 0 is dst
;; 1 is val
;; 2 is size of copy in bytes
;; 3 is alignment
-
(define_expand "setmemdi"
[(set (match_operand:BLK 0 "memory_operand") ;; Dest
(match_operand:QI 2 "nonmemory_operand")) ;; Value
- (use (match_operand:DI 1 "immediate_operand")) ;; Length
+ (use (match_operand:DI 1 "general_operand")) ;; Length
(match_operand 3 "immediate_operand")] ;; Align
- "TARGET_SIMD"
-{
+ "TARGET_SIMD || TARGET_MOPS"
+ {
if (aarch64_expand_setmem (operands))
DONE;
diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
index 33788ffd9e3..264739e6143 100644
--- a/gcc/config/aarch64/aarch64.opt
+++ b/gcc/config/aarch64/aarch64.opt
@@ -288,3 +288,7 @@ Constant memcpy size in bytes above which to start using MOPS sequence.
-param=aarch64-mops-memmove-size-threshold=
Target Joined UInteger Var(aarch64_mops_memmove_size_threshold) Init(0) Param
Constant memmove size in bytes above which to start using MOPS sequence.
+
+-param=aarch64-mops-memset-size-threshold=
+Target Joined UInteger Var(aarch64_mops_memset_size_threshold) Init(256) Param
+Constant memset size in bytes from which to start using MOPS sequence.
diff --git a/gcc/testsuite/gcc.target/aarch64/mops_3.c b/gcc/testsuite/gcc.target/aarch64/mops_3.c
new file mode 100644
index 00000000000..0eda2ffb578
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/mops_3.c
@@ -0,0 +1,85 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=armv8.6-a+mops --param=aarch64-mops-memset-size-threshold=0" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include <stdlib.h>
+
+/* We want to inline variable-sized memset.
+** do_it_set:
+** setp \[x0\]\!, x2\!, x1
+** setm \[x0\]\!, x2\!, x1
+** sete \[x0\]\!, x2\!, x1
+** ret
+*/
+void do_it_set (char * out, int n, size_t size)
+{
+ __builtin_memset (out, n, size);
+}
+
+/*
+** do_it_set_large:
+** mov w2, 1
+** mov x1, 1024
+** setp \[x0\]\!, x1\!, x2
+** setm \[x0\]\!, x1\!, x2
+** sete \[x0\]\!, x1\!, x2
+** ret
+*/
+void do_it_set_large (char * out)
+{
+ __builtin_memset (out, 1, 1024);
+}
+
+/*
+** do_it_set_256:
+** mov w2, 1
+** mov x1, 256
+** setp \[x0\]\!, x1\!, x2
+** setm \[x0\]\!, x1\!, x2
+** sete \[x0\]\!, x1\!, x2
+** ret
+*/
+void do_it_set_256 (char * out)
+{
+ __builtin_memset (out, 1, 256);
+}
+
+/*
+** do_it_set_255:
+** mov w2, 1
+** mov x1, 255
+** setp \[x0\]\!, x1\!, x2
+** setm \[x0\]\!, x1\!, x2
+** sete \[x0\]\!, x1\!, x2
+** ret
+*/
+void do_it_set_255 (char * out)
+{
+ __builtin_memset (out, 1, 255);
+}
+
+/*
+** do_it_set_0:
+** setp \[x0\]\!, x1\!, xzr
+** setm \[x0\]\!, x1\!, xzr
+** sete \[x0\]\!, x1\!, xzr
+** ret
+*/
+void do_it_set_0 (char * out, size_t n)
+{
+ __builtin_memset (out, 0, n);
+}
+
+/*
+** do_it_set_0_255:
+** mov x1, 255
+** setp \[x0\]\!, x1\!, xzr
+** setm \[x0\]\!, x1\!, xzr
+** sete \[x0\]\!, x1\!, xzr
+** ret
+*/
+void do_it_set_0_255 (char * out)
+{
+ __builtin_memset (out, 0, 255);
+}
+
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2021-12-13 15:18 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-12-13 15:18 [gcc r12-5934] aarch64: Use +mops to inline memset operations Kyrylo Tkachov
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).