From: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
To: GCC Patches <gcc-patches@gcc.gnu.org>
Cc: Richard Sandiford <Richard.Sandiford@arm.com>,
Richard Earnshaw <Richard.Earnshaw@arm.com>
Subject: Re: [PATCH] AArch64: Cleanup memset expansion
Date: Mon, 6 Nov 2023 12:11:50 +0000 [thread overview]
Message-ID: <DB3PR08MB8986D5D28C0FD01BBE78926583AAA@DB3PR08MB8986.eurprd08.prod.outlook.com> (raw)
In-Reply-To: <PAWPR08MB898262EC1D06EA3207A4372483D4A@PAWPR08MB8982.eurprd08.prod.outlook.com>
ping
Cleanup memset implementation. Similar to memcpy/memmove, use an offset and
bytes throughout. Simplify the complex calculations when optimizing for size
by using a fixed limit.
Passes regress/bootstrap, OK for commit?
gcc/ChangeLog:
* config/aarch64/aarch64.cc (aarch64_progress_pointer): Remove function.
(aarch64_set_one_block_and_progress_pointer): Simplify and clean up.
(aarch64_expand_setmem): Clean up implementation, use byte offsets,
simplify size calculation.
---
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index e19e2d1de2e5b30eca672df05d9dcc1bc106ecc8..578a253d6e0e133e19592553fc873b3e73f9f218 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -25229,15 +25229,6 @@ aarch64_move_pointer (rtx pointer, poly_int64 amount)
next, amount);
}
-/* Return a new RTX holding the result of moving POINTER forward by the
- size of the mode it points to. */
-
-static rtx
-aarch64_progress_pointer (rtx pointer)
-{
- return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
-}
-
/* Copy one block of size MODE from SRC to DST at offset OFFSET. */
static void
@@ -25393,46 +25384,22 @@ aarch64_expand_cpymem (rtx *operands, bool is_memmove)
return true;
}
-/* Like aarch64_copy_one_block_and_progress_pointers, except for memset where
- SRC is a register we have created with the duplicated value to be set. */
+/* Set one block of size MODE at DST at offset OFFSET to value in SRC. */
static void
-aarch64_set_one_block_and_progress_pointer (rtx src, rtx *dst,
- machine_mode mode)
-{
- /* If we are copying 128bits or 256bits, we can do that straight from
- the SIMD register we prepared. */
- if (known_eq (GET_MODE_BITSIZE (mode), 256))
- {
- mode = GET_MODE (src);
- /* "Cast" the *dst to the correct mode. */
- *dst = adjust_address (*dst, mode, 0);
- /* Emit the memset. */
- emit_insn (aarch64_gen_store_pair (mode, *dst, src,
- aarch64_progress_pointer (*dst), src));
-
- /* Move the pointers forward. */
- *dst = aarch64_move_pointer (*dst, 32);
- return;
- }
- if (known_eq (GET_MODE_BITSIZE (mode), 128))
+aarch64_set_one_block (rtx src, rtx dst, int offset, machine_mode mode)
+{
+ /* Emit explict store pair instructions for 32-byte writes. */
+ if (known_eq (GET_MODE_SIZE (mode), 32))
{
- /* "Cast" the *dst to the correct mode. */
- *dst = adjust_address (*dst, GET_MODE (src), 0);
- /* Emit the memset. */
- emit_move_insn (*dst, src);
- /* Move the pointers forward. */
- *dst = aarch64_move_pointer (*dst, 16);
+ mode = V16QImode;
+ rtx dst1 = adjust_address (dst, mode, offset);
+ rtx dst2 = adjust_address (dst, mode, offset + 16);
+ emit_insn (aarch64_gen_store_pair (mode, dst1, src, dst2, src));
return;
}
- /* For copying less, we have to extract the right amount from src. */
- rtx reg = lowpart_subreg (mode, src, GET_MODE (src));
-
- /* "Cast" the *dst to the correct mode. */
- *dst = adjust_address (*dst, mode, 0);
- /* Emit the memset. */
- emit_move_insn (*dst, reg);
- /* Move the pointer forward. */
- *dst = aarch64_progress_pointer (*dst);
+ if (known_lt (GET_MODE_SIZE (mode), 16))
+ src = lowpart_subreg (mode, src, GET_MODE (src));
+ emit_move_insn (adjust_address (dst, mode, offset), src);
}
/* Expand a setmem using the MOPS instructions. OPERANDS are the same
@@ -25461,7 +25428,7 @@ aarch64_expand_setmem_mops (rtx *operands)
bool
aarch64_expand_setmem (rtx *operands)
{
- int n, mode_bits;
+ int mode_bytes;
unsigned HOST_WIDE_INT len;
rtx dst = operands[0];
rtx val = operands[2], src;
@@ -25474,104 +25441,70 @@ aarch64_expand_setmem (rtx *operands)
|| (STRICT_ALIGNMENT && align < 16))
return aarch64_expand_setmem_mops (operands);
- bool size_p = optimize_function_for_size_p (cfun);
-
/* Default the maximum to 256-bytes when considering only libcall vs
SIMD broadcast sequence. */
unsigned max_set_size = 256;
unsigned mops_threshold = aarch64_mops_memset_size_threshold;
+ /* Reduce the maximum size with -Os. */
+ if (optimize_function_for_size_p (cfun))
+ max_set_size = 96;
+
len = UINTVAL (operands[1]);
/* Large memset uses MOPS when available or a library call. */
if (len > max_set_size || (TARGET_MOPS && len > mops_threshold))
return aarch64_expand_setmem_mops (operands);
- int cst_val = !!(CONST_INT_P (val) && (INTVAL (val) != 0));
- /* The MOPS sequence takes:
- 3 instructions for the memory storing
- + 1 to move the constant size into a reg
- + 1 if VAL is a non-zero constant to move into a reg
- (zero constants can use XZR directly). */
- unsigned mops_cost = 3 + 1 + cst_val;
- /* A libcall to memset in the worst case takes 3 instructions to prepare
- the arguments + 1 for the call. */
- unsigned libcall_cost = 4;
-
- /* Attempt a sequence with a vector broadcast followed by stores.
- Count the number of operations involved to see if it's worth it
- against the alternatives. A simple counter simd_ops on the
- algorithmically-relevant operations is used rather than an rtx_insn count
- as all the pointer adjusmtents and mode reinterprets will be optimized
- away later. */
- start_sequence ();
- unsigned simd_ops = 0;
-
base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
dst = adjust_automodify_address (dst, VOIDmode, base, 0);
/* Prepare the val using a DUP/MOVI v0.16B, val. */
src = expand_vector_broadcast (V16QImode, val);
src = force_reg (V16QImode, src);
- simd_ops++;
- /* Convert len to bits to make the rest of the code simpler. */
- n = len * BITS_PER_UNIT;
- /* Maximum amount to copy in one go. We allow 256-bit chunks based on the
- AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter. */
- const int copy_limit = (aarch64_tune_params.extra_tuning_flags
- & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
- ? GET_MODE_BITSIZE (TImode) : 256;
+ /* Set maximum amount to write in one go. We allow 32-byte chunks based
+ on the AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter. */
+ unsigned set_max = 32;
+
+ if (len <= 24 || (aarch64_tune_params.extra_tuning_flags
+ & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
+ set_max = 16;
- while (n > 0)
+ int offset = 0;
+ while (len > 0)
{
/* Find the largest mode in which to do the copy without
over writing. */
opt_scalar_int_mode mode_iter;
FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
- if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
+ if (GET_MODE_SIZE (mode_iter.require ()) <= MIN (len, set_max))
cur_mode = mode_iter.require ();
gcc_assert (cur_mode != BLKmode);
- mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
- aarch64_set_one_block_and_progress_pointer (src, &dst, cur_mode);
- simd_ops++;
- n -= mode_bits;
+ mode_bytes = GET_MODE_SIZE (cur_mode).to_constant ();
+
+ /* Prefer Q-register accesses for the last bytes. */
+ if (mode_bytes == 16)
+ cur_mode = V16QImode;
+
+ aarch64_set_one_block (src, dst, offset, cur_mode);
+ len -= mode_bytes;
+ offset += mode_bytes;
/* Emit trailing writes using overlapping unaligned accesses
- (when !STRICT_ALIGNMENT) - this is smaller and faster. */
- if (n > 0 && n < copy_limit / 2 && !STRICT_ALIGNMENT)
+ (when !STRICT_ALIGNMENT) - this is smaller and faster. */
+ if (len > 0 && len < set_max / 2 && !STRICT_ALIGNMENT)
{
- next_mode = smallest_mode_for_size (n, MODE_INT);
- int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
- gcc_assert (n_bits <= mode_bits);
- dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
- n = n_bits;
+ next_mode = smallest_mode_for_size (len * BITS_PER_UNIT, MODE_INT);
+ int n_bytes = GET_MODE_SIZE (next_mode).to_constant ();
+ gcc_assert (n_bytes <= mode_bytes);
+ offset -= n_bytes - len;
+ len = n_bytes;
}
}
- rtx_insn *seq = get_insns ();
- end_sequence ();
-
- if (size_p)
- {
- /* When optimizing for size we have 3 options: the SIMD broadcast sequence,
- call to memset or the MOPS expansion. */
- if (TARGET_MOPS
- && mops_cost <= libcall_cost
- && mops_cost <= simd_ops)
- return aarch64_expand_setmem_mops (operands);
- /* If MOPS is not available or not shorter pick a libcall if the SIMD
- sequence is too long. */
- else if (libcall_cost < simd_ops)
- return false;
- emit_insn (seq);
- return true;
- }
- /* At this point the SIMD broadcast sequence is the best choice when
- optimizing for speed. */
- emit_insn (seq);
return true;
}
next prev parent reply other threads:[~2023-11-06 12:12 UTC|newest]
Thread overview: 16+ messages / expand[flat|nested] mbox.gz Atom feed top
2023-10-19 12:51 Wilco Dijkstra
2023-11-06 12:11 ` Wilco Dijkstra [this message]
2023-11-10 9:50 ` Kyrylo Tkachov
2023-11-10 10:17 ` Wilco Dijkstra
2023-11-10 11:30 ` Richard Earnshaw
2023-11-10 14:46 ` Kyrylo Tkachov
2023-11-10 15:13 ` Richard Earnshaw
2023-11-14 16:23 ` [PATCH v2] " Wilco Dijkstra
2023-11-14 16:36 ` Richard Earnshaw
2023-11-14 16:56 ` Wilco Dijkstra
2023-12-22 14:25 ` [PATCH v3] " Wilco Dijkstra
2024-01-05 10:53 ` Richard Sandiford
2024-01-09 20:51 ` [PATCH v4] " Wilco Dijkstra
2024-01-10 18:13 ` Richard Sandiford
2024-01-30 15:51 ` Wilco Dijkstra
2024-02-01 17:32 ` Richard Sandiford
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=DB3PR08MB8986D5D28C0FD01BBE78926583AAA@DB3PR08MB8986.eurprd08.prod.outlook.com \
--to=wilco.dijkstra@arm.com \
--cc=Richard.Earnshaw@arm.com \
--cc=Richard.Sandiford@arm.com \
--cc=gcc-patches@gcc.gnu.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).