From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 1895) id D8237384AB45; Fri, 26 Apr 2024 17:26:04 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org D8237384AB45 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org; s=default; t=1714152364; bh=SEoRo0Yibkdm2XMhEQw/e3JDi/Ijrc8HbWjOieNSALg=; h=From:To:Subject:Date:From; b=NkpSWMcrens8b9+eqyY7e/LPlH5q9zl/jR0ofg50Puueyxs5sQ7TwON6EEZYl7Z1I 4OPn5bk8XtaaP/u9f4L2zJflbdM/j4dVjWVXz9fOTMVdZuZrLPWs6e48u3MmjmtttV 2kcrxXNn1dFf+nxBEjU9TtneqctGe/MiIKt3b374= MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Content-Type: text/plain; charset="utf-8" From: Wilco Dijkstra To: gcc-cvs@gcc.gnu.org Subject: [gcc r15-8] AArch64: Cleanup memset expansion X-Act-Checkin: gcc X-Git-Author: Wilco Dijkstra X-Git-Refname: refs/heads/master X-Git-Oldrev: 768fbb56b3285b2a3cf067881e745e0f8caec215 X-Git-Newrev: 6b86f71165de9ee64fb76489c04ce032dd74ac21 Message-Id: <20240426172604.D8237384AB45@sourceware.org> Date: Fri, 26 Apr 2024 17:26:04 +0000 (GMT) List-Id: https://gcc.gnu.org/g:6b86f71165de9ee64fb76489c04ce032dd74ac21 commit r15-8-g6b86f71165de9ee64fb76489c04ce032dd74ac21 Author: Wilco Dijkstra Date: Wed Feb 21 23:34:37 2024 +0000 AArch64: Cleanup memset expansion Cleanup memset implementation. Similar to memcpy/memmove, use an offset and bytes throughout. Simplify the complex calculations when optimizing for size by using a fixed limit. gcc/ChangeLog: * config/aarch64/aarch64.cc (MAX_SET_SIZE): New define. (aarch64_progress_pointer): Remove function. (aarch64_set_one_block_and_progress_pointer): Simplify and clean up. (aarch64_expand_setmem): Clean up implementation, use byte offsets, simplify size calculation. Diff: --- gcc/config/aarch64/aarch64.cc | 157 ++++++++++-------------------------------- 1 file changed, 35 insertions(+), 122 deletions(-) diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index bf5fb129b45..a4b7db62546 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -103,6 +103,10 @@ /* Defined for convenience. */ #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT) +/* Maximum bytes set for an inline memset expansion. With -Os use 3 STP + and 1 MOVI/DUP (same size as a call). */ +#define MAX_SET_SIZE(speed) (speed ? 256 : 96) + /* Flags that describe how a function shares certain architectural state with its callers. @@ -26565,15 +26569,6 @@ aarch64_move_pointer (rtx pointer, poly_int64 amount) next, amount); } -/* Return a new RTX holding the result of moving POINTER forward by the - size of the mode it points to. */ - -static rtx -aarch64_progress_pointer (rtx pointer) -{ - return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer))); -} - /* Expand a cpymem/movmem using the MOPS extension. OPERANDS are taken from the cpymem/movmem pattern. IS_MEMMOVE is true if this is a memmove rather than memcpy. Return true iff we succeeded. */ @@ -26699,48 +26694,6 @@ aarch64_expand_cpymem (rtx *operands, bool is_memmove) return true; } -/* Like aarch64_copy_one_block_and_progress_pointers, except for memset where - SRC is a register we have created with the duplicated value to be set. */ -static void -aarch64_set_one_block_and_progress_pointer (rtx src, rtx *dst, - machine_mode mode) -{ - /* If we are copying 128bits or 256bits, we can do that straight from - the SIMD register we prepared. */ - if (known_eq (GET_MODE_BITSIZE (mode), 256)) - { - mode = GET_MODE (src); - /* "Cast" the *dst to the correct mode. */ - *dst = adjust_address (*dst, mode, 0); - /* Emit the memset. */ - emit_move_insn (*dst, src); - emit_move_insn (aarch64_move_pointer (*dst, 16), src); - - /* Move the pointers forward. */ - *dst = aarch64_move_pointer (*dst, 32); - return; - } - if (known_eq (GET_MODE_BITSIZE (mode), 128)) - { - /* "Cast" the *dst to the correct mode. */ - *dst = adjust_address (*dst, GET_MODE (src), 0); - /* Emit the memset. */ - emit_move_insn (*dst, src); - /* Move the pointers forward. */ - *dst = aarch64_move_pointer (*dst, 16); - return; - } - /* For copying less, we have to extract the right amount from src. */ - rtx reg = lowpart_subreg (mode, src, GET_MODE (src)); - - /* "Cast" the *dst to the correct mode. */ - *dst = adjust_address (*dst, mode, 0); - /* Emit the memset. */ - emit_move_insn (*dst, reg); - /* Move the pointer forward. */ - *dst = aarch64_progress_pointer (*dst); -} - /* Expand a setmem using the MOPS instructions. OPERANDS are the same as for the setmem pattern. Return true iff we succeed. */ static bool @@ -26767,24 +26720,21 @@ aarch64_expand_setmem_mops (rtx *operands) bool aarch64_expand_setmem (rtx *operands) { - int n, mode_bits; + int mode_bytes; unsigned HOST_WIDE_INT len; rtx dst = operands[0]; rtx val = operands[2], src; unsigned align = UINTVAL (operands[3]); rtx base; - machine_mode cur_mode = BLKmode, next_mode; + machine_mode mode = BLKmode, next_mode; /* Variable-sized or strict-align memset may use the MOPS expansion. */ if (!CONST_INT_P (operands[1]) || !TARGET_SIMD || (STRICT_ALIGNMENT && align < 16)) return aarch64_expand_setmem_mops (operands); - bool size_p = optimize_function_for_size_p (cfun); - - /* Default the maximum to 256-bytes when considering only libcall vs - SIMD broadcast sequence. */ - unsigned max_set_size = 256; + /* Set inline limits for memset. MOPS has a separate threshold. */ + unsigned max_set_size = MAX_SET_SIZE (optimize_function_for_speed_p (cfun)); unsigned mops_threshold = aarch64_mops_memset_size_threshold; len = UINTVAL (operands[1]); @@ -26793,88 +26743,51 @@ aarch64_expand_setmem (rtx *operands) if (len > max_set_size || (TARGET_MOPS && len > mops_threshold)) return aarch64_expand_setmem_mops (operands); - int cst_val = !!(CONST_INT_P (val) && (INTVAL (val) != 0)); - /* The MOPS sequence takes: - 3 instructions for the memory storing - + 1 to move the constant size into a reg - + 1 if VAL is a non-zero constant to move into a reg - (zero constants can use XZR directly). */ - unsigned mops_cost = 3 + 1 + cst_val; - /* A libcall to memset in the worst case takes 3 instructions to prepare - the arguments + 1 for the call. */ - unsigned libcall_cost = 4; - - /* Attempt a sequence with a vector broadcast followed by stores. - Count the number of operations involved to see if it's worth it - against the alternatives. A simple counter simd_ops on the - algorithmically-relevant operations is used rather than an rtx_insn count - as all the pointer adjusmtents and mode reinterprets will be optimized - away later. */ - start_sequence (); - unsigned simd_ops = 0; - base = copy_to_mode_reg (Pmode, XEXP (dst, 0)); dst = adjust_automodify_address (dst, VOIDmode, base, 0); /* Prepare the val using a DUP/MOVI v0.16B, val. */ - src = expand_vector_broadcast (V16QImode, val); - src = force_reg (V16QImode, src); - simd_ops++; - /* Convert len to bits to make the rest of the code simpler. */ - n = len * BITS_PER_UNIT; - - /* Maximum amount to copy in one go. We allow 256-bit chunks. */ - const int copy_limit = 256; + val = expand_vector_broadcast (V16QImode, val); + val = force_reg (V16QImode, val); - while (n > 0) + int offset = 0; + while (len > 0) { /* Find the largest mode in which to do the copy without over writing. */ opt_scalar_int_mode mode_iter; FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT) - if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit)) - cur_mode = mode_iter.require (); + if (GET_MODE_SIZE (mode_iter.require ()) <= MIN (len, 16)) + mode = mode_iter.require (); + + gcc_assert (mode != BLKmode); + + mode_bytes = GET_MODE_SIZE (mode).to_constant (); - gcc_assert (cur_mode != BLKmode); + src = val; - mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant (); - aarch64_set_one_block_and_progress_pointer (src, &dst, cur_mode); - simd_ops++; - n -= mode_bits; + /* Prefer Q-register accesses. */ + if (mode_bytes == 16) + mode = V16QImode; + else + src = lowpart_subreg (mode, src, GET_MODE (val)); + + emit_move_insn (adjust_address (dst, mode, offset), src); + len -= mode_bytes; + offset += mode_bytes; /* Emit trailing writes using overlapping unaligned accesses - (when !STRICT_ALIGNMENT) - this is smaller and faster. */ - if (n > 0 && n < copy_limit / 2 && !STRICT_ALIGNMENT) + (when !STRICT_ALIGNMENT) - this is smaller and faster. */ + if (len > 0 && len < 16 && !STRICT_ALIGNMENT) { - next_mode = smallest_mode_for_size (n, MODE_INT); - int n_bits = GET_MODE_BITSIZE (next_mode).to_constant (); - gcc_assert (n_bits <= mode_bits); - dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT); - n = n_bits; + next_mode = smallest_mode_for_size (len * BITS_PER_UNIT, MODE_INT); + int n_bytes = GET_MODE_SIZE (next_mode).to_constant (); + gcc_assert (n_bytes <= mode_bytes); + offset -= n_bytes - len; + len = n_bytes; } } - rtx_insn *seq = get_insns (); - end_sequence (); - - if (size_p) - { - /* When optimizing for size we have 3 options: the SIMD broadcast sequence, - call to memset or the MOPS expansion. */ - if (TARGET_MOPS - && mops_cost <= libcall_cost - && mops_cost <= simd_ops) - return aarch64_expand_setmem_mops (operands); - /* If MOPS is not available or not shorter pick a libcall if the SIMD - sequence is too long. */ - else if (libcall_cost < simd_ops) - return false; - emit_insn (seq); - return true; - } - /* At this point the SIMD broadcast sequence is the best choice when - optimizing for speed. */ - emit_insn (seq); return true; }