From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 2140) id 76AEA3858C20; Fri, 27 Jan 2023 02:25:48 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 76AEA3858C20 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org; s=default; t=1674786348; bh=CzETawfoiXjemHT1DNrAAIHVE6WotfeZ6PyMFmn85ls=; h=From:To:Subject:Date:From; b=XUm0sjXK6KWE18V4CFXoT+FTm5e4648BQXy70Lj+u6TGrjNLKRnG9VwBpSmF/3+M5 RWosf/EI4CndTJ51C8fmLEl+jIKsMZ4b+b1l/oYWPGcf+R+r5XWvV84qT4SYagR+Ox n1xODZ/yknIx2g0kB5Liod+cCZhofO+/WJgGjM5k= Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: Alexandre Oliva To: gcc-cvs@gcc.gnu.org Subject: [gcc(refs/users/aoliva/heads/testme)] add memcpy and memmove loop expanders X-Act-Checkin: gcc X-Git-Author: Alexandre Oliva X-Git-Refname: refs/users/aoliva/heads/testme X-Git-Oldrev: 9258b638dbdecef19a55d530854d59e4fce3e21b X-Git-Newrev: 6517957459debe07872931b426376952319d5cdf Message-Id: <20230127022548.76AEA3858C20@sourceware.org> Date: Fri, 27 Jan 2023 02:25:48 +0000 (GMT) List-Id: https://gcc.gnu.org/g:6517957459debe07872931b426376952319d5cdf commit 6517957459debe07872931b426376952319d5cdf Author: Alexandre Oliva Date: Thu Jan 26 22:52:18 2023 -0300 add memcpy and memmove loop expanders Diff: --- gcc/builtins.cc | 2 +- gcc/expr.cc | 194 +++++++++++++++++++++-- gcc/expr.h | 6 +- gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c | 8 + gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c | 7 + 5 files changed, 199 insertions(+), 18 deletions(-) diff --git a/gcc/builtins.cc b/gcc/builtins.cc index bf21fef699f..4bf404a8a78 100644 --- a/gcc/builtins.cc +++ b/gcc/builtins.cc @@ -3756,7 +3756,7 @@ expand_builtin_memory_copy_args (tree dest, tree src, tree len, expected_align, expected_size, min_size, max_size, probable_max_size, use_mempcpy_call, &is_move_done, - might_overlap); + might_overlap, tree_ctz (len)); /* Bail out when a mempcpy call would be expanded as libcall and when we have a target that provides a fast implementation diff --git a/gcc/expr.cc b/gcc/expr.cc index 15be1c8db99..aceb3f514fc 100644 --- a/gcc/expr.cc +++ b/gcc/expr.cc @@ -80,7 +80,9 @@ static bool emit_block_move_via_pattern (rtx, rtx, rtx, unsigned, unsigned, HOST_WIDE_INT, unsigned HOST_WIDE_INT, unsigned HOST_WIDE_INT, unsigned HOST_WIDE_INT, bool); -static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned); +static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned, int); +static void emit_block_move_via_sized_loop (rtx, rtx, rtx, unsigned, unsigned); +static void emit_block_move_via_oriented_loop (rtx, rtx, rtx, unsigned, unsigned); static void clear_by_pieces (rtx, unsigned HOST_WIDE_INT, unsigned int); static rtx_insn *compress_float_constant (rtx, rtx); static rtx get_subtarget (rtx); @@ -1955,6 +1957,8 @@ compare_by_pieces (rtx arg0, rtx arg1, unsigned HOST_WIDE_INT len, MIN_SIZE is the minimal size of block to move MAX_SIZE is the maximal size of block to move, if it cannot be represented in unsigned HOST_WIDE_INT, than it is mask of all ones. + CTZ_SIZE is the trailing-zeros count of SIZE; even a nonconstant SIZE is + known to be a multiple of 1< 1 && !can_move_by_pieces (incr, align)) + incr >>= 1; + + gcc_checking_assert (incr); + + return emit_block_move_via_loop (x, y, size, align, incr); +} + +/* Like emit_block_move_via_sized_loop, but besides choosing INCR so + as to ensure safe moves even in case of overlap, output dynamic + tests to choose between two loops, one moving downwards, another + moving upwards. */ + +static void +emit_block_move_via_oriented_loop (rtx x, rtx y, rtx size, + unsigned int align, + unsigned int ctz_size) +{ + int incr = align / BITS_PER_UNIT; + + if (CONST_INT_P (size)) + ctz_size = MAX (ctz_size, (unsigned) wi::ctz (UINTVAL (size))); + + if (HOST_WIDE_INT_1U << ctz_size < (unsigned HOST_WIDE_INT) incr) + incr = HOST_WIDE_INT_1U << ctz_size; + + while (incr > 1 && !int_mode_for_size (incr, 0).exists ()) + incr >>= 1; + + gcc_checking_assert (incr); + + rtx_code_label *upw_label, *end_label; + upw_label = gen_label_rtx (); + end_label = gen_label_rtx (); + + rtx x_addr = force_operand (XEXP (x, 0), NULL_RTX); + rtx y_addr = force_operand (XEXP (y, 0), NULL_RTX); + do_pending_stack_adjust (); + + machine_mode mode = GET_MODE (x_addr); + if (mode != GET_MODE (y_addr)) + { + scalar_int_mode xmode + = smallest_int_mode_for_size (GET_MODE_BITSIZE (mode)); + scalar_int_mode ymode + = smallest_int_mode_for_size (GET_MODE_BITSIZE + (GET_MODE (y_addr))); + if (GET_MODE_BITSIZE (xmode) < GET_MODE_BITSIZE (ymode)) + mode = ymode; + else + mode = xmode; + +#ifndef POINTERS_EXTEND_UNSIGNED + const int POINTERS_EXTEND_UNSIGNED = 1; +#endif + x_addr = convert_modes (mode, GET_MODE (x_addr), x_addr, + POINTERS_EXTEND_UNSIGNED); + y_addr = convert_modes (mode, GET_MODE (y_addr), y_addr, + POINTERS_EXTEND_UNSIGNED); + } + + /* Test for overlap: if (x >= y || x + size <= y) goto upw_label. */ + emit_cmp_and_jump_insns (x_addr, y_addr, GEU, NULL_RTX, mode, + true, upw_label, + profile_probability::guessed_always () + .apply_scale (5, 10)); + rtx tmp = convert_modes (GET_MODE (x_addr), GET_MODE (size), size, true); + tmp = simplify_gen_binary (PLUS, GET_MODE (x_addr), x_addr, tmp); + + emit_cmp_and_jump_insns (tmp, y_addr, LEU, NULL_RTX, mode, + true, upw_label, + profile_probability::guessed_always () + .apply_scale (8, 10)); + + emit_block_move_via_loop (x, y, size, align, -incr); + + emit_jump (end_label); + emit_label (upw_label); + + emit_block_move_via_loop (x, y, size, align, incr); + + emit_label (end_label); +} + /* A subroutine of emit_block_move. Copy the data via an explicit - loop. This is used only when libcalls are forbidden. */ -/* ??? It'd be nice to copy in hunks larger than QImode. */ + loop. This is used only when libcalls are forbidden, or when + inlining is required. INCR is the block size to be copied in each + loop iteration. If it is negative, the absolute value is used, and + the block is copied backwards. INCR must be a power of two, an + exact divisor for SIZE and ALIGN, and imply a mode that can be + safely copied per iteration assuming no overlap. */ static void emit_block_move_via_loop (rtx x, rtx y, rtx size, - unsigned int align ATTRIBUTE_UNUSED) + unsigned int align, int incr) { rtx_code_label *cmp_label, *top_label; rtx iter, x_addr, y_addr, tmp; @@ -2277,7 +2397,38 @@ emit_block_move_via_loop (rtx x, rtx y, rtx size, cmp_label = gen_label_rtx (); iter = gen_reg_rtx (iter_mode); - emit_move_insn (iter, const0_rtx); + bool downwards = incr < 0; + rtx iter_init; + rtx_code iter_cond; + rtx iter_limit; + rtx iter_incr; + machine_mode move_mode; + if (downwards) + { + incr = -incr; + iter_init = size; + iter_cond = GE; + iter_limit = const0_rtx; + iter_incr = GEN_INT (incr); + } + else + { + iter_init = const0_rtx; + iter_cond = LT; + iter_limit = size; + iter_incr = GEN_INT (incr); + } + emit_move_insn (iter, iter_init); + + scalar_int_mode int_move_mode + = smallest_int_mode_for_size (incr * BITS_PER_UNIT); + if (GET_MODE_BITSIZE (int_move_mode) != incr * BITS_PER_UNIT) + { + move_mode = BLKmode; + gcc_checking_assert (can_move_by_pieces (incr, align)); + } + else + move_mode = int_move_mode; x_addr = force_operand (XEXP (x, 0), NULL_RTX); y_addr = force_operand (XEXP (y, 0), NULL_RTX); @@ -2293,19 +2444,32 @@ emit_block_move_via_loop (rtx x, rtx y, rtx size, tmp = convert_modes (y_addr_mode, iter_mode, iter, true); y_addr = simplify_gen_binary (PLUS, y_addr_mode, y_addr, tmp); - x = change_address (x, QImode, x_addr); - y = change_address (y, QImode, y_addr); + x = change_address (x, move_mode, x_addr); + y = change_address (y, move_mode, y_addr); + + if (move_mode == BLKmode) + { + bool done; + emit_block_move_hints (x, y, iter_incr, BLOCK_OP_NO_LIBCALL, + align, incr, incr, incr, incr, + false, &done, false); + gcc_checking_assert (done); + } + else + emit_move_insn (x, y); - emit_move_insn (x, y); + if (downwards) + emit_label (cmp_label); - tmp = expand_simple_binop (iter_mode, PLUS, iter, const1_rtx, iter, + tmp = expand_simple_binop (iter_mode, PLUS, iter, iter_incr, iter, true, OPTAB_LIB_WIDEN); if (tmp != iter) emit_move_insn (iter, tmp); - emit_label (cmp_label); + if (!downwards) + emit_label (cmp_label); - emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode, + emit_cmp_and_jump_insns (iter, iter_limit, iter_cond, NULL_RTX, iter_mode, true, top_label, profile_probability::guessed_always () .apply_scale (9, 10)); diff --git a/gcc/expr.h b/gcc/expr.h index e3ba9eb5370..d9fc47c9114 100644 --- a/gcc/expr.h +++ b/gcc/expr.h @@ -126,7 +126,8 @@ struct by_pieces_prev fixed_size_mode mode; }; -extern rtx emit_block_move (rtx, rtx, rtx, enum block_op_methods); +extern rtx emit_block_move (rtx, rtx, rtx, enum block_op_methods, + unsigned ctz_size = 0); extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods, unsigned int, HOST_WIDE_INT, unsigned HOST_WIDE_INT, @@ -134,7 +135,8 @@ extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods, unsigned HOST_WIDE_INT, bool bail_out_libcall = false, bool *is_move_done = NULL, - bool might_overlap = false); + bool might_overlap = false, + unsigned ctz_size = 0); extern rtx emit_block_cmp_hints (rtx, rtx, rtx, tree, rtx, bool, by_pieces_constfn, void *); extern bool emit_storent_insn (rtx to, rtx from); diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c new file mode 100644 index 00000000000..2a7d74fbee4 --- /dev/null +++ b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c @@ -0,0 +1,8 @@ +/* { dg-do compile } */ +/* { dg-options "-finline-stringops=memcpy -g0 -fno-lto" } */ + +#include "../memcmp-1.c" +/* Yeah, this memcmp test exercises plenty of memcpy, more than any of the + memcpy tests. */ + +/* { dg-final { scan-assembler-not "memcpy" } } */ diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c new file mode 100644 index 00000000000..5e38bc99ce1 --- /dev/null +++ b/gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c @@ -0,0 +1,7 @@ +/* { dg-do run } */ +/* { dg-options "-finline-stringops=memmove -g0 -fno-lto" } */ + +#include "../../gcc.c-torture/execute/builtins/memmove.c" + +/* { dg-final { scan-assembler-not "memcpy" } } */ +/* { dg-final { scan-assembler-not "memmove" } } */