From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 2140) id 9BFDC3858C3A; Mon, 20 Nov 2023 09:03:28 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 9BFDC3858C3A DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org; s=default; t=1700471008; bh=q6u7ixhC4M2T52IciajvNTXHYSBHfpfQKUaC5RAtFm8=; h=From:To:Subject:Date:From; b=OzU/MScmPwU2mvrcemJSuq8kUZS7mheY9cxxJ9RTAll48pkVo/LqE95uDdDILf0O9 eEQOwMJPowE8HNKLrKBJez8dzryTwDwsGQGT3Gr3wRMgjnDhcnNHI6TwG5Z/C9314+ D3J5yKpyjfcVx9JaMLyNBbRuy962n5iCOBQrOag8= Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: Alexandre Oliva To: gcc-cvs@gcc.gnu.org Subject: [gcc(refs/users/aoliva/heads/testme)] Introduce -finline-stringops X-Act-Checkin: gcc X-Git-Author: Alexandre Oliva X-Git-Refname: refs/users/aoliva/heads/testme X-Git-Oldrev: 7c7bbfc3b5965a5f658b78f97183b5b4060934ba X-Git-Newrev: 959574d2b5ac900f9dfdd29d0b0dc6fd9dcbdad2 Message-Id: <20231120090328.9BFDC3858C3A@sourceware.org> Date: Mon, 20 Nov 2023 09:03:28 +0000 (GMT) List-Id: https://gcc.gnu.org/g:959574d2b5ac900f9dfdd29d0b0dc6fd9dcbdad2 commit 959574d2b5ac900f9dfdd29d0b0dc6fd9dcbdad2 Author: Alexandre Oliva Date: Fri Sep 15 02:41:24 2023 -0300 Introduce -finline-stringops try_store_by_multiple_pieces was added not long ago, enabling variable-sized memset to be expanded inline when the worst-case in-range constant length would, using conditional blocks with powers of two to cover all possibilities of length and alignment. This patch introduces -finline-stringops[=fn] to request expansions to start with a loop, so as to still take advantage of known alignment even with long lengths, but without necessarily adding store blocks for every power of two. This makes it possible for the supported stringops (memset, memcpy, memmove, memset) to be expanded, even if storing a single byte per iteration. Surely efficient implementations can run faster, with a pre-loop to increase alignment, but that would likely be excessive for inline expansions. Still, in some cases, such as in freestanding environments, users prefer to inline such stringops, especially those that the compiler may introduce itself, even if the expansion is not as performant as a highly optimized C library implementation could be, to avoid depending on a C runtime library. for gcc/ChangeLog * expr.cc (emit_block_move_hints): Take ctz of len. Obey -finline-stringops. Use oriented or sized loop. (emit_block_move): Take ctz of len, and pass it on. (emit_block_move_via_sized_loop): New. (emit_block_move_via_oriented_loop): New. (emit_block_move_via_loop): Take incr. Move an incr-sized block per iteration. (emit_block_cmp_via_cmpmem): Take ctz of len. Obey -finline-stringops. (emit_block_cmp_via_loop): New. * expr.h (emit_block_move): Add ctz of len defaulting to zero. (emit_block_move_hints): Likewise. (emit_block_cmp_hints): Likewise. * builtins.cc (expand_builtin_memory_copy_args): Pass ctz of len to emit_block_move_hints. (try_store_by_multiple_pieces): Support starting with a loop. (expand_builtin_memcmp): Pass ctz of len to emit_block_cmp_hints. (expand_builtin): Allow inline expansion of memset, memcpy, memmove and memcmp if requested. * common.opt (finline-stringops): New. (ilsop_fn): New enum. * flag-types.h (enum ilsop_fn): New. * doc/invoke.texi (-finline-stringops): Add. for gcc/testsuite/ChangeLog * gcc.dg/torture/inline-mem-cmp-1.c: New. * gcc.dg/torture/inline-mem-cpy-1.c: New. * gcc.dg/torture/inline-mem-cpy-cmp-1.c: New. * gcc.dg/torture/inline-mem-move-1.c: New. * gcc.dg/torture/inline-mem-set-1.c: New. Diff: --- gcc/builtins.cc | 149 +++++++- gcc/common.opt | 34 ++ gcc/doc/invoke.texi | 15 + gcc/expr.cc | 396 ++++++++++++++++++++- gcc/expr.h | 9 +- gcc/flag-types.h | 11 + gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c | 7 + gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c | 8 + .../gcc.dg/torture/inline-mem-cpy-cmp-1.c | 11 + gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c | 8 + gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c | 84 +++++ 11 files changed, 697 insertions(+), 35 deletions(-) diff --git a/gcc/builtins.cc b/gcc/builtins.cc index 27f16b3cf68..26df959520f 100644 --- a/gcc/builtins.cc +++ b/gcc/builtins.cc @@ -3752,7 +3752,7 @@ expand_builtin_memory_copy_args (tree dest, tree src, tree len, expected_align, expected_size, min_size, max_size, probable_max_size, use_mempcpy_call, &is_move_done, - might_overlap); + might_overlap, tree_ctz (len)); /* Bail out when a mempcpy call would be expanded as libcall and when we have a target that provides a fast implementation @@ -4314,6 +4314,10 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len, int tst_bits = (max_bits != min_bits ? max_bits : floor_log2 (max_len ^ min_len)); + /* Save the pre-blksize values. */ + int orig_max_bits = max_bits; + int orig_tst_bits = tst_bits; + /* Check whether it's profitable to start by storing a fixed BLKSIZE bytes, to lower max_bits. In the unlikely case of a constant LEN (implied by identical MAX_LEN and MIN_LEN), we want to issue a @@ -4353,9 +4357,81 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len, if (max_bits >= 0) xlenest += ((HOST_WIDE_INT_1U << max_bits) * 2 - (HOST_WIDE_INT_1U << ctz_len)); - if (!can_store_by_pieces (xlenest, builtin_memset_read_str, - &valc, align, true)) - return false; + bool max_loop = false; + bool use_store_by_pieces = true; + /* Skip the test in case of overflow in xlenest. It shouldn't + happen because of the way max_bits and blksize are related, but + it doesn't hurt to test. */ + if (blksize > xlenest + || !can_store_by_pieces (xlenest, builtin_memset_read_str, + &valc, align, true)) + { + if (!(flag_inline_stringops & ILSOP_MEMSET)) + return false; + + for (max_bits = orig_max_bits; + max_bits >= sctz_len; + --max_bits) + { + xlenest = ((HOST_WIDE_INT_1U << max_bits) * 2 + - (HOST_WIDE_INT_1U << ctz_len)); + /* Check that blksize plus the bits to be stored as blocks + sized at powers of two can be stored by pieces. This is + like the test above, but with smaller max_bits. Skip + orig_max_bits (it would be redundant). Also skip in case + of overflow. */ + if (max_bits < orig_max_bits + && xlenest + blksize >= xlenest + && can_store_by_pieces (xlenest + blksize, + builtin_memset_read_str, + &valc, align, true)) + { + max_loop = true; + break; + } + if (blksize + && can_store_by_pieces (xlenest, + builtin_memset_read_str, + &valc, align, true)) + { + max_len += blksize; + min_len += blksize; + tst_bits = orig_tst_bits; + blksize = 0; + max_loop = true; + break; + } + if (max_bits == sctz_len) + { + /* We'll get here if can_store_by_pieces refuses to + store even a single QImode. We'll fall back to + QImode stores then. */ + if (!sctz_len) + { + blksize = 0; + max_loop = true; + use_store_by_pieces = false; + break; + } + --sctz_len; + --ctz_len; + } + } + if (!max_loop) + return false; + /* If the boundaries are such that min and max may run a + different number of trips in the initial loop, the remainder + needs not be between the moduli, so set tst_bits to cover all + bits. Otherwise, if the trip counts are the same, max_len + has the common prefix, and the previously-computed tst_bits + is usable. */ + if (max_len >> max_bits > min_len >> max_bits) + tst_bits = max_bits; + } + /* ??? Do we have to check that all powers of two lengths from + max_bits down to ctz_len pass can_store_by_pieces? As in, could + it possibly be that xlenest passes while smaller power-of-two + sizes don't? */ by_pieces_constfn constfun; void *constfundata; @@ -4397,7 +4473,9 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len, the least significant bit possibly set in the length. */ for (int i = max_bits; i >= sctz_len; i--) { + rtx_code_label *loop_label = NULL; rtx_code_label *label = NULL; + blksize = HOST_WIDE_INT_1U << i; /* If we're past the bits shared between min_ and max_len, expand @@ -4411,25 +4489,57 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len, profile_probability::even ()); } /* If we are at a bit that is in the prefix shared by min_ and - max_len, skip this BLKSIZE if the bit is clear. */ - else if ((max_len & blksize) == 0) + max_len, skip the current BLKSIZE if the bit is clear, but do + not skip the loop, even if it doesn't require + prechecking. */ + else if ((max_len & blksize) == 0 + && !(max_loop && i == max_bits)) continue; - /* Issue a store of BLKSIZE bytes. */ - to = store_by_pieces (to, blksize, - constfun, constfundata, - align, true, - i != sctz_len ? RETURN_END : RETURN_BEGIN); + if (max_loop && i == max_bits) + { + loop_label = gen_label_rtx (); + emit_label (loop_label); + /* Since we may run this multiple times, don't assume we + know anything about the offset. */ + clear_mem_offset (to); + } + bool update_needed = i != sctz_len || loop_label; + rtx next_ptr = NULL_RTX; + if (!use_store_by_pieces) + { + gcc_checking_assert (blksize == 1); + if (!val) + val = gen_int_mode (valc, QImode); + to = change_address (to, QImode, 0); + emit_move_insn (to, val); + if (update_needed) + next_ptr = plus_constant (ptr_mode, ptr, blksize); + } + else + { + /* Issue a store of BLKSIZE bytes. */ + to = store_by_pieces (to, blksize, + constfun, constfundata, + align, true, + update_needed ? RETURN_END : RETURN_BEGIN); + next_ptr = XEXP (to, 0); + } /* Adjust REM and PTR, unless this is the last iteration. */ - if (i != sctz_len) + if (update_needed) { - emit_move_insn (ptr, force_operand (XEXP (to, 0), NULL_RTX)); + emit_move_insn (ptr, force_operand (next_ptr, NULL_RTX)); to = replace_equiv_address (to, ptr); rtx rem_minus_blksize = plus_constant (ptr_mode, rem, -blksize); emit_move_insn (rem, force_operand (rem_minus_blksize, NULL_RTX)); } + if (loop_label) + emit_cmp_and_jump_insns (rem, GEN_INT (blksize), GE, NULL, + ptr_mode, 1, loop_label, + profile_probability::likely ()); + if (label) { emit_label (label); @@ -4716,7 +4826,8 @@ expand_builtin_memcmp (tree exp, rtx target, bool result_eq) result = emit_block_cmp_hints (arg1_rtx, arg2_rtx, len_rtx, TREE_TYPE (len), target, result_eq, constfn, - CONST_CAST (char *, rep)); + CONST_CAST (char *, rep), + tree_ctz (len)); if (result) { @@ -7605,7 +7716,15 @@ expand_builtin (tree exp, rtx target, rtx subtarget, machine_mode mode, && fcode != BUILT_IN_EXECVE && fcode != BUILT_IN_CLEAR_CACHE && !ALLOCA_FUNCTION_CODE_P (fcode) - && fcode != BUILT_IN_FREE) + && fcode != BUILT_IN_FREE + && (fcode != BUILT_IN_MEMSET + || !(flag_inline_stringops & ILSOP_MEMSET)) + && (fcode != BUILT_IN_MEMCPY + || !(flag_inline_stringops & ILSOP_MEMCPY)) + && (fcode != BUILT_IN_MEMMOVE + || !(flag_inline_stringops & ILSOP_MEMMOVE)) + && (fcode != BUILT_IN_MEMCMP + || !(flag_inline_stringops & ILSOP_MEMCMP))) return expand_call (exp, target, ignore); /* The built-in function expanders test for target == const0_rtx diff --git a/gcc/common.opt b/gcc/common.opt index 5b4cef21bd6..4914b6e7dbf 100644 --- a/gcc/common.opt +++ b/gcc/common.opt @@ -1943,6 +1943,40 @@ finline-atomics Common Var(flag_inline_atomics) Init(1) Optimization Inline __atomic operations when a lock free instruction sequence is available. +finline-stringops +Common RejectNegative Enum(ilsop_fn) Var(flag_inline_stringops, ILSOP_ALL) Enum(ilsop_fn) Init(ILSOP_NONE) Optimization Undocumented + +fno-inline-stringops +Common RejectNegative Enum(ilsop_fn) Var(flag_inline_stringops, ILSOP_NONE) Enum(ilsop_fn) Optimization Undocumented + +finline-stringops= +Common Joined Var(flag_inline_stringops) EnumSet Enum(ilsop_fn) Optimization +-finline-stringops[=memcmp|memcpy|memmove|memset] +Expand supported mem/str operations inline, even if against optimization. + +Enum +Name(ilsop_fn) Type(enum ilsop_fn) UnknownError(unavailable stringop for inlining %qs) + +; This is not part of any set. +; EnumValue +; Enum(ilsop_fn) String(none) Value(ILSOP_NONE) + +EnumValue +Enum(ilsop_fn) String(memcmp) Value(ILSOP_MEMCMP) Set(1) + +EnumValue +Enum(ilsop_fn) String(memcpy) Value(ILSOP_MEMCPY) Set(2) + +EnumValue +Enum(ilsop_fn) String(memmove) Value(ILSOP_MEMMOVE) Set(3) + +EnumValue +Enum(ilsop_fn) String(memset) Value(ILSOP_MEMSET) Set(4) + +; This is not part of any set either. +; EnumValue +; Enum(ilsop_fn) String(all) Value(ILSOP_ALL) + fcf-protection Common RejectNegative Alias(fcf-protection=,full) diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index 7110c462d1d..ff3eb373cda 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -560,6 +560,7 @@ Objective-C and Objective-C++ Dialects}. -fgcse -fgcse-after-reload -fgcse-las -fgcse-lm -fgraphite-identity -fgcse-sm -fhoist-adjacent-loads -fif-conversion -fif-conversion2 -findirect-inlining +-finline-stringops[=@var{fn}] -finline-functions -finline-functions-called-once -finline-limit=@var{n} -finline-small-functions -fipa-modref -fipa-cp -fipa-cp-clone -fipa-bit-cp -fipa-vrp -fipa-pta -fipa-profile -fipa-pure-const @@ -12348,6 +12349,20 @@ their @code{_FORTIFY_SOURCE} counterparts into faster alternatives. Enabled at levels @option{-O2}, @option{-O3}. +@opindex finline-stringops +@item -finline-stringops[=@var{fn}] +Expand memory and string operations (for now, only @code{memset}) +inline, even when the length is variable or big enough as to require +looping. This is most useful along with @option{-ffreestanding} and +@option{-fno-builtin}. + +In some circumstances, it enables the compiler to generate code that +takes advantage of known alignment and length multipliers, but even then +it may be less efficient than optimized runtime implementations, and +grow code size so much that even a less performant but shared +implementation runs faster due to better use of code caches. This +option is disabled by default. + @opindex fno-inline @opindex finline @item -fno-inline diff --git a/gcc/expr.cc b/gcc/expr.cc index 556bcf7ef59..b1db81a47d0 100644 --- a/gcc/expr.cc +++ b/gcc/expr.cc @@ -80,7 +80,11 @@ static bool emit_block_move_via_pattern (rtx, rtx, rtx, unsigned, unsigned, HOST_WIDE_INT, unsigned HOST_WIDE_INT, unsigned HOST_WIDE_INT, unsigned HOST_WIDE_INT, bool); -static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned); +static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned, int); +static void emit_block_move_via_sized_loop (rtx, rtx, rtx, unsigned, unsigned); +static void emit_block_move_via_oriented_loop (rtx, rtx, rtx, unsigned, unsigned); +static rtx emit_block_cmp_via_loop (rtx, rtx, rtx, tree, rtx, bool, + unsigned, unsigned); static void clear_by_pieces (rtx, unsigned HOST_WIDE_INT, unsigned int); static rtx_insn *compress_float_constant (rtx, rtx); static rtx get_subtarget (rtx); @@ -1985,6 +1989,8 @@ compare_by_pieces (rtx arg0, rtx arg1, unsigned HOST_WIDE_INT len, MIN_SIZE is the minimal size of block to move MAX_SIZE is the maximal size of block to move, if it cannot be represented in unsigned HOST_WIDE_INT, than it is mask of all ones. + CTZ_SIZE is the trailing-zeros count of SIZE; even a nonconstant SIZE is + known to be a multiple of 1< 1 && !can_move_by_pieces (incr, align)) + incr >>= 1; + + gcc_checking_assert (incr); + + return emit_block_move_via_loop (x, y, size, align, incr); +} + +/* Like emit_block_move_via_sized_loop, but besides choosing INCR so + as to ensure safe moves even in case of overlap, output dynamic + tests to choose between two loops, one moving downwards, another + moving upwards. */ + +static void +emit_block_move_via_oriented_loop (rtx x, rtx y, rtx size, + unsigned int align, + unsigned int ctz_size) +{ + int incr = align / BITS_PER_UNIT; + + if (CONST_INT_P (size)) + ctz_size = MAX (ctz_size, (unsigned) wi::ctz (UINTVAL (size))); + + if (HOST_WIDE_INT_1U << ctz_size < (unsigned HOST_WIDE_INT) incr) + incr = HOST_WIDE_INT_1U << ctz_size; + + while (incr > 1 && !int_mode_for_size (incr, 0).exists ()) + incr >>= 1; + + gcc_checking_assert (incr); + + rtx_code_label *upw_label, *end_label; + upw_label = gen_label_rtx (); + end_label = gen_label_rtx (); + + rtx x_addr = force_operand (XEXP (x, 0), NULL_RTX); + rtx y_addr = force_operand (XEXP (y, 0), NULL_RTX); + do_pending_stack_adjust (); + + machine_mode mode = GET_MODE (x_addr); + if (mode != GET_MODE (y_addr)) + { + scalar_int_mode xmode + = smallest_int_mode_for_size (GET_MODE_BITSIZE (mode)); + scalar_int_mode ymode + = smallest_int_mode_for_size (GET_MODE_BITSIZE + (GET_MODE (y_addr))); + if (GET_MODE_BITSIZE (xmode) < GET_MODE_BITSIZE (ymode)) + mode = ymode; + else + mode = xmode; + +#ifndef POINTERS_EXTEND_UNSIGNED + const int POINTERS_EXTEND_UNSIGNED = 1; +#endif + x_addr = convert_modes (mode, GET_MODE (x_addr), x_addr, + POINTERS_EXTEND_UNSIGNED); + y_addr = convert_modes (mode, GET_MODE (y_addr), y_addr, + POINTERS_EXTEND_UNSIGNED); + } + + /* Test for overlap: if (x >= y || x + size <= y) goto upw_label. */ + emit_cmp_and_jump_insns (x_addr, y_addr, GEU, NULL_RTX, mode, + true, upw_label, + profile_probability::guessed_always () + .apply_scale (5, 10)); + rtx tmp = convert_modes (GET_MODE (x_addr), GET_MODE (size), size, true); + tmp = simplify_gen_binary (PLUS, GET_MODE (x_addr), x_addr, tmp); + + emit_cmp_and_jump_insns (tmp, y_addr, LEU, NULL_RTX, mode, + true, upw_label, + profile_probability::guessed_always () + .apply_scale (8, 10)); + + emit_block_move_via_loop (x, y, size, align, -incr); + + emit_jump (end_label); + emit_label (upw_label); + + emit_block_move_via_loop (x, y, size, align, incr); + + emit_label (end_label); +} + /* A subroutine of emit_block_move. Copy the data via an explicit - loop. This is used only when libcalls are forbidden. */ -/* ??? It'd be nice to copy in hunks larger than QImode. */ + loop. This is used only when libcalls are forbidden, or when + inlining is required. INCR is the block size to be copied in each + loop iteration. If it is negative, the absolute value is used, and + the block is copied backwards. INCR must be a power of two, an + exact divisor for SIZE and ALIGN, and imply a mode that can be + safely copied per iteration assuming no overlap. */ static void emit_block_move_via_loop (rtx x, rtx y, rtx size, - unsigned int align ATTRIBUTE_UNUSED) + unsigned int align, int incr) { rtx_code_label *cmp_label, *top_label; rtx iter, x_addr, y_addr, tmp; @@ -2307,7 +2429,38 @@ emit_block_move_via_loop (rtx x, rtx y, rtx size, cmp_label = gen_label_rtx (); iter = gen_reg_rtx (iter_mode); - emit_move_insn (iter, const0_rtx); + bool downwards = incr < 0; + rtx iter_init; + rtx_code iter_cond; + rtx iter_limit; + rtx iter_incr; + machine_mode move_mode; + if (downwards) + { + incr = -incr; + iter_init = size; + iter_cond = GEU; + iter_limit = const0_rtx; + iter_incr = GEN_INT (incr); + } + else + { + iter_init = const0_rtx; + iter_cond = LTU; + iter_limit = size; + iter_incr = GEN_INT (incr); + } + emit_move_insn (iter, iter_init); + + scalar_int_mode int_move_mode + = smallest_int_mode_for_size (incr * BITS_PER_UNIT); + if (GET_MODE_BITSIZE (int_move_mode) != incr * BITS_PER_UNIT) + { + move_mode = BLKmode; + gcc_checking_assert (can_move_by_pieces (incr, align)); + } + else + move_mode = int_move_mode; x_addr = force_operand (XEXP (x, 0), NULL_RTX); y_addr = force_operand (XEXP (y, 0), NULL_RTX); @@ -2323,19 +2476,32 @@ emit_block_move_via_loop (rtx x, rtx y, rtx size, tmp = convert_modes (y_addr_mode, iter_mode, iter, true); y_addr = simplify_gen_binary (PLUS, y_addr_mode, y_addr, tmp); - x = change_address (x, QImode, x_addr); - y = change_address (y, QImode, y_addr); + x = change_address (x, move_mode, x_addr); + y = change_address (y, move_mode, y_addr); + + if (move_mode == BLKmode) + { + bool done; + emit_block_move_hints (x, y, iter_incr, BLOCK_OP_NO_LIBCALL, + align, incr, incr, incr, incr, + false, &done, false); + gcc_checking_assert (done); + } + else + emit_move_insn (x, y); - emit_move_insn (x, y); + if (downwards) + emit_label (cmp_label); - tmp = expand_simple_binop (iter_mode, PLUS, iter, const1_rtx, iter, + tmp = expand_simple_binop (iter_mode, PLUS, iter, iter_incr, iter, true, OPTAB_LIB_WIDEN); if (tmp != iter) emit_move_insn (iter, tmp); - emit_label (cmp_label); + if (!downwards) + emit_label (cmp_label); - emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode, + emit_cmp_and_jump_insns (iter, iter_limit, iter_cond, NULL_RTX, iter_mode, true, top_label, profile_probability::guessed_always () .apply_scale (9, 10)); @@ -2435,7 +2601,8 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target, Both X and Y must be MEM rtx's. LEN is an rtx that says how long they are. LEN_TYPE is the type of the expression that was used to - calculate it. + calculate it, and CTZ_LEN is the known trailing-zeros count of LEN, + so LEN must be a multiple of 1< 1 + && !can_do_by_pieces (incr, align, COMPARE_BY_PIECES)) + incr >>= 1; + + rtx_code_label *cmp_label, *top_label, *ne_label, *res_label; + rtx iter, x_addr, y_addr, tmp; + machine_mode x_addr_mode = get_address_mode (x); + machine_mode y_addr_mode = get_address_mode (y); + machine_mode iter_mode; + + iter_mode = GET_MODE (len); + if (iter_mode == VOIDmode) + iter_mode = word_mode; + + rtx iter_init = const0_rtx; + rtx_code iter_cond = LTU; + rtx_code entry_cond = GEU; + rtx iter_limit = len; + rtx iter_incr = GEN_INT (incr); + machine_mode cmp_mode; + + /* We can drop the loop back edge if we know there's exactly one + iteration. */ + top_label = (!rtx_equal_p (len, iter_incr) + ? gen_label_rtx () + : NULL); + /* We need not test before entering the loop if len is known + nonzero. ??? This could be even stricter, testing whether a + nonconstant LEN could possibly be zero. */ + cmp_label = (!CONSTANT_P (len) || rtx_equal_p (len, iter_init) + ? gen_label_rtx () + : NULL); + ne_label = gen_label_rtx (); + res_label = gen_label_rtx (); + + iter = gen_reg_rtx (iter_mode); + emit_move_insn (iter, iter_init); + + scalar_int_mode int_cmp_mode + = smallest_int_mode_for_size (incr * BITS_PER_UNIT); + if (GET_MODE_BITSIZE (int_cmp_mode) != incr * BITS_PER_UNIT + || !can_compare_p (NE, int_cmp_mode, ccp_jump)) + { + cmp_mode = BLKmode; + gcc_checking_assert (incr != 1); + } + else + cmp_mode = int_cmp_mode; + + /* Save the base addresses. */ + x_addr = force_operand (XEXP (x, 0), NULL_RTX); + y_addr = force_operand (XEXP (y, 0), NULL_RTX); + do_pending_stack_adjust (); + + if (cmp_label) + { + if (top_label) + emit_jump (cmp_label); + else + emit_cmp_and_jump_insns (iter, iter_limit, entry_cond, + NULL_RTX, iter_mode, + true, cmp_label, + profile_probability::guessed_always () + .apply_scale (1, 10)); + } + if (top_label) + emit_label (top_label); + + /* Offset the base addresses by ITER. */ + tmp = convert_modes (x_addr_mode, iter_mode, iter, true); + x_addr = simplify_gen_binary (PLUS, x_addr_mode, x_addr, tmp); + + if (x_addr_mode != y_addr_mode) + tmp = convert_modes (y_addr_mode, iter_mode, iter, true); + y_addr = simplify_gen_binary (PLUS, y_addr_mode, y_addr, tmp); + + x = change_address (x, cmp_mode, x_addr); + y = change_address (y, cmp_mode, y_addr); + + /* Compare one block. */ + rtx part_res; + if (cmp_mode == BLKmode) + part_res = compare_by_pieces (x, y, incr, target, align, 0, 0); + else + part_res = expand_binop (cmp_mode, sub_optab, x, y, NULL_RTX, + true, OPTAB_LIB_WIDEN); + + /* Stop if we found a difference. */ + emit_cmp_and_jump_insns (part_res, GEN_INT (0), NE, NULL_RTX, + GET_MODE (part_res), true, ne_label, + profile_probability::guessed_always () + .apply_scale (1, 10)); + + /* Increment ITER. */ + tmp = expand_simple_binop (iter_mode, PLUS, iter, iter_incr, iter, + true, OPTAB_LIB_WIDEN); + if (tmp != iter) + emit_move_insn (iter, tmp); + + if (cmp_label) + emit_label (cmp_label); + /* Loop until we reach the limit. */ + + if (top_label) + emit_cmp_and_jump_insns (iter, iter_limit, iter_cond, NULL_RTX, iter_mode, + true, top_label, + profile_probability::guessed_always () + .apply_scale (9, 10)); + + /* We got to the end without differences, so the result is zero. */ + if (target == NULL_RTX + || !REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER) + target = gen_reg_rtx (TYPE_MODE (integer_type_node)); + + emit_move_insn (target, const0_rtx); + emit_jump (res_label); + + emit_label (ne_label); + + /* Return nonzero, or pinpoint the difference to return the expected + result for non-equality tests. */ + if (equality_only) + emit_move_insn (target, const1_rtx); + else + { + if (incr > UNITS_PER_WORD) + /* ??? Re-compare the block found to be different one word at a + time. */ + part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), len_type, + target, equality_only, + BITS_PER_WORD, 0); + else if (incr > 1) + /* ??? Re-compare the block found to be different one byte at a + time. We could do better using part_res, and being careful + about endianness. */ + part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), len_type, + target, equality_only, + BITS_PER_UNIT, 0); + else if (known_gt (GET_MODE_BITSIZE (GET_MODE (target)), + GET_MODE_BITSIZE (cmp_mode))) + part_res = expand_binop (GET_MODE (target), sub_optab, x, y, target, + true, OPTAB_LIB_WIDEN); + else + { + /* In the odd chance target is QImode, we can't count on + widening subtract to capture the result of the unsigned + compares. */ + rtx_code_label *ltu_label; + ltu_label = gen_label_rtx (); + emit_cmp_and_jump_insns (x, y, LTU, NULL_RTX, + cmp_mode, true, ltu_label, + profile_probability::guessed_always () + .apply_scale (5, 10)); + + emit_move_insn (target, const1_rtx); + emit_jump (res_label); + + emit_label (ltu_label); + emit_move_insn (target, constm1_rtx); + part_res = target; + } + + if (target != part_res) + convert_move (target, part_res, false); + } + + emit_label (res_label); + + return target; +} + /* Copy all or part of a value X into registers starting at REGNO. The number of registers to be filled is NREGS. */ diff --git a/gcc/expr.h b/gcc/expr.h index 2a172867fdb..431616d55be 100644 --- a/gcc/expr.h +++ b/gcc/expr.h @@ -126,7 +126,8 @@ struct by_pieces_prev fixed_size_mode mode; }; -extern rtx emit_block_move (rtx, rtx, rtx, enum block_op_methods); +extern rtx emit_block_move (rtx, rtx, rtx, enum block_op_methods, + unsigned ctz_size = 0); extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods, unsigned int, HOST_WIDE_INT, unsigned HOST_WIDE_INT, @@ -134,9 +135,11 @@ extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods, unsigned HOST_WIDE_INT, bool bail_out_libcall = false, bool *is_move_done = NULL, - bool might_overlap = false); + bool might_overlap = false, + unsigned ctz_size = 0); extern rtx emit_block_cmp_hints (rtx, rtx, rtx, tree, rtx, bool, - by_pieces_constfn, void *); + by_pieces_constfn, void *, + unsigned ctz_len = 0); extern bool emit_storent_insn (rtx to, rtx from); /* Copy all or part of a value X into registers starting at REGNO. diff --git a/gcc/flag-types.h b/gcc/flag-types.h index c1852cd810c..650ae08b004 100644 --- a/gcc/flag-types.h +++ b/gcc/flag-types.h @@ -447,6 +447,17 @@ enum gfc_convert }; +/* Inline String Operations functions. */ +enum ilsop_fn +{ + ILSOP_NONE = 0, + ILSOP_MEMSET = 1 << 0, + ILSOP_MEMCPY = 1 << 1, + ILSOP_MEMMOVE = 1 << 2, + ILSOP_MEMCMP = 1 << 3, + ILSOP_ALL = -1 +}; + /* Control-Flow Protection values. */ enum cf_protection_level { diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c new file mode 100644 index 00000000000..a368f074112 --- /dev/null +++ b/gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c @@ -0,0 +1,7 @@ +/* { dg-do run } */ +/* { dg-options "-finline-stringops=memcmp -save-temps -g0 -fno-lto" } */ + +#include "../memcmp-1.c" + +/* Check that no memcmp calls remain, but allow for lib_memcmp calls. */ +/* { dg-final { scan-assembler-not {(^|\*)\mmemcmp\M} } } */ diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c new file mode 100644 index 00000000000..c98e903c1f1 --- /dev/null +++ b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c @@ -0,0 +1,8 @@ +/* { dg-do run } */ +/* { dg-options "-finline-stringops=memcpy -save-temps -g0 -fno-lto" } */ + +#include "../memcmp-1.c" +/* Yeah, this memcmp test exercises plenty of memcpy, more than any of the + memcpy tests. */ + +/* { dg-final { scan-assembler-not {\mmemcpy\M} } } */ diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-cmp-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-cmp-1.c new file mode 100644 index 00000000000..2cd2057a9b5 --- /dev/null +++ b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-cmp-1.c @@ -0,0 +1,11 @@ +/* { dg-do run } */ +/* { dg-options "-finline-stringops -save-temps -g0 -fno-lto" } */ +/* { dg-require-effective-target ptr32plus } */ +/* { dg-timeout-factor 2 } */ + +#include "../memcmp-1.c" +/* Yeah, this memcmp test exercises plenty of memcpy, more than any of the + memcpy tests. */ + +/* { dg-final { scan-assembler-not {\mmemcpy\M} } } */ +/* { dg-final { scan-assembler-not {(^|\*)\mmemcmp\M} } } */ diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c new file mode 100644 index 00000000000..c0eca5bce6b --- /dev/null +++ b/gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c @@ -0,0 +1,8 @@ +/* { dg-do run } */ +/* { dg-options "-finline-stringops=memmove -save-temps -g0 -fno-lto" } */ + +#include "../../gcc.c-torture/execute/builtins/memmove.c" + +#include "../../gcc.c-torture/execute/builtins/lib/main.c" + +/* { dg-final { scan-assembler-not {\mmemmove\M} } } */ diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c new file mode 100644 index 00000000000..bdcf9bf5b61 --- /dev/null +++ b/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c @@ -0,0 +1,84 @@ +/* { dg-do compile } */ +/* { dg-options "-finline-stringops -fno-lto" } */ + +void *zero (unsigned long long (*p)[32], int n) +{ + return __builtin_memset (p, 0, n * sizeof (*p)); +} + +void *ones (char (*p)[128], int n) +{ + return __builtin_memset (p, -1, n * sizeof (*p)); +} + +void *opt2 (int *p, int i) +{ + return __builtin_memset (p, 0, (i ? 1024 : 2) * sizeof (*p)); +} + +void *opt8 (int *p, int i) +{ + return __builtin_memset (p, 0, (i ? 1024 : 8) * sizeof (*p)); +} + +void *opt32 (int *p, int i) +{ + return __builtin_memset (p, 0, (i ? 1024 : 32) * sizeof (*p)); +} + +void *opt128 (int *p, int i) +{ + return __builtin_memset (p, 0, (i ? 1024 : 128) * sizeof (*p)); +} + +void *opt512 (int *p, int i) +{ + return __builtin_memset (p, 0, (i ? 1024 : 512) * sizeof (*p)); +} + +void *opt_primes (int *p, int i) +{ + return __builtin_memset (p, 0, (i ? 509 : 7) * sizeof (*p)); +} + +void *opt_primes_blk (int *p, int i) +{ + return __builtin_memset (p, 0, (i ? 521 : 9) * sizeof (*p)); +} + +void *huge (long (*p)[16384]) +{ + return __builtin_memset (p, 0, sizeof (*p)); +} + +void *hugep1 (long (*p)[16384+1]) +{ + return __builtin_memset (p, 0, sizeof (*p)); +} + +void *hugep4 (long (*p)[16384+4]) +{ + return __builtin_memset (p, 0, sizeof (*p)); +} + +void *hugep16 (long (*p)[16384+16]) +{ + return __builtin_memset (p, 0, sizeof (*p)); +} + +void *hugep64 (long (*p)[16384+64]) +{ + return __builtin_memset (p, 0, sizeof (*p)); +} + +void *hugep256 (long (*p)[16384+256]) +{ + return __builtin_memset (p, 0, sizeof (*p)); +} + +void *hugep1024p256p64p16p4p1 (long (*p)[16384+1024+64+16+4+1]) +{ + return __builtin_memset (p, 0, sizeof (*p)); +} + +/* { dg-final { scan-assembler-not {\mmemset\M} } } */