From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 2140) id 27FBA384406A; Sat, 14 Jan 2023 14:31:29 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 27FBA384406A DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org; s=default; t=1673706689; bh=Bmc3jxaqlqFnNqGPjk+IIkFXA+hucB4202TK8Qgb16I=; h=From:To:Subject:Date:From; b=Hu5VcwIOm0RZ320NYesq/RanSMVsst7KWpe+kfFVSnxWSYkrtykR93c2yj3x6cXnH F3auFL9INtowh3bONEP1vdX38wXjmJnH9FHpY2RUYTmOhs9EGZebsV2ssme1WuvGhx AltLQDT3zjfGM+Y3rmB6yBbo1g1z8Km0iqADWXHk= Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: Alexandre Oliva To: gcc-cvs@gcc.gnu.org Subject: [gcc(refs/users/aoliva/heads/testme)] Introduce -finline-memset-loops X-Act-Checkin: gcc X-Git-Author: Alexandre Oliva X-Git-Refname: refs/users/aoliva/heads/testme X-Git-Oldrev: d8d14963f7eb24e07a4c066909db734e31ec8736 X-Git-Newrev: ff392972a77b438939a5194e6238be16e48bf045 Message-Id: <20230114143129.27FBA384406A@sourceware.org> Date: Sat, 14 Jan 2023 14:31:29 +0000 (GMT) List-Id: https://gcc.gnu.org/g:ff392972a77b438939a5194e6238be16e48bf045 commit ff392972a77b438939a5194e6238be16e48bf045 Author: Alexandre Oliva Date: Thu Dec 22 21:28:44 2022 -0300 Introduce -finline-memset-loops try_store_by_multiple_pieces was added not long ago, enabling variable-sized memset to be expanded inline when the worst-case in-range constant length would, using conditional blocks with powers of two to cover all possibilities of length and alignment. This patch extends the memset expansion to start with a loop, so as to still take advantage of known alignment even with long lengths, but without necessarily adding store blocks for every power of two. This makes it possible for any memset call to be expanded, even if storing a single byte per iteration. Surely efficient implementations of memset can do better, with a pre-loop to increase alignment, but that would likely be excessive for inline expansions of memset. Still, in some cases, users prefer to inline memset, even if it's not as performant, or when it's known to be performant in ways the compiler can't tell, to avoid depending on a C runtime library. With this flag, global or per-function optimizations may enable inline expansion of memset to be selectively requested, while the infrastructure for that may enable us to introduce per-target tuning to enable such looping even advantageous, even if not explicitly requested. for gcc/ChangeLog * builtins.cc (try_store_by_multiple_pieces): Support starting with a loop. * common.opt (finline-memset-loops): New. * doc/invoke.texi (-finline-memset-loops): Add. for gcc/testsuite/ChangeLog * gcc.dg/torture/inline-mem-set-1.c: New. Diff: --- gcc/builtins.cc | 75 ++++++++++++++++++++-- gcc/common.opt | 4 ++ gcc/doc/invoke.texi | 13 +++- gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c | 84 +++++++++++++++++++++++++ 4 files changed, 171 insertions(+), 5 deletions(-) diff --git a/gcc/builtins.cc b/gcc/builtins.cc index af45829875e..911773f170f 100644 --- a/gcc/builtins.cc +++ b/gcc/builtins.cc @@ -4322,6 +4322,10 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len, int tst_bits = (max_bits != min_bits ? max_bits : floor_log2 (max_len ^ min_len)); + /* Save the pre-blksize values. */ + int orig_max_bits = max_bits; + int orig_tst_bits = tst_bits; + /* Check whether it's profitable to start by storing a fixed BLKSIZE bytes, to lower max_bits. In the unlikely case of a constant LEN (implied by identical MAX_LEN and MIN_LEN), we want to issue a @@ -4361,9 +4365,55 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len, if (max_bits >= 0) xlenest += ((HOST_WIDE_INT_1U << max_bits) * 2 - (HOST_WIDE_INT_1U << ctz_len)); + bool max_loop = false; if (!can_store_by_pieces (xlenest, builtin_memset_read_str, &valc, align, true)) - return false; + { + if (!flag_inline_memset_loops) + return false; + + for (max_bits = orig_max_bits; + max_bits >= sctz_len; + --max_bits) + { + xlenest = ((HOST_WIDE_INT_1U << max_bits) * 2 + - (HOST_WIDE_INT_1U << ctz_len)); + if (can_store_by_pieces (xlenest + blksize, + builtin_memset_read_str, + &valc, align, true)) + { + max_loop = true; + break; + } + if (blksize + && can_store_by_pieces (xlenest, + builtin_memset_read_str, + &valc, align, true)) + { + max_len += blksize; + min_len += blksize; + tst_bits = orig_tst_bits; + blksize = 0; + max_loop = true; + break; + } + if (max_bits == sctz_len) + { + --sctz_len; + --ctz_len; + } + } + if (!max_loop) + return false; + /* If the boundaries are such that min and max may run a + different number of trips in the initial loop, the remainder + needs not be between the moduli, so set tst_bits to cover all + bits. Otherwise, if the trip counts are the same, max_len + has the common prefix, and the previously-computed tst_bits + is usable. */ + if (max_len >> max_bits > min_len >> max_bits) + tst_bits = max_bits; + } by_pieces_constfn constfun; void *constfundata; @@ -4405,7 +4455,9 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len, the least significant bit possibly set in the length. */ for (int i = max_bits; i >= sctz_len; i--) { + rtx_code_label *loop_label = NULL; rtx_code_label *label = NULL; + blksize = HOST_WIDE_INT_1U << i; /* If we're past the bits shared between min_ and max_len, expand @@ -4419,18 +4471,28 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len, profile_probability::even ()); } /* If we are at a bit that is in the prefix shared by min_ and - max_len, skip this BLKSIZE if the bit is clear. */ + max_len, skip the current BLKSIZE if the bit is clear. */ else if ((max_len & blksize) == 0) continue; + if (max_loop && i == max_bits) + { + loop_label = gen_label_rtx (); + emit_label (loop_label); + /* Since we may run this multiple times, don't assume we + know anything about the offset. */ + clear_mem_offset (to); + } + /* Issue a store of BLKSIZE bytes. */ + bool update_needed = i != sctz_len || loop_label; to = store_by_pieces (to, blksize, constfun, constfundata, align, true, - i != sctz_len ? RETURN_END : RETURN_BEGIN); + update_needed ? RETURN_END : RETURN_BEGIN); /* Adjust REM and PTR, unless this is the last iteration. */ - if (i != sctz_len) + if (update_needed) { emit_move_insn (ptr, force_operand (XEXP (to, 0), NULL_RTX)); to = replace_equiv_address (to, ptr); @@ -4438,6 +4500,11 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len, emit_move_insn (rem, force_operand (rem_minus_blksize, NULL_RTX)); } + if (loop_label) + emit_cmp_and_jump_insns (rem, GEN_INT (blksize), GE, NULL, + ptr_mode, 1, loop_label, + profile_probability::likely ()); + if (label) { emit_label (label); diff --git a/gcc/common.opt b/gcc/common.opt index d0371aec8db..5d28f054241 100644 --- a/gcc/common.opt +++ b/gcc/common.opt @@ -1874,6 +1874,10 @@ finline-atomics Common Var(flag_inline_atomics) Init(1) Optimization Inline __atomic operations when a lock free instruction sequence is available. +finline-memset-loops +Common Var(flag_inline_memset_loops) Init(0) Optimization +Inline memset even if it requires loops. + fcf-protection Common RejectNegative Alias(fcf-protection=,full) diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index dec0cdb9d35..063ae03b30e 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -548,7 +548,8 @@ Objective-C and Objective-C++ Dialects}. -fgcse-sm -fhoist-adjacent-loads -fif-conversion @gol -fif-conversion2 -findirect-inlining @gol -finline-functions -finline-functions-called-once -finline-limit=@var{n} @gol --finline-small-functions -fipa-modref -fipa-cp -fipa-cp-clone @gol +-finline-memset-loops -finline-small-functions @gol +-fipa-modref -fipa-cp -fipa-cp-clone @gol -fipa-bit-cp -fipa-vrp -fipa-pta -fipa-profile -fipa-pure-const @gol -fipa-reference -fipa-reference-addressable @gol -fipa-stack-alignment -fipa-icf -fira-algorithm=@var{algorithm} @gol @@ -11961,6 +11962,16 @@ in its own right. Enabled at levels @option{-O1}, @option{-O2}, @option{-O3} and @option{-Os}, but not @option{-Og}. +@item -finline-memset-loops +@opindex finline-memset-loops +Expand @code{memset} calls inline, even when the length is variable or +big enough as to require looping. This may enable the compiler to take +advantage of known alignment and length multipliers, but it will often +generate code that is less efficient than performant implementations of +@code{memset}, and grow code size so much that even a less performant +@code{memset} may run faster due to better use of the code cache. This +option is disabled by default. + @item -fearly-inlining @opindex fearly-inlining Inline functions marked by @code{always_inline} and functions whose body seems diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c new file mode 100644 index 00000000000..4de51df006e --- /dev/null +++ b/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c @@ -0,0 +1,84 @@ +/* { dg-do compile } */ +/* { dg-options "-finline-memset-loops -gno-record-gcc-switches -fno-lto" } */ + +void *zero (unsigned long long (*p)[32], int n) +{ + return __builtin_memset (p, 0, n * sizeof (*p)); +} + +void *ones (char (*p)[128], int n) +{ + return __builtin_memset (p, -1, n * sizeof (*p)); +} + +void *opt2 (int *p, int i) +{ + return __builtin_memset (p, 0, (i ? 1024 : 2) * sizeof (*p)); +} + +void *opt8 (int *p, int i) +{ + return __builtin_memset (p, 0, (i ? 1024 : 8) * sizeof (*p)); +} + +void *opt32 (int *p, int i) +{ + return __builtin_memset (p, 0, (i ? 1024 : 32) * sizeof (*p)); +} + +void *opt128 (int *p, int i) +{ + return __builtin_memset (p, 0, (i ? 1024 : 128) * sizeof (*p)); +} + +void *opt512 (int *p, int i) +{ + return __builtin_memset (p, 0, (i ? 1024 : 512) * sizeof (*p)); +} + +void *opt_primes (int *p, int i) +{ + return __builtin_memset (p, 0, (i ? 509 : 7) * sizeof (*p)); +} + +void *opt_primes_blk (int *p, int i) +{ + return __builtin_memset (p, 0, (i ? 521 : 9) * sizeof (*p)); +} + +void *huge (long (*p)[16384]) +{ + return __builtin_memset (p, 0, sizeof (*p)); +} + +void *hugep1 (long (*p)[16384+1]) +{ + return __builtin_memset (p, 0, sizeof (*p)); +} + +void *hugep4 (long (*p)[16384+4]) +{ + return __builtin_memset (p, 0, sizeof (*p)); +} + +void *hugep16 (long (*p)[16384+16]) +{ + return __builtin_memset (p, 0, sizeof (*p)); +} + +void *hugep64 (long (*p)[16384+64]) +{ + return __builtin_memset (p, 0, sizeof (*p)); +} + +void *hugep256 (long (*p)[16384+256]) +{ + return __builtin_memset (p, 0, sizeof (*p)); +} + +void *hugep1024p256p64p16p4p1 (long (*p)[16384+1024+64+16+4+1]) +{ + return __builtin_memset (p, 0, sizeof (*p)); +} + +/* { dg-final { scan-assembler-not "memset" } } */