From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 2140) id 94B9F385B500; Fri, 23 Dec 2022 00:57:48 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 94B9F385B500 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org; s=default; t=1671757068; bh=lEtkTzFs0Nj/pDrNql52SKVQwMduyPR80cv29K2+WyU=; h=From:To:Subject:Date:From; b=Ot6rUhz/frxvnpXe/Iwj1WqpYwBYxxrZTWAXQYBELoMqGanAke5bcee6FdBFOUSzG fJb3IpzyVFzkUQ7Iy/YsDXdi4ySfxC9hBIHQ+U9wIAMHC1mqnGyUhFgj8YCnxfS4Yy 4uJrodGLpkB2RBgnGzDwZu7mIju5/m0E2QfC5omY= Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: Alexandre Oliva To: gcc-cvs@gcc.gnu.org Subject: [gcc(refs/users/aoliva/heads/testme)] inline memset loops X-Act-Checkin: gcc X-Git-Author: Alexandre Oliva X-Git-Refname: refs/users/aoliva/heads/testme X-Git-Oldrev: 40b8ac12dfce49a956bcd61f3d53f779dec2c630 X-Git-Newrev: 7d172bec5011526917596810a9fe0a140bcbe9a8 Message-Id: <20221223005748.94B9F385B500@sourceware.org> Date: Fri, 23 Dec 2022 00:57:48 +0000 (GMT) List-Id: https://gcc.gnu.org/g:7d172bec5011526917596810a9fe0a140bcbe9a8 commit 7d172bec5011526917596810a9fe0a140bcbe9a8 Author: Alexandre Oliva Date: Thu Dec 22 21:28:44 2022 -0300 inline memset loops Diff: --- gcc/builtins.cc | 50 +++++++++++++++++++++++++++++++++++++++++++++++--- gcc/common.opt | 5 +++++ gcc/doc/invoke.texi | 13 ++++++++++++- 3 files changed, 64 insertions(+), 4 deletions(-) diff --git a/gcc/builtins.cc b/gcc/builtins.cc index 02c4fefa86f..388bae58ce4 100644 --- a/gcc/builtins.cc +++ b/gcc/builtins.cc @@ -4361,9 +4361,37 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len, if (max_bits >= 0) xlenest += ((HOST_WIDE_INT_1U << max_bits) * 2 - (HOST_WIDE_INT_1U << ctz_len)); + bool max_loop = false; if (!can_store_by_pieces (xlenest, builtin_memset_read_str, &valc, align, true)) - return false; + { + if (!flag_inline_memset_loops) + return false; + while (--max_bits >= sctz_len) + { + xlenest = ((HOST_WIDE_INT_1U << max_bits) * 2 + - (HOST_WIDE_INT_1U << ctz_len)); + if (can_store_by_pieces (xlenest + blksize, + builtin_memset_read_str, + &valc, align, true)) + { + max_loop = true; + break; + } + if (!blksize) + continue; + if (can_store_by_pieces (xlenest, + builtin_memset_read_str, + &valc, align, true)) + { + blksize = 0; + max_loop = true; + break; + } + } + if (!max_loop) + return false; + } by_pieces_constfn constfun; void *constfundata; @@ -4405,6 +4433,7 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len, the least significant bit possibly set in the length. */ for (int i = max_bits; i >= sctz_len; i--) { + rtx_code_label *loop_label = NULL; rtx_code_label *label = NULL; blksize = HOST_WIDE_INT_1U << i; @@ -4423,14 +4452,24 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len, else if ((max_len & blksize) == 0) continue; + if (max_loop && i == max_bits) + { + loop_label = gen_label_rtx (); + emit_label (loop_label); + /* Since we may run this multiple times, don't assume we + know anything about the offset. */ + clear_mem_offset (to); + } + /* Issue a store of BLKSIZE bytes. */ + bool update_needed = i != sctz_len || loop_label; to = store_by_pieces (to, blksize, constfun, constfundata, align, true, - i != sctz_len ? RETURN_END : RETURN_BEGIN); + update_needed ? RETURN_END : RETURN_BEGIN); /* Adjust REM and PTR, unless this is the last iteration. */ - if (i != sctz_len) + if (update_needed) { emit_move_insn (ptr, force_operand (XEXP (to, 0), NULL_RTX)); to = replace_equiv_address (to, ptr); @@ -4438,6 +4477,11 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len, emit_move_insn (rem, force_operand (rem_minus_blksize, NULL_RTX)); } + if (loop_label) + emit_cmp_and_jump_insns (rem, GEN_INT (blksize), GE, NULL, + ptr_mode, 1, loop_label, + profile_probability::likely ()); + if (label) { emit_label (label); diff --git a/gcc/common.opt b/gcc/common.opt index 562d73d7f55..b21f4225782 100644 --- a/gcc/common.opt +++ b/gcc/common.opt @@ -1874,6 +1874,11 @@ finline-atomics Common Var(flag_inline_atomics) Init(1) Optimization Inline __atomic operations when a lock free instruction sequence is available. +; FIXME: Disable by default! +finline-memset-loops +Common Var(flag_inline_memset_loops) Init(1) Optimization +Inline memset even if it requires loops. + fcf-protection Common RejectNegative Alias(fcf-protection=,full) diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index da9ad1068fb..19f436ad463 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -548,7 +548,8 @@ Objective-C and Objective-C++ Dialects}. -fgcse-sm -fhoist-adjacent-loads -fif-conversion @gol -fif-conversion2 -findirect-inlining @gol -finline-functions -finline-functions-called-once -finline-limit=@var{n} @gol --finline-small-functions -fipa-modref -fipa-cp -fipa-cp-clone @gol +-finline-memset-loops -finline-small-functions @gol +-fipa-modref -fipa-cp -fipa-cp-clone @gol -fipa-bit-cp -fipa-vrp -fipa-pta -fipa-profile -fipa-pure-const @gol -fipa-reference -fipa-reference-addressable @gol -fipa-stack-alignment -fipa-icf -fira-algorithm=@var{algorithm} @gol @@ -11960,6 +11961,16 @@ in its own right. Enabled at levels @option{-O1}, @option{-O2}, @option{-O3} and @option{-Os}, but not @option{-Og}. +@item -finline-memset-loops +@opindex finline-memset-loops +Expand @code{memset} calls inline, even when the length is variable or +big enough as to require looping. This may enable the compiler to take +advantage of known alignment and length multipliers, but it will often +generate code that is less efficient than performant implementations of +@code{memset}, and grow code size so much that even a less performant +@code{memset} may run faster due to better use of the code cache. This +option is disabled by default. + @item -fearly-inlining @opindex fearly-inlining Inline functions marked by @code{always_inline} and functions whose body seems