public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc(refs/users/aoliva/heads/testme)] Introduce -finline-stringops
@ 2023-06-02  4:39 Alexandre Oliva
  0 siblings, 0 replies; 16+ messages in thread
From: Alexandre Oliva @ 2023-06-02  4:39 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:5f509457cb75ae274eef5a84f5dfa09c612f9a65

commit 5f509457cb75ae274eef5a84f5dfa09c612f9a65
Author: Alexandre Oliva <oliva@adacore.com>
Date:   Thu Jun 1 22:38:29 2023 -0300

    Introduce -finline-stringops
    
    try_store_by_multiple_pieces was added not long ago, enabling
    variable-sized memset to be expanded inline when the worst-case
    in-range constant length would, using conditional blocks with powers
    of two to cover all possibilities of length and alignment.
    
    This patch introduces -finline-stringops[=fn] to request expansions to
    start with a loop, so as to still take advantage of known alignment
    even with long lengths, but without necessarily adding store blocks
    for every power of two.
    
    This makes it possible for the supported stringops (memset, memcpy,
    memmove, memset) to be expanded, even if storing a single byte per
    iteration.  Surely efficient implementations can run faster, with a
    pre-loop to increase alignment, but that would likely be excessive for
    inline expansions.
    
    Still, in some cases, such as in freestanding environments, users
    prefer to inline such stringops, especially those that the compiler
    may introduce itself, even if the expansion is not as performant as a
    highly optimized C library implementation could be, to avoid
    depending on a C runtime library.
    
    
    for  gcc/ChangeLog
    
            * expr.cc (emit_block_move_hints): Take ctz of len.  Obey
            -finline-stringops.  Use oriented or sized loop.
            (emit_block_move): Take ctz of len, and pass it on.
            (emit_block_move_via_sized_loop): New.
            (emit_block_move_via_oriented_loop): New.
            (emit_block_move_via_loop): Take incr.  Move an incr-sized
            block per iteration.
            (emit_block_cmp_via_cmpmem): Take ctz of len.  Obey
            -finline-stringops.
            (emit_block_cmp_via_loop): New.
            * expr.h (emit_block_move): Add ctz of len defaulting to zero.
            (emit_block_move_hints): Likewise.
            (emit_block_cmp_hints): Likewise.
            * builtins.cc (expand_builtin_memory_copy_args): Pass ctz of
            len to emit_block_move_hints.
            (try_store_by_multiple_pieces): Support starting with a loop.
            (expand_builtin_memcmp): Pass ctz of len to
            emit_block_cmp_hints.
            (expand_builtin): Allow inline expansion of memset, memcpy,
            memmove and memcmp if requested.
            * common.opt (finline-stringops): New.
            (ilsop_fn): New enum.
            * flag-types.h (enum ilsop_fn): New.
            * doc/invoke.texi (-finline-stringops): Add.
    
    for  gcc/testsuite/ChangeLog
    
            * gcc.dg/torture/inline-mem-cmp-1.c: New.
            * gcc.dg/torture/inline-mem-cpy-1.c: New.
            * gcc.dg/torture/inline-mem-cpy-cmp-1.c: New.
            * gcc.dg/torture/inline-mem-move-1.c: New.
            * gcc.dg/torture/inline-mem-set-1.c: New.

Diff:
---
 gcc/builtins.cc                                    | 114 ++++++-
 gcc/common.opt                                     |  34 ++
 gcc/doc/invoke.texi                                |  15 +
 gcc/expr.cc                                        | 374 ++++++++++++++++++++-
 gcc/expr.h                                         |   9 +-
 gcc/flag-types.h                                   |  11 +
 gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c    |   6 +
 gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c    |   8 +
 .../gcc.dg/torture/inline-mem-cpy-cmp-1.c          |  11 +
 gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c   |   9 +
 gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c    |  84 +++++
 11 files changed, 645 insertions(+), 30 deletions(-)

diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index 8400adaf5b4..1beaa4eae97 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -3769,7 +3769,7 @@ expand_builtin_memory_copy_args (tree dest, tree src, tree len,
 				     expected_align, expected_size,
 				     min_size, max_size, probable_max_size,
 				     use_mempcpy_call, &is_move_done,
-				     might_overlap);
+				     might_overlap, tree_ctz (len));
 
   /* Bail out when a mempcpy call would be expanded as libcall and when
      we have a target that provides a fast implementation
@@ -4335,6 +4335,10 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
   int tst_bits = (max_bits != min_bits ? max_bits
 		  : floor_log2 (max_len ^ min_len));
 
+  /* Save the pre-blksize values.  */
+  int orig_max_bits = max_bits;
+  int orig_tst_bits = tst_bits;
+
   /* Check whether it's profitable to start by storing a fixed BLKSIZE
      bytes, to lower max_bits.  In the unlikely case of a constant LEN
      (implied by identical MAX_LEN and MIN_LEN), we want to issue a
@@ -4374,9 +4378,70 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
   if (max_bits >= 0)
     xlenest += ((HOST_WIDE_INT_1U << max_bits) * 2
 		- (HOST_WIDE_INT_1U << ctz_len));
-  if (!can_store_by_pieces (xlenest, builtin_memset_read_str,
-			    &valc, align, true))
-    return false;
+  bool max_loop = false;
+  /* Skip the test in case of overflow in xlenest.  It shouldn't
+     happen because of the way max_bits and blksize are related, but
+     it doesn't hurt to test.  */
+  if (blksize > xlenest
+      || !can_store_by_pieces (xlenest, builtin_memset_read_str,
+			       &valc, align, true))
+    {
+      if (!(flag_inline_stringops & ILSOP_MEMSET))
+	return false;
+
+      for (max_bits = orig_max_bits;
+	   max_bits >= sctz_len;
+	   --max_bits)
+	{
+	  xlenest = ((HOST_WIDE_INT_1U << max_bits) * 2
+		     - (HOST_WIDE_INT_1U << ctz_len));
+	  /* Check that blksize plus the bits to be stored as blocks
+	     sized at powers of two can be stored by pieces.  This is
+	     like the test above, but with smaller max_bits.  Skip
+	     orig_max_bits (it would be redundant).  Also skip in case
+	     of overflow.  */
+	  if (max_bits < orig_max_bits
+	      && xlenest + blksize >= xlenest
+	      && can_store_by_pieces (xlenest + blksize,
+				      builtin_memset_read_str,
+				      &valc, align, true))
+	    {
+	      max_loop = true;
+	      break;
+	    }
+	  if (blksize
+	      && can_store_by_pieces (xlenest,
+				      builtin_memset_read_str,
+				      &valc, align, true))
+	    {
+	      max_len += blksize;
+	      min_len += blksize;
+	      tst_bits = orig_tst_bits;
+	      blksize = 0;
+	      max_loop = true;
+	      break;
+	    }
+	  if (max_bits == sctz_len)
+	    {
+	      --sctz_len;
+	      --ctz_len;
+	    }
+	}
+      if (!max_loop)
+	return false;
+      /* If the boundaries are such that min and max may run a
+	 different number of trips in the initial loop, the remainder
+	 needs not be between the moduli, so set tst_bits to cover all
+	 bits.  Otherwise, if the trip counts are the same, max_len
+	 has the common prefix, and the previously-computed tst_bits
+	 is usable.  */
+      if (max_len >> max_bits > min_len >> max_bits)
+	tst_bits = max_bits;
+    }
+  /* ??? Do we have to check that all powers of two lengths from
+     max_bits down to ctz_len pass can_store_by_pieces?  As in, could
+     it possibly be that xlenest passes while smaller power-of-two
+     sizes don't?  */
 
   by_pieces_constfn constfun;
   void *constfundata;
@@ -4418,7 +4483,9 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
      the least significant bit possibly set in the length.  */
   for (int i = max_bits; i >= sctz_len; i--)
     {
+      rtx_code_label *loop_label = NULL;
       rtx_code_label *label = NULL;
+
       blksize = HOST_WIDE_INT_1U << i;
 
       /* If we're past the bits shared between min_ and max_len, expand
@@ -4432,18 +4499,31 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
 				   profile_probability::even ());
 	}
       /* If we are at a bit that is in the prefix shared by min_ and
-	 max_len, skip this BLKSIZE if the bit is clear.  */
-      else if ((max_len & blksize) == 0)
+	 max_len, skip the current BLKSIZE if the bit is clear, but do
+	 not skip the loop, even if it doesn't require
+	 prechecking.  */
+      else if ((max_len & blksize) == 0
+	       && !(max_loop && i == max_bits))
 	continue;
 
+      if (max_loop && i == max_bits)
+	{
+	  loop_label = gen_label_rtx ();
+	  emit_label (loop_label);
+	  /* Since we may run this multiple times, don't assume we
+	     know anything about the offset.  */
+	  clear_mem_offset (to);
+	}
+
       /* Issue a store of BLKSIZE bytes.  */
+      bool update_needed = i != sctz_len || loop_label;
       to = store_by_pieces (to, blksize,
 			    constfun, constfundata,
 			    align, true,
-			    i != sctz_len ? RETURN_END : RETURN_BEGIN);
+			    update_needed ? RETURN_END : RETURN_BEGIN);
 
       /* Adjust REM and PTR, unless this is the last iteration.  */
-      if (i != sctz_len)
+      if (update_needed)
 	{
 	  emit_move_insn (ptr, force_operand (XEXP (to, 0), NULL_RTX));
 	  to = replace_equiv_address (to, ptr);
@@ -4451,6 +4531,11 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
 	  emit_move_insn (rem, force_operand (rem_minus_blksize, NULL_RTX));
 	}
 
+      if (loop_label)
+	emit_cmp_and_jump_insns (rem, GEN_INT (blksize), GE, NULL,
+				 ptr_mode, 1, loop_label,
+				 profile_probability::likely ());
+
       if (label)
 	{
 	  emit_label (label);
@@ -4737,7 +4822,8 @@ expand_builtin_memcmp (tree exp, rtx target, bool result_eq)
   result = emit_block_cmp_hints (arg1_rtx, arg2_rtx, len_rtx,
 				 TREE_TYPE (len), target,
 				 result_eq, constfn,
-				 CONST_CAST (char *, rep));
+				 CONST_CAST (char *, rep),
+				 tree_ctz (len));
 
   if (result)
     {
@@ -7380,7 +7466,15 @@ expand_builtin (tree exp, rtx target, rtx subtarget, machine_mode mode,
       && fcode != BUILT_IN_EXECVE
       && fcode != BUILT_IN_CLEAR_CACHE
       && !ALLOCA_FUNCTION_CODE_P (fcode)
-      && fcode != BUILT_IN_FREE)
+      && fcode != BUILT_IN_FREE
+      && (fcode != BUILT_IN_MEMSET
+	  || !(flag_inline_stringops & ILSOP_MEMSET))
+      && (fcode != BUILT_IN_MEMCPY
+	  || !(flag_inline_stringops & ILSOP_MEMCPY))
+      && (fcode != BUILT_IN_MEMMOVE
+	  || !(flag_inline_stringops & ILSOP_MEMMOVE))
+      && (fcode != BUILT_IN_MEMCMP
+	  || !(flag_inline_stringops & ILSOP_MEMCMP)))
     return expand_call (exp, target, ignore);
 
   /* The built-in function expanders test for target == const0_rtx
diff --git a/gcc/common.opt b/gcc/common.opt
index a28ca13385a..fcf945019ec 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -1882,6 +1882,40 @@ finline-atomics
 Common Var(flag_inline_atomics) Init(1) Optimization
 Inline __atomic operations when a lock free instruction sequence is available.
 
+finline-stringops
+Common RejectNegative Enum(ilsop_fn) Var(flag_inline_stringops, ILSOP_ALL) Enum(ilsop_fn) Init(ILSOP_NONE) Optimization Undocumented
+
+fno-inline-stringops
+Common RejectNegative Enum(ilsop_fn) Var(flag_inline_stringops, ILSOP_NONE) Enum(ilsop_fn) Optimization Undocumented
+
+finline-stringops=
+Common Joined Var(flag_inline_stringops) EnumSet Enum(ilsop_fn) Optimization
+-finline-stringops[=memcmp|memcpy|memmove|memset]
+Expand supported mem/str operations inline, even if against optimization.
+
+Enum
+Name(ilsop_fn) Type(enum ilsop_fn) UnknownError(unavailable stringop for inlining %qs)
+
+; This is not part of any set.
+; EnumValue
+; Enum(ilsop_fn) String(none) Value(ILSOP_NONE)
+
+EnumValue
+Enum(ilsop_fn) String(memcmp) Value(ILSOP_MEMCMP) Set(1)
+
+EnumValue
+Enum(ilsop_fn) String(memcpy) Value(ILSOP_MEMCPY) Set(2)
+
+EnumValue
+Enum(ilsop_fn) String(memmove) Value(ILSOP_MEMMOVE) Set(3)
+
+EnumValue
+Enum(ilsop_fn) String(memset) Value(ILSOP_MEMSET) Set(4)
+
+; This is not part of any set either.
+; EnumValue
+; Enum(ilsop_fn) String(all) Value(ILSOP_ALL)
+
 fcf-protection
 Common RejectNegative Alias(fcf-protection=,full)
 
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 9130104af22..923c8005f57 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -550,6 +550,7 @@ Objective-C and Objective-C++ Dialects}.
 -fgcse  -fgcse-after-reload  -fgcse-las  -fgcse-lm  -fgraphite-identity
 -fgcse-sm  -fhoist-adjacent-loads  -fif-conversion
 -fif-conversion2  -findirect-inlining
+-finline-stringops[=@var{fn}]
 -finline-functions  -finline-functions-called-once  -finline-limit=@var{n}
 -finline-small-functions -fipa-modref -fipa-cp  -fipa-cp-clone
 -fipa-bit-cp  -fipa-vrp  -fipa-pta  -fipa-profile  -fipa-pure-const
@@ -12081,6 +12082,20 @@ their @code{_FORTIFY_SOURCE} counterparts into faster alternatives.
 
 Enabled at levels @option{-O2}, @option{-O3}.
 
+@opindex finline-stringops
+@item -finline-stringops[=@var{fn}]
+Expand memory and string operations (for now, only @code{memset})
+inline, even when the length is variable or big enough as to require
+looping.  This is most useful along with @option{-ffreestanding} and
+@option{-fno-builtin}.
+
+In some circumstances, it enables the compiler to generate code that
+takes advantage of known alignment and length multipliers, but even then
+it may be less efficient than optimized runtime implementations, and
+grow code size so much that even a less performant but shared
+implementation runs faster due to better use of code caches.  This
+option is disabled by default.
+
 @opindex fno-inline
 @opindex finline
 @item -fno-inline
diff --git a/gcc/expr.cc b/gcc/expr.cc
index 56b51876f80..efa409643a3 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -80,7 +80,11 @@ static bool emit_block_move_via_pattern (rtx, rtx, rtx, unsigned, unsigned,
 					 HOST_WIDE_INT, unsigned HOST_WIDE_INT,
 					 unsigned HOST_WIDE_INT,
 					 unsigned HOST_WIDE_INT, bool);
-static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned);
+static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned, int);
+static void emit_block_move_via_sized_loop (rtx, rtx, rtx, unsigned, unsigned);
+static void emit_block_move_via_oriented_loop (rtx, rtx, rtx, unsigned, unsigned);
+static rtx emit_block_cmp_via_loop (rtx, rtx, rtx, tree, rtx, bool,
+				    unsigned, unsigned);
 static void clear_by_pieces (rtx, unsigned HOST_WIDE_INT, unsigned int);
 static rtx_insn *compress_float_constant (rtx, rtx);
 static rtx get_subtarget (rtx);
@@ -1955,6 +1959,8 @@ compare_by_pieces (rtx arg0, rtx arg1, unsigned HOST_WIDE_INT len,
    MIN_SIZE is the minimal size of block to move
    MAX_SIZE is the maximal size of block to move, if it cannot be represented
    in unsigned HOST_WIDE_INT, than it is mask of all ones.
+   CTZ_SIZE is the trailing-zeros count of SIZE; even a nonconstant SIZE is
+   known to be a multiple of 1<<CTZ_SIZE.
 
    Return the address of the new block, if memcpy is called and returns it,
    0 otherwise.  */
@@ -1966,7 +1972,7 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
 		       unsigned HOST_WIDE_INT max_size,
 		       unsigned HOST_WIDE_INT probable_max_size,
 		       bool bail_out_libcall, bool *is_move_done,
-		       bool might_overlap)
+		       bool might_overlap, unsigned ctz_size)
 {
   int may_use_call;
   rtx retval = 0;
@@ -2052,6 +2058,14 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
 	}
     }
 
+  bool dynamic_direction = false;
+  if (!pattern_ok && !pieces_ok && may_use_call
+      && (flag_inline_stringops & (might_overlap ? ILSOP_MEMMOVE : ILSOP_MEMCPY)))
+    {
+      may_use_call = 0;
+      dynamic_direction = might_overlap;
+    }
+
   if (pattern_ok)
     ;
   else if (pieces_ok)
@@ -2073,10 +2087,12 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
       retval = emit_block_copy_via_libcall (x, y, size,
 					    method == BLOCK_OP_TAILCALL);
     }
+  else if (dynamic_direction)
+    emit_block_move_via_oriented_loop (x, y, size, align, ctz_size);
   else if (might_overlap)
     *is_move_done = false;
   else
-    emit_block_move_via_loop (x, y, size, align);
+    emit_block_move_via_sized_loop (x, y, size, align, ctz_size);
 
   if (method == BLOCK_OP_CALL_PARM)
     OK_DEFER_POP;
@@ -2085,7 +2101,8 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
 }
 
 rtx
-emit_block_move (rtx x, rtx y, rtx size, enum block_op_methods method)
+emit_block_move (rtx x, rtx y, rtx size, enum block_op_methods method,
+		 unsigned int ctz_size)
 {
   unsigned HOST_WIDE_INT max, min = 0;
   if (GET_CODE (size) == CONST_INT)
@@ -2093,7 +2110,8 @@ emit_block_move (rtx x, rtx y, rtx size, enum block_op_methods method)
   else
     max = GET_MODE_MASK (GET_MODE (size));
   return emit_block_move_hints (x, y, size, method, 0, -1,
-				min, max, max);
+				min, max, max,
+				false, NULL, false, ctz_size);
 }
 
 /* A subroutine of emit_block_move.  Returns true if calling the
@@ -2255,13 +2273,117 @@ emit_block_move_via_pattern (rtx x, rtx y, rtx size, unsigned int align,
   return false;
 }
 
+/* Like emit_block_move_via_loop, but choose a suitable INCR based on
+   ALIGN and CTZ_SIZE.  */
+
+static void
+emit_block_move_via_sized_loop (rtx x, rtx y, rtx size,
+				unsigned int align,
+				unsigned int ctz_size)
+{
+  int incr = align / BITS_PER_UNIT;
+
+  if (CONST_INT_P (size))
+    ctz_size = MAX (ctz_size, (unsigned) wi::ctz (UINTVAL (size)));
+
+  if (HOST_WIDE_INT_1U << ctz_size < (unsigned HOST_WIDE_INT) incr)
+    incr = HOST_WIDE_INT_1U << ctz_size;
+
+  while (incr > 1 && !can_move_by_pieces (incr, align))
+    incr >>= 1;
+
+  gcc_checking_assert (incr);
+
+  return emit_block_move_via_loop (x, y, size, align, incr);
+}
+
+/* Like emit_block_move_via_sized_loop, but besides choosing INCR so
+   as to ensure safe moves even in case of overlap, output dynamic
+   tests to choose between two loops, one moving downwards, another
+   moving upwards.  */
+
+static void
+emit_block_move_via_oriented_loop (rtx x, rtx y, rtx size,
+				   unsigned int align,
+				   unsigned int ctz_size)
+{
+  int incr = align / BITS_PER_UNIT;
+
+  if (CONST_INT_P (size))
+    ctz_size = MAX (ctz_size, (unsigned) wi::ctz (UINTVAL (size)));
+
+  if (HOST_WIDE_INT_1U << ctz_size < (unsigned HOST_WIDE_INT) incr)
+    incr = HOST_WIDE_INT_1U << ctz_size;
+
+  while (incr > 1 && !int_mode_for_size (incr, 0).exists ())
+    incr >>= 1;
+
+  gcc_checking_assert (incr);
+
+  rtx_code_label *upw_label, *end_label;
+  upw_label = gen_label_rtx ();
+  end_label = gen_label_rtx ();
+
+  rtx x_addr = force_operand (XEXP (x, 0), NULL_RTX);
+  rtx y_addr = force_operand (XEXP (y, 0), NULL_RTX);
+  do_pending_stack_adjust ();
+
+  machine_mode mode = GET_MODE (x_addr);
+  if (mode != GET_MODE (y_addr))
+    {
+      scalar_int_mode xmode
+	= smallest_int_mode_for_size (GET_MODE_BITSIZE (mode));
+      scalar_int_mode ymode
+	= smallest_int_mode_for_size (GET_MODE_BITSIZE
+				      (GET_MODE (y_addr)));
+      if (GET_MODE_BITSIZE (xmode) < GET_MODE_BITSIZE (ymode))
+	mode = ymode;
+      else
+	mode = xmode;
+
+#ifndef POINTERS_EXTEND_UNSIGNED
+      const int POINTERS_EXTEND_UNSIGNED = 1;
+#endif
+      x_addr = convert_modes (mode, GET_MODE (x_addr), x_addr,
+			      POINTERS_EXTEND_UNSIGNED);
+      y_addr = convert_modes (mode, GET_MODE (y_addr), y_addr,
+			      POINTERS_EXTEND_UNSIGNED);
+    }
+
+  /* Test for overlap: if (x >= y || x + size <= y) goto upw_label.  */
+  emit_cmp_and_jump_insns (x_addr, y_addr, GEU, NULL_RTX, mode,
+			   true, upw_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (5, 10));
+  rtx tmp = convert_modes (GET_MODE (x_addr), GET_MODE (size), size, true);
+  tmp = simplify_gen_binary (PLUS, GET_MODE (x_addr), x_addr, tmp);
+
+  emit_cmp_and_jump_insns (tmp, y_addr, LEU, NULL_RTX, mode,
+			   true, upw_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (8, 10));
+
+  emit_block_move_via_loop (x, y, size, align, -incr);
+
+  emit_jump (end_label);
+  emit_label (upw_label);
+
+  emit_block_move_via_loop (x, y, size, align, incr);
+
+  emit_label (end_label);
+}
+
 /* A subroutine of emit_block_move.  Copy the data via an explicit
-   loop.  This is used only when libcalls are forbidden.  */
-/* ??? It'd be nice to copy in hunks larger than QImode.  */
+   loop.  This is used only when libcalls are forbidden, or when
+   inlining is required.  INCR is the block size to be copied in each
+   loop iteration.  If it is negative, the absolute value is used, and
+   the block is copied backwards.  INCR must be a power of two, an
+   exact divisor for SIZE and ALIGN, and imply a mode that can be
+   safely copied per iteration assuming no overlap.  */
 
 static void
 emit_block_move_via_loop (rtx x, rtx y, rtx size,
-			  unsigned int align ATTRIBUTE_UNUSED)
+			  unsigned int align, int incr)
 {
   rtx_code_label *cmp_label, *top_label;
   rtx iter, x_addr, y_addr, tmp;
@@ -2277,7 +2399,38 @@ emit_block_move_via_loop (rtx x, rtx y, rtx size,
   cmp_label = gen_label_rtx ();
   iter = gen_reg_rtx (iter_mode);
 
-  emit_move_insn (iter, const0_rtx);
+  bool downwards = incr < 0;
+  rtx iter_init;
+  rtx_code iter_cond;
+  rtx iter_limit;
+  rtx iter_incr;
+  machine_mode move_mode;
+  if (downwards)
+    {
+      incr = -incr;
+      iter_init = size;
+      iter_cond = GEU;
+      iter_limit = const0_rtx;
+      iter_incr = GEN_INT (incr);
+    }
+  else
+    {
+      iter_init = const0_rtx;
+      iter_cond = LTU;
+      iter_limit = size;
+      iter_incr = GEN_INT (incr);
+    }
+  emit_move_insn (iter, iter_init);
+
+  scalar_int_mode int_move_mode
+    = smallest_int_mode_for_size (incr * BITS_PER_UNIT);
+  if (GET_MODE_BITSIZE (int_move_mode) != incr * BITS_PER_UNIT)
+    {
+      move_mode = BLKmode;
+      gcc_checking_assert (can_move_by_pieces (incr, align));
+    }
+  else
+    move_mode = int_move_mode;
 
   x_addr = force_operand (XEXP (x, 0), NULL_RTX);
   y_addr = force_operand (XEXP (y, 0), NULL_RTX);
@@ -2293,19 +2446,32 @@ emit_block_move_via_loop (rtx x, rtx y, rtx size,
     tmp = convert_modes (y_addr_mode, iter_mode, iter, true);
   y_addr = simplify_gen_binary (PLUS, y_addr_mode, y_addr, tmp);
 
-  x = change_address (x, QImode, x_addr);
-  y = change_address (y, QImode, y_addr);
+  x = change_address (x, move_mode, x_addr);
+  y = change_address (y, move_mode, y_addr);
 
-  emit_move_insn (x, y);
+  if (move_mode == BLKmode)
+    {
+      bool done;
+      emit_block_move_hints (x, y, iter_incr, BLOCK_OP_NO_LIBCALL,
+			     align, incr, incr, incr, incr,
+			     false, &done, false);
+      gcc_checking_assert (done);
+    }
+  else
+    emit_move_insn (x, y);
 
-  tmp = expand_simple_binop (iter_mode, PLUS, iter, const1_rtx, iter,
+  if (downwards)
+    emit_label (cmp_label);
+
+  tmp = expand_simple_binop (iter_mode, PLUS, iter, iter_incr, iter,
 			     true, OPTAB_LIB_WIDEN);
   if (tmp != iter)
     emit_move_insn (iter, tmp);
 
-  emit_label (cmp_label);
+  if (!downwards)
+    emit_label (cmp_label);
 
-  emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
+  emit_cmp_and_jump_insns (iter, iter_limit, iter_cond, NULL_RTX, iter_mode,
 			   true, top_label,
 			   profile_probability::guessed_always ()
 				.apply_scale (9, 10));
@@ -2405,7 +2571,8 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
 
    Both X and Y must be MEM rtx's.  LEN is an rtx that says how long
    they are.  LEN_TYPE is the type of the expression that was used to
-   calculate it.
+   calculate it, and CTZ_LEN is the known trailing-zeros count of LEN,
+   so LEN must be a multiple of 1<<CTZ_LEN even if it's not constant.
 
    If EQUALITY_ONLY is true, it means we don't have to return the tri-state
    value of a normal memcmp call, instead we can just compare for equality.
@@ -2421,7 +2588,7 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
 rtx
 emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
 		      bool equality_only, by_pieces_constfn y_cfn,
-		      void *y_cfndata)
+		      void *y_cfndata, unsigned ctz_len)
 {
   rtx result = 0;
 
@@ -2443,8 +2610,181 @@ emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
   else
     result = emit_block_cmp_via_cmpmem (x, y, len, len_type, target, align);
 
+  if (!result && (flag_inline_stringops & ILSOP_MEMCMP))
+    result = emit_block_cmp_via_loop (x, y, len, len_type,
+				      target, equality_only,
+				      align, ctz_len);
+
   return result;
 }
+
+/* Like emit_block_cmp_hints, but with known alignment and no support
+   for constats.  Always expand to a loop with iterations that compare
+   blocks of the largest compare-by-pieces size that divides both len
+   and align, and then, if !EQUALITY_ONLY, identify the word and then
+   the unit that first differs to return the result.  */
+
+rtx
+emit_block_cmp_via_loop (rtx x, rtx y, rtx len, tree len_type, rtx target,
+			 bool equality_only, unsigned align, unsigned ctz_len)
+{
+  unsigned incr = align / BITS_PER_UNIT;
+
+  if (CONST_INT_P (len))
+    ctz_len = MAX (ctz_len, (unsigned) wi::ctz (UINTVAL (len)));
+
+  if (HOST_WIDE_INT_1U << ctz_len < (unsigned HOST_WIDE_INT) incr)
+    incr = HOST_WIDE_INT_1U << ctz_len;
+
+  while (incr > 1
+	 && !can_do_by_pieces (incr, align, COMPARE_BY_PIECES))
+    incr >>= 1;
+
+  rtx_code_label *cmp_label, *top_label, *ne_label, *res_label;
+  rtx iter, x_addr, y_addr, tmp;
+  machine_mode x_addr_mode = get_address_mode (x);
+  machine_mode y_addr_mode = get_address_mode (y);
+  machine_mode iter_mode;
+
+  iter_mode = GET_MODE (len);
+  if (iter_mode == VOIDmode)
+    iter_mode = word_mode;
+
+  top_label = gen_label_rtx ();
+  cmp_label = gen_label_rtx ();
+  ne_label = gen_label_rtx ();
+  res_label = gen_label_rtx ();
+  iter = gen_reg_rtx (iter_mode);
+
+  rtx iter_init = const0_rtx;
+  rtx_code iter_cond = LTU;
+  rtx iter_limit = len;
+  rtx iter_incr = GEN_INT (incr);
+  machine_mode cmp_mode;
+
+  emit_move_insn (iter, iter_init);
+
+  scalar_int_mode int_cmp_mode
+    = smallest_int_mode_for_size (incr * BITS_PER_UNIT);
+  if (GET_MODE_BITSIZE (int_cmp_mode) != incr * BITS_PER_UNIT
+      || !can_compare_p (NE, int_cmp_mode, ccp_jump))
+    {
+      cmp_mode = BLKmode;
+      gcc_checking_assert (incr != 1);
+    }
+  else
+    cmp_mode = int_cmp_mode;
+
+  /* Save the base addresses.  */
+  x_addr = force_operand (XEXP (x, 0), NULL_RTX);
+  y_addr = force_operand (XEXP (y, 0), NULL_RTX);
+  do_pending_stack_adjust ();
+
+  emit_jump (cmp_label);
+  emit_label (top_label);
+
+  /* Offset the base addresses by ITER.  */
+  tmp = convert_modes (x_addr_mode, iter_mode, iter, true);
+  x_addr = simplify_gen_binary (PLUS, x_addr_mode, x_addr, tmp);
+
+  if (x_addr_mode != y_addr_mode)
+    tmp = convert_modes (y_addr_mode, iter_mode, iter, true);
+  y_addr = simplify_gen_binary (PLUS, y_addr_mode, y_addr, tmp);
+
+  x = change_address (x, cmp_mode, x_addr);
+  y = change_address (y, cmp_mode, y_addr);
+
+  /* Compare one block.  */
+  rtx part_res;
+  if (cmp_mode == BLKmode)
+    part_res = compare_by_pieces (x, y, incr, target, align, 0, 0);
+  else
+    part_res = expand_binop (cmp_mode, sub_optab, x, y, NULL_RTX,
+			     true, OPTAB_LIB_WIDEN);
+
+  /* Stop if we found a difference.  */
+  emit_cmp_and_jump_insns (part_res, GEN_INT (0), NE, NULL_RTX,
+			   GET_MODE (part_res), true, ne_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (1, 10));
+
+  /* Increment ITER.  */
+  tmp = expand_simple_binop (iter_mode, PLUS, iter, iter_incr, iter,
+			     true, OPTAB_LIB_WIDEN);
+  if (tmp != iter)
+    emit_move_insn (iter, tmp);
+
+  emit_label (cmp_label);
+  /* Loop until we reach the limit.  */
+  emit_cmp_and_jump_insns (iter, iter_limit, iter_cond, NULL_RTX, iter_mode,
+			   true, top_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (9, 10));
+
+  /* We got to the end without differences, so the result is zero.  */
+  if (target == NULL_RTX
+      || !REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
+    target = gen_reg_rtx (TYPE_MODE (integer_type_node));
+
+  emit_move_insn (target, const0_rtx);
+  emit_jump (res_label);
+  emit_barrier ();
+
+  emit_label (ne_label);
+
+  /* Return nonzero, or pinpoint the difference to return the expected
+     result for non-equality tests.  */
+  if (equality_only)
+    emit_move_insn (target, const1_rtx);
+  else
+    {
+      if (incr > UNITS_PER_WORD)
+	/* ??? Re-compare the block found to be different one word at a
+	   time.  */
+	part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), len_type,
+					    target, equality_only,
+					    BITS_PER_WORD, 0);
+      else if (incr > 1)
+	/* ??? Re-compare the block found to be different one byte at a
+	   time.  We could do better using part_res, and being careful
+	   about endianness.  */
+	part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), len_type,
+					    target, equality_only,
+					    BITS_PER_UNIT, 0);
+      else if (known_gt (GET_MODE_BITSIZE (GET_MODE (target)),
+			 GET_MODE_BITSIZE (cmp_mode)))
+	part_res = expand_binop (GET_MODE (target), sub_optab, x, y, target,
+				 true, OPTAB_LIB_WIDEN);
+      else
+	{
+	  /* In the odd chance target is QImode, we can't count on
+	     widening subtract to capture the result of the unsigned
+	     compares.  */
+	  rtx_code_label *ltu_label;
+	  ltu_label = gen_label_rtx ();
+	  emit_cmp_and_jump_insns (x, y, LTU, NULL_RTX,
+				   cmp_mode, true, ltu_label,
+				   profile_probability::guessed_always ()
+				   .apply_scale (5, 10));
+
+	  emit_move_insn (target, const1_rtx);
+	  emit_jump (res_label);
+	  emit_barrier ();
+
+	  emit_label (ltu_label);
+	  emit_move_insn (target, constm1_rtx);
+	  part_res = target;
+	}
+
+      if (target != part_res)
+	convert_move (target, part_res, false);
+    }
+
+  emit_label (res_label);
+
+  return target;
+}
+
 \f
 /* Copy all or part of a value X into registers starting at REGNO.
    The number of registers to be filled is NREGS.  */
diff --git a/gcc/expr.h b/gcc/expr.h
index 0c059ed9023..63c75c9688c 100644
--- a/gcc/expr.h
+++ b/gcc/expr.h
@@ -126,7 +126,8 @@ struct by_pieces_prev
   fixed_size_mode mode;
 };
 
-extern rtx emit_block_move (rtx, rtx, rtx, enum block_op_methods);
+extern rtx emit_block_move (rtx, rtx, rtx, enum block_op_methods,
+			    unsigned ctz_size = 0);
 extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
 			          unsigned int, HOST_WIDE_INT,
 				  unsigned HOST_WIDE_INT,
@@ -134,9 +135,11 @@ extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
 				  unsigned HOST_WIDE_INT,
 				  bool bail_out_libcall = false,
 				  bool *is_move_done = NULL,
-				  bool might_overlap = false);
+				  bool might_overlap = false,
+				  unsigned ctz_size = 0);
 extern rtx emit_block_cmp_hints (rtx, rtx, rtx, tree, rtx, bool,
-				 by_pieces_constfn, void *);
+				 by_pieces_constfn, void *,
+				 unsigned ctz_len = 0);
 extern bool emit_storent_insn (rtx to, rtx from);
 
 /* Copy all or part of a value X into registers starting at REGNO.
diff --git a/gcc/flag-types.h b/gcc/flag-types.h
index 78dbdbe90dd..a9d9eb166b6 100644
--- a/gcc/flag-types.h
+++ b/gcc/flag-types.h
@@ -432,6 +432,17 @@ enum gfc_convert
 };
 
 
+/* Inline String Operations functions.  */
+enum ilsop_fn
+{
+  ILSOP_NONE = 0,
+  ILSOP_MEMSET = 1 << 0,
+  ILSOP_MEMCPY = 1 << 1,
+  ILSOP_MEMMOVE = 1 << 2,
+  ILSOP_MEMCMP = 1 << 3,
+  ILSOP_ALL = -1
+};
+
 /* Control-Flow Protection values.  */
 enum cf_protection_level
 {
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c
new file mode 100644
index 00000000000..d6f1d90b26d
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c
@@ -0,0 +1,6 @@
+/* { dg-do run } */
+/* { dg-options "-finline-stringops=memcmp -save-temps -g0 -fno-lto" } */
+
+#include "../memcmp-1.c"
+
+/* { dg-final { scan-assembler-not {(^|\*)\mmemcmp\M} } } */
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c
new file mode 100644
index 00000000000..c98e903c1f1
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c
@@ -0,0 +1,8 @@
+/* { dg-do run } */
+/* { dg-options "-finline-stringops=memcpy -save-temps -g0 -fno-lto" } */
+
+#include "../memcmp-1.c"
+/* Yeah, this memcmp test exercises plenty of memcpy, more than any of the
+   memcpy tests.  */
+
+/* { dg-final { scan-assembler-not {\mmemcpy\M} } } */
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-cmp-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-cmp-1.c
new file mode 100644
index 00000000000..2cd2057a9b5
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-cmp-1.c
@@ -0,0 +1,11 @@
+/* { dg-do run } */
+/* { dg-options "-finline-stringops -save-temps -g0 -fno-lto" } */
+/* { dg-require-effective-target ptr32plus } */
+/* { dg-timeout-factor 2 } */
+
+#include "../memcmp-1.c"
+/* Yeah, this memcmp test exercises plenty of memcpy, more than any of the
+   memcpy tests.  */
+
+/* { dg-final { scan-assembler-not {\mmemcpy\M} } } */
+/* { dg-final { scan-assembler-not {(^|\*)\mmemcmp\M} } } */
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c
new file mode 100644
index 00000000000..f1d77b50a4c
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c
@@ -0,0 +1,9 @@
+/* { dg-do run } */
+/* { dg-options "-finline-stringops=memmove -save-temps -g0 -fno-lto" } */
+
+#include "../../gcc.c-torture/execute/builtins/memmove.c"
+
+#include "../../gcc.c-torture/execute/builtins/lib/main.c"
+
+/* { dg-final { scan-assembler-not {\mmemcpy\M} } } */
+/* { dg-final { scan-assembler-not {\mmemmove\M} } } */
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c
new file mode 100644
index 00000000000..bdcf9bf5b61
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c
@@ -0,0 +1,84 @@
+/* { dg-do compile } */
+/* { dg-options "-finline-stringops -fno-lto" } */
+
+void *zero (unsigned long long (*p)[32], int n)
+{
+  return __builtin_memset (p, 0, n * sizeof (*p));
+}
+
+void *ones (char (*p)[128], int n)
+{
+  return __builtin_memset (p, -1, n * sizeof (*p));
+}
+
+void *opt2 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 2) * sizeof (*p));
+}
+
+void *opt8 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 8) * sizeof (*p));
+}
+
+void *opt32 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 32) * sizeof (*p));
+}
+
+void *opt128 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 128) * sizeof (*p));
+}
+
+void *opt512 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 512) * sizeof (*p));
+}
+
+void *opt_primes (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 509 : 7) * sizeof (*p));
+}
+
+void *opt_primes_blk (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 521 : 9) * sizeof (*p));
+}
+
+void *huge (long (*p)[16384])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep1 (long (*p)[16384+1])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep4 (long (*p)[16384+4])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep16 (long (*p)[16384+16])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep64 (long (*p)[16384+64])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep256 (long (*p)[16384+256])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep1024p256p64p16p4p1 (long (*p)[16384+1024+64+16+4+1])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+/* { dg-final { scan-assembler-not {\mmemset\M} } } */

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [gcc(refs/users/aoliva/heads/testme)] Introduce -finline-stringops
@ 2023-11-29  4:35 Alexandre Oliva
  0 siblings, 0 replies; 16+ messages in thread
From: Alexandre Oliva @ 2023-11-29  4:35 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:7431755100be6acbb4ee4dc2134d8ecd85e54ddb

commit 7431755100be6acbb4ee4dc2134d8ecd85e54ddb
Author: Alexandre Oliva <oliva@adacore.com>
Date:   Wed Nov 29 00:54:23 2023 -0300

    Introduce -finline-stringops
    
    try_store_by_multiple_pieces was added not long ago, enabling
    variable-sized memset to be expanded inline when the worst-case
    in-range constant length would, using conditional blocks with powers
    of two to cover all possibilities of length and alignment.
    
    This patch introduces -finline-stringops[=fn] to request expansions to
    start with a loop, so as to still take advantage of known alignment
    even with long lengths, but without necessarily adding store blocks
    for every power of two.
    
    This makes it possible for the supported stringops (memset, memcpy,
    memmove, memset) to be expanded, even if storing a single byte per
    iteration.  Surely efficient implementations can run faster, with a
    pre-loop to increase alignment, but that would likely be excessive for
    inline expansions.
    
    Still, in some cases, such as in freestanding environments, users
    prefer to inline such stringops, especially those that the compiler
    may introduce itself, even if the expansion is not as performant as a
    highly optimized C library implementation could be, to avoid
    depending on a C runtime library.
    
    
    for  gcc/ChangeLog
    
            * expr.cc (emit_block_move_hints): Take ctz of len.  Obey
            -finline-stringops.  Use oriented or sized loop.
            (emit_block_move): Take ctz of len, and pass it on.
            (emit_block_move_via_sized_loop): New.
            (emit_block_move_via_oriented_loop): New.
            (emit_block_move_via_loop): Take incr.  Move an incr-sized
            block per iteration.
            (emit_block_cmp_via_cmpmem): Take ctz of len.  Obey
            -finline-stringops.
            (emit_block_cmp_via_loop): New.
            * expr.h (emit_block_move): Add ctz of len defaulting to zero.
            (emit_block_move_hints): Likewise.
            (emit_block_cmp_hints): Likewise.
            * builtins.cc (expand_builtin_memory_copy_args): Pass ctz of
            len to emit_block_move_hints.
            (try_store_by_multiple_pieces): Support starting with a loop.
            (expand_builtin_memcmp): Pass ctz of len to
            emit_block_cmp_hints.
            (expand_builtin): Allow inline expansion of memset, memcpy,
            memmove and memcmp if requested.
            * common.opt (finline-stringops): New.
            (ilsop_fn): New enum.
            * flag-types.h (enum ilsop_fn): New.
            * doc/invoke.texi (-finline-stringops): Add.
    
    for  gcc/testsuite/ChangeLog
    
            * gcc.dg/torture/inline-mem-cmp-1.c: New.
            * gcc.dg/torture/inline-mem-cpy-1.c: New.
            * gcc.dg/torture/inline-mem-cpy-cmp-1.c: New.
            * gcc.dg/torture/inline-mem-move-1.c: New.
            * gcc.dg/torture/inline-mem-set-1.c: New.

Diff:
---
 gcc/builtins.cc                                    | 149 +++++++-
 gcc/common.opt                                     |  34 ++
 gcc/doc/invoke.texi                                |  15 +
 gcc/expr.cc                                        | 396 ++++++++++++++++++++-
 gcc/expr.h                                         |   9 +-
 gcc/flag-types.h                                   |  11 +
 gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c    |   7 +
 gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c    |   8 +
 .../gcc.dg/torture/inline-mem-cpy-cmp-1.c          |  11 +
 gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c   |   8 +
 gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c    |  84 +++++
 11 files changed, 697 insertions(+), 35 deletions(-)

diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index 6af2a0b07aa..4fc58a0bda9 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -3751,7 +3751,7 @@ expand_builtin_memory_copy_args (tree dest, tree src, tree len,
 				     expected_align, expected_size,
 				     min_size, max_size, probable_max_size,
 				     use_mempcpy_call, &is_move_done,
-				     might_overlap);
+				     might_overlap, tree_ctz (len));
 
   /* Bail out when a mempcpy call would be expanded as libcall and when
      we have a target that provides a fast implementation
@@ -4313,6 +4313,10 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
   int tst_bits = (max_bits != min_bits ? max_bits
 		  : floor_log2 (max_len ^ min_len));
 
+  /* Save the pre-blksize values.  */
+  int orig_max_bits = max_bits;
+  int orig_tst_bits = tst_bits;
+
   /* Check whether it's profitable to start by storing a fixed BLKSIZE
      bytes, to lower max_bits.  In the unlikely case of a constant LEN
      (implied by identical MAX_LEN and MIN_LEN), we want to issue a
@@ -4352,9 +4356,81 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
   if (max_bits >= 0)
     xlenest += ((HOST_WIDE_INT_1U << max_bits) * 2
 		- (HOST_WIDE_INT_1U << ctz_len));
-  if (!can_store_by_pieces (xlenest, builtin_memset_read_str,
-			    &valc, align, true))
-    return false;
+  bool max_loop = false;
+  bool use_store_by_pieces = true;
+  /* Skip the test in case of overflow in xlenest.  It shouldn't
+     happen because of the way max_bits and blksize are related, but
+     it doesn't hurt to test.  */
+  if (blksize > xlenest
+      || !can_store_by_pieces (xlenest, builtin_memset_read_str,
+			       &valc, align, true))
+    {
+      if (!(flag_inline_stringops & ILSOP_MEMSET))
+	return false;
+
+      for (max_bits = orig_max_bits;
+	   max_bits >= sctz_len;
+	   --max_bits)
+	{
+	  xlenest = ((HOST_WIDE_INT_1U << max_bits) * 2
+		     - (HOST_WIDE_INT_1U << ctz_len));
+	  /* Check that blksize plus the bits to be stored as blocks
+	     sized at powers of two can be stored by pieces.  This is
+	     like the test above, but with smaller max_bits.  Skip
+	     orig_max_bits (it would be redundant).  Also skip in case
+	     of overflow.  */
+	  if (max_bits < orig_max_bits
+	      && xlenest + blksize >= xlenest
+	      && can_store_by_pieces (xlenest + blksize,
+				      builtin_memset_read_str,
+				      &valc, align, true))
+	    {
+	      max_loop = true;
+	      break;
+	    }
+	  if (blksize
+	      && can_store_by_pieces (xlenest,
+				      builtin_memset_read_str,
+				      &valc, align, true))
+	    {
+	      max_len += blksize;
+	      min_len += blksize;
+	      tst_bits = orig_tst_bits;
+	      blksize = 0;
+	      max_loop = true;
+	      break;
+	    }
+	  if (max_bits == sctz_len)
+	    {
+	      /* We'll get here if can_store_by_pieces refuses to
+		 store even a single QImode.  We'll fall back to
+		 QImode stores then.  */
+	      if (!sctz_len)
+		{
+		  blksize = 0;
+		  max_loop = true;
+		  use_store_by_pieces = false;
+		  break;
+		}
+	      --sctz_len;
+	      --ctz_len;
+	    }
+	}
+      if (!max_loop)
+	return false;
+      /* If the boundaries are such that min and max may run a
+	 different number of trips in the initial loop, the remainder
+	 needs not be between the moduli, so set tst_bits to cover all
+	 bits.  Otherwise, if the trip counts are the same, max_len
+	 has the common prefix, and the previously-computed tst_bits
+	 is usable.  */
+      if (max_len >> max_bits > min_len >> max_bits)
+	tst_bits = max_bits;
+    }
+  /* ??? Do we have to check that all powers of two lengths from
+     max_bits down to ctz_len pass can_store_by_pieces?  As in, could
+     it possibly be that xlenest passes while smaller power-of-two
+     sizes don't?  */
 
   by_pieces_constfn constfun;
   void *constfundata;
@@ -4396,7 +4472,9 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
      the least significant bit possibly set in the length.  */
   for (int i = max_bits; i >= sctz_len; i--)
     {
+      rtx_code_label *loop_label = NULL;
       rtx_code_label *label = NULL;
+
       blksize = HOST_WIDE_INT_1U << i;
 
       /* If we're past the bits shared between min_ and max_len, expand
@@ -4410,25 +4488,57 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
 				   profile_probability::even ());
 	}
       /* If we are at a bit that is in the prefix shared by min_ and
-	 max_len, skip this BLKSIZE if the bit is clear.  */
-      else if ((max_len & blksize) == 0)
+	 max_len, skip the current BLKSIZE if the bit is clear, but do
+	 not skip the loop, even if it doesn't require
+	 prechecking.  */
+      else if ((max_len & blksize) == 0
+	       && !(max_loop && i == max_bits))
 	continue;
 
-      /* Issue a store of BLKSIZE bytes.  */
-      to = store_by_pieces (to, blksize,
-			    constfun, constfundata,
-			    align, true,
-			    i != sctz_len ? RETURN_END : RETURN_BEGIN);
+      if (max_loop && i == max_bits)
+	{
+	  loop_label = gen_label_rtx ();
+	  emit_label (loop_label);
+	  /* Since we may run this multiple times, don't assume we
+	     know anything about the offset.  */
+	  clear_mem_offset (to);
+	}
 
+      bool update_needed = i != sctz_len || loop_label;
+      rtx next_ptr = NULL_RTX;
+      if (!use_store_by_pieces)
+	{
+	  gcc_checking_assert (blksize == 1);
+	  if (!val)
+	    val = gen_int_mode (valc, QImode);
+	  to = change_address (to, QImode, 0);
+	  emit_move_insn (to, val);
+	  if (update_needed)
+	    next_ptr = plus_constant (ptr_mode, ptr, blksize);
+	}
+      else
+	{
+	  /* Issue a store of BLKSIZE bytes.  */
+	  to = store_by_pieces (to, blksize,
+				constfun, constfundata,
+				align, true,
+				update_needed ? RETURN_END : RETURN_BEGIN);
+	  next_ptr = XEXP (to, 0);
+	}
       /* Adjust REM and PTR, unless this is the last iteration.  */
-      if (i != sctz_len)
+      if (update_needed)
 	{
-	  emit_move_insn (ptr, force_operand (XEXP (to, 0), NULL_RTX));
+	  emit_move_insn (ptr, force_operand (next_ptr, NULL_RTX));
 	  to = replace_equiv_address (to, ptr);
 	  rtx rem_minus_blksize = plus_constant (ptr_mode, rem, -blksize);
 	  emit_move_insn (rem, force_operand (rem_minus_blksize, NULL_RTX));
 	}
 
+      if (loop_label)
+	emit_cmp_and_jump_insns (rem, GEN_INT (blksize), GE, NULL,
+				 ptr_mode, 1, loop_label,
+				 profile_probability::likely ());
+
       if (label)
 	{
 	  emit_label (label);
@@ -4715,7 +4825,8 @@ expand_builtin_memcmp (tree exp, rtx target, bool result_eq)
   result = emit_block_cmp_hints (arg1_rtx, arg2_rtx, len_rtx,
 				 TREE_TYPE (len), target,
 				 result_eq, constfn,
-				 CONST_CAST (char *, rep));
+				 CONST_CAST (char *, rep),
+				 tree_ctz (len));
 
   if (result)
     {
@@ -7358,7 +7469,15 @@ expand_builtin (tree exp, rtx target, rtx subtarget, machine_mode mode,
       && fcode != BUILT_IN_EXECVE
       && fcode != BUILT_IN_CLEAR_CACHE
       && !ALLOCA_FUNCTION_CODE_P (fcode)
-      && fcode != BUILT_IN_FREE)
+      && fcode != BUILT_IN_FREE
+      && (fcode != BUILT_IN_MEMSET
+	  || !(flag_inline_stringops & ILSOP_MEMSET))
+      && (fcode != BUILT_IN_MEMCPY
+	  || !(flag_inline_stringops & ILSOP_MEMCPY))
+      && (fcode != BUILT_IN_MEMMOVE
+	  || !(flag_inline_stringops & ILSOP_MEMMOVE))
+      && (fcode != BUILT_IN_MEMCMP
+	  || !(flag_inline_stringops & ILSOP_MEMCMP)))
     return expand_call (exp, target, ignore);
 
   /* The built-in function expanders test for target == const0_rtx
diff --git a/gcc/common.opt b/gcc/common.opt
index 736a4653578..161a035d736 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -1955,6 +1955,40 @@ finline-atomics
 Common Var(flag_inline_atomics) Init(1) Optimization
 Inline __atomic operations when a lock free instruction sequence is available.
 
+finline-stringops
+Common RejectNegative Enum(ilsop_fn) Var(flag_inline_stringops, ILSOP_ALL) Enum(ilsop_fn) Init(ILSOP_NONE) Optimization Undocumented
+
+fno-inline-stringops
+Common RejectNegative Enum(ilsop_fn) Var(flag_inline_stringops, ILSOP_NONE) Enum(ilsop_fn) Optimization Undocumented
+
+finline-stringops=
+Common Joined Var(flag_inline_stringops) EnumSet Enum(ilsop_fn) Optimization
+-finline-stringops[=memcmp|memcpy|memmove|memset]
+Expand supported mem/str operations inline, even if against optimization.
+
+Enum
+Name(ilsop_fn) Type(enum ilsop_fn) UnknownError(unavailable stringop for inlining %qs)
+
+; This is not part of any set.
+; EnumValue
+; Enum(ilsop_fn) String(none) Value(ILSOP_NONE)
+
+EnumValue
+Enum(ilsop_fn) String(memcmp) Value(ILSOP_MEMCMP) Set(1)
+
+EnumValue
+Enum(ilsop_fn) String(memcpy) Value(ILSOP_MEMCPY) Set(2)
+
+EnumValue
+Enum(ilsop_fn) String(memmove) Value(ILSOP_MEMMOVE) Set(3)
+
+EnumValue
+Enum(ilsop_fn) String(memset) Value(ILSOP_MEMSET) Set(4)
+
+; This is not part of any set either.
+; EnumValue
+; Enum(ilsop_fn) String(all) Value(ILSOP_ALL)
+
 fcf-protection
 Common RejectNegative Alias(fcf-protection=,full)
 
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 2e6bac37f3c..9f8a4ef3d5f 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -560,6 +560,7 @@ Objective-C and Objective-C++ Dialects}.
 -fgcse  -fgcse-after-reload  -fgcse-las  -fgcse-lm  -fgraphite-identity
 -fgcse-sm  -fhoist-adjacent-loads  -fif-conversion
 -fif-conversion2  -findirect-inlining
+-finline-stringops[=@var{fn}]
 -finline-functions  -finline-functions-called-once  -finline-limit=@var{n}
 -finline-small-functions -fipa-modref -fipa-cp  -fipa-cp-clone
 -fipa-bit-cp  -fipa-vrp  -fipa-pta  -fipa-profile  -fipa-pure-const
@@ -12373,6 +12374,20 @@ their @code{_FORTIFY_SOURCE} counterparts into faster alternatives.
 
 Enabled at levels @option{-O2}, @option{-O3}.
 
+@opindex finline-stringops
+@item -finline-stringops[=@var{fn}]
+Expand memory and string operations (for now, only @code{memset})
+inline, even when the length is variable or big enough as to require
+looping.  This is most useful along with @option{-ffreestanding} and
+@option{-fno-builtin}.
+
+In some circumstances, it enables the compiler to generate code that
+takes advantage of known alignment and length multipliers, but even then
+it may be less efficient than optimized runtime implementations, and
+grow code size so much that even a less performant but shared
+implementation runs faster due to better use of code caches.  This
+option is disabled by default.
+
 @opindex fno-inline
 @opindex finline
 @item -fno-inline
diff --git a/gcc/expr.cc b/gcc/expr.cc
index c432170403b..fea719028a1 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -80,7 +80,11 @@ static bool emit_block_move_via_pattern (rtx, rtx, rtx, unsigned, unsigned,
 					 HOST_WIDE_INT, unsigned HOST_WIDE_INT,
 					 unsigned HOST_WIDE_INT,
 					 unsigned HOST_WIDE_INT, bool);
-static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned);
+static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned, int);
+static void emit_block_move_via_sized_loop (rtx, rtx, rtx, unsigned, unsigned);
+static void emit_block_move_via_oriented_loop (rtx, rtx, rtx, unsigned, unsigned);
+static rtx emit_block_cmp_via_loop (rtx, rtx, rtx, tree, rtx, bool,
+				    unsigned, unsigned);
 static void clear_by_pieces (rtx, unsigned HOST_WIDE_INT, unsigned int);
 static rtx_insn *compress_float_constant (rtx, rtx);
 static rtx get_subtarget (rtx);
@@ -1982,6 +1986,8 @@ compare_by_pieces (rtx arg0, rtx arg1, unsigned HOST_WIDE_INT len,
    MIN_SIZE is the minimal size of block to move
    MAX_SIZE is the maximal size of block to move, if it cannot be represented
    in unsigned HOST_WIDE_INT, than it is mask of all ones.
+   CTZ_SIZE is the trailing-zeros count of SIZE; even a nonconstant SIZE is
+   known to be a multiple of 1<<CTZ_SIZE.
 
    Return the address of the new block, if memcpy is called and returns it,
    0 otherwise.  */
@@ -1993,7 +1999,7 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
 		       unsigned HOST_WIDE_INT max_size,
 		       unsigned HOST_WIDE_INT probable_max_size,
 		       bool bail_out_libcall, bool *is_move_done,
-		       bool might_overlap)
+		       bool might_overlap, unsigned ctz_size)
 {
   int may_use_call;
   rtx retval = 0;
@@ -2079,6 +2085,14 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
 	}
     }
 
+  bool dynamic_direction = false;
+  if (!pattern_ok && !pieces_ok && may_use_call
+      && (flag_inline_stringops & (might_overlap ? ILSOP_MEMMOVE : ILSOP_MEMCPY)))
+    {
+      may_use_call = 0;
+      dynamic_direction = might_overlap;
+    }
+
   if (pattern_ok)
     ;
   else if (pieces_ok)
@@ -2100,10 +2114,12 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
       retval = emit_block_copy_via_libcall (x, y, size,
 					    method == BLOCK_OP_TAILCALL);
     }
+  else if (dynamic_direction)
+    emit_block_move_via_oriented_loop (x, y, size, align, ctz_size);
   else if (might_overlap)
     *is_move_done = false;
   else
-    emit_block_move_via_loop (x, y, size, align);
+    emit_block_move_via_sized_loop (x, y, size, align, ctz_size);
 
   if (method == BLOCK_OP_CALL_PARM)
     OK_DEFER_POP;
@@ -2112,7 +2128,8 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
 }
 
 rtx
-emit_block_move (rtx x, rtx y, rtx size, enum block_op_methods method)
+emit_block_move (rtx x, rtx y, rtx size, enum block_op_methods method,
+		 unsigned int ctz_size)
 {
   unsigned HOST_WIDE_INT max, min = 0;
   if (GET_CODE (size) == CONST_INT)
@@ -2120,7 +2137,8 @@ emit_block_move (rtx x, rtx y, rtx size, enum block_op_methods method)
   else
     max = GET_MODE_MASK (GET_MODE (size));
   return emit_block_move_hints (x, y, size, method, 0, -1,
-				min, max, max);
+				min, max, max,
+				false, NULL, false, ctz_size);
 }
 
 /* A subroutine of emit_block_move.  Returns true if calling the
@@ -2282,13 +2300,117 @@ emit_block_move_via_pattern (rtx x, rtx y, rtx size, unsigned int align,
   return false;
 }
 
+/* Like emit_block_move_via_loop, but choose a suitable INCR based on
+   ALIGN and CTZ_SIZE.  */
+
+static void
+emit_block_move_via_sized_loop (rtx x, rtx y, rtx size,
+				unsigned int align,
+				unsigned int ctz_size)
+{
+  int incr = align / BITS_PER_UNIT;
+
+  if (CONST_INT_P (size))
+    ctz_size = MAX (ctz_size, (unsigned) wi::ctz (UINTVAL (size)));
+
+  if (HOST_WIDE_INT_1U << ctz_size < (unsigned HOST_WIDE_INT) incr)
+    incr = HOST_WIDE_INT_1U << ctz_size;
+
+  while (incr > 1 && !can_move_by_pieces (incr, align))
+    incr >>= 1;
+
+  gcc_checking_assert (incr);
+
+  return emit_block_move_via_loop (x, y, size, align, incr);
+}
+
+/* Like emit_block_move_via_sized_loop, but besides choosing INCR so
+   as to ensure safe moves even in case of overlap, output dynamic
+   tests to choose between two loops, one moving downwards, another
+   moving upwards.  */
+
+static void
+emit_block_move_via_oriented_loop (rtx x, rtx y, rtx size,
+				   unsigned int align,
+				   unsigned int ctz_size)
+{
+  int incr = align / BITS_PER_UNIT;
+
+  if (CONST_INT_P (size))
+    ctz_size = MAX (ctz_size, (unsigned) wi::ctz (UINTVAL (size)));
+
+  if (HOST_WIDE_INT_1U << ctz_size < (unsigned HOST_WIDE_INT) incr)
+    incr = HOST_WIDE_INT_1U << ctz_size;
+
+  while (incr > 1 && !int_mode_for_size (incr, 0).exists ())
+    incr >>= 1;
+
+  gcc_checking_assert (incr);
+
+  rtx_code_label *upw_label, *end_label;
+  upw_label = gen_label_rtx ();
+  end_label = gen_label_rtx ();
+
+  rtx x_addr = force_operand (XEXP (x, 0), NULL_RTX);
+  rtx y_addr = force_operand (XEXP (y, 0), NULL_RTX);
+  do_pending_stack_adjust ();
+
+  machine_mode mode = GET_MODE (x_addr);
+  if (mode != GET_MODE (y_addr))
+    {
+      scalar_int_mode xmode
+	= smallest_int_mode_for_size (GET_MODE_BITSIZE (mode));
+      scalar_int_mode ymode
+	= smallest_int_mode_for_size (GET_MODE_BITSIZE
+				      (GET_MODE (y_addr)));
+      if (GET_MODE_BITSIZE (xmode) < GET_MODE_BITSIZE (ymode))
+	mode = ymode;
+      else
+	mode = xmode;
+
+#ifndef POINTERS_EXTEND_UNSIGNED
+      const int POINTERS_EXTEND_UNSIGNED = 1;
+#endif
+      x_addr = convert_modes (mode, GET_MODE (x_addr), x_addr,
+			      POINTERS_EXTEND_UNSIGNED);
+      y_addr = convert_modes (mode, GET_MODE (y_addr), y_addr,
+			      POINTERS_EXTEND_UNSIGNED);
+    }
+
+  /* Test for overlap: if (x >= y || x + size <= y) goto upw_label.  */
+  emit_cmp_and_jump_insns (x_addr, y_addr, GEU, NULL_RTX, mode,
+			   true, upw_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (5, 10));
+  rtx tmp = convert_modes (GET_MODE (x_addr), GET_MODE (size), size, true);
+  tmp = simplify_gen_binary (PLUS, GET_MODE (x_addr), x_addr, tmp);
+
+  emit_cmp_and_jump_insns (tmp, y_addr, LEU, NULL_RTX, mode,
+			   true, upw_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (8, 10));
+
+  emit_block_move_via_loop (x, y, size, align, -incr);
+
+  emit_jump (end_label);
+  emit_label (upw_label);
+
+  emit_block_move_via_loop (x, y, size, align, incr);
+
+  emit_label (end_label);
+}
+
 /* A subroutine of emit_block_move.  Copy the data via an explicit
-   loop.  This is used only when libcalls are forbidden.  */
-/* ??? It'd be nice to copy in hunks larger than QImode.  */
+   loop.  This is used only when libcalls are forbidden, or when
+   inlining is required.  INCR is the block size to be copied in each
+   loop iteration.  If it is negative, the absolute value is used, and
+   the block is copied backwards.  INCR must be a power of two, an
+   exact divisor for SIZE and ALIGN, and imply a mode that can be
+   safely copied per iteration assuming no overlap.  */
 
 static void
 emit_block_move_via_loop (rtx x, rtx y, rtx size,
-			  unsigned int align ATTRIBUTE_UNUSED)
+			  unsigned int align, int incr)
 {
   rtx_code_label *cmp_label, *top_label;
   rtx iter, x_addr, y_addr, tmp;
@@ -2304,7 +2426,38 @@ emit_block_move_via_loop (rtx x, rtx y, rtx size,
   cmp_label = gen_label_rtx ();
   iter = gen_reg_rtx (iter_mode);
 
-  emit_move_insn (iter, const0_rtx);
+  bool downwards = incr < 0;
+  rtx iter_init;
+  rtx_code iter_cond;
+  rtx iter_limit;
+  rtx iter_incr;
+  machine_mode move_mode;
+  if (downwards)
+    {
+      incr = -incr;
+      iter_init = size;
+      iter_cond = GEU;
+      iter_limit = const0_rtx;
+      iter_incr = GEN_INT (incr);
+    }
+  else
+    {
+      iter_init = const0_rtx;
+      iter_cond = LTU;
+      iter_limit = size;
+      iter_incr = GEN_INT (incr);
+    }
+  emit_move_insn (iter, iter_init);
+
+  scalar_int_mode int_move_mode
+    = smallest_int_mode_for_size (incr * BITS_PER_UNIT);
+  if (GET_MODE_BITSIZE (int_move_mode) != incr * BITS_PER_UNIT)
+    {
+      move_mode = BLKmode;
+      gcc_checking_assert (can_move_by_pieces (incr, align));
+    }
+  else
+    move_mode = int_move_mode;
 
   x_addr = force_operand (XEXP (x, 0), NULL_RTX);
   y_addr = force_operand (XEXP (y, 0), NULL_RTX);
@@ -2320,19 +2473,32 @@ emit_block_move_via_loop (rtx x, rtx y, rtx size,
     tmp = convert_modes (y_addr_mode, iter_mode, iter, true);
   y_addr = simplify_gen_binary (PLUS, y_addr_mode, y_addr, tmp);
 
-  x = change_address (x, QImode, x_addr);
-  y = change_address (y, QImode, y_addr);
+  x = change_address (x, move_mode, x_addr);
+  y = change_address (y, move_mode, y_addr);
+
+  if (move_mode == BLKmode)
+    {
+      bool done;
+      emit_block_move_hints (x, y, iter_incr, BLOCK_OP_NO_LIBCALL,
+			     align, incr, incr, incr, incr,
+			     false, &done, false);
+      gcc_checking_assert (done);
+    }
+  else
+    emit_move_insn (x, y);
 
-  emit_move_insn (x, y);
+  if (downwards)
+    emit_label (cmp_label);
 
-  tmp = expand_simple_binop (iter_mode, PLUS, iter, const1_rtx, iter,
+  tmp = expand_simple_binop (iter_mode, PLUS, iter, iter_incr, iter,
 			     true, OPTAB_LIB_WIDEN);
   if (tmp != iter)
     emit_move_insn (iter, tmp);
 
-  emit_label (cmp_label);
+  if (!downwards)
+    emit_label (cmp_label);
 
-  emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
+  emit_cmp_and_jump_insns (iter, iter_limit, iter_cond, NULL_RTX, iter_mode,
 			   true, top_label,
 			   profile_probability::guessed_always ()
 				.apply_scale (9, 10));
@@ -2432,7 +2598,8 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
 
    Both X and Y must be MEM rtx's.  LEN is an rtx that says how long
    they are.  LEN_TYPE is the type of the expression that was used to
-   calculate it.
+   calculate it, and CTZ_LEN is the known trailing-zeros count of LEN,
+   so LEN must be a multiple of 1<<CTZ_LEN even if it's not constant.
 
    If EQUALITY_ONLY is true, it means we don't have to return the tri-state
    value of a normal memcmp call, instead we can just compare for equality.
@@ -2448,7 +2615,7 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
 rtx
 emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
 		      bool equality_only, by_pieces_constfn y_cfn,
-		      void *y_cfndata)
+		      void *y_cfndata, unsigned ctz_len)
 {
   rtx result = 0;
 
@@ -2470,8 +2637,203 @@ emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
   else
     result = emit_block_cmp_via_cmpmem (x, y, len, len_type, target, align);
 
+  if (!result && (flag_inline_stringops & ILSOP_MEMCMP))
+    result = emit_block_cmp_via_loop (x, y, len, len_type,
+				      target, equality_only,
+				      align, ctz_len);
+
   return result;
 }
+
+/* Like emit_block_cmp_hints, but with known alignment and no support
+   for constats.  Always expand to a loop with iterations that compare
+   blocks of the largest compare-by-pieces size that divides both len
+   and align, and then, if !EQUALITY_ONLY, identify the word and then
+   the unit that first differs to return the result.  */
+
+rtx
+emit_block_cmp_via_loop (rtx x, rtx y, rtx len, tree len_type, rtx target,
+			 bool equality_only, unsigned align, unsigned ctz_len)
+{
+  unsigned incr = align / BITS_PER_UNIT;
+
+  if (CONST_INT_P (len))
+    ctz_len = MAX (ctz_len, (unsigned) wi::ctz (UINTVAL (len)));
+
+  if (HOST_WIDE_INT_1U << ctz_len < (unsigned HOST_WIDE_INT) incr)
+    incr = HOST_WIDE_INT_1U << ctz_len;
+
+  while (incr > 1
+	 && !can_do_by_pieces (incr, align, COMPARE_BY_PIECES))
+    incr >>= 1;
+
+  rtx_code_label *cmp_label, *top_label, *ne_label, *res_label;
+  rtx iter, x_addr, y_addr, tmp;
+  machine_mode x_addr_mode = get_address_mode (x);
+  machine_mode y_addr_mode = get_address_mode (y);
+  machine_mode iter_mode;
+
+  iter_mode = GET_MODE (len);
+  if (iter_mode == VOIDmode)
+    iter_mode = word_mode;
+
+  rtx iter_init = const0_rtx;
+  rtx_code iter_cond = LTU;
+  rtx_code entry_cond = GEU;
+  rtx iter_limit = len;
+  rtx iter_incr = GEN_INT (incr);
+  machine_mode cmp_mode;
+
+  /* We can drop the loop back edge if we know there's exactly one
+     iteration.  */
+  top_label = (!rtx_equal_p (len, iter_incr)
+	       ? gen_label_rtx ()
+	       : NULL);
+  /* We need not test before entering the loop if len is known
+     nonzero.  ??? This could be even stricter, testing whether a
+     nonconstant LEN could possibly be zero.  */
+  cmp_label = (!CONSTANT_P (len) || rtx_equal_p (len, iter_init)
+	       ? gen_label_rtx ()
+	       : NULL);
+  ne_label = gen_label_rtx ();
+  res_label = gen_label_rtx ();
+
+  iter = gen_reg_rtx (iter_mode);
+  emit_move_insn (iter, iter_init);
+
+  scalar_int_mode int_cmp_mode
+    = smallest_int_mode_for_size (incr * BITS_PER_UNIT);
+  if (GET_MODE_BITSIZE (int_cmp_mode) != incr * BITS_PER_UNIT
+      || !can_compare_p (NE, int_cmp_mode, ccp_jump))
+    {
+      cmp_mode = BLKmode;
+      gcc_checking_assert (incr != 1);
+    }
+  else
+    cmp_mode = int_cmp_mode;
+
+  /* Save the base addresses.  */
+  x_addr = force_operand (XEXP (x, 0), NULL_RTX);
+  y_addr = force_operand (XEXP (y, 0), NULL_RTX);
+  do_pending_stack_adjust ();
+
+  if (cmp_label)
+    {
+      if (top_label)
+	emit_jump (cmp_label);
+      else
+	emit_cmp_and_jump_insns (iter, iter_limit, entry_cond,
+				 NULL_RTX, iter_mode,
+				 true, cmp_label,
+				 profile_probability::guessed_always ()
+				 .apply_scale (1, 10));
+    }
+  if (top_label)
+    emit_label (top_label);
+
+  /* Offset the base addresses by ITER.  */
+  tmp = convert_modes (x_addr_mode, iter_mode, iter, true);
+  x_addr = simplify_gen_binary (PLUS, x_addr_mode, x_addr, tmp);
+
+  if (x_addr_mode != y_addr_mode)
+    tmp = convert_modes (y_addr_mode, iter_mode, iter, true);
+  y_addr = simplify_gen_binary (PLUS, y_addr_mode, y_addr, tmp);
+
+  x = change_address (x, cmp_mode, x_addr);
+  y = change_address (y, cmp_mode, y_addr);
+
+  /* Compare one block.  */
+  rtx part_res;
+  if (cmp_mode == BLKmode)
+    part_res = compare_by_pieces (x, y, incr, target, align, 0, 0);
+  else
+    part_res = expand_binop (cmp_mode, sub_optab, x, y, NULL_RTX,
+			     true, OPTAB_LIB_WIDEN);
+
+  /* Stop if we found a difference.  */
+  emit_cmp_and_jump_insns (part_res, GEN_INT (0), NE, NULL_RTX,
+			   GET_MODE (part_res), true, ne_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (1, 10));
+
+  /* Increment ITER.  */
+  tmp = expand_simple_binop (iter_mode, PLUS, iter, iter_incr, iter,
+			     true, OPTAB_LIB_WIDEN);
+  if (tmp != iter)
+    emit_move_insn (iter, tmp);
+
+  if (cmp_label)
+    emit_label (cmp_label);
+  /* Loop until we reach the limit.  */
+
+  if (top_label)
+    emit_cmp_and_jump_insns (iter, iter_limit, iter_cond, NULL_RTX, iter_mode,
+			     true, top_label,
+			     profile_probability::guessed_always ()
+			     .apply_scale (9, 10));
+
+  /* We got to the end without differences, so the result is zero.  */
+  if (target == NULL_RTX
+      || !REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
+    target = gen_reg_rtx (TYPE_MODE (integer_type_node));
+
+  emit_move_insn (target, const0_rtx);
+  emit_jump (res_label);
+
+  emit_label (ne_label);
+
+  /* Return nonzero, or pinpoint the difference to return the expected
+     result for non-equality tests.  */
+  if (equality_only)
+    emit_move_insn (target, const1_rtx);
+  else
+    {
+      if (incr > UNITS_PER_WORD)
+	/* ??? Re-compare the block found to be different one word at a
+	   time.  */
+	part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), len_type,
+					    target, equality_only,
+					    BITS_PER_WORD, 0);
+      else if (incr > 1)
+	/* ??? Re-compare the block found to be different one byte at a
+	   time.  We could do better using part_res, and being careful
+	   about endianness.  */
+	part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), len_type,
+					    target, equality_only,
+					    BITS_PER_UNIT, 0);
+      else if (known_gt (GET_MODE_BITSIZE (GET_MODE (target)),
+			 GET_MODE_BITSIZE (cmp_mode)))
+	part_res = expand_binop (GET_MODE (target), sub_optab, x, y, target,
+				 true, OPTAB_LIB_WIDEN);
+      else
+	{
+	  /* In the odd chance target is QImode, we can't count on
+	     widening subtract to capture the result of the unsigned
+	     compares.  */
+	  rtx_code_label *ltu_label;
+	  ltu_label = gen_label_rtx ();
+	  emit_cmp_and_jump_insns (x, y, LTU, NULL_RTX,
+				   cmp_mode, true, ltu_label,
+				   profile_probability::guessed_always ()
+				   .apply_scale (5, 10));
+
+	  emit_move_insn (target, const1_rtx);
+	  emit_jump (res_label);
+
+	  emit_label (ltu_label);
+	  emit_move_insn (target, constm1_rtx);
+	  part_res = target;
+	}
+
+      if (target != part_res)
+	convert_move (target, part_res, false);
+    }
+
+  emit_label (res_label);
+
+  return target;
+}
+
 \f
 /* Copy all or part of a value X into registers starting at REGNO.
    The number of registers to be filled is NREGS.  */
diff --git a/gcc/expr.h b/gcc/expr.h
index 2a172867fdb..431616d55be 100644
--- a/gcc/expr.h
+++ b/gcc/expr.h
@@ -126,7 +126,8 @@ struct by_pieces_prev
   fixed_size_mode mode;
 };
 
-extern rtx emit_block_move (rtx, rtx, rtx, enum block_op_methods);
+extern rtx emit_block_move (rtx, rtx, rtx, enum block_op_methods,
+			    unsigned ctz_size = 0);
 extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
 			          unsigned int, HOST_WIDE_INT,
 				  unsigned HOST_WIDE_INT,
@@ -134,9 +135,11 @@ extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
 				  unsigned HOST_WIDE_INT,
 				  bool bail_out_libcall = false,
 				  bool *is_move_done = NULL,
-				  bool might_overlap = false);
+				  bool might_overlap = false,
+				  unsigned ctz_size = 0);
 extern rtx emit_block_cmp_hints (rtx, rtx, rtx, tree, rtx, bool,
-				 by_pieces_constfn, void *);
+				 by_pieces_constfn, void *,
+				 unsigned ctz_len = 0);
 extern bool emit_storent_insn (rtx to, rtx from);
 
 /* Copy all or part of a value X into registers starting at REGNO.
diff --git a/gcc/flag-types.h b/gcc/flag-types.h
index c1852cd810c..650ae08b004 100644
--- a/gcc/flag-types.h
+++ b/gcc/flag-types.h
@@ -447,6 +447,17 @@ enum gfc_convert
 };
 
 
+/* Inline String Operations functions.  */
+enum ilsop_fn
+{
+  ILSOP_NONE = 0,
+  ILSOP_MEMSET = 1 << 0,
+  ILSOP_MEMCPY = 1 << 1,
+  ILSOP_MEMMOVE = 1 << 2,
+  ILSOP_MEMCMP = 1 << 3,
+  ILSOP_ALL = -1
+};
+
 /* Control-Flow Protection values.  */
 enum cf_protection_level
 {
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c
new file mode 100644
index 00000000000..a368f074112
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c
@@ -0,0 +1,7 @@
+/* { dg-do run } */
+/* { dg-options "-finline-stringops=memcmp -save-temps -g0 -fno-lto" } */
+
+#include "../memcmp-1.c"
+
+/* Check that no memcmp calls remain, but allow for lib_memcmp calls.  */
+/* { dg-final { scan-assembler-not {(^|\*)\mmemcmp\M} } } */
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c
new file mode 100644
index 00000000000..c98e903c1f1
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c
@@ -0,0 +1,8 @@
+/* { dg-do run } */
+/* { dg-options "-finline-stringops=memcpy -save-temps -g0 -fno-lto" } */
+
+#include "../memcmp-1.c"
+/* Yeah, this memcmp test exercises plenty of memcpy, more than any of the
+   memcpy tests.  */
+
+/* { dg-final { scan-assembler-not {\mmemcpy\M} } } */
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-cmp-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-cmp-1.c
new file mode 100644
index 00000000000..2cd2057a9b5
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-cmp-1.c
@@ -0,0 +1,11 @@
+/* { dg-do run } */
+/* { dg-options "-finline-stringops -save-temps -g0 -fno-lto" } */
+/* { dg-require-effective-target ptr32plus } */
+/* { dg-timeout-factor 2 } */
+
+#include "../memcmp-1.c"
+/* Yeah, this memcmp test exercises plenty of memcpy, more than any of the
+   memcpy tests.  */
+
+/* { dg-final { scan-assembler-not {\mmemcpy\M} } } */
+/* { dg-final { scan-assembler-not {(^|\*)\mmemcmp\M} } } */
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c
new file mode 100644
index 00000000000..c0eca5bce6b
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c
@@ -0,0 +1,8 @@
+/* { dg-do run } */
+/* { dg-options "-finline-stringops=memmove -save-temps -g0 -fno-lto" } */
+
+#include "../../gcc.c-torture/execute/builtins/memmove.c"
+
+#include "../../gcc.c-torture/execute/builtins/lib/main.c"
+
+/* { dg-final { scan-assembler-not {\mmemmove\M} } } */
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c
new file mode 100644
index 00000000000..bdcf9bf5b61
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c
@@ -0,0 +1,84 @@
+/* { dg-do compile } */
+/* { dg-options "-finline-stringops -fno-lto" } */
+
+void *zero (unsigned long long (*p)[32], int n)
+{
+  return __builtin_memset (p, 0, n * sizeof (*p));
+}
+
+void *ones (char (*p)[128], int n)
+{
+  return __builtin_memset (p, -1, n * sizeof (*p));
+}
+
+void *opt2 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 2) * sizeof (*p));
+}
+
+void *opt8 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 8) * sizeof (*p));
+}
+
+void *opt32 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 32) * sizeof (*p));
+}
+
+void *opt128 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 128) * sizeof (*p));
+}
+
+void *opt512 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 512) * sizeof (*p));
+}
+
+void *opt_primes (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 509 : 7) * sizeof (*p));
+}
+
+void *opt_primes_blk (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 521 : 9) * sizeof (*p));
+}
+
+void *huge (long (*p)[16384])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep1 (long (*p)[16384+1])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep4 (long (*p)[16384+4])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep16 (long (*p)[16384+16])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep64 (long (*p)[16384+64])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep256 (long (*p)[16384+256])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep1024p256p64p16p4p1 (long (*p)[16384+1024+64+16+4+1])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+/* { dg-final { scan-assembler-not {\mmemset\M} } } */

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [gcc(refs/users/aoliva/heads/testme)] Introduce -finline-stringops
@ 2023-11-23 11:45 Alexandre Oliva
  0 siblings, 0 replies; 16+ messages in thread
From: Alexandre Oliva @ 2023-11-23 11:45 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:fc5140e027881b844338d60a86ef0086de4493d9

commit fc5140e027881b844338d60a86ef0086de4493d9
Author: Alexandre Oliva <oliva@adacore.com>
Date:   Fri Sep 15 02:41:24 2023 -0300

    Introduce -finline-stringops
    
    try_store_by_multiple_pieces was added not long ago, enabling
    variable-sized memset to be expanded inline when the worst-case
    in-range constant length would, using conditional blocks with powers
    of two to cover all possibilities of length and alignment.
    
    This patch introduces -finline-stringops[=fn] to request expansions to
    start with a loop, so as to still take advantage of known alignment
    even with long lengths, but without necessarily adding store blocks
    for every power of two.
    
    This makes it possible for the supported stringops (memset, memcpy,
    memmove, memset) to be expanded, even if storing a single byte per
    iteration.  Surely efficient implementations can run faster, with a
    pre-loop to increase alignment, but that would likely be excessive for
    inline expansions.
    
    Still, in some cases, such as in freestanding environments, users
    prefer to inline such stringops, especially those that the compiler
    may introduce itself, even if the expansion is not as performant as a
    highly optimized C library implementation could be, to avoid
    depending on a C runtime library.
    
    
    for  gcc/ChangeLog
    
            * expr.cc (emit_block_move_hints): Take ctz of len.  Obey
            -finline-stringops.  Use oriented or sized loop.
            (emit_block_move): Take ctz of len, and pass it on.
            (emit_block_move_via_sized_loop): New.
            (emit_block_move_via_oriented_loop): New.
            (emit_block_move_via_loop): Take incr.  Move an incr-sized
            block per iteration.
            (emit_block_cmp_via_cmpmem): Take ctz of len.  Obey
            -finline-stringops.
            (emit_block_cmp_via_loop): New.
            * expr.h (emit_block_move): Add ctz of len defaulting to zero.
            (emit_block_move_hints): Likewise.
            (emit_block_cmp_hints): Likewise.
            * builtins.cc (expand_builtin_memory_copy_args): Pass ctz of
            len to emit_block_move_hints.
            (try_store_by_multiple_pieces): Support starting with a loop.
            (expand_builtin_memcmp): Pass ctz of len to
            emit_block_cmp_hints.
            (expand_builtin): Allow inline expansion of memset, memcpy,
            memmove and memcmp if requested.
            * common.opt (finline-stringops): New.
            (ilsop_fn): New enum.
            * flag-types.h (enum ilsop_fn): New.
            * doc/invoke.texi (-finline-stringops): Add.
    
    for  gcc/testsuite/ChangeLog
    
            * gcc.dg/torture/inline-mem-cmp-1.c: New.
            * gcc.dg/torture/inline-mem-cpy-1.c: New.
            * gcc.dg/torture/inline-mem-cpy-cmp-1.c: New.
            * gcc.dg/torture/inline-mem-move-1.c: New.
            * gcc.dg/torture/inline-mem-set-1.c: New.

Diff:
---
 gcc/builtins.cc                                    | 149 +++++++-
 gcc/common.opt                                     |  34 ++
 gcc/doc/invoke.texi                                |  15 +
 gcc/expr.cc                                        | 396 ++++++++++++++++++++-
 gcc/expr.h                                         |   9 +-
 gcc/flag-types.h                                   |  11 +
 gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c    |   7 +
 gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c    |   8 +
 .../gcc.dg/torture/inline-mem-cpy-cmp-1.c          |  11 +
 gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c   |   8 +
 gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c    |  84 +++++
 11 files changed, 697 insertions(+), 35 deletions(-)

diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index 5ece0d23eb9..af6f9526219 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -3750,7 +3750,7 @@ expand_builtin_memory_copy_args (tree dest, tree src, tree len,
 				     expected_align, expected_size,
 				     min_size, max_size, probable_max_size,
 				     use_mempcpy_call, &is_move_done,
-				     might_overlap);
+				     might_overlap, tree_ctz (len));
 
   /* Bail out when a mempcpy call would be expanded as libcall and when
      we have a target that provides a fast implementation
@@ -4312,6 +4312,10 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
   int tst_bits = (max_bits != min_bits ? max_bits
 		  : floor_log2 (max_len ^ min_len));
 
+  /* Save the pre-blksize values.  */
+  int orig_max_bits = max_bits;
+  int orig_tst_bits = tst_bits;
+
   /* Check whether it's profitable to start by storing a fixed BLKSIZE
      bytes, to lower max_bits.  In the unlikely case of a constant LEN
      (implied by identical MAX_LEN and MIN_LEN), we want to issue a
@@ -4351,9 +4355,81 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
   if (max_bits >= 0)
     xlenest += ((HOST_WIDE_INT_1U << max_bits) * 2
 		- (HOST_WIDE_INT_1U << ctz_len));
-  if (!can_store_by_pieces (xlenest, builtin_memset_read_str,
-			    &valc, align, true))
-    return false;
+  bool max_loop = false;
+  bool use_store_by_pieces = true;
+  /* Skip the test in case of overflow in xlenest.  It shouldn't
+     happen because of the way max_bits and blksize are related, but
+     it doesn't hurt to test.  */
+  if (blksize > xlenest
+      || !can_store_by_pieces (xlenest, builtin_memset_read_str,
+			       &valc, align, true))
+    {
+      if (!(flag_inline_stringops & ILSOP_MEMSET))
+	return false;
+
+      for (max_bits = orig_max_bits;
+	   max_bits >= sctz_len;
+	   --max_bits)
+	{
+	  xlenest = ((HOST_WIDE_INT_1U << max_bits) * 2
+		     - (HOST_WIDE_INT_1U << ctz_len));
+	  /* Check that blksize plus the bits to be stored as blocks
+	     sized at powers of two can be stored by pieces.  This is
+	     like the test above, but with smaller max_bits.  Skip
+	     orig_max_bits (it would be redundant).  Also skip in case
+	     of overflow.  */
+	  if (max_bits < orig_max_bits
+	      && xlenest + blksize >= xlenest
+	      && can_store_by_pieces (xlenest + blksize,
+				      builtin_memset_read_str,
+				      &valc, align, true))
+	    {
+	      max_loop = true;
+	      break;
+	    }
+	  if (blksize
+	      && can_store_by_pieces (xlenest,
+				      builtin_memset_read_str,
+				      &valc, align, true))
+	    {
+	      max_len += blksize;
+	      min_len += blksize;
+	      tst_bits = orig_tst_bits;
+	      blksize = 0;
+	      max_loop = true;
+	      break;
+	    }
+	  if (max_bits == sctz_len)
+	    {
+	      /* We'll get here if can_store_by_pieces refuses to
+		 store even a single QImode.  We'll fall back to
+		 QImode stores then.  */
+	      if (!sctz_len)
+		{
+		  blksize = 0;
+		  max_loop = true;
+		  use_store_by_pieces = false;
+		  break;
+		}
+	      --sctz_len;
+	      --ctz_len;
+	    }
+	}
+      if (!max_loop)
+	return false;
+      /* If the boundaries are such that min and max may run a
+	 different number of trips in the initial loop, the remainder
+	 needs not be between the moduli, so set tst_bits to cover all
+	 bits.  Otherwise, if the trip counts are the same, max_len
+	 has the common prefix, and the previously-computed tst_bits
+	 is usable.  */
+      if (max_len >> max_bits > min_len >> max_bits)
+	tst_bits = max_bits;
+    }
+  /* ??? Do we have to check that all powers of two lengths from
+     max_bits down to ctz_len pass can_store_by_pieces?  As in, could
+     it possibly be that xlenest passes while smaller power-of-two
+     sizes don't?  */
 
   by_pieces_constfn constfun;
   void *constfundata;
@@ -4395,7 +4471,9 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
      the least significant bit possibly set in the length.  */
   for (int i = max_bits; i >= sctz_len; i--)
     {
+      rtx_code_label *loop_label = NULL;
       rtx_code_label *label = NULL;
+
       blksize = HOST_WIDE_INT_1U << i;
 
       /* If we're past the bits shared between min_ and max_len, expand
@@ -4409,25 +4487,57 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
 				   profile_probability::even ());
 	}
       /* If we are at a bit that is in the prefix shared by min_ and
-	 max_len, skip this BLKSIZE if the bit is clear.  */
-      else if ((max_len & blksize) == 0)
+	 max_len, skip the current BLKSIZE if the bit is clear, but do
+	 not skip the loop, even if it doesn't require
+	 prechecking.  */
+      else if ((max_len & blksize) == 0
+	       && !(max_loop && i == max_bits))
 	continue;
 
-      /* Issue a store of BLKSIZE bytes.  */
-      to = store_by_pieces (to, blksize,
-			    constfun, constfundata,
-			    align, true,
-			    i != sctz_len ? RETURN_END : RETURN_BEGIN);
+      if (max_loop && i == max_bits)
+	{
+	  loop_label = gen_label_rtx ();
+	  emit_label (loop_label);
+	  /* Since we may run this multiple times, don't assume we
+	     know anything about the offset.  */
+	  clear_mem_offset (to);
+	}
 
+      bool update_needed = i != sctz_len || loop_label;
+      rtx next_ptr = NULL_RTX;
+      if (!use_store_by_pieces)
+	{
+	  gcc_checking_assert (blksize == 1);
+	  if (!val)
+	    val = gen_int_mode (valc, QImode);
+	  to = change_address (to, QImode, 0);
+	  emit_move_insn (to, val);
+	  if (update_needed)
+	    next_ptr = plus_constant (ptr_mode, ptr, blksize);
+	}
+      else
+	{
+	  /* Issue a store of BLKSIZE bytes.  */
+	  to = store_by_pieces (to, blksize,
+				constfun, constfundata,
+				align, true,
+				update_needed ? RETURN_END : RETURN_BEGIN);
+	  next_ptr = XEXP (to, 0);
+	}
       /* Adjust REM and PTR, unless this is the last iteration.  */
-      if (i != sctz_len)
+      if (update_needed)
 	{
-	  emit_move_insn (ptr, force_operand (XEXP (to, 0), NULL_RTX));
+	  emit_move_insn (ptr, force_operand (next_ptr, NULL_RTX));
 	  to = replace_equiv_address (to, ptr);
 	  rtx rem_minus_blksize = plus_constant (ptr_mode, rem, -blksize);
 	  emit_move_insn (rem, force_operand (rem_minus_blksize, NULL_RTX));
 	}
 
+      if (loop_label)
+	emit_cmp_and_jump_insns (rem, GEN_INT (blksize), GE, NULL,
+				 ptr_mode, 1, loop_label,
+				 profile_probability::likely ());
+
       if (label)
 	{
 	  emit_label (label);
@@ -4714,7 +4824,8 @@ expand_builtin_memcmp (tree exp, rtx target, bool result_eq)
   result = emit_block_cmp_hints (arg1_rtx, arg2_rtx, len_rtx,
 				 TREE_TYPE (len), target,
 				 result_eq, constfn,
-				 CONST_CAST (char *, rep));
+				 CONST_CAST (char *, rep),
+				 tree_ctz (len));
 
   if (result)
     {
@@ -7357,7 +7468,15 @@ expand_builtin (tree exp, rtx target, rtx subtarget, machine_mode mode,
       && fcode != BUILT_IN_EXECVE
       && fcode != BUILT_IN_CLEAR_CACHE
       && !ALLOCA_FUNCTION_CODE_P (fcode)
-      && fcode != BUILT_IN_FREE)
+      && fcode != BUILT_IN_FREE
+      && (fcode != BUILT_IN_MEMSET
+	  || !(flag_inline_stringops & ILSOP_MEMSET))
+      && (fcode != BUILT_IN_MEMCPY
+	  || !(flag_inline_stringops & ILSOP_MEMCPY))
+      && (fcode != BUILT_IN_MEMMOVE
+	  || !(flag_inline_stringops & ILSOP_MEMMOVE))
+      && (fcode != BUILT_IN_MEMCMP
+	  || !(flag_inline_stringops & ILSOP_MEMCMP)))
     return expand_call (exp, target, ignore);
 
   /* The built-in function expanders test for target == const0_rtx
diff --git a/gcc/common.opt b/gcc/common.opt
index d21db5d4a20..6a514df71dd 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -1943,6 +1943,40 @@ finline-atomics
 Common Var(flag_inline_atomics) Init(1) Optimization
 Inline __atomic operations when a lock free instruction sequence is available.
 
+finline-stringops
+Common RejectNegative Enum(ilsop_fn) Var(flag_inline_stringops, ILSOP_ALL) Enum(ilsop_fn) Init(ILSOP_NONE) Optimization Undocumented
+
+fno-inline-stringops
+Common RejectNegative Enum(ilsop_fn) Var(flag_inline_stringops, ILSOP_NONE) Enum(ilsop_fn) Optimization Undocumented
+
+finline-stringops=
+Common Joined Var(flag_inline_stringops) EnumSet Enum(ilsop_fn) Optimization
+-finline-stringops[=memcmp|memcpy|memmove|memset]
+Expand supported mem/str operations inline, even if against optimization.
+
+Enum
+Name(ilsop_fn) Type(enum ilsop_fn) UnknownError(unavailable stringop for inlining %qs)
+
+; This is not part of any set.
+; EnumValue
+; Enum(ilsop_fn) String(none) Value(ILSOP_NONE)
+
+EnumValue
+Enum(ilsop_fn) String(memcmp) Value(ILSOP_MEMCMP) Set(1)
+
+EnumValue
+Enum(ilsop_fn) String(memcpy) Value(ILSOP_MEMCPY) Set(2)
+
+EnumValue
+Enum(ilsop_fn) String(memmove) Value(ILSOP_MEMMOVE) Set(3)
+
+EnumValue
+Enum(ilsop_fn) String(memset) Value(ILSOP_MEMSET) Set(4)
+
+; This is not part of any set either.
+; EnumValue
+; Enum(ilsop_fn) String(all) Value(ILSOP_ALL)
+
 fcf-protection
 Common RejectNegative Alias(fcf-protection=,full)
 
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index bc9f4f70914..a71fd524dfd 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -560,6 +560,7 @@ Objective-C and Objective-C++ Dialects}.
 -fgcse  -fgcse-after-reload  -fgcse-las  -fgcse-lm  -fgraphite-identity
 -fgcse-sm  -fhoist-adjacent-loads  -fif-conversion
 -fif-conversion2  -findirect-inlining
+-finline-stringops[=@var{fn}]
 -finline-functions  -finline-functions-called-once  -finline-limit=@var{n}
 -finline-small-functions -fipa-modref -fipa-cp  -fipa-cp-clone
 -fipa-bit-cp  -fipa-vrp  -fipa-pta  -fipa-profile  -fipa-pure-const
@@ -12346,6 +12347,20 @@ their @code{_FORTIFY_SOURCE} counterparts into faster alternatives.
 
 Enabled at levels @option{-O2}, @option{-O3}.
 
+@opindex finline-stringops
+@item -finline-stringops[=@var{fn}]
+Expand memory and string operations (for now, only @code{memset})
+inline, even when the length is variable or big enough as to require
+looping.  This is most useful along with @option{-ffreestanding} and
+@option{-fno-builtin}.
+
+In some circumstances, it enables the compiler to generate code that
+takes advantage of known alignment and length multipliers, but even then
+it may be less efficient than optimized runtime implementations, and
+grow code size so much that even a less performant but shared
+implementation runs faster due to better use of code caches.  This
+option is disabled by default.
+
 @opindex fno-inline
 @opindex finline
 @item -fno-inline
diff --git a/gcc/expr.cc b/gcc/expr.cc
index 556bcf7ef59..b1db81a47d0 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -80,7 +80,11 @@ static bool emit_block_move_via_pattern (rtx, rtx, rtx, unsigned, unsigned,
 					 HOST_WIDE_INT, unsigned HOST_WIDE_INT,
 					 unsigned HOST_WIDE_INT,
 					 unsigned HOST_WIDE_INT, bool);
-static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned);
+static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned, int);
+static void emit_block_move_via_sized_loop (rtx, rtx, rtx, unsigned, unsigned);
+static void emit_block_move_via_oriented_loop (rtx, rtx, rtx, unsigned, unsigned);
+static rtx emit_block_cmp_via_loop (rtx, rtx, rtx, tree, rtx, bool,
+				    unsigned, unsigned);
 static void clear_by_pieces (rtx, unsigned HOST_WIDE_INT, unsigned int);
 static rtx_insn *compress_float_constant (rtx, rtx);
 static rtx get_subtarget (rtx);
@@ -1985,6 +1989,8 @@ compare_by_pieces (rtx arg0, rtx arg1, unsigned HOST_WIDE_INT len,
    MIN_SIZE is the minimal size of block to move
    MAX_SIZE is the maximal size of block to move, if it cannot be represented
    in unsigned HOST_WIDE_INT, than it is mask of all ones.
+   CTZ_SIZE is the trailing-zeros count of SIZE; even a nonconstant SIZE is
+   known to be a multiple of 1<<CTZ_SIZE.
 
    Return the address of the new block, if memcpy is called and returns it,
    0 otherwise.  */
@@ -1996,7 +2002,7 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
 		       unsigned HOST_WIDE_INT max_size,
 		       unsigned HOST_WIDE_INT probable_max_size,
 		       bool bail_out_libcall, bool *is_move_done,
-		       bool might_overlap)
+		       bool might_overlap, unsigned ctz_size)
 {
   int may_use_call;
   rtx retval = 0;
@@ -2082,6 +2088,14 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
 	}
     }
 
+  bool dynamic_direction = false;
+  if (!pattern_ok && !pieces_ok && may_use_call
+      && (flag_inline_stringops & (might_overlap ? ILSOP_MEMMOVE : ILSOP_MEMCPY)))
+    {
+      may_use_call = 0;
+      dynamic_direction = might_overlap;
+    }
+
   if (pattern_ok)
     ;
   else if (pieces_ok)
@@ -2103,10 +2117,12 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
       retval = emit_block_copy_via_libcall (x, y, size,
 					    method == BLOCK_OP_TAILCALL);
     }
+  else if (dynamic_direction)
+    emit_block_move_via_oriented_loop (x, y, size, align, ctz_size);
   else if (might_overlap)
     *is_move_done = false;
   else
-    emit_block_move_via_loop (x, y, size, align);
+    emit_block_move_via_sized_loop (x, y, size, align, ctz_size);
 
   if (method == BLOCK_OP_CALL_PARM)
     OK_DEFER_POP;
@@ -2115,7 +2131,8 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
 }
 
 rtx
-emit_block_move (rtx x, rtx y, rtx size, enum block_op_methods method)
+emit_block_move (rtx x, rtx y, rtx size, enum block_op_methods method,
+		 unsigned int ctz_size)
 {
   unsigned HOST_WIDE_INT max, min = 0;
   if (GET_CODE (size) == CONST_INT)
@@ -2123,7 +2140,8 @@ emit_block_move (rtx x, rtx y, rtx size, enum block_op_methods method)
   else
     max = GET_MODE_MASK (GET_MODE (size));
   return emit_block_move_hints (x, y, size, method, 0, -1,
-				min, max, max);
+				min, max, max,
+				false, NULL, false, ctz_size);
 }
 
 /* A subroutine of emit_block_move.  Returns true if calling the
@@ -2285,13 +2303,117 @@ emit_block_move_via_pattern (rtx x, rtx y, rtx size, unsigned int align,
   return false;
 }
 
+/* Like emit_block_move_via_loop, but choose a suitable INCR based on
+   ALIGN and CTZ_SIZE.  */
+
+static void
+emit_block_move_via_sized_loop (rtx x, rtx y, rtx size,
+				unsigned int align,
+				unsigned int ctz_size)
+{
+  int incr = align / BITS_PER_UNIT;
+
+  if (CONST_INT_P (size))
+    ctz_size = MAX (ctz_size, (unsigned) wi::ctz (UINTVAL (size)));
+
+  if (HOST_WIDE_INT_1U << ctz_size < (unsigned HOST_WIDE_INT) incr)
+    incr = HOST_WIDE_INT_1U << ctz_size;
+
+  while (incr > 1 && !can_move_by_pieces (incr, align))
+    incr >>= 1;
+
+  gcc_checking_assert (incr);
+
+  return emit_block_move_via_loop (x, y, size, align, incr);
+}
+
+/* Like emit_block_move_via_sized_loop, but besides choosing INCR so
+   as to ensure safe moves even in case of overlap, output dynamic
+   tests to choose between two loops, one moving downwards, another
+   moving upwards.  */
+
+static void
+emit_block_move_via_oriented_loop (rtx x, rtx y, rtx size,
+				   unsigned int align,
+				   unsigned int ctz_size)
+{
+  int incr = align / BITS_PER_UNIT;
+
+  if (CONST_INT_P (size))
+    ctz_size = MAX (ctz_size, (unsigned) wi::ctz (UINTVAL (size)));
+
+  if (HOST_WIDE_INT_1U << ctz_size < (unsigned HOST_WIDE_INT) incr)
+    incr = HOST_WIDE_INT_1U << ctz_size;
+
+  while (incr > 1 && !int_mode_for_size (incr, 0).exists ())
+    incr >>= 1;
+
+  gcc_checking_assert (incr);
+
+  rtx_code_label *upw_label, *end_label;
+  upw_label = gen_label_rtx ();
+  end_label = gen_label_rtx ();
+
+  rtx x_addr = force_operand (XEXP (x, 0), NULL_RTX);
+  rtx y_addr = force_operand (XEXP (y, 0), NULL_RTX);
+  do_pending_stack_adjust ();
+
+  machine_mode mode = GET_MODE (x_addr);
+  if (mode != GET_MODE (y_addr))
+    {
+      scalar_int_mode xmode
+	= smallest_int_mode_for_size (GET_MODE_BITSIZE (mode));
+      scalar_int_mode ymode
+	= smallest_int_mode_for_size (GET_MODE_BITSIZE
+				      (GET_MODE (y_addr)));
+      if (GET_MODE_BITSIZE (xmode) < GET_MODE_BITSIZE (ymode))
+	mode = ymode;
+      else
+	mode = xmode;
+
+#ifndef POINTERS_EXTEND_UNSIGNED
+      const int POINTERS_EXTEND_UNSIGNED = 1;
+#endif
+      x_addr = convert_modes (mode, GET_MODE (x_addr), x_addr,
+			      POINTERS_EXTEND_UNSIGNED);
+      y_addr = convert_modes (mode, GET_MODE (y_addr), y_addr,
+			      POINTERS_EXTEND_UNSIGNED);
+    }
+
+  /* Test for overlap: if (x >= y || x + size <= y) goto upw_label.  */
+  emit_cmp_and_jump_insns (x_addr, y_addr, GEU, NULL_RTX, mode,
+			   true, upw_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (5, 10));
+  rtx tmp = convert_modes (GET_MODE (x_addr), GET_MODE (size), size, true);
+  tmp = simplify_gen_binary (PLUS, GET_MODE (x_addr), x_addr, tmp);
+
+  emit_cmp_and_jump_insns (tmp, y_addr, LEU, NULL_RTX, mode,
+			   true, upw_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (8, 10));
+
+  emit_block_move_via_loop (x, y, size, align, -incr);
+
+  emit_jump (end_label);
+  emit_label (upw_label);
+
+  emit_block_move_via_loop (x, y, size, align, incr);
+
+  emit_label (end_label);
+}
+
 /* A subroutine of emit_block_move.  Copy the data via an explicit
-   loop.  This is used only when libcalls are forbidden.  */
-/* ??? It'd be nice to copy in hunks larger than QImode.  */
+   loop.  This is used only when libcalls are forbidden, or when
+   inlining is required.  INCR is the block size to be copied in each
+   loop iteration.  If it is negative, the absolute value is used, and
+   the block is copied backwards.  INCR must be a power of two, an
+   exact divisor for SIZE and ALIGN, and imply a mode that can be
+   safely copied per iteration assuming no overlap.  */
 
 static void
 emit_block_move_via_loop (rtx x, rtx y, rtx size,
-			  unsigned int align ATTRIBUTE_UNUSED)
+			  unsigned int align, int incr)
 {
   rtx_code_label *cmp_label, *top_label;
   rtx iter, x_addr, y_addr, tmp;
@@ -2307,7 +2429,38 @@ emit_block_move_via_loop (rtx x, rtx y, rtx size,
   cmp_label = gen_label_rtx ();
   iter = gen_reg_rtx (iter_mode);
 
-  emit_move_insn (iter, const0_rtx);
+  bool downwards = incr < 0;
+  rtx iter_init;
+  rtx_code iter_cond;
+  rtx iter_limit;
+  rtx iter_incr;
+  machine_mode move_mode;
+  if (downwards)
+    {
+      incr = -incr;
+      iter_init = size;
+      iter_cond = GEU;
+      iter_limit = const0_rtx;
+      iter_incr = GEN_INT (incr);
+    }
+  else
+    {
+      iter_init = const0_rtx;
+      iter_cond = LTU;
+      iter_limit = size;
+      iter_incr = GEN_INT (incr);
+    }
+  emit_move_insn (iter, iter_init);
+
+  scalar_int_mode int_move_mode
+    = smallest_int_mode_for_size (incr * BITS_PER_UNIT);
+  if (GET_MODE_BITSIZE (int_move_mode) != incr * BITS_PER_UNIT)
+    {
+      move_mode = BLKmode;
+      gcc_checking_assert (can_move_by_pieces (incr, align));
+    }
+  else
+    move_mode = int_move_mode;
 
   x_addr = force_operand (XEXP (x, 0), NULL_RTX);
   y_addr = force_operand (XEXP (y, 0), NULL_RTX);
@@ -2323,19 +2476,32 @@ emit_block_move_via_loop (rtx x, rtx y, rtx size,
     tmp = convert_modes (y_addr_mode, iter_mode, iter, true);
   y_addr = simplify_gen_binary (PLUS, y_addr_mode, y_addr, tmp);
 
-  x = change_address (x, QImode, x_addr);
-  y = change_address (y, QImode, y_addr);
+  x = change_address (x, move_mode, x_addr);
+  y = change_address (y, move_mode, y_addr);
+
+  if (move_mode == BLKmode)
+    {
+      bool done;
+      emit_block_move_hints (x, y, iter_incr, BLOCK_OP_NO_LIBCALL,
+			     align, incr, incr, incr, incr,
+			     false, &done, false);
+      gcc_checking_assert (done);
+    }
+  else
+    emit_move_insn (x, y);
 
-  emit_move_insn (x, y);
+  if (downwards)
+    emit_label (cmp_label);
 
-  tmp = expand_simple_binop (iter_mode, PLUS, iter, const1_rtx, iter,
+  tmp = expand_simple_binop (iter_mode, PLUS, iter, iter_incr, iter,
 			     true, OPTAB_LIB_WIDEN);
   if (tmp != iter)
     emit_move_insn (iter, tmp);
 
-  emit_label (cmp_label);
+  if (!downwards)
+    emit_label (cmp_label);
 
-  emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
+  emit_cmp_and_jump_insns (iter, iter_limit, iter_cond, NULL_RTX, iter_mode,
 			   true, top_label,
 			   profile_probability::guessed_always ()
 				.apply_scale (9, 10));
@@ -2435,7 +2601,8 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
 
    Both X and Y must be MEM rtx's.  LEN is an rtx that says how long
    they are.  LEN_TYPE is the type of the expression that was used to
-   calculate it.
+   calculate it, and CTZ_LEN is the known trailing-zeros count of LEN,
+   so LEN must be a multiple of 1<<CTZ_LEN even if it's not constant.
 
    If EQUALITY_ONLY is true, it means we don't have to return the tri-state
    value of a normal memcmp call, instead we can just compare for equality.
@@ -2451,7 +2618,7 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
 rtx
 emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
 		      bool equality_only, by_pieces_constfn y_cfn,
-		      void *y_cfndata)
+		      void *y_cfndata, unsigned ctz_len)
 {
   rtx result = 0;
 
@@ -2473,8 +2640,203 @@ emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
   else
     result = emit_block_cmp_via_cmpmem (x, y, len, len_type, target, align);
 
+  if (!result && (flag_inline_stringops & ILSOP_MEMCMP))
+    result = emit_block_cmp_via_loop (x, y, len, len_type,
+				      target, equality_only,
+				      align, ctz_len);
+
   return result;
 }
+
+/* Like emit_block_cmp_hints, but with known alignment and no support
+   for constats.  Always expand to a loop with iterations that compare
+   blocks of the largest compare-by-pieces size that divides both len
+   and align, and then, if !EQUALITY_ONLY, identify the word and then
+   the unit that first differs to return the result.  */
+
+rtx
+emit_block_cmp_via_loop (rtx x, rtx y, rtx len, tree len_type, rtx target,
+			 bool equality_only, unsigned align, unsigned ctz_len)
+{
+  unsigned incr = align / BITS_PER_UNIT;
+
+  if (CONST_INT_P (len))
+    ctz_len = MAX (ctz_len, (unsigned) wi::ctz (UINTVAL (len)));
+
+  if (HOST_WIDE_INT_1U << ctz_len < (unsigned HOST_WIDE_INT) incr)
+    incr = HOST_WIDE_INT_1U << ctz_len;
+
+  while (incr > 1
+	 && !can_do_by_pieces (incr, align, COMPARE_BY_PIECES))
+    incr >>= 1;
+
+  rtx_code_label *cmp_label, *top_label, *ne_label, *res_label;
+  rtx iter, x_addr, y_addr, tmp;
+  machine_mode x_addr_mode = get_address_mode (x);
+  machine_mode y_addr_mode = get_address_mode (y);
+  machine_mode iter_mode;
+
+  iter_mode = GET_MODE (len);
+  if (iter_mode == VOIDmode)
+    iter_mode = word_mode;
+
+  rtx iter_init = const0_rtx;
+  rtx_code iter_cond = LTU;
+  rtx_code entry_cond = GEU;
+  rtx iter_limit = len;
+  rtx iter_incr = GEN_INT (incr);
+  machine_mode cmp_mode;
+
+  /* We can drop the loop back edge if we know there's exactly one
+     iteration.  */
+  top_label = (!rtx_equal_p (len, iter_incr)
+	       ? gen_label_rtx ()
+	       : NULL);
+  /* We need not test before entering the loop if len is known
+     nonzero.  ??? This could be even stricter, testing whether a
+     nonconstant LEN could possibly be zero.  */
+  cmp_label = (!CONSTANT_P (len) || rtx_equal_p (len, iter_init)
+	       ? gen_label_rtx ()
+	       : NULL);
+  ne_label = gen_label_rtx ();
+  res_label = gen_label_rtx ();
+
+  iter = gen_reg_rtx (iter_mode);
+  emit_move_insn (iter, iter_init);
+
+  scalar_int_mode int_cmp_mode
+    = smallest_int_mode_for_size (incr * BITS_PER_UNIT);
+  if (GET_MODE_BITSIZE (int_cmp_mode) != incr * BITS_PER_UNIT
+      || !can_compare_p (NE, int_cmp_mode, ccp_jump))
+    {
+      cmp_mode = BLKmode;
+      gcc_checking_assert (incr != 1);
+    }
+  else
+    cmp_mode = int_cmp_mode;
+
+  /* Save the base addresses.  */
+  x_addr = force_operand (XEXP (x, 0), NULL_RTX);
+  y_addr = force_operand (XEXP (y, 0), NULL_RTX);
+  do_pending_stack_adjust ();
+
+  if (cmp_label)
+    {
+      if (top_label)
+	emit_jump (cmp_label);
+      else
+	emit_cmp_and_jump_insns (iter, iter_limit, entry_cond,
+				 NULL_RTX, iter_mode,
+				 true, cmp_label,
+				 profile_probability::guessed_always ()
+				 .apply_scale (1, 10));
+    }
+  if (top_label)
+    emit_label (top_label);
+
+  /* Offset the base addresses by ITER.  */
+  tmp = convert_modes (x_addr_mode, iter_mode, iter, true);
+  x_addr = simplify_gen_binary (PLUS, x_addr_mode, x_addr, tmp);
+
+  if (x_addr_mode != y_addr_mode)
+    tmp = convert_modes (y_addr_mode, iter_mode, iter, true);
+  y_addr = simplify_gen_binary (PLUS, y_addr_mode, y_addr, tmp);
+
+  x = change_address (x, cmp_mode, x_addr);
+  y = change_address (y, cmp_mode, y_addr);
+
+  /* Compare one block.  */
+  rtx part_res;
+  if (cmp_mode == BLKmode)
+    part_res = compare_by_pieces (x, y, incr, target, align, 0, 0);
+  else
+    part_res = expand_binop (cmp_mode, sub_optab, x, y, NULL_RTX,
+			     true, OPTAB_LIB_WIDEN);
+
+  /* Stop if we found a difference.  */
+  emit_cmp_and_jump_insns (part_res, GEN_INT (0), NE, NULL_RTX,
+			   GET_MODE (part_res), true, ne_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (1, 10));
+
+  /* Increment ITER.  */
+  tmp = expand_simple_binop (iter_mode, PLUS, iter, iter_incr, iter,
+			     true, OPTAB_LIB_WIDEN);
+  if (tmp != iter)
+    emit_move_insn (iter, tmp);
+
+  if (cmp_label)
+    emit_label (cmp_label);
+  /* Loop until we reach the limit.  */
+
+  if (top_label)
+    emit_cmp_and_jump_insns (iter, iter_limit, iter_cond, NULL_RTX, iter_mode,
+			     true, top_label,
+			     profile_probability::guessed_always ()
+			     .apply_scale (9, 10));
+
+  /* We got to the end without differences, so the result is zero.  */
+  if (target == NULL_RTX
+      || !REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
+    target = gen_reg_rtx (TYPE_MODE (integer_type_node));
+
+  emit_move_insn (target, const0_rtx);
+  emit_jump (res_label);
+
+  emit_label (ne_label);
+
+  /* Return nonzero, or pinpoint the difference to return the expected
+     result for non-equality tests.  */
+  if (equality_only)
+    emit_move_insn (target, const1_rtx);
+  else
+    {
+      if (incr > UNITS_PER_WORD)
+	/* ??? Re-compare the block found to be different one word at a
+	   time.  */
+	part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), len_type,
+					    target, equality_only,
+					    BITS_PER_WORD, 0);
+      else if (incr > 1)
+	/* ??? Re-compare the block found to be different one byte at a
+	   time.  We could do better using part_res, and being careful
+	   about endianness.  */
+	part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), len_type,
+					    target, equality_only,
+					    BITS_PER_UNIT, 0);
+      else if (known_gt (GET_MODE_BITSIZE (GET_MODE (target)),
+			 GET_MODE_BITSIZE (cmp_mode)))
+	part_res = expand_binop (GET_MODE (target), sub_optab, x, y, target,
+				 true, OPTAB_LIB_WIDEN);
+      else
+	{
+	  /* In the odd chance target is QImode, we can't count on
+	     widening subtract to capture the result of the unsigned
+	     compares.  */
+	  rtx_code_label *ltu_label;
+	  ltu_label = gen_label_rtx ();
+	  emit_cmp_and_jump_insns (x, y, LTU, NULL_RTX,
+				   cmp_mode, true, ltu_label,
+				   profile_probability::guessed_always ()
+				   .apply_scale (5, 10));
+
+	  emit_move_insn (target, const1_rtx);
+	  emit_jump (res_label);
+
+	  emit_label (ltu_label);
+	  emit_move_insn (target, constm1_rtx);
+	  part_res = target;
+	}
+
+      if (target != part_res)
+	convert_move (target, part_res, false);
+    }
+
+  emit_label (res_label);
+
+  return target;
+}
+
 \f
 /* Copy all or part of a value X into registers starting at REGNO.
    The number of registers to be filled is NREGS.  */
diff --git a/gcc/expr.h b/gcc/expr.h
index 2a172867fdb..431616d55be 100644
--- a/gcc/expr.h
+++ b/gcc/expr.h
@@ -126,7 +126,8 @@ struct by_pieces_prev
   fixed_size_mode mode;
 };
 
-extern rtx emit_block_move (rtx, rtx, rtx, enum block_op_methods);
+extern rtx emit_block_move (rtx, rtx, rtx, enum block_op_methods,
+			    unsigned ctz_size = 0);
 extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
 			          unsigned int, HOST_WIDE_INT,
 				  unsigned HOST_WIDE_INT,
@@ -134,9 +135,11 @@ extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
 				  unsigned HOST_WIDE_INT,
 				  bool bail_out_libcall = false,
 				  bool *is_move_done = NULL,
-				  bool might_overlap = false);
+				  bool might_overlap = false,
+				  unsigned ctz_size = 0);
 extern rtx emit_block_cmp_hints (rtx, rtx, rtx, tree, rtx, bool,
-				 by_pieces_constfn, void *);
+				 by_pieces_constfn, void *,
+				 unsigned ctz_len = 0);
 extern bool emit_storent_insn (rtx to, rtx from);
 
 /* Copy all or part of a value X into registers starting at REGNO.
diff --git a/gcc/flag-types.h b/gcc/flag-types.h
index c1852cd810c..650ae08b004 100644
--- a/gcc/flag-types.h
+++ b/gcc/flag-types.h
@@ -447,6 +447,17 @@ enum gfc_convert
 };
 
 
+/* Inline String Operations functions.  */
+enum ilsop_fn
+{
+  ILSOP_NONE = 0,
+  ILSOP_MEMSET = 1 << 0,
+  ILSOP_MEMCPY = 1 << 1,
+  ILSOP_MEMMOVE = 1 << 2,
+  ILSOP_MEMCMP = 1 << 3,
+  ILSOP_ALL = -1
+};
+
 /* Control-Flow Protection values.  */
 enum cf_protection_level
 {
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c
new file mode 100644
index 00000000000..a368f074112
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c
@@ -0,0 +1,7 @@
+/* { dg-do run } */
+/* { dg-options "-finline-stringops=memcmp -save-temps -g0 -fno-lto" } */
+
+#include "../memcmp-1.c"
+
+/* Check that no memcmp calls remain, but allow for lib_memcmp calls.  */
+/* { dg-final { scan-assembler-not {(^|\*)\mmemcmp\M} } } */
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c
new file mode 100644
index 00000000000..c98e903c1f1
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c
@@ -0,0 +1,8 @@
+/* { dg-do run } */
+/* { dg-options "-finline-stringops=memcpy -save-temps -g0 -fno-lto" } */
+
+#include "../memcmp-1.c"
+/* Yeah, this memcmp test exercises plenty of memcpy, more than any of the
+   memcpy tests.  */
+
+/* { dg-final { scan-assembler-not {\mmemcpy\M} } } */
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-cmp-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-cmp-1.c
new file mode 100644
index 00000000000..2cd2057a9b5
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-cmp-1.c
@@ -0,0 +1,11 @@
+/* { dg-do run } */
+/* { dg-options "-finline-stringops -save-temps -g0 -fno-lto" } */
+/* { dg-require-effective-target ptr32plus } */
+/* { dg-timeout-factor 2 } */
+
+#include "../memcmp-1.c"
+/* Yeah, this memcmp test exercises plenty of memcpy, more than any of the
+   memcpy tests.  */
+
+/* { dg-final { scan-assembler-not {\mmemcpy\M} } } */
+/* { dg-final { scan-assembler-not {(^|\*)\mmemcmp\M} } } */
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c
new file mode 100644
index 00000000000..c0eca5bce6b
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c
@@ -0,0 +1,8 @@
+/* { dg-do run } */
+/* { dg-options "-finline-stringops=memmove -save-temps -g0 -fno-lto" } */
+
+#include "../../gcc.c-torture/execute/builtins/memmove.c"
+
+#include "../../gcc.c-torture/execute/builtins/lib/main.c"
+
+/* { dg-final { scan-assembler-not {\mmemmove\M} } } */
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c
new file mode 100644
index 00000000000..bdcf9bf5b61
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c
@@ -0,0 +1,84 @@
+/* { dg-do compile } */
+/* { dg-options "-finline-stringops -fno-lto" } */
+
+void *zero (unsigned long long (*p)[32], int n)
+{
+  return __builtin_memset (p, 0, n * sizeof (*p));
+}
+
+void *ones (char (*p)[128], int n)
+{
+  return __builtin_memset (p, -1, n * sizeof (*p));
+}
+
+void *opt2 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 2) * sizeof (*p));
+}
+
+void *opt8 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 8) * sizeof (*p));
+}
+
+void *opt32 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 32) * sizeof (*p));
+}
+
+void *opt128 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 128) * sizeof (*p));
+}
+
+void *opt512 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 512) * sizeof (*p));
+}
+
+void *opt_primes (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 509 : 7) * sizeof (*p));
+}
+
+void *opt_primes_blk (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 521 : 9) * sizeof (*p));
+}
+
+void *huge (long (*p)[16384])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep1 (long (*p)[16384+1])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep4 (long (*p)[16384+4])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep16 (long (*p)[16384+16])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep64 (long (*p)[16384+64])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep256 (long (*p)[16384+256])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep1024p256p64p16p4p1 (long (*p)[16384+1024+64+16+4+1])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+/* { dg-final { scan-assembler-not {\mmemset\M} } } */

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [gcc(refs/users/aoliva/heads/testme)] Introduce -finline-stringops
@ 2023-11-20  9:03 Alexandre Oliva
  0 siblings, 0 replies; 16+ messages in thread
From: Alexandre Oliva @ 2023-11-20  9:03 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:959574d2b5ac900f9dfdd29d0b0dc6fd9dcbdad2

commit 959574d2b5ac900f9dfdd29d0b0dc6fd9dcbdad2
Author: Alexandre Oliva <oliva@adacore.com>
Date:   Fri Sep 15 02:41:24 2023 -0300

    Introduce -finline-stringops
    
    try_store_by_multiple_pieces was added not long ago, enabling
    variable-sized memset to be expanded inline when the worst-case
    in-range constant length would, using conditional blocks with powers
    of two to cover all possibilities of length and alignment.
    
    This patch introduces -finline-stringops[=fn] to request expansions to
    start with a loop, so as to still take advantage of known alignment
    even with long lengths, but without necessarily adding store blocks
    for every power of two.
    
    This makes it possible for the supported stringops (memset, memcpy,
    memmove, memset) to be expanded, even if storing a single byte per
    iteration.  Surely efficient implementations can run faster, with a
    pre-loop to increase alignment, but that would likely be excessive for
    inline expansions.
    
    Still, in some cases, such as in freestanding environments, users
    prefer to inline such stringops, especially those that the compiler
    may introduce itself, even if the expansion is not as performant as a
    highly optimized C library implementation could be, to avoid
    depending on a C runtime library.
    
    
    for  gcc/ChangeLog
    
            * expr.cc (emit_block_move_hints): Take ctz of len.  Obey
            -finline-stringops.  Use oriented or sized loop.
            (emit_block_move): Take ctz of len, and pass it on.
            (emit_block_move_via_sized_loop): New.
            (emit_block_move_via_oriented_loop): New.
            (emit_block_move_via_loop): Take incr.  Move an incr-sized
            block per iteration.
            (emit_block_cmp_via_cmpmem): Take ctz of len.  Obey
            -finline-stringops.
            (emit_block_cmp_via_loop): New.
            * expr.h (emit_block_move): Add ctz of len defaulting to zero.
            (emit_block_move_hints): Likewise.
            (emit_block_cmp_hints): Likewise.
            * builtins.cc (expand_builtin_memory_copy_args): Pass ctz of
            len to emit_block_move_hints.
            (try_store_by_multiple_pieces): Support starting with a loop.
            (expand_builtin_memcmp): Pass ctz of len to
            emit_block_cmp_hints.
            (expand_builtin): Allow inline expansion of memset, memcpy,
            memmove and memcmp if requested.
            * common.opt (finline-stringops): New.
            (ilsop_fn): New enum.
            * flag-types.h (enum ilsop_fn): New.
            * doc/invoke.texi (-finline-stringops): Add.
    
    for  gcc/testsuite/ChangeLog
    
            * gcc.dg/torture/inline-mem-cmp-1.c: New.
            * gcc.dg/torture/inline-mem-cpy-1.c: New.
            * gcc.dg/torture/inline-mem-cpy-cmp-1.c: New.
            * gcc.dg/torture/inline-mem-move-1.c: New.
            * gcc.dg/torture/inline-mem-set-1.c: New.

Diff:
---
 gcc/builtins.cc                                    | 149 +++++++-
 gcc/common.opt                                     |  34 ++
 gcc/doc/invoke.texi                                |  15 +
 gcc/expr.cc                                        | 396 ++++++++++++++++++++-
 gcc/expr.h                                         |   9 +-
 gcc/flag-types.h                                   |  11 +
 gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c    |   7 +
 gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c    |   8 +
 .../gcc.dg/torture/inline-mem-cpy-cmp-1.c          |  11 +
 gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c   |   8 +
 gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c    |  84 +++++
 11 files changed, 697 insertions(+), 35 deletions(-)

diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index 27f16b3cf68..26df959520f 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -3752,7 +3752,7 @@ expand_builtin_memory_copy_args (tree dest, tree src, tree len,
 				     expected_align, expected_size,
 				     min_size, max_size, probable_max_size,
 				     use_mempcpy_call, &is_move_done,
-				     might_overlap);
+				     might_overlap, tree_ctz (len));
 
   /* Bail out when a mempcpy call would be expanded as libcall and when
      we have a target that provides a fast implementation
@@ -4314,6 +4314,10 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
   int tst_bits = (max_bits != min_bits ? max_bits
 		  : floor_log2 (max_len ^ min_len));
 
+  /* Save the pre-blksize values.  */
+  int orig_max_bits = max_bits;
+  int orig_tst_bits = tst_bits;
+
   /* Check whether it's profitable to start by storing a fixed BLKSIZE
      bytes, to lower max_bits.  In the unlikely case of a constant LEN
      (implied by identical MAX_LEN and MIN_LEN), we want to issue a
@@ -4353,9 +4357,81 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
   if (max_bits >= 0)
     xlenest += ((HOST_WIDE_INT_1U << max_bits) * 2
 		- (HOST_WIDE_INT_1U << ctz_len));
-  if (!can_store_by_pieces (xlenest, builtin_memset_read_str,
-			    &valc, align, true))
-    return false;
+  bool max_loop = false;
+  bool use_store_by_pieces = true;
+  /* Skip the test in case of overflow in xlenest.  It shouldn't
+     happen because of the way max_bits and blksize are related, but
+     it doesn't hurt to test.  */
+  if (blksize > xlenest
+      || !can_store_by_pieces (xlenest, builtin_memset_read_str,
+			       &valc, align, true))
+    {
+      if (!(flag_inline_stringops & ILSOP_MEMSET))
+	return false;
+
+      for (max_bits = orig_max_bits;
+	   max_bits >= sctz_len;
+	   --max_bits)
+	{
+	  xlenest = ((HOST_WIDE_INT_1U << max_bits) * 2
+		     - (HOST_WIDE_INT_1U << ctz_len));
+	  /* Check that blksize plus the bits to be stored as blocks
+	     sized at powers of two can be stored by pieces.  This is
+	     like the test above, but with smaller max_bits.  Skip
+	     orig_max_bits (it would be redundant).  Also skip in case
+	     of overflow.  */
+	  if (max_bits < orig_max_bits
+	      && xlenest + blksize >= xlenest
+	      && can_store_by_pieces (xlenest + blksize,
+				      builtin_memset_read_str,
+				      &valc, align, true))
+	    {
+	      max_loop = true;
+	      break;
+	    }
+	  if (blksize
+	      && can_store_by_pieces (xlenest,
+				      builtin_memset_read_str,
+				      &valc, align, true))
+	    {
+	      max_len += blksize;
+	      min_len += blksize;
+	      tst_bits = orig_tst_bits;
+	      blksize = 0;
+	      max_loop = true;
+	      break;
+	    }
+	  if (max_bits == sctz_len)
+	    {
+	      /* We'll get here if can_store_by_pieces refuses to
+		 store even a single QImode.  We'll fall back to
+		 QImode stores then.  */
+	      if (!sctz_len)
+		{
+		  blksize = 0;
+		  max_loop = true;
+		  use_store_by_pieces = false;
+		  break;
+		}
+	      --sctz_len;
+	      --ctz_len;
+	    }
+	}
+      if (!max_loop)
+	return false;
+      /* If the boundaries are such that min and max may run a
+	 different number of trips in the initial loop, the remainder
+	 needs not be between the moduli, so set tst_bits to cover all
+	 bits.  Otherwise, if the trip counts are the same, max_len
+	 has the common prefix, and the previously-computed tst_bits
+	 is usable.  */
+      if (max_len >> max_bits > min_len >> max_bits)
+	tst_bits = max_bits;
+    }
+  /* ??? Do we have to check that all powers of two lengths from
+     max_bits down to ctz_len pass can_store_by_pieces?  As in, could
+     it possibly be that xlenest passes while smaller power-of-two
+     sizes don't?  */
 
   by_pieces_constfn constfun;
   void *constfundata;
@@ -4397,7 +4473,9 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
      the least significant bit possibly set in the length.  */
   for (int i = max_bits; i >= sctz_len; i--)
     {
+      rtx_code_label *loop_label = NULL;
       rtx_code_label *label = NULL;
+
       blksize = HOST_WIDE_INT_1U << i;
 
       /* If we're past the bits shared between min_ and max_len, expand
@@ -4411,25 +4489,57 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
 				   profile_probability::even ());
 	}
       /* If we are at a bit that is in the prefix shared by min_ and
-	 max_len, skip this BLKSIZE if the bit is clear.  */
-      else if ((max_len & blksize) == 0)
+	 max_len, skip the current BLKSIZE if the bit is clear, but do
+	 not skip the loop, even if it doesn't require
+	 prechecking.  */
+      else if ((max_len & blksize) == 0
+	       && !(max_loop && i == max_bits))
 	continue;
 
-      /* Issue a store of BLKSIZE bytes.  */
-      to = store_by_pieces (to, blksize,
-			    constfun, constfundata,
-			    align, true,
-			    i != sctz_len ? RETURN_END : RETURN_BEGIN);
+      if (max_loop && i == max_bits)
+	{
+	  loop_label = gen_label_rtx ();
+	  emit_label (loop_label);
+	  /* Since we may run this multiple times, don't assume we
+	     know anything about the offset.  */
+	  clear_mem_offset (to);
+	}
 
+      bool update_needed = i != sctz_len || loop_label;
+      rtx next_ptr = NULL_RTX;
+      if (!use_store_by_pieces)
+	{
+	  gcc_checking_assert (blksize == 1);
+	  if (!val)
+	    val = gen_int_mode (valc, QImode);
+	  to = change_address (to, QImode, 0);
+	  emit_move_insn (to, val);
+	  if (update_needed)
+	    next_ptr = plus_constant (ptr_mode, ptr, blksize);
+	}
+      else
+	{
+	  /* Issue a store of BLKSIZE bytes.  */
+	  to = store_by_pieces (to, blksize,
+				constfun, constfundata,
+				align, true,
+				update_needed ? RETURN_END : RETURN_BEGIN);
+	  next_ptr = XEXP (to, 0);
+	}
       /* Adjust REM and PTR, unless this is the last iteration.  */
-      if (i != sctz_len)
+      if (update_needed)
 	{
-	  emit_move_insn (ptr, force_operand (XEXP (to, 0), NULL_RTX));
+	  emit_move_insn (ptr, force_operand (next_ptr, NULL_RTX));
 	  to = replace_equiv_address (to, ptr);
 	  rtx rem_minus_blksize = plus_constant (ptr_mode, rem, -blksize);
 	  emit_move_insn (rem, force_operand (rem_minus_blksize, NULL_RTX));
 	}
 
+      if (loop_label)
+	emit_cmp_and_jump_insns (rem, GEN_INT (blksize), GE, NULL,
+				 ptr_mode, 1, loop_label,
+				 profile_probability::likely ());
+
       if (label)
 	{
 	  emit_label (label);
@@ -4716,7 +4826,8 @@ expand_builtin_memcmp (tree exp, rtx target, bool result_eq)
   result = emit_block_cmp_hints (arg1_rtx, arg2_rtx, len_rtx,
 				 TREE_TYPE (len), target,
 				 result_eq, constfn,
-				 CONST_CAST (char *, rep));
+				 CONST_CAST (char *, rep),
+				 tree_ctz (len));
 
   if (result)
     {
@@ -7605,7 +7716,15 @@ expand_builtin (tree exp, rtx target, rtx subtarget, machine_mode mode,
       && fcode != BUILT_IN_EXECVE
       && fcode != BUILT_IN_CLEAR_CACHE
       && !ALLOCA_FUNCTION_CODE_P (fcode)
-      && fcode != BUILT_IN_FREE)
+      && fcode != BUILT_IN_FREE
+      && (fcode != BUILT_IN_MEMSET
+	  || !(flag_inline_stringops & ILSOP_MEMSET))
+      && (fcode != BUILT_IN_MEMCPY
+	  || !(flag_inline_stringops & ILSOP_MEMCPY))
+      && (fcode != BUILT_IN_MEMMOVE
+	  || !(flag_inline_stringops & ILSOP_MEMMOVE))
+      && (fcode != BUILT_IN_MEMCMP
+	  || !(flag_inline_stringops & ILSOP_MEMCMP)))
     return expand_call (exp, target, ignore);
 
   /* The built-in function expanders test for target == const0_rtx
diff --git a/gcc/common.opt b/gcc/common.opt
index 5b4cef21bd6..4914b6e7dbf 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -1943,6 +1943,40 @@ finline-atomics
 Common Var(flag_inline_atomics) Init(1) Optimization
 Inline __atomic operations when a lock free instruction sequence is available.
 
+finline-stringops
+Common RejectNegative Enum(ilsop_fn) Var(flag_inline_stringops, ILSOP_ALL) Enum(ilsop_fn) Init(ILSOP_NONE) Optimization Undocumented
+
+fno-inline-stringops
+Common RejectNegative Enum(ilsop_fn) Var(flag_inline_stringops, ILSOP_NONE) Enum(ilsop_fn) Optimization Undocumented
+
+finline-stringops=
+Common Joined Var(flag_inline_stringops) EnumSet Enum(ilsop_fn) Optimization
+-finline-stringops[=memcmp|memcpy|memmove|memset]
+Expand supported mem/str operations inline, even if against optimization.
+
+Enum
+Name(ilsop_fn) Type(enum ilsop_fn) UnknownError(unavailable stringop for inlining %qs)
+
+; This is not part of any set.
+; EnumValue
+; Enum(ilsop_fn) String(none) Value(ILSOP_NONE)
+
+EnumValue
+Enum(ilsop_fn) String(memcmp) Value(ILSOP_MEMCMP) Set(1)
+
+EnumValue
+Enum(ilsop_fn) String(memcpy) Value(ILSOP_MEMCPY) Set(2)
+
+EnumValue
+Enum(ilsop_fn) String(memmove) Value(ILSOP_MEMMOVE) Set(3)
+
+EnumValue
+Enum(ilsop_fn) String(memset) Value(ILSOP_MEMSET) Set(4)
+
+; This is not part of any set either.
+; EnumValue
+; Enum(ilsop_fn) String(all) Value(ILSOP_ALL)
+
 fcf-protection
 Common RejectNegative Alias(fcf-protection=,full)
 
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 7110c462d1d..ff3eb373cda 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -560,6 +560,7 @@ Objective-C and Objective-C++ Dialects}.
 -fgcse  -fgcse-after-reload  -fgcse-las  -fgcse-lm  -fgraphite-identity
 -fgcse-sm  -fhoist-adjacent-loads  -fif-conversion
 -fif-conversion2  -findirect-inlining
+-finline-stringops[=@var{fn}]
 -finline-functions  -finline-functions-called-once  -finline-limit=@var{n}
 -finline-small-functions -fipa-modref -fipa-cp  -fipa-cp-clone
 -fipa-bit-cp  -fipa-vrp  -fipa-pta  -fipa-profile  -fipa-pure-const
@@ -12348,6 +12349,20 @@ their @code{_FORTIFY_SOURCE} counterparts into faster alternatives.
 
 Enabled at levels @option{-O2}, @option{-O3}.
 
+@opindex finline-stringops
+@item -finline-stringops[=@var{fn}]
+Expand memory and string operations (for now, only @code{memset})
+inline, even when the length is variable or big enough as to require
+looping.  This is most useful along with @option{-ffreestanding} and
+@option{-fno-builtin}.
+
+In some circumstances, it enables the compiler to generate code that
+takes advantage of known alignment and length multipliers, but even then
+it may be less efficient than optimized runtime implementations, and
+grow code size so much that even a less performant but shared
+implementation runs faster due to better use of code caches.  This
+option is disabled by default.
+
 @opindex fno-inline
 @opindex finline
 @item -fno-inline
diff --git a/gcc/expr.cc b/gcc/expr.cc
index 556bcf7ef59..b1db81a47d0 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -80,7 +80,11 @@ static bool emit_block_move_via_pattern (rtx, rtx, rtx, unsigned, unsigned,
 					 HOST_WIDE_INT, unsigned HOST_WIDE_INT,
 					 unsigned HOST_WIDE_INT,
 					 unsigned HOST_WIDE_INT, bool);
-static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned);
+static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned, int);
+static void emit_block_move_via_sized_loop (rtx, rtx, rtx, unsigned, unsigned);
+static void emit_block_move_via_oriented_loop (rtx, rtx, rtx, unsigned, unsigned);
+static rtx emit_block_cmp_via_loop (rtx, rtx, rtx, tree, rtx, bool,
+				    unsigned, unsigned);
 static void clear_by_pieces (rtx, unsigned HOST_WIDE_INT, unsigned int);
 static rtx_insn *compress_float_constant (rtx, rtx);
 static rtx get_subtarget (rtx);
@@ -1985,6 +1989,8 @@ compare_by_pieces (rtx arg0, rtx arg1, unsigned HOST_WIDE_INT len,
    MIN_SIZE is the minimal size of block to move
    MAX_SIZE is the maximal size of block to move, if it cannot be represented
    in unsigned HOST_WIDE_INT, than it is mask of all ones.
+   CTZ_SIZE is the trailing-zeros count of SIZE; even a nonconstant SIZE is
+   known to be a multiple of 1<<CTZ_SIZE.
 
    Return the address of the new block, if memcpy is called and returns it,
    0 otherwise.  */
@@ -1996,7 +2002,7 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
 		       unsigned HOST_WIDE_INT max_size,
 		       unsigned HOST_WIDE_INT probable_max_size,
 		       bool bail_out_libcall, bool *is_move_done,
-		       bool might_overlap)
+		       bool might_overlap, unsigned ctz_size)
 {
   int may_use_call;
   rtx retval = 0;
@@ -2082,6 +2088,14 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
 	}
     }
 
+  bool dynamic_direction = false;
+  if (!pattern_ok && !pieces_ok && may_use_call
+      && (flag_inline_stringops & (might_overlap ? ILSOP_MEMMOVE : ILSOP_MEMCPY)))
+    {
+      may_use_call = 0;
+      dynamic_direction = might_overlap;
+    }
+
   if (pattern_ok)
     ;
   else if (pieces_ok)
@@ -2103,10 +2117,12 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
       retval = emit_block_copy_via_libcall (x, y, size,
 					    method == BLOCK_OP_TAILCALL);
     }
+  else if (dynamic_direction)
+    emit_block_move_via_oriented_loop (x, y, size, align, ctz_size);
   else if (might_overlap)
     *is_move_done = false;
   else
-    emit_block_move_via_loop (x, y, size, align);
+    emit_block_move_via_sized_loop (x, y, size, align, ctz_size);
 
   if (method == BLOCK_OP_CALL_PARM)
     OK_DEFER_POP;
@@ -2115,7 +2131,8 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
 }
 
 rtx
-emit_block_move (rtx x, rtx y, rtx size, enum block_op_methods method)
+emit_block_move (rtx x, rtx y, rtx size, enum block_op_methods method,
+		 unsigned int ctz_size)
 {
   unsigned HOST_WIDE_INT max, min = 0;
   if (GET_CODE (size) == CONST_INT)
@@ -2123,7 +2140,8 @@ emit_block_move (rtx x, rtx y, rtx size, enum block_op_methods method)
   else
     max = GET_MODE_MASK (GET_MODE (size));
   return emit_block_move_hints (x, y, size, method, 0, -1,
-				min, max, max);
+				min, max, max,
+				false, NULL, false, ctz_size);
 }
 
 /* A subroutine of emit_block_move.  Returns true if calling the
@@ -2285,13 +2303,117 @@ emit_block_move_via_pattern (rtx x, rtx y, rtx size, unsigned int align,
   return false;
 }
 
+/* Like emit_block_move_via_loop, but choose a suitable INCR based on
+   ALIGN and CTZ_SIZE.  */
+
+static void
+emit_block_move_via_sized_loop (rtx x, rtx y, rtx size,
+				unsigned int align,
+				unsigned int ctz_size)
+{
+  int incr = align / BITS_PER_UNIT;
+
+  if (CONST_INT_P (size))
+    ctz_size = MAX (ctz_size, (unsigned) wi::ctz (UINTVAL (size)));
+
+  if (HOST_WIDE_INT_1U << ctz_size < (unsigned HOST_WIDE_INT) incr)
+    incr = HOST_WIDE_INT_1U << ctz_size;
+
+  while (incr > 1 && !can_move_by_pieces (incr, align))
+    incr >>= 1;
+
+  gcc_checking_assert (incr);
+
+  return emit_block_move_via_loop (x, y, size, align, incr);
+}
+
+/* Like emit_block_move_via_sized_loop, but besides choosing INCR so
+   as to ensure safe moves even in case of overlap, output dynamic
+   tests to choose between two loops, one moving downwards, another
+   moving upwards.  */
+
+static void
+emit_block_move_via_oriented_loop (rtx x, rtx y, rtx size,
+				   unsigned int align,
+				   unsigned int ctz_size)
+{
+  int incr = align / BITS_PER_UNIT;
+
+  if (CONST_INT_P (size))
+    ctz_size = MAX (ctz_size, (unsigned) wi::ctz (UINTVAL (size)));
+
+  if (HOST_WIDE_INT_1U << ctz_size < (unsigned HOST_WIDE_INT) incr)
+    incr = HOST_WIDE_INT_1U << ctz_size;
+
+  while (incr > 1 && !int_mode_for_size (incr, 0).exists ())
+    incr >>= 1;
+
+  gcc_checking_assert (incr);
+
+  rtx_code_label *upw_label, *end_label;
+  upw_label = gen_label_rtx ();
+  end_label = gen_label_rtx ();
+
+  rtx x_addr = force_operand (XEXP (x, 0), NULL_RTX);
+  rtx y_addr = force_operand (XEXP (y, 0), NULL_RTX);
+  do_pending_stack_adjust ();
+
+  machine_mode mode = GET_MODE (x_addr);
+  if (mode != GET_MODE (y_addr))
+    {
+      scalar_int_mode xmode
+	= smallest_int_mode_for_size (GET_MODE_BITSIZE (mode));
+      scalar_int_mode ymode
+	= smallest_int_mode_for_size (GET_MODE_BITSIZE
+				      (GET_MODE (y_addr)));
+      if (GET_MODE_BITSIZE (xmode) < GET_MODE_BITSIZE (ymode))
+	mode = ymode;
+      else
+	mode = xmode;
+
+#ifndef POINTERS_EXTEND_UNSIGNED
+      const int POINTERS_EXTEND_UNSIGNED = 1;
+#endif
+      x_addr = convert_modes (mode, GET_MODE (x_addr), x_addr,
+			      POINTERS_EXTEND_UNSIGNED);
+      y_addr = convert_modes (mode, GET_MODE (y_addr), y_addr,
+			      POINTERS_EXTEND_UNSIGNED);
+    }
+
+  /* Test for overlap: if (x >= y || x + size <= y) goto upw_label.  */
+  emit_cmp_and_jump_insns (x_addr, y_addr, GEU, NULL_RTX, mode,
+			   true, upw_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (5, 10));
+  rtx tmp = convert_modes (GET_MODE (x_addr), GET_MODE (size), size, true);
+  tmp = simplify_gen_binary (PLUS, GET_MODE (x_addr), x_addr, tmp);
+
+  emit_cmp_and_jump_insns (tmp, y_addr, LEU, NULL_RTX, mode,
+			   true, upw_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (8, 10));
+
+  emit_block_move_via_loop (x, y, size, align, -incr);
+
+  emit_jump (end_label);
+  emit_label (upw_label);
+
+  emit_block_move_via_loop (x, y, size, align, incr);
+
+  emit_label (end_label);
+}
+
 /* A subroutine of emit_block_move.  Copy the data via an explicit
-   loop.  This is used only when libcalls are forbidden.  */
-/* ??? It'd be nice to copy in hunks larger than QImode.  */
+   loop.  This is used only when libcalls are forbidden, or when
+   inlining is required.  INCR is the block size to be copied in each
+   loop iteration.  If it is negative, the absolute value is used, and
+   the block is copied backwards.  INCR must be a power of two, an
+   exact divisor for SIZE and ALIGN, and imply a mode that can be
+   safely copied per iteration assuming no overlap.  */
 
 static void
 emit_block_move_via_loop (rtx x, rtx y, rtx size,
-			  unsigned int align ATTRIBUTE_UNUSED)
+			  unsigned int align, int incr)
 {
   rtx_code_label *cmp_label, *top_label;
   rtx iter, x_addr, y_addr, tmp;
@@ -2307,7 +2429,38 @@ emit_block_move_via_loop (rtx x, rtx y, rtx size,
   cmp_label = gen_label_rtx ();
   iter = gen_reg_rtx (iter_mode);
 
-  emit_move_insn (iter, const0_rtx);
+  bool downwards = incr < 0;
+  rtx iter_init;
+  rtx_code iter_cond;
+  rtx iter_limit;
+  rtx iter_incr;
+  machine_mode move_mode;
+  if (downwards)
+    {
+      incr = -incr;
+      iter_init = size;
+      iter_cond = GEU;
+      iter_limit = const0_rtx;
+      iter_incr = GEN_INT (incr);
+    }
+  else
+    {
+      iter_init = const0_rtx;
+      iter_cond = LTU;
+      iter_limit = size;
+      iter_incr = GEN_INT (incr);
+    }
+  emit_move_insn (iter, iter_init);
+
+  scalar_int_mode int_move_mode
+    = smallest_int_mode_for_size (incr * BITS_PER_UNIT);
+  if (GET_MODE_BITSIZE (int_move_mode) != incr * BITS_PER_UNIT)
+    {
+      move_mode = BLKmode;
+      gcc_checking_assert (can_move_by_pieces (incr, align));
+    }
+  else
+    move_mode = int_move_mode;
 
   x_addr = force_operand (XEXP (x, 0), NULL_RTX);
   y_addr = force_operand (XEXP (y, 0), NULL_RTX);
@@ -2323,19 +2476,32 @@ emit_block_move_via_loop (rtx x, rtx y, rtx size,
     tmp = convert_modes (y_addr_mode, iter_mode, iter, true);
   y_addr = simplify_gen_binary (PLUS, y_addr_mode, y_addr, tmp);
 
-  x = change_address (x, QImode, x_addr);
-  y = change_address (y, QImode, y_addr);
+  x = change_address (x, move_mode, x_addr);
+  y = change_address (y, move_mode, y_addr);
+
+  if (move_mode == BLKmode)
+    {
+      bool done;
+      emit_block_move_hints (x, y, iter_incr, BLOCK_OP_NO_LIBCALL,
+			     align, incr, incr, incr, incr,
+			     false, &done, false);
+      gcc_checking_assert (done);
+    }
+  else
+    emit_move_insn (x, y);
 
-  emit_move_insn (x, y);
+  if (downwards)
+    emit_label (cmp_label);
 
-  tmp = expand_simple_binop (iter_mode, PLUS, iter, const1_rtx, iter,
+  tmp = expand_simple_binop (iter_mode, PLUS, iter, iter_incr, iter,
 			     true, OPTAB_LIB_WIDEN);
   if (tmp != iter)
     emit_move_insn (iter, tmp);
 
-  emit_label (cmp_label);
+  if (!downwards)
+    emit_label (cmp_label);
 
-  emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
+  emit_cmp_and_jump_insns (iter, iter_limit, iter_cond, NULL_RTX, iter_mode,
 			   true, top_label,
 			   profile_probability::guessed_always ()
 				.apply_scale (9, 10));
@@ -2435,7 +2601,8 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
 
    Both X and Y must be MEM rtx's.  LEN is an rtx that says how long
    they are.  LEN_TYPE is the type of the expression that was used to
-   calculate it.
+   calculate it, and CTZ_LEN is the known trailing-zeros count of LEN,
+   so LEN must be a multiple of 1<<CTZ_LEN even if it's not constant.
 
    If EQUALITY_ONLY is true, it means we don't have to return the tri-state
    value of a normal memcmp call, instead we can just compare for equality.
@@ -2451,7 +2618,7 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
 rtx
 emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
 		      bool equality_only, by_pieces_constfn y_cfn,
-		      void *y_cfndata)
+		      void *y_cfndata, unsigned ctz_len)
 {
   rtx result = 0;
 
@@ -2473,8 +2640,203 @@ emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
   else
     result = emit_block_cmp_via_cmpmem (x, y, len, len_type, target, align);
 
+  if (!result && (flag_inline_stringops & ILSOP_MEMCMP))
+    result = emit_block_cmp_via_loop (x, y, len, len_type,
+				      target, equality_only,
+				      align, ctz_len);
+
   return result;
 }
+
+/* Like emit_block_cmp_hints, but with known alignment and no support
+   for constats.  Always expand to a loop with iterations that compare
+   blocks of the largest compare-by-pieces size that divides both len
+   and align, and then, if !EQUALITY_ONLY, identify the word and then
+   the unit that first differs to return the result.  */
+
+rtx
+emit_block_cmp_via_loop (rtx x, rtx y, rtx len, tree len_type, rtx target,
+			 bool equality_only, unsigned align, unsigned ctz_len)
+{
+  unsigned incr = align / BITS_PER_UNIT;
+
+  if (CONST_INT_P (len))
+    ctz_len = MAX (ctz_len, (unsigned) wi::ctz (UINTVAL (len)));
+
+  if (HOST_WIDE_INT_1U << ctz_len < (unsigned HOST_WIDE_INT) incr)
+    incr = HOST_WIDE_INT_1U << ctz_len;
+
+  while (incr > 1
+	 && !can_do_by_pieces (incr, align, COMPARE_BY_PIECES))
+    incr >>= 1;
+
+  rtx_code_label *cmp_label, *top_label, *ne_label, *res_label;
+  rtx iter, x_addr, y_addr, tmp;
+  machine_mode x_addr_mode = get_address_mode (x);
+  machine_mode y_addr_mode = get_address_mode (y);
+  machine_mode iter_mode;
+
+  iter_mode = GET_MODE (len);
+  if (iter_mode == VOIDmode)
+    iter_mode = word_mode;
+
+  rtx iter_init = const0_rtx;
+  rtx_code iter_cond = LTU;
+  rtx_code entry_cond = GEU;
+  rtx iter_limit = len;
+  rtx iter_incr = GEN_INT (incr);
+  machine_mode cmp_mode;
+
+  /* We can drop the loop back edge if we know there's exactly one
+     iteration.  */
+  top_label = (!rtx_equal_p (len, iter_incr)
+	       ? gen_label_rtx ()
+	       : NULL);
+  /* We need not test before entering the loop if len is known
+     nonzero.  ??? This could be even stricter, testing whether a
+     nonconstant LEN could possibly be zero.  */
+  cmp_label = (!CONSTANT_P (len) || rtx_equal_p (len, iter_init)
+	       ? gen_label_rtx ()
+	       : NULL);
+  ne_label = gen_label_rtx ();
+  res_label = gen_label_rtx ();
+
+  iter = gen_reg_rtx (iter_mode);
+  emit_move_insn (iter, iter_init);
+
+  scalar_int_mode int_cmp_mode
+    = smallest_int_mode_for_size (incr * BITS_PER_UNIT);
+  if (GET_MODE_BITSIZE (int_cmp_mode) != incr * BITS_PER_UNIT
+      || !can_compare_p (NE, int_cmp_mode, ccp_jump))
+    {
+      cmp_mode = BLKmode;
+      gcc_checking_assert (incr != 1);
+    }
+  else
+    cmp_mode = int_cmp_mode;
+
+  /* Save the base addresses.  */
+  x_addr = force_operand (XEXP (x, 0), NULL_RTX);
+  y_addr = force_operand (XEXP (y, 0), NULL_RTX);
+  do_pending_stack_adjust ();
+
+  if (cmp_label)
+    {
+      if (top_label)
+	emit_jump (cmp_label);
+      else
+	emit_cmp_and_jump_insns (iter, iter_limit, entry_cond,
+				 NULL_RTX, iter_mode,
+				 true, cmp_label,
+				 profile_probability::guessed_always ()
+				 .apply_scale (1, 10));
+    }
+  if (top_label)
+    emit_label (top_label);
+
+  /* Offset the base addresses by ITER.  */
+  tmp = convert_modes (x_addr_mode, iter_mode, iter, true);
+  x_addr = simplify_gen_binary (PLUS, x_addr_mode, x_addr, tmp);
+
+  if (x_addr_mode != y_addr_mode)
+    tmp = convert_modes (y_addr_mode, iter_mode, iter, true);
+  y_addr = simplify_gen_binary (PLUS, y_addr_mode, y_addr, tmp);
+
+  x = change_address (x, cmp_mode, x_addr);
+  y = change_address (y, cmp_mode, y_addr);
+
+  /* Compare one block.  */
+  rtx part_res;
+  if (cmp_mode == BLKmode)
+    part_res = compare_by_pieces (x, y, incr, target, align, 0, 0);
+  else
+    part_res = expand_binop (cmp_mode, sub_optab, x, y, NULL_RTX,
+			     true, OPTAB_LIB_WIDEN);
+
+  /* Stop if we found a difference.  */
+  emit_cmp_and_jump_insns (part_res, GEN_INT (0), NE, NULL_RTX,
+			   GET_MODE (part_res), true, ne_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (1, 10));
+
+  /* Increment ITER.  */
+  tmp = expand_simple_binop (iter_mode, PLUS, iter, iter_incr, iter,
+			     true, OPTAB_LIB_WIDEN);
+  if (tmp != iter)
+    emit_move_insn (iter, tmp);
+
+  if (cmp_label)
+    emit_label (cmp_label);
+  /* Loop until we reach the limit.  */
+
+  if (top_label)
+    emit_cmp_and_jump_insns (iter, iter_limit, iter_cond, NULL_RTX, iter_mode,
+			     true, top_label,
+			     profile_probability::guessed_always ()
+			     .apply_scale (9, 10));
+
+  /* We got to the end without differences, so the result is zero.  */
+  if (target == NULL_RTX
+      || !REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
+    target = gen_reg_rtx (TYPE_MODE (integer_type_node));
+
+  emit_move_insn (target, const0_rtx);
+  emit_jump (res_label);
+
+  emit_label (ne_label);
+
+  /* Return nonzero, or pinpoint the difference to return the expected
+     result for non-equality tests.  */
+  if (equality_only)
+    emit_move_insn (target, const1_rtx);
+  else
+    {
+      if (incr > UNITS_PER_WORD)
+	/* ??? Re-compare the block found to be different one word at a
+	   time.  */
+	part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), len_type,
+					    target, equality_only,
+					    BITS_PER_WORD, 0);
+      else if (incr > 1)
+	/* ??? Re-compare the block found to be different one byte at a
+	   time.  We could do better using part_res, and being careful
+	   about endianness.  */
+	part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), len_type,
+					    target, equality_only,
+					    BITS_PER_UNIT, 0);
+      else if (known_gt (GET_MODE_BITSIZE (GET_MODE (target)),
+			 GET_MODE_BITSIZE (cmp_mode)))
+	part_res = expand_binop (GET_MODE (target), sub_optab, x, y, target,
+				 true, OPTAB_LIB_WIDEN);
+      else
+	{
+	  /* In the odd chance target is QImode, we can't count on
+	     widening subtract to capture the result of the unsigned
+	     compares.  */
+	  rtx_code_label *ltu_label;
+	  ltu_label = gen_label_rtx ();
+	  emit_cmp_and_jump_insns (x, y, LTU, NULL_RTX,
+				   cmp_mode, true, ltu_label,
+				   profile_probability::guessed_always ()
+				   .apply_scale (5, 10));
+
+	  emit_move_insn (target, const1_rtx);
+	  emit_jump (res_label);
+
+	  emit_label (ltu_label);
+	  emit_move_insn (target, constm1_rtx);
+	  part_res = target;
+	}
+
+      if (target != part_res)
+	convert_move (target, part_res, false);
+    }
+
+  emit_label (res_label);
+
+  return target;
+}
+
 \f
 /* Copy all or part of a value X into registers starting at REGNO.
    The number of registers to be filled is NREGS.  */
diff --git a/gcc/expr.h b/gcc/expr.h
index 2a172867fdb..431616d55be 100644
--- a/gcc/expr.h
+++ b/gcc/expr.h
@@ -126,7 +126,8 @@ struct by_pieces_prev
   fixed_size_mode mode;
 };
 
-extern rtx emit_block_move (rtx, rtx, rtx, enum block_op_methods);
+extern rtx emit_block_move (rtx, rtx, rtx, enum block_op_methods,
+			    unsigned ctz_size = 0);
 extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
 			          unsigned int, HOST_WIDE_INT,
 				  unsigned HOST_WIDE_INT,
@@ -134,9 +135,11 @@ extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
 				  unsigned HOST_WIDE_INT,
 				  bool bail_out_libcall = false,
 				  bool *is_move_done = NULL,
-				  bool might_overlap = false);
+				  bool might_overlap = false,
+				  unsigned ctz_size = 0);
 extern rtx emit_block_cmp_hints (rtx, rtx, rtx, tree, rtx, bool,
-				 by_pieces_constfn, void *);
+				 by_pieces_constfn, void *,
+				 unsigned ctz_len = 0);
 extern bool emit_storent_insn (rtx to, rtx from);
 
 /* Copy all or part of a value X into registers starting at REGNO.
diff --git a/gcc/flag-types.h b/gcc/flag-types.h
index c1852cd810c..650ae08b004 100644
--- a/gcc/flag-types.h
+++ b/gcc/flag-types.h
@@ -447,6 +447,17 @@ enum gfc_convert
 };
 
 
+/* Inline String Operations functions.  */
+enum ilsop_fn
+{
+  ILSOP_NONE = 0,
+  ILSOP_MEMSET = 1 << 0,
+  ILSOP_MEMCPY = 1 << 1,
+  ILSOP_MEMMOVE = 1 << 2,
+  ILSOP_MEMCMP = 1 << 3,
+  ILSOP_ALL = -1
+};
+
 /* Control-Flow Protection values.  */
 enum cf_protection_level
 {
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c
new file mode 100644
index 00000000000..a368f074112
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c
@@ -0,0 +1,7 @@
+/* { dg-do run } */
+/* { dg-options "-finline-stringops=memcmp -save-temps -g0 -fno-lto" } */
+
+#include "../memcmp-1.c"
+
+/* Check that no memcmp calls remain, but allow for lib_memcmp calls.  */
+/* { dg-final { scan-assembler-not {(^|\*)\mmemcmp\M} } } */
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c
new file mode 100644
index 00000000000..c98e903c1f1
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c
@@ -0,0 +1,8 @@
+/* { dg-do run } */
+/* { dg-options "-finline-stringops=memcpy -save-temps -g0 -fno-lto" } */
+
+#include "../memcmp-1.c"
+/* Yeah, this memcmp test exercises plenty of memcpy, more than any of the
+   memcpy tests.  */
+
+/* { dg-final { scan-assembler-not {\mmemcpy\M} } } */
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-cmp-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-cmp-1.c
new file mode 100644
index 00000000000..2cd2057a9b5
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-cmp-1.c
@@ -0,0 +1,11 @@
+/* { dg-do run } */
+/* { dg-options "-finline-stringops -save-temps -g0 -fno-lto" } */
+/* { dg-require-effective-target ptr32plus } */
+/* { dg-timeout-factor 2 } */
+
+#include "../memcmp-1.c"
+/* Yeah, this memcmp test exercises plenty of memcpy, more than any of the
+   memcpy tests.  */
+
+/* { dg-final { scan-assembler-not {\mmemcpy\M} } } */
+/* { dg-final { scan-assembler-not {(^|\*)\mmemcmp\M} } } */
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c
new file mode 100644
index 00000000000..c0eca5bce6b
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c
@@ -0,0 +1,8 @@
+/* { dg-do run } */
+/* { dg-options "-finline-stringops=memmove -save-temps -g0 -fno-lto" } */
+
+#include "../../gcc.c-torture/execute/builtins/memmove.c"
+
+#include "../../gcc.c-torture/execute/builtins/lib/main.c"
+
+/* { dg-final { scan-assembler-not {\mmemmove\M} } } */
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c
new file mode 100644
index 00000000000..bdcf9bf5b61
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c
@@ -0,0 +1,84 @@
+/* { dg-do compile } */
+/* { dg-options "-finline-stringops -fno-lto" } */
+
+void *zero (unsigned long long (*p)[32], int n)
+{
+  return __builtin_memset (p, 0, n * sizeof (*p));
+}
+
+void *ones (char (*p)[128], int n)
+{
+  return __builtin_memset (p, -1, n * sizeof (*p));
+}
+
+void *opt2 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 2) * sizeof (*p));
+}
+
+void *opt8 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 8) * sizeof (*p));
+}
+
+void *opt32 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 32) * sizeof (*p));
+}
+
+void *opt128 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 128) * sizeof (*p));
+}
+
+void *opt512 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 512) * sizeof (*p));
+}
+
+void *opt_primes (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 509 : 7) * sizeof (*p));
+}
+
+void *opt_primes_blk (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 521 : 9) * sizeof (*p));
+}
+
+void *huge (long (*p)[16384])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep1 (long (*p)[16384+1])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep4 (long (*p)[16384+4])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep16 (long (*p)[16384+16])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep64 (long (*p)[16384+64])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep256 (long (*p)[16384+256])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep1024p256p64p16p4p1 (long (*p)[16384+1024+64+16+4+1])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+/* { dg-final { scan-assembler-not {\mmemset\M} } } */

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [gcc(refs/users/aoliva/heads/testme)] Introduce -finline-stringops
@ 2023-09-23  5:23 Alexandre Oliva
  0 siblings, 0 replies; 16+ messages in thread
From: Alexandre Oliva @ 2023-09-23  5:23 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:fc43d0c01c773b43c2287fe6edd35f89545c28c8

commit fc43d0c01c773b43c2287fe6edd35f89545c28c8
Author: Alexandre Oliva <oliva@adacore.com>
Date:   Fri Sep 15 02:41:24 2023 -0300

    Introduce -finline-stringops
    
    try_store_by_multiple_pieces was added not long ago, enabling
    variable-sized memset to be expanded inline when the worst-case
    in-range constant length would, using conditional blocks with powers
    of two to cover all possibilities of length and alignment.
    
    This patch introduces -finline-stringops[=fn] to request expansions to
    start with a loop, so as to still take advantage of known alignment
    even with long lengths, but without necessarily adding store blocks
    for every power of two.
    
    This makes it possible for the supported stringops (memset, memcpy,
    memmove, memset) to be expanded, even if storing a single byte per
    iteration.  Surely efficient implementations can run faster, with a
    pre-loop to increase alignment, but that would likely be excessive for
    inline expansions.
    
    Still, in some cases, such as in freestanding environments, users
    prefer to inline such stringops, especially those that the compiler
    may introduce itself, even if the expansion is not as performant as a
    highly optimized C library implementation could be, to avoid
    depending on a C runtime library.
    
    
    for  gcc/ChangeLog
    
            * expr.cc (emit_block_move_hints): Take ctz of len.  Obey
            -finline-stringops.  Use oriented or sized loop.
            (emit_block_move): Take ctz of len, and pass it on.
            (emit_block_move_via_sized_loop): New.
            (emit_block_move_via_oriented_loop): New.
            (emit_block_move_via_loop): Take incr.  Move an incr-sized
            block per iteration.
            (emit_block_cmp_via_cmpmem): Take ctz of len.  Obey
            -finline-stringops.
            (emit_block_cmp_via_loop): New.
            * expr.h (emit_block_move): Add ctz of len defaulting to zero.
            (emit_block_move_hints): Likewise.
            (emit_block_cmp_hints): Likewise.
            * builtins.cc (expand_builtin_memory_copy_args): Pass ctz of
            len to emit_block_move_hints.
            (try_store_by_multiple_pieces): Support starting with a loop.
            (expand_builtin_memcmp): Pass ctz of len to
            emit_block_cmp_hints.
            (expand_builtin): Allow inline expansion of memset, memcpy,
            memmove and memcmp if requested.
            * common.opt (finline-stringops): New.
            (ilsop_fn): New enum.
            * flag-types.h (enum ilsop_fn): New.
            * doc/invoke.texi (-finline-stringops): Add.
    
    for  gcc/testsuite/ChangeLog
    
            * gcc.dg/torture/inline-mem-cmp-1.c: New.
            * gcc.dg/torture/inline-mem-cpy-1.c: New.
            * gcc.dg/torture/inline-mem-cpy-cmp-1.c: New.
            * gcc.dg/torture/inline-mem-move-1.c: New.
            * gcc.dg/torture/inline-mem-set-1.c: New.

Diff:
---
 gcc/builtins.cc                                    | 149 +++++++-
 gcc/common.opt                                     |  34 ++
 gcc/doc/invoke.texi                                |  15 +
 gcc/expr.cc                                        | 396 ++++++++++++++++++++-
 gcc/expr.h                                         |   9 +-
 gcc/flag-types.h                                   |  11 +
 gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c    |   7 +
 gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c    |   8 +
 .../gcc.dg/torture/inline-mem-cpy-cmp-1.c          |  11 +
 gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c   |   8 +
 gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c    |  84 +++++
 11 files changed, 697 insertions(+), 35 deletions(-)

diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index 3b453b3ec8c..fc3e3ae61cb 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -3771,7 +3771,7 @@ expand_builtin_memory_copy_args (tree dest, tree src, tree len,
 				     expected_align, expected_size,
 				     min_size, max_size, probable_max_size,
 				     use_mempcpy_call, &is_move_done,
-				     might_overlap);
+				     might_overlap, tree_ctz (len));
 
   /* Bail out when a mempcpy call would be expanded as libcall and when
      we have a target that provides a fast implementation
@@ -4337,6 +4337,10 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
   int tst_bits = (max_bits != min_bits ? max_bits
 		  : floor_log2 (max_len ^ min_len));
 
+  /* Save the pre-blksize values.  */
+  int orig_max_bits = max_bits;
+  int orig_tst_bits = tst_bits;
+
   /* Check whether it's profitable to start by storing a fixed BLKSIZE
      bytes, to lower max_bits.  In the unlikely case of a constant LEN
      (implied by identical MAX_LEN and MIN_LEN), we want to issue a
@@ -4376,9 +4380,81 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
   if (max_bits >= 0)
     xlenest += ((HOST_WIDE_INT_1U << max_bits) * 2
 		- (HOST_WIDE_INT_1U << ctz_len));
-  if (!can_store_by_pieces (xlenest, builtin_memset_read_str,
-			    &valc, align, true))
-    return false;
+  bool max_loop = false;
+  bool use_store_by_pieces = true;
+  /* Skip the test in case of overflow in xlenest.  It shouldn't
+     happen because of the way max_bits and blksize are related, but
+     it doesn't hurt to test.  */
+  if (blksize > xlenest
+      || !can_store_by_pieces (xlenest, builtin_memset_read_str,
+			       &valc, align, true))
+    {
+      if (!(flag_inline_stringops & ILSOP_MEMSET))
+	return false;
+
+      for (max_bits = orig_max_bits;
+	   max_bits >= sctz_len;
+	   --max_bits)
+	{
+	  xlenest = ((HOST_WIDE_INT_1U << max_bits) * 2
+		     - (HOST_WIDE_INT_1U << ctz_len));
+	  /* Check that blksize plus the bits to be stored as blocks
+	     sized at powers of two can be stored by pieces.  This is
+	     like the test above, but with smaller max_bits.  Skip
+	     orig_max_bits (it would be redundant).  Also skip in case
+	     of overflow.  */
+	  if (max_bits < orig_max_bits
+	      && xlenest + blksize >= xlenest
+	      && can_store_by_pieces (xlenest + blksize,
+				      builtin_memset_read_str,
+				      &valc, align, true))
+	    {
+	      max_loop = true;
+	      break;
+	    }
+	  if (blksize
+	      && can_store_by_pieces (xlenest,
+				      builtin_memset_read_str,
+				      &valc, align, true))
+	    {
+	      max_len += blksize;
+	      min_len += blksize;
+	      tst_bits = orig_tst_bits;
+	      blksize = 0;
+	      max_loop = true;
+	      break;
+	    }
+	  if (max_bits == sctz_len)
+	    {
+	      /* We'll get here if can_store_by_pieces refuses to
+		 store even a single QImode.  We'll fall back to
+		 QImode stores then.  */
+	      if (!sctz_len)
+		{
+		  blksize = 0;
+		  max_loop = true;
+		  use_store_by_pieces = false;
+		  break;
+		}
+	      --sctz_len;
+	      --ctz_len;
+	    }
+	}
+      if (!max_loop)
+	return false;
+      /* If the boundaries are such that min and max may run a
+	 different number of trips in the initial loop, the remainder
+	 needs not be between the moduli, so set tst_bits to cover all
+	 bits.  Otherwise, if the trip counts are the same, max_len
+	 has the common prefix, and the previously-computed tst_bits
+	 is usable.  */
+      if (max_len >> max_bits > min_len >> max_bits)
+	tst_bits = max_bits;
+    }
+  /* ??? Do we have to check that all powers of two lengths from
+     max_bits down to ctz_len pass can_store_by_pieces?  As in, could
+     it possibly be that xlenest passes while smaller power-of-two
+     sizes don't?  */
 
   by_pieces_constfn constfun;
   void *constfundata;
@@ -4420,7 +4496,9 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
      the least significant bit possibly set in the length.  */
   for (int i = max_bits; i >= sctz_len; i--)
     {
+      rtx_code_label *loop_label = NULL;
       rtx_code_label *label = NULL;
+
       blksize = HOST_WIDE_INT_1U << i;
 
       /* If we're past the bits shared between min_ and max_len, expand
@@ -4434,25 +4512,57 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
 				   profile_probability::even ());
 	}
       /* If we are at a bit that is in the prefix shared by min_ and
-	 max_len, skip this BLKSIZE if the bit is clear.  */
-      else if ((max_len & blksize) == 0)
+	 max_len, skip the current BLKSIZE if the bit is clear, but do
+	 not skip the loop, even if it doesn't require
+	 prechecking.  */
+      else if ((max_len & blksize) == 0
+	       && !(max_loop && i == max_bits))
 	continue;
 
-      /* Issue a store of BLKSIZE bytes.  */
-      to = store_by_pieces (to, blksize,
-			    constfun, constfundata,
-			    align, true,
-			    i != sctz_len ? RETURN_END : RETURN_BEGIN);
+      if (max_loop && i == max_bits)
+	{
+	  loop_label = gen_label_rtx ();
+	  emit_label (loop_label);
+	  /* Since we may run this multiple times, don't assume we
+	     know anything about the offset.  */
+	  clear_mem_offset (to);
+	}
 
+      bool update_needed = i != sctz_len || loop_label;
+      rtx next_ptr = NULL_RTX;
+      if (!use_store_by_pieces)
+	{
+	  gcc_checking_assert (blksize == 1);
+	  if (!val)
+	    val = gen_int_mode (valc, QImode);
+	  to = change_address (to, QImode, 0);
+	  emit_move_insn (to, val);
+	  if (update_needed)
+	    next_ptr = plus_constant (ptr_mode, ptr, blksize);
+	}
+      else
+	{
+	  /* Issue a store of BLKSIZE bytes.  */
+	  to = store_by_pieces (to, blksize,
+				constfun, constfundata,
+				align, true,
+				update_needed ? RETURN_END : RETURN_BEGIN);
+	  next_ptr = XEXP (to, 0);
+	}
       /* Adjust REM and PTR, unless this is the last iteration.  */
-      if (i != sctz_len)
+      if (update_needed)
 	{
-	  emit_move_insn (ptr, force_operand (XEXP (to, 0), NULL_RTX));
+	  emit_move_insn (ptr, force_operand (next_ptr, NULL_RTX));
 	  to = replace_equiv_address (to, ptr);
 	  rtx rem_minus_blksize = plus_constant (ptr_mode, rem, -blksize);
 	  emit_move_insn (rem, force_operand (rem_minus_blksize, NULL_RTX));
 	}
 
+      if (loop_label)
+	emit_cmp_and_jump_insns (rem, GEN_INT (blksize), GE, NULL,
+				 ptr_mode, 1, loop_label,
+				 profile_probability::likely ());
+
       if (label)
 	{
 	  emit_label (label);
@@ -4739,7 +4849,8 @@ expand_builtin_memcmp (tree exp, rtx target, bool result_eq)
   result = emit_block_cmp_hints (arg1_rtx, arg2_rtx, len_rtx,
 				 TREE_TYPE (len), target,
 				 result_eq, constfn,
-				 CONST_CAST (char *, rep));
+				 CONST_CAST (char *, rep),
+				 tree_ctz (len));
 
   if (result)
     {
@@ -7382,7 +7493,15 @@ expand_builtin (tree exp, rtx target, rtx subtarget, machine_mode mode,
       && fcode != BUILT_IN_EXECVE
       && fcode != BUILT_IN_CLEAR_CACHE
       && !ALLOCA_FUNCTION_CODE_P (fcode)
-      && fcode != BUILT_IN_FREE)
+      && fcode != BUILT_IN_FREE
+      && (fcode != BUILT_IN_MEMSET
+	  || !(flag_inline_stringops & ILSOP_MEMSET))
+      && (fcode != BUILT_IN_MEMCPY
+	  || !(flag_inline_stringops & ILSOP_MEMCPY))
+      && (fcode != BUILT_IN_MEMMOVE
+	  || !(flag_inline_stringops & ILSOP_MEMMOVE))
+      && (fcode != BUILT_IN_MEMCMP
+	  || !(flag_inline_stringops & ILSOP_MEMCMP)))
     return expand_call (exp, target, ignore);
 
   /* The built-in function expanders test for target == const0_rtx
diff --git a/gcc/common.opt b/gcc/common.opt
index f137a1f81ac..1e3f8efbc97 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -1908,6 +1908,40 @@ finline-atomics
 Common Var(flag_inline_atomics) Init(1) Optimization
 Inline __atomic operations when a lock free instruction sequence is available.
 
+finline-stringops
+Common RejectNegative Enum(ilsop_fn) Var(flag_inline_stringops, ILSOP_ALL) Enum(ilsop_fn) Init(ILSOP_NONE) Optimization Undocumented
+
+fno-inline-stringops
+Common RejectNegative Enum(ilsop_fn) Var(flag_inline_stringops, ILSOP_NONE) Enum(ilsop_fn) Optimization Undocumented
+
+finline-stringops=
+Common Joined Var(flag_inline_stringops) EnumSet Enum(ilsop_fn) Optimization
+-finline-stringops[=memcmp|memcpy|memmove|memset]
+Expand supported mem/str operations inline, even if against optimization.
+
+Enum
+Name(ilsop_fn) Type(enum ilsop_fn) UnknownError(unavailable stringop for inlining %qs)
+
+; This is not part of any set.
+; EnumValue
+; Enum(ilsop_fn) String(none) Value(ILSOP_NONE)
+
+EnumValue
+Enum(ilsop_fn) String(memcmp) Value(ILSOP_MEMCMP) Set(1)
+
+EnumValue
+Enum(ilsop_fn) String(memcpy) Value(ILSOP_MEMCPY) Set(2)
+
+EnumValue
+Enum(ilsop_fn) String(memmove) Value(ILSOP_MEMMOVE) Set(3)
+
+EnumValue
+Enum(ilsop_fn) String(memset) Value(ILSOP_MEMSET) Set(4)
+
+; This is not part of any set either.
+; EnumValue
+; Enum(ilsop_fn) String(all) Value(ILSOP_ALL)
+
 fcf-protection
 Common RejectNegative Alias(fcf-protection=,full)
 
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 03d93e6b185..b0649712792 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -556,6 +556,7 @@ Objective-C and Objective-C++ Dialects}.
 -fgcse  -fgcse-after-reload  -fgcse-las  -fgcse-lm  -fgraphite-identity
 -fgcse-sm  -fhoist-adjacent-loads  -fif-conversion
 -fif-conversion2  -findirect-inlining
+-finline-stringops[=@var{fn}]
 -finline-functions  -finline-functions-called-once  -finline-limit=@var{n}
 -finline-small-functions -fipa-modref -fipa-cp  -fipa-cp-clone
 -fipa-bit-cp  -fipa-vrp  -fipa-pta  -fipa-profile  -fipa-pure-const
@@ -12228,6 +12229,20 @@ their @code{_FORTIFY_SOURCE} counterparts into faster alternatives.
 
 Enabled at levels @option{-O2}, @option{-O3}.
 
+@opindex finline-stringops
+@item -finline-stringops[=@var{fn}]
+Expand memory and string operations (for now, only @code{memset})
+inline, even when the length is variable or big enough as to require
+looping.  This is most useful along with @option{-ffreestanding} and
+@option{-fno-builtin}.
+
+In some circumstances, it enables the compiler to generate code that
+takes advantage of known alignment and length multipliers, but even then
+it may be less efficient than optimized runtime implementations, and
+grow code size so much that even a less performant but shared
+implementation runs faster due to better use of code caches.  This
+option is disabled by default.
+
 @opindex fno-inline
 @opindex finline
 @item -fno-inline
diff --git a/gcc/expr.cc b/gcc/expr.cc
index d5b6494b4fc..c6842996b98 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -80,7 +80,11 @@ static bool emit_block_move_via_pattern (rtx, rtx, rtx, unsigned, unsigned,
 					 HOST_WIDE_INT, unsigned HOST_WIDE_INT,
 					 unsigned HOST_WIDE_INT,
 					 unsigned HOST_WIDE_INT, bool);
-static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned);
+static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned, int);
+static void emit_block_move_via_sized_loop (rtx, rtx, rtx, unsigned, unsigned);
+static void emit_block_move_via_oriented_loop (rtx, rtx, rtx, unsigned, unsigned);
+static rtx emit_block_cmp_via_loop (rtx, rtx, rtx, tree, rtx, bool,
+				    unsigned, unsigned);
 static void clear_by_pieces (rtx, unsigned HOST_WIDE_INT, unsigned int);
 static rtx_insn *compress_float_constant (rtx, rtx);
 static rtx get_subtarget (rtx);
@@ -1955,6 +1959,8 @@ compare_by_pieces (rtx arg0, rtx arg1, unsigned HOST_WIDE_INT len,
    MIN_SIZE is the minimal size of block to move
    MAX_SIZE is the maximal size of block to move, if it cannot be represented
    in unsigned HOST_WIDE_INT, than it is mask of all ones.
+   CTZ_SIZE is the trailing-zeros count of SIZE; even a nonconstant SIZE is
+   known to be a multiple of 1<<CTZ_SIZE.
 
    Return the address of the new block, if memcpy is called and returns it,
    0 otherwise.  */
@@ -1966,7 +1972,7 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
 		       unsigned HOST_WIDE_INT max_size,
 		       unsigned HOST_WIDE_INT probable_max_size,
 		       bool bail_out_libcall, bool *is_move_done,
-		       bool might_overlap)
+		       bool might_overlap, unsigned ctz_size)
 {
   int may_use_call;
   rtx retval = 0;
@@ -2052,6 +2058,14 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
 	}
     }
 
+  bool dynamic_direction = false;
+  if (!pattern_ok && !pieces_ok && may_use_call
+      && (flag_inline_stringops & (might_overlap ? ILSOP_MEMMOVE : ILSOP_MEMCPY)))
+    {
+      may_use_call = 0;
+      dynamic_direction = might_overlap;
+    }
+
   if (pattern_ok)
     ;
   else if (pieces_ok)
@@ -2073,10 +2087,12 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
       retval = emit_block_copy_via_libcall (x, y, size,
 					    method == BLOCK_OP_TAILCALL);
     }
+  else if (dynamic_direction)
+    emit_block_move_via_oriented_loop (x, y, size, align, ctz_size);
   else if (might_overlap)
     *is_move_done = false;
   else
-    emit_block_move_via_loop (x, y, size, align);
+    emit_block_move_via_sized_loop (x, y, size, align, ctz_size);
 
   if (method == BLOCK_OP_CALL_PARM)
     OK_DEFER_POP;
@@ -2085,7 +2101,8 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
 }
 
 rtx
-emit_block_move (rtx x, rtx y, rtx size, enum block_op_methods method)
+emit_block_move (rtx x, rtx y, rtx size, enum block_op_methods method,
+		 unsigned int ctz_size)
 {
   unsigned HOST_WIDE_INT max, min = 0;
   if (GET_CODE (size) == CONST_INT)
@@ -2093,7 +2110,8 @@ emit_block_move (rtx x, rtx y, rtx size, enum block_op_methods method)
   else
     max = GET_MODE_MASK (GET_MODE (size));
   return emit_block_move_hints (x, y, size, method, 0, -1,
-				min, max, max);
+				min, max, max,
+				false, NULL, false, ctz_size);
 }
 
 /* A subroutine of emit_block_move.  Returns true if calling the
@@ -2255,13 +2273,117 @@ emit_block_move_via_pattern (rtx x, rtx y, rtx size, unsigned int align,
   return false;
 }
 
+/* Like emit_block_move_via_loop, but choose a suitable INCR based on
+   ALIGN and CTZ_SIZE.  */
+
+static void
+emit_block_move_via_sized_loop (rtx x, rtx y, rtx size,
+				unsigned int align,
+				unsigned int ctz_size)
+{
+  int incr = align / BITS_PER_UNIT;
+
+  if (CONST_INT_P (size))
+    ctz_size = MAX (ctz_size, (unsigned) wi::ctz (UINTVAL (size)));
+
+  if (HOST_WIDE_INT_1U << ctz_size < (unsigned HOST_WIDE_INT) incr)
+    incr = HOST_WIDE_INT_1U << ctz_size;
+
+  while (incr > 1 && !can_move_by_pieces (incr, align))
+    incr >>= 1;
+
+  gcc_checking_assert (incr);
+
+  return emit_block_move_via_loop (x, y, size, align, incr);
+}
+
+/* Like emit_block_move_via_sized_loop, but besides choosing INCR so
+   as to ensure safe moves even in case of overlap, output dynamic
+   tests to choose between two loops, one moving downwards, another
+   moving upwards.  */
+
+static void
+emit_block_move_via_oriented_loop (rtx x, rtx y, rtx size,
+				   unsigned int align,
+				   unsigned int ctz_size)
+{
+  int incr = align / BITS_PER_UNIT;
+
+  if (CONST_INT_P (size))
+    ctz_size = MAX (ctz_size, (unsigned) wi::ctz (UINTVAL (size)));
+
+  if (HOST_WIDE_INT_1U << ctz_size < (unsigned HOST_WIDE_INT) incr)
+    incr = HOST_WIDE_INT_1U << ctz_size;
+
+  while (incr > 1 && !int_mode_for_size (incr, 0).exists ())
+    incr >>= 1;
+
+  gcc_checking_assert (incr);
+
+  rtx_code_label *upw_label, *end_label;
+  upw_label = gen_label_rtx ();
+  end_label = gen_label_rtx ();
+
+  rtx x_addr = force_operand (XEXP (x, 0), NULL_RTX);
+  rtx y_addr = force_operand (XEXP (y, 0), NULL_RTX);
+  do_pending_stack_adjust ();
+
+  machine_mode mode = GET_MODE (x_addr);
+  if (mode != GET_MODE (y_addr))
+    {
+      scalar_int_mode xmode
+	= smallest_int_mode_for_size (GET_MODE_BITSIZE (mode));
+      scalar_int_mode ymode
+	= smallest_int_mode_for_size (GET_MODE_BITSIZE
+				      (GET_MODE (y_addr)));
+      if (GET_MODE_BITSIZE (xmode) < GET_MODE_BITSIZE (ymode))
+	mode = ymode;
+      else
+	mode = xmode;
+
+#ifndef POINTERS_EXTEND_UNSIGNED
+      const int POINTERS_EXTEND_UNSIGNED = 1;
+#endif
+      x_addr = convert_modes (mode, GET_MODE (x_addr), x_addr,
+			      POINTERS_EXTEND_UNSIGNED);
+      y_addr = convert_modes (mode, GET_MODE (y_addr), y_addr,
+			      POINTERS_EXTEND_UNSIGNED);
+    }
+
+  /* Test for overlap: if (x >= y || x + size <= y) goto upw_label.  */
+  emit_cmp_and_jump_insns (x_addr, y_addr, GEU, NULL_RTX, mode,
+			   true, upw_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (5, 10));
+  rtx tmp = convert_modes (GET_MODE (x_addr), GET_MODE (size), size, true);
+  tmp = simplify_gen_binary (PLUS, GET_MODE (x_addr), x_addr, tmp);
+
+  emit_cmp_and_jump_insns (tmp, y_addr, LEU, NULL_RTX, mode,
+			   true, upw_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (8, 10));
+
+  emit_block_move_via_loop (x, y, size, align, -incr);
+
+  emit_jump (end_label);
+  emit_label (upw_label);
+
+  emit_block_move_via_loop (x, y, size, align, incr);
+
+  emit_label (end_label);
+}
+
 /* A subroutine of emit_block_move.  Copy the data via an explicit
-   loop.  This is used only when libcalls are forbidden.  */
-/* ??? It'd be nice to copy in hunks larger than QImode.  */
+   loop.  This is used only when libcalls are forbidden, or when
+   inlining is required.  INCR is the block size to be copied in each
+   loop iteration.  If it is negative, the absolute value is used, and
+   the block is copied backwards.  INCR must be a power of two, an
+   exact divisor for SIZE and ALIGN, and imply a mode that can be
+   safely copied per iteration assuming no overlap.  */
 
 static void
 emit_block_move_via_loop (rtx x, rtx y, rtx size,
-			  unsigned int align ATTRIBUTE_UNUSED)
+			  unsigned int align, int incr)
 {
   rtx_code_label *cmp_label, *top_label;
   rtx iter, x_addr, y_addr, tmp;
@@ -2277,7 +2399,38 @@ emit_block_move_via_loop (rtx x, rtx y, rtx size,
   cmp_label = gen_label_rtx ();
   iter = gen_reg_rtx (iter_mode);
 
-  emit_move_insn (iter, const0_rtx);
+  bool downwards = incr < 0;
+  rtx iter_init;
+  rtx_code iter_cond;
+  rtx iter_limit;
+  rtx iter_incr;
+  machine_mode move_mode;
+  if (downwards)
+    {
+      incr = -incr;
+      iter_init = size;
+      iter_cond = GEU;
+      iter_limit = const0_rtx;
+      iter_incr = GEN_INT (incr);
+    }
+  else
+    {
+      iter_init = const0_rtx;
+      iter_cond = LTU;
+      iter_limit = size;
+      iter_incr = GEN_INT (incr);
+    }
+  emit_move_insn (iter, iter_init);
+
+  scalar_int_mode int_move_mode
+    = smallest_int_mode_for_size (incr * BITS_PER_UNIT);
+  if (GET_MODE_BITSIZE (int_move_mode) != incr * BITS_PER_UNIT)
+    {
+      move_mode = BLKmode;
+      gcc_checking_assert (can_move_by_pieces (incr, align));
+    }
+  else
+    move_mode = int_move_mode;
 
   x_addr = force_operand (XEXP (x, 0), NULL_RTX);
   y_addr = force_operand (XEXP (y, 0), NULL_RTX);
@@ -2293,19 +2446,32 @@ emit_block_move_via_loop (rtx x, rtx y, rtx size,
     tmp = convert_modes (y_addr_mode, iter_mode, iter, true);
   y_addr = simplify_gen_binary (PLUS, y_addr_mode, y_addr, tmp);
 
-  x = change_address (x, QImode, x_addr);
-  y = change_address (y, QImode, y_addr);
+  x = change_address (x, move_mode, x_addr);
+  y = change_address (y, move_mode, y_addr);
+
+  if (move_mode == BLKmode)
+    {
+      bool done;
+      emit_block_move_hints (x, y, iter_incr, BLOCK_OP_NO_LIBCALL,
+			     align, incr, incr, incr, incr,
+			     false, &done, false);
+      gcc_checking_assert (done);
+    }
+  else
+    emit_move_insn (x, y);
 
-  emit_move_insn (x, y);
+  if (downwards)
+    emit_label (cmp_label);
 
-  tmp = expand_simple_binop (iter_mode, PLUS, iter, const1_rtx, iter,
+  tmp = expand_simple_binop (iter_mode, PLUS, iter, iter_incr, iter,
 			     true, OPTAB_LIB_WIDEN);
   if (tmp != iter)
     emit_move_insn (iter, tmp);
 
-  emit_label (cmp_label);
+  if (!downwards)
+    emit_label (cmp_label);
 
-  emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
+  emit_cmp_and_jump_insns (iter, iter_limit, iter_cond, NULL_RTX, iter_mode,
 			   true, top_label,
 			   profile_probability::guessed_always ()
 				.apply_scale (9, 10));
@@ -2405,7 +2571,8 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
 
    Both X and Y must be MEM rtx's.  LEN is an rtx that says how long
    they are.  LEN_TYPE is the type of the expression that was used to
-   calculate it.
+   calculate it, and CTZ_LEN is the known trailing-zeros count of LEN,
+   so LEN must be a multiple of 1<<CTZ_LEN even if it's not constant.
 
    If EQUALITY_ONLY is true, it means we don't have to return the tri-state
    value of a normal memcmp call, instead we can just compare for equality.
@@ -2421,7 +2588,7 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
 rtx
 emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
 		      bool equality_only, by_pieces_constfn y_cfn,
-		      void *y_cfndata)
+		      void *y_cfndata, unsigned ctz_len)
 {
   rtx result = 0;
 
@@ -2443,8 +2610,203 @@ emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
   else
     result = emit_block_cmp_via_cmpmem (x, y, len, len_type, target, align);
 
+  if (!result && (flag_inline_stringops & ILSOP_MEMCMP))
+    result = emit_block_cmp_via_loop (x, y, len, len_type,
+				      target, equality_only,
+				      align, ctz_len);
+
   return result;
 }
+
+/* Like emit_block_cmp_hints, but with known alignment and no support
+   for constats.  Always expand to a loop with iterations that compare
+   blocks of the largest compare-by-pieces size that divides both len
+   and align, and then, if !EQUALITY_ONLY, identify the word and then
+   the unit that first differs to return the result.  */
+
+rtx
+emit_block_cmp_via_loop (rtx x, rtx y, rtx len, tree len_type, rtx target,
+			 bool equality_only, unsigned align, unsigned ctz_len)
+{
+  unsigned incr = align / BITS_PER_UNIT;
+
+  if (CONST_INT_P (len))
+    ctz_len = MAX (ctz_len, (unsigned) wi::ctz (UINTVAL (len)));
+
+  if (HOST_WIDE_INT_1U << ctz_len < (unsigned HOST_WIDE_INT) incr)
+    incr = HOST_WIDE_INT_1U << ctz_len;
+
+  while (incr > 1
+	 && !can_do_by_pieces (incr, align, COMPARE_BY_PIECES))
+    incr >>= 1;
+
+  rtx_code_label *cmp_label, *top_label, *ne_label, *res_label;
+  rtx iter, x_addr, y_addr, tmp;
+  machine_mode x_addr_mode = get_address_mode (x);
+  machine_mode y_addr_mode = get_address_mode (y);
+  machine_mode iter_mode;
+
+  iter_mode = GET_MODE (len);
+  if (iter_mode == VOIDmode)
+    iter_mode = word_mode;
+
+  rtx iter_init = const0_rtx;
+  rtx_code iter_cond = LTU;
+  rtx_code entry_cond = GEU;
+  rtx iter_limit = len;
+  rtx iter_incr = GEN_INT (incr);
+  machine_mode cmp_mode;
+
+  /* We can drop the loop back edge if we know there's exactly one
+     iteration.  */
+  top_label = (!rtx_equal_p (len, iter_incr)
+	       ? gen_label_rtx ()
+	       : NULL);
+  /* We need not test before entering the loop if len is known
+     nonzero.  ??? This could be even stricter, testing whether a
+     nonconstant LEN could possibly be zero.  */
+  cmp_label = (!CONSTANT_P (len) || rtx_equal_p (len, iter_init)
+	       ? gen_label_rtx ()
+	       : NULL);
+  ne_label = gen_label_rtx ();
+  res_label = gen_label_rtx ();
+
+  iter = gen_reg_rtx (iter_mode);
+  emit_move_insn (iter, iter_init);
+
+  scalar_int_mode int_cmp_mode
+    = smallest_int_mode_for_size (incr * BITS_PER_UNIT);
+  if (GET_MODE_BITSIZE (int_cmp_mode) != incr * BITS_PER_UNIT
+      || !can_compare_p (NE, int_cmp_mode, ccp_jump))
+    {
+      cmp_mode = BLKmode;
+      gcc_checking_assert (incr != 1);
+    }
+  else
+    cmp_mode = int_cmp_mode;
+
+  /* Save the base addresses.  */
+  x_addr = force_operand (XEXP (x, 0), NULL_RTX);
+  y_addr = force_operand (XEXP (y, 0), NULL_RTX);
+  do_pending_stack_adjust ();
+
+  if (cmp_label)
+    {
+      if (top_label)
+	emit_jump (cmp_label);
+      else
+	emit_cmp_and_jump_insns (iter, iter_limit, entry_cond,
+				 NULL_RTX, iter_mode,
+				 true, cmp_label,
+				 profile_probability::guessed_always ()
+				 .apply_scale (1, 10));
+    }
+  if (top_label)
+    emit_label (top_label);
+
+  /* Offset the base addresses by ITER.  */
+  tmp = convert_modes (x_addr_mode, iter_mode, iter, true);
+  x_addr = simplify_gen_binary (PLUS, x_addr_mode, x_addr, tmp);
+
+  if (x_addr_mode != y_addr_mode)
+    tmp = convert_modes (y_addr_mode, iter_mode, iter, true);
+  y_addr = simplify_gen_binary (PLUS, y_addr_mode, y_addr, tmp);
+
+  x = change_address (x, cmp_mode, x_addr);
+  y = change_address (y, cmp_mode, y_addr);
+
+  /* Compare one block.  */
+  rtx part_res;
+  if (cmp_mode == BLKmode)
+    part_res = compare_by_pieces (x, y, incr, target, align, 0, 0);
+  else
+    part_res = expand_binop (cmp_mode, sub_optab, x, y, NULL_RTX,
+			     true, OPTAB_LIB_WIDEN);
+
+  /* Stop if we found a difference.  */
+  emit_cmp_and_jump_insns (part_res, GEN_INT (0), NE, NULL_RTX,
+			   GET_MODE (part_res), true, ne_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (1, 10));
+
+  /* Increment ITER.  */
+  tmp = expand_simple_binop (iter_mode, PLUS, iter, iter_incr, iter,
+			     true, OPTAB_LIB_WIDEN);
+  if (tmp != iter)
+    emit_move_insn (iter, tmp);
+
+  if (cmp_label)
+    emit_label (cmp_label);
+  /* Loop until we reach the limit.  */
+
+  if (top_label)
+    emit_cmp_and_jump_insns (iter, iter_limit, iter_cond, NULL_RTX, iter_mode,
+			     true, top_label,
+			     profile_probability::guessed_always ()
+			     .apply_scale (9, 10));
+
+  /* We got to the end without differences, so the result is zero.  */
+  if (target == NULL_RTX
+      || !REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
+    target = gen_reg_rtx (TYPE_MODE (integer_type_node));
+
+  emit_move_insn (target, const0_rtx);
+  emit_jump (res_label);
+
+  emit_label (ne_label);
+
+  /* Return nonzero, or pinpoint the difference to return the expected
+     result for non-equality tests.  */
+  if (equality_only)
+    emit_move_insn (target, const1_rtx);
+  else
+    {
+      if (incr > UNITS_PER_WORD)
+	/* ??? Re-compare the block found to be different one word at a
+	   time.  */
+	part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), len_type,
+					    target, equality_only,
+					    BITS_PER_WORD, 0);
+      else if (incr > 1)
+	/* ??? Re-compare the block found to be different one byte at a
+	   time.  We could do better using part_res, and being careful
+	   about endianness.  */
+	part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), len_type,
+					    target, equality_only,
+					    BITS_PER_UNIT, 0);
+      else if (known_gt (GET_MODE_BITSIZE (GET_MODE (target)),
+			 GET_MODE_BITSIZE (cmp_mode)))
+	part_res = expand_binop (GET_MODE (target), sub_optab, x, y, target,
+				 true, OPTAB_LIB_WIDEN);
+      else
+	{
+	  /* In the odd chance target is QImode, we can't count on
+	     widening subtract to capture the result of the unsigned
+	     compares.  */
+	  rtx_code_label *ltu_label;
+	  ltu_label = gen_label_rtx ();
+	  emit_cmp_and_jump_insns (x, y, LTU, NULL_RTX,
+				   cmp_mode, true, ltu_label,
+				   profile_probability::guessed_always ()
+				   .apply_scale (5, 10));
+
+	  emit_move_insn (target, const1_rtx);
+	  emit_jump (res_label);
+
+	  emit_label (ltu_label);
+	  emit_move_insn (target, constm1_rtx);
+	  part_res = target;
+	}
+
+      if (target != part_res)
+	convert_move (target, part_res, false);
+    }
+
+  emit_label (res_label);
+
+  return target;
+}
+
 \f
 /* Copy all or part of a value X into registers starting at REGNO.
    The number of registers to be filled is NREGS.  */
diff --git a/gcc/expr.h b/gcc/expr.h
index 11bff531862..988c783e450 100644
--- a/gcc/expr.h
+++ b/gcc/expr.h
@@ -126,7 +126,8 @@ struct by_pieces_prev
   fixed_size_mode mode;
 };
 
-extern rtx emit_block_move (rtx, rtx, rtx, enum block_op_methods);
+extern rtx emit_block_move (rtx, rtx, rtx, enum block_op_methods,
+			    unsigned ctz_size = 0);
 extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
 			          unsigned int, HOST_WIDE_INT,
 				  unsigned HOST_WIDE_INT,
@@ -134,9 +135,11 @@ extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
 				  unsigned HOST_WIDE_INT,
 				  bool bail_out_libcall = false,
 				  bool *is_move_done = NULL,
-				  bool might_overlap = false);
+				  bool might_overlap = false,
+				  unsigned ctz_size = 0);
 extern rtx emit_block_cmp_hints (rtx, rtx, rtx, tree, rtx, bool,
-				 by_pieces_constfn, void *);
+				 by_pieces_constfn, void *,
+				 unsigned ctz_len = 0);
 extern bool emit_storent_insn (rtx to, rtx from);
 
 /* Copy all or part of a value X into registers starting at REGNO.
diff --git a/gcc/flag-types.h b/gcc/flag-types.h
index 7466c1106f2..7a20cfd5de3 100644
--- a/gcc/flag-types.h
+++ b/gcc/flag-types.h
@@ -437,6 +437,17 @@ enum gfc_convert
 };
 
 
+/* Inline String Operations functions.  */
+enum ilsop_fn
+{
+  ILSOP_NONE = 0,
+  ILSOP_MEMSET = 1 << 0,
+  ILSOP_MEMCPY = 1 << 1,
+  ILSOP_MEMMOVE = 1 << 2,
+  ILSOP_MEMCMP = 1 << 3,
+  ILSOP_ALL = -1
+};
+
 /* Control-Flow Protection values.  */
 enum cf_protection_level
 {
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c
new file mode 100644
index 00000000000..a368f074112
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c
@@ -0,0 +1,7 @@
+/* { dg-do run } */
+/* { dg-options "-finline-stringops=memcmp -save-temps -g0 -fno-lto" } */
+
+#include "../memcmp-1.c"
+
+/* Check that no memcmp calls remain, but allow for lib_memcmp calls.  */
+/* { dg-final { scan-assembler-not {(^|\*)\mmemcmp\M} } } */
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c
new file mode 100644
index 00000000000..c98e903c1f1
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c
@@ -0,0 +1,8 @@
+/* { dg-do run } */
+/* { dg-options "-finline-stringops=memcpy -save-temps -g0 -fno-lto" } */
+
+#include "../memcmp-1.c"
+/* Yeah, this memcmp test exercises plenty of memcpy, more than any of the
+   memcpy tests.  */
+
+/* { dg-final { scan-assembler-not {\mmemcpy\M} } } */
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-cmp-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-cmp-1.c
new file mode 100644
index 00000000000..2cd2057a9b5
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-cmp-1.c
@@ -0,0 +1,11 @@
+/* { dg-do run } */
+/* { dg-options "-finline-stringops -save-temps -g0 -fno-lto" } */
+/* { dg-require-effective-target ptr32plus } */
+/* { dg-timeout-factor 2 } */
+
+#include "../memcmp-1.c"
+/* Yeah, this memcmp test exercises plenty of memcpy, more than any of the
+   memcpy tests.  */
+
+/* { dg-final { scan-assembler-not {\mmemcpy\M} } } */
+/* { dg-final { scan-assembler-not {(^|\*)\mmemcmp\M} } } */
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c
new file mode 100644
index 00000000000..c0eca5bce6b
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c
@@ -0,0 +1,8 @@
+/* { dg-do run } */
+/* { dg-options "-finline-stringops=memmove -save-temps -g0 -fno-lto" } */
+
+#include "../../gcc.c-torture/execute/builtins/memmove.c"
+
+#include "../../gcc.c-torture/execute/builtins/lib/main.c"
+
+/* { dg-final { scan-assembler-not {\mmemmove\M} } } */
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c
new file mode 100644
index 00000000000..bdcf9bf5b61
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c
@@ -0,0 +1,84 @@
+/* { dg-do compile } */
+/* { dg-options "-finline-stringops -fno-lto" } */
+
+void *zero (unsigned long long (*p)[32], int n)
+{
+  return __builtin_memset (p, 0, n * sizeof (*p));
+}
+
+void *ones (char (*p)[128], int n)
+{
+  return __builtin_memset (p, -1, n * sizeof (*p));
+}
+
+void *opt2 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 2) * sizeof (*p));
+}
+
+void *opt8 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 8) * sizeof (*p));
+}
+
+void *opt32 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 32) * sizeof (*p));
+}
+
+void *opt128 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 128) * sizeof (*p));
+}
+
+void *opt512 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 512) * sizeof (*p));
+}
+
+void *opt_primes (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 509 : 7) * sizeof (*p));
+}
+
+void *opt_primes_blk (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 521 : 9) * sizeof (*p));
+}
+
+void *huge (long (*p)[16384])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep1 (long (*p)[16384+1])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep4 (long (*p)[16384+4])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep16 (long (*p)[16384+16])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep64 (long (*p)[16384+64])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep256 (long (*p)[16384+256])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep1024p256p64p16p4p1 (long (*p)[16384+1024+64+16+4+1])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+/* { dg-final { scan-assembler-not {\mmemset\M} } } */

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [gcc(refs/users/aoliva/heads/testme)] Introduce -finline-stringops
@ 2023-09-22  5:46 Alexandre Oliva
  0 siblings, 0 replies; 16+ messages in thread
From: Alexandre Oliva @ 2023-09-22  5:46 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:57fd5fb847bafc5c566d5ee47921a05fb30348a5

commit 57fd5fb847bafc5c566d5ee47921a05fb30348a5
Author: Alexandre Oliva <oliva@adacore.com>
Date:   Fri Sep 15 02:41:24 2023 -0300

    Introduce -finline-stringops
    
    try_store_by_multiple_pieces was added not long ago, enabling
    variable-sized memset to be expanded inline when the worst-case
    in-range constant length would, using conditional blocks with powers
    of two to cover all possibilities of length and alignment.
    
    This patch introduces -finline-stringops[=fn] to request expansions to
    start with a loop, so as to still take advantage of known alignment
    even with long lengths, but without necessarily adding store blocks
    for every power of two.
    
    This makes it possible for the supported stringops (memset, memcpy,
    memmove, memset) to be expanded, even if storing a single byte per
    iteration.  Surely efficient implementations can run faster, with a
    pre-loop to increase alignment, but that would likely be excessive for
    inline expansions.
    
    Still, in some cases, such as in freestanding environments, users
    prefer to inline such stringops, especially those that the compiler
    may introduce itself, even if the expansion is not as performant as a
    highly optimized C library implementation could be, to avoid
    depending on a C runtime library.
    
    
    for  gcc/ChangeLog
    
            * expr.cc (emit_block_move_hints): Take ctz of len.  Obey
            -finline-stringops.  Use oriented or sized loop.
            (emit_block_move): Take ctz of len, and pass it on.
            (emit_block_move_via_sized_loop): New.
            (emit_block_move_via_oriented_loop): New.
            (emit_block_move_via_loop): Take incr.  Move an incr-sized
            block per iteration.
            (emit_block_cmp_via_cmpmem): Take ctz of len.  Obey
            -finline-stringops.
            (emit_block_cmp_via_loop): New.
            * expr.h (emit_block_move): Add ctz of len defaulting to zero.
            (emit_block_move_hints): Likewise.
            (emit_block_cmp_hints): Likewise.
            * builtins.cc (expand_builtin_memory_copy_args): Pass ctz of
            len to emit_block_move_hints.
            (try_store_by_multiple_pieces): Support starting with a loop.
            (expand_builtin_memcmp): Pass ctz of len to
            emit_block_cmp_hints.
            (expand_builtin): Allow inline expansion of memset, memcpy,
            memmove and memcmp if requested.
            * common.opt (finline-stringops): New.
            (ilsop_fn): New enum.
            * flag-types.h (enum ilsop_fn): New.
            * doc/invoke.texi (-finline-stringops): Add.
    
    for  gcc/testsuite/ChangeLog
    
            * gcc.dg/torture/inline-mem-cmp-1.c: New.
            * gcc.dg/torture/inline-mem-cpy-1.c: New.
            * gcc.dg/torture/inline-mem-cpy-cmp-1.c: New.
            * gcc.dg/torture/inline-mem-move-1.c: New.
            * gcc.dg/torture/inline-mem-set-1.c: New.

Diff:
---
 gcc/builtins.cc                                    | 120 ++++++-
 gcc/common.opt                                     |  34 ++
 gcc/doc/invoke.texi                                |  15 +
 gcc/expr.cc                                        | 396 ++++++++++++++++++++-
 gcc/expr.h                                         |   9 +-
 gcc/flag-types.h                                   |  11 +
 gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c    |   7 +
 gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c    |   8 +
 .../gcc.dg/torture/inline-mem-cpy-cmp-1.c          |  11 +
 gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c   |   9 +
 gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c    |  84 +++++
 11 files changed, 674 insertions(+), 30 deletions(-)

diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index 3b453b3ec8c..dab4055ff62 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -3771,7 +3771,7 @@ expand_builtin_memory_copy_args (tree dest, tree src, tree len,
 				     expected_align, expected_size,
 				     min_size, max_size, probable_max_size,
 				     use_mempcpy_call, &is_move_done,
-				     might_overlap);
+				     might_overlap, tree_ctz (len));
 
   /* Bail out when a mempcpy call would be expanded as libcall and when
      we have a target that provides a fast implementation
@@ -4337,6 +4337,10 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
   int tst_bits = (max_bits != min_bits ? max_bits
 		  : floor_log2 (max_len ^ min_len));
 
+  /* Save the pre-blksize values.  */
+  int orig_max_bits = max_bits;
+  int orig_tst_bits = tst_bits;
+
   /* Check whether it's profitable to start by storing a fixed BLKSIZE
      bytes, to lower max_bits.  In the unlikely case of a constant LEN
      (implied by identical MAX_LEN and MIN_LEN), we want to issue a
@@ -4376,9 +4380,76 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
   if (max_bits >= 0)
     xlenest += ((HOST_WIDE_INT_1U << max_bits) * 2
 		- (HOST_WIDE_INT_1U << ctz_len));
-  if (!can_store_by_pieces (xlenest, builtin_memset_read_str,
-			    &valc, align, true))
-    return false;
+  bool max_loop = false;
+  /* Skip the test in case of overflow in xlenest.  It shouldn't
+     happen because of the way max_bits and blksize are related, but
+     it doesn't hurt to test.  */
+  if (blksize > xlenest
+      || !can_store_by_pieces (xlenest, builtin_memset_read_str,
+			       &valc, align, true))
+    {
+      if (!(flag_inline_stringops & ILSOP_MEMSET))
+	return false;
+
+      for (max_bits = orig_max_bits;
+	   max_bits >= sctz_len;
+	   --max_bits)
+	{
+	  xlenest = ((HOST_WIDE_INT_1U << max_bits) * 2
+		     - (HOST_WIDE_INT_1U << ctz_len));
+	  /* Check that blksize plus the bits to be stored as blocks
+	     sized at powers of two can be stored by pieces.  This is
+	     like the test above, but with smaller max_bits.  Skip
+	     orig_max_bits (it would be redundant).  Also skip in case
+	     of overflow.  */
+	  if (max_bits < orig_max_bits
+	      && xlenest + blksize >= xlenest
+	      && can_store_by_pieces (xlenest + blksize,
+				      builtin_memset_read_str,
+				      &valc, align, true))
+	    {
+	      max_loop = true;
+	      break;
+	    }
+	  if (blksize
+	      && can_store_by_pieces (xlenest,
+				      builtin_memset_read_str,
+				      &valc, align, true))
+	    {
+	      max_len += blksize;
+	      min_len += blksize;
+	      tst_bits = orig_tst_bits;
+	      blksize = 0;
+	      max_loop = true;
+	      break;
+	    }
+	  if (max_bits == sctz_len)
+	    {
+	      /* We'll get here if can_store_by_pieces refuses to
+		 store even a single QImode.  ??? If inline expansion
+		 was requested with -finline-stringops, should we
+		 force it?  */
+	      if (!sctz_len)
+		return false;
+	      --sctz_len;
+	      --ctz_len;
+	    }
+	}
+      if (!max_loop)
+	return false;
+      /* If the boundaries are such that min and max may run a
+	 different number of trips in the initial loop, the remainder
+	 needs not be between the moduli, so set tst_bits to cover all
+	 bits.  Otherwise, if the trip counts are the same, max_len
+	 has the common prefix, and the previously-computed tst_bits
+	 is usable.  */
+      if (max_len >> max_bits > min_len >> max_bits)
+	tst_bits = max_bits;
+    }
+  /* ??? Do we have to check that all powers of two lengths from
+     max_bits down to ctz_len pass can_store_by_pieces?  As in, could
+     it possibly be that xlenest passes while smaller power-of-two
+     sizes don't?  */
 
   by_pieces_constfn constfun;
   void *constfundata;
@@ -4420,7 +4491,9 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
      the least significant bit possibly set in the length.  */
   for (int i = max_bits; i >= sctz_len; i--)
     {
+      rtx_code_label *loop_label = NULL;
       rtx_code_label *label = NULL;
+
       blksize = HOST_WIDE_INT_1U << i;
 
       /* If we're past the bits shared between min_ and max_len, expand
@@ -4434,18 +4507,31 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
 				   profile_probability::even ());
 	}
       /* If we are at a bit that is in the prefix shared by min_ and
-	 max_len, skip this BLKSIZE if the bit is clear.  */
-      else if ((max_len & blksize) == 0)
+	 max_len, skip the current BLKSIZE if the bit is clear, but do
+	 not skip the loop, even if it doesn't require
+	 prechecking.  */
+      else if ((max_len & blksize) == 0
+	       && !(max_loop && i == max_bits))
 	continue;
 
+      if (max_loop && i == max_bits)
+	{
+	  loop_label = gen_label_rtx ();
+	  emit_label (loop_label);
+	  /* Since we may run this multiple times, don't assume we
+	     know anything about the offset.  */
+	  clear_mem_offset (to);
+	}
+
       /* Issue a store of BLKSIZE bytes.  */
+      bool update_needed = i != sctz_len || loop_label;
       to = store_by_pieces (to, blksize,
 			    constfun, constfundata,
 			    align, true,
-			    i != sctz_len ? RETURN_END : RETURN_BEGIN);
+			    update_needed ? RETURN_END : RETURN_BEGIN);
 
       /* Adjust REM and PTR, unless this is the last iteration.  */
-      if (i != sctz_len)
+      if (update_needed)
 	{
 	  emit_move_insn (ptr, force_operand (XEXP (to, 0), NULL_RTX));
 	  to = replace_equiv_address (to, ptr);
@@ -4453,6 +4539,11 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
 	  emit_move_insn (rem, force_operand (rem_minus_blksize, NULL_RTX));
 	}
 
+      if (loop_label)
+	emit_cmp_and_jump_insns (rem, GEN_INT (blksize), GE, NULL,
+				 ptr_mode, 1, loop_label,
+				 profile_probability::likely ());
+
       if (label)
 	{
 	  emit_label (label);
@@ -4739,7 +4830,8 @@ expand_builtin_memcmp (tree exp, rtx target, bool result_eq)
   result = emit_block_cmp_hints (arg1_rtx, arg2_rtx, len_rtx,
 				 TREE_TYPE (len), target,
 				 result_eq, constfn,
-				 CONST_CAST (char *, rep));
+				 CONST_CAST (char *, rep),
+				 tree_ctz (len));
 
   if (result)
     {
@@ -7382,7 +7474,15 @@ expand_builtin (tree exp, rtx target, rtx subtarget, machine_mode mode,
       && fcode != BUILT_IN_EXECVE
       && fcode != BUILT_IN_CLEAR_CACHE
       && !ALLOCA_FUNCTION_CODE_P (fcode)
-      && fcode != BUILT_IN_FREE)
+      && fcode != BUILT_IN_FREE
+      && (fcode != BUILT_IN_MEMSET
+	  || !(flag_inline_stringops & ILSOP_MEMSET))
+      && (fcode != BUILT_IN_MEMCPY
+	  || !(flag_inline_stringops & ILSOP_MEMCPY))
+      && (fcode != BUILT_IN_MEMMOVE
+	  || !(flag_inline_stringops & ILSOP_MEMMOVE))
+      && (fcode != BUILT_IN_MEMCMP
+	  || !(flag_inline_stringops & ILSOP_MEMCMP)))
     return expand_call (exp, target, ignore);
 
   /* The built-in function expanders test for target == const0_rtx
diff --git a/gcc/common.opt b/gcc/common.opt
index f137a1f81ac..1e3f8efbc97 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -1908,6 +1908,40 @@ finline-atomics
 Common Var(flag_inline_atomics) Init(1) Optimization
 Inline __atomic operations when a lock free instruction sequence is available.
 
+finline-stringops
+Common RejectNegative Enum(ilsop_fn) Var(flag_inline_stringops, ILSOP_ALL) Enum(ilsop_fn) Init(ILSOP_NONE) Optimization Undocumented
+
+fno-inline-stringops
+Common RejectNegative Enum(ilsop_fn) Var(flag_inline_stringops, ILSOP_NONE) Enum(ilsop_fn) Optimization Undocumented
+
+finline-stringops=
+Common Joined Var(flag_inline_stringops) EnumSet Enum(ilsop_fn) Optimization
+-finline-stringops[=memcmp|memcpy|memmove|memset]
+Expand supported mem/str operations inline, even if against optimization.
+
+Enum
+Name(ilsop_fn) Type(enum ilsop_fn) UnknownError(unavailable stringop for inlining %qs)
+
+; This is not part of any set.
+; EnumValue
+; Enum(ilsop_fn) String(none) Value(ILSOP_NONE)
+
+EnumValue
+Enum(ilsop_fn) String(memcmp) Value(ILSOP_MEMCMP) Set(1)
+
+EnumValue
+Enum(ilsop_fn) String(memcpy) Value(ILSOP_MEMCPY) Set(2)
+
+EnumValue
+Enum(ilsop_fn) String(memmove) Value(ILSOP_MEMMOVE) Set(3)
+
+EnumValue
+Enum(ilsop_fn) String(memset) Value(ILSOP_MEMSET) Set(4)
+
+; This is not part of any set either.
+; EnumValue
+; Enum(ilsop_fn) String(all) Value(ILSOP_ALL)
+
 fcf-protection
 Common RejectNegative Alias(fcf-protection=,full)
 
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 03d93e6b185..b0649712792 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -556,6 +556,7 @@ Objective-C and Objective-C++ Dialects}.
 -fgcse  -fgcse-after-reload  -fgcse-las  -fgcse-lm  -fgraphite-identity
 -fgcse-sm  -fhoist-adjacent-loads  -fif-conversion
 -fif-conversion2  -findirect-inlining
+-finline-stringops[=@var{fn}]
 -finline-functions  -finline-functions-called-once  -finline-limit=@var{n}
 -finline-small-functions -fipa-modref -fipa-cp  -fipa-cp-clone
 -fipa-bit-cp  -fipa-vrp  -fipa-pta  -fipa-profile  -fipa-pure-const
@@ -12228,6 +12229,20 @@ their @code{_FORTIFY_SOURCE} counterparts into faster alternatives.
 
 Enabled at levels @option{-O2}, @option{-O3}.
 
+@opindex finline-stringops
+@item -finline-stringops[=@var{fn}]
+Expand memory and string operations (for now, only @code{memset})
+inline, even when the length is variable or big enough as to require
+looping.  This is most useful along with @option{-ffreestanding} and
+@option{-fno-builtin}.
+
+In some circumstances, it enables the compiler to generate code that
+takes advantage of known alignment and length multipliers, but even then
+it may be less efficient than optimized runtime implementations, and
+grow code size so much that even a less performant but shared
+implementation runs faster due to better use of code caches.  This
+option is disabled by default.
+
 @opindex fno-inline
 @opindex finline
 @item -fno-inline
diff --git a/gcc/expr.cc b/gcc/expr.cc
index d5b6494b4fc..c6842996b98 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -80,7 +80,11 @@ static bool emit_block_move_via_pattern (rtx, rtx, rtx, unsigned, unsigned,
 					 HOST_WIDE_INT, unsigned HOST_WIDE_INT,
 					 unsigned HOST_WIDE_INT,
 					 unsigned HOST_WIDE_INT, bool);
-static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned);
+static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned, int);
+static void emit_block_move_via_sized_loop (rtx, rtx, rtx, unsigned, unsigned);
+static void emit_block_move_via_oriented_loop (rtx, rtx, rtx, unsigned, unsigned);
+static rtx emit_block_cmp_via_loop (rtx, rtx, rtx, tree, rtx, bool,
+				    unsigned, unsigned);
 static void clear_by_pieces (rtx, unsigned HOST_WIDE_INT, unsigned int);
 static rtx_insn *compress_float_constant (rtx, rtx);
 static rtx get_subtarget (rtx);
@@ -1955,6 +1959,8 @@ compare_by_pieces (rtx arg0, rtx arg1, unsigned HOST_WIDE_INT len,
    MIN_SIZE is the minimal size of block to move
    MAX_SIZE is the maximal size of block to move, if it cannot be represented
    in unsigned HOST_WIDE_INT, than it is mask of all ones.
+   CTZ_SIZE is the trailing-zeros count of SIZE; even a nonconstant SIZE is
+   known to be a multiple of 1<<CTZ_SIZE.
 
    Return the address of the new block, if memcpy is called and returns it,
    0 otherwise.  */
@@ -1966,7 +1972,7 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
 		       unsigned HOST_WIDE_INT max_size,
 		       unsigned HOST_WIDE_INT probable_max_size,
 		       bool bail_out_libcall, bool *is_move_done,
-		       bool might_overlap)
+		       bool might_overlap, unsigned ctz_size)
 {
   int may_use_call;
   rtx retval = 0;
@@ -2052,6 +2058,14 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
 	}
     }
 
+  bool dynamic_direction = false;
+  if (!pattern_ok && !pieces_ok && may_use_call
+      && (flag_inline_stringops & (might_overlap ? ILSOP_MEMMOVE : ILSOP_MEMCPY)))
+    {
+      may_use_call = 0;
+      dynamic_direction = might_overlap;
+    }
+
   if (pattern_ok)
     ;
   else if (pieces_ok)
@@ -2073,10 +2087,12 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
       retval = emit_block_copy_via_libcall (x, y, size,
 					    method == BLOCK_OP_TAILCALL);
     }
+  else if (dynamic_direction)
+    emit_block_move_via_oriented_loop (x, y, size, align, ctz_size);
   else if (might_overlap)
     *is_move_done = false;
   else
-    emit_block_move_via_loop (x, y, size, align);
+    emit_block_move_via_sized_loop (x, y, size, align, ctz_size);
 
   if (method == BLOCK_OP_CALL_PARM)
     OK_DEFER_POP;
@@ -2085,7 +2101,8 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
 }
 
 rtx
-emit_block_move (rtx x, rtx y, rtx size, enum block_op_methods method)
+emit_block_move (rtx x, rtx y, rtx size, enum block_op_methods method,
+		 unsigned int ctz_size)
 {
   unsigned HOST_WIDE_INT max, min = 0;
   if (GET_CODE (size) == CONST_INT)
@@ -2093,7 +2110,8 @@ emit_block_move (rtx x, rtx y, rtx size, enum block_op_methods method)
   else
     max = GET_MODE_MASK (GET_MODE (size));
   return emit_block_move_hints (x, y, size, method, 0, -1,
-				min, max, max);
+				min, max, max,
+				false, NULL, false, ctz_size);
 }
 
 /* A subroutine of emit_block_move.  Returns true if calling the
@@ -2255,13 +2273,117 @@ emit_block_move_via_pattern (rtx x, rtx y, rtx size, unsigned int align,
   return false;
 }
 
+/* Like emit_block_move_via_loop, but choose a suitable INCR based on
+   ALIGN and CTZ_SIZE.  */
+
+static void
+emit_block_move_via_sized_loop (rtx x, rtx y, rtx size,
+				unsigned int align,
+				unsigned int ctz_size)
+{
+  int incr = align / BITS_PER_UNIT;
+
+  if (CONST_INT_P (size))
+    ctz_size = MAX (ctz_size, (unsigned) wi::ctz (UINTVAL (size)));
+
+  if (HOST_WIDE_INT_1U << ctz_size < (unsigned HOST_WIDE_INT) incr)
+    incr = HOST_WIDE_INT_1U << ctz_size;
+
+  while (incr > 1 && !can_move_by_pieces (incr, align))
+    incr >>= 1;
+
+  gcc_checking_assert (incr);
+
+  return emit_block_move_via_loop (x, y, size, align, incr);
+}
+
+/* Like emit_block_move_via_sized_loop, but besides choosing INCR so
+   as to ensure safe moves even in case of overlap, output dynamic
+   tests to choose between two loops, one moving downwards, another
+   moving upwards.  */
+
+static void
+emit_block_move_via_oriented_loop (rtx x, rtx y, rtx size,
+				   unsigned int align,
+				   unsigned int ctz_size)
+{
+  int incr = align / BITS_PER_UNIT;
+
+  if (CONST_INT_P (size))
+    ctz_size = MAX (ctz_size, (unsigned) wi::ctz (UINTVAL (size)));
+
+  if (HOST_WIDE_INT_1U << ctz_size < (unsigned HOST_WIDE_INT) incr)
+    incr = HOST_WIDE_INT_1U << ctz_size;
+
+  while (incr > 1 && !int_mode_for_size (incr, 0).exists ())
+    incr >>= 1;
+
+  gcc_checking_assert (incr);
+
+  rtx_code_label *upw_label, *end_label;
+  upw_label = gen_label_rtx ();
+  end_label = gen_label_rtx ();
+
+  rtx x_addr = force_operand (XEXP (x, 0), NULL_RTX);
+  rtx y_addr = force_operand (XEXP (y, 0), NULL_RTX);
+  do_pending_stack_adjust ();
+
+  machine_mode mode = GET_MODE (x_addr);
+  if (mode != GET_MODE (y_addr))
+    {
+      scalar_int_mode xmode
+	= smallest_int_mode_for_size (GET_MODE_BITSIZE (mode));
+      scalar_int_mode ymode
+	= smallest_int_mode_for_size (GET_MODE_BITSIZE
+				      (GET_MODE (y_addr)));
+      if (GET_MODE_BITSIZE (xmode) < GET_MODE_BITSIZE (ymode))
+	mode = ymode;
+      else
+	mode = xmode;
+
+#ifndef POINTERS_EXTEND_UNSIGNED
+      const int POINTERS_EXTEND_UNSIGNED = 1;
+#endif
+      x_addr = convert_modes (mode, GET_MODE (x_addr), x_addr,
+			      POINTERS_EXTEND_UNSIGNED);
+      y_addr = convert_modes (mode, GET_MODE (y_addr), y_addr,
+			      POINTERS_EXTEND_UNSIGNED);
+    }
+
+  /* Test for overlap: if (x >= y || x + size <= y) goto upw_label.  */
+  emit_cmp_and_jump_insns (x_addr, y_addr, GEU, NULL_RTX, mode,
+			   true, upw_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (5, 10));
+  rtx tmp = convert_modes (GET_MODE (x_addr), GET_MODE (size), size, true);
+  tmp = simplify_gen_binary (PLUS, GET_MODE (x_addr), x_addr, tmp);
+
+  emit_cmp_and_jump_insns (tmp, y_addr, LEU, NULL_RTX, mode,
+			   true, upw_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (8, 10));
+
+  emit_block_move_via_loop (x, y, size, align, -incr);
+
+  emit_jump (end_label);
+  emit_label (upw_label);
+
+  emit_block_move_via_loop (x, y, size, align, incr);
+
+  emit_label (end_label);
+}
+
 /* A subroutine of emit_block_move.  Copy the data via an explicit
-   loop.  This is used only when libcalls are forbidden.  */
-/* ??? It'd be nice to copy in hunks larger than QImode.  */
+   loop.  This is used only when libcalls are forbidden, or when
+   inlining is required.  INCR is the block size to be copied in each
+   loop iteration.  If it is negative, the absolute value is used, and
+   the block is copied backwards.  INCR must be a power of two, an
+   exact divisor for SIZE and ALIGN, and imply a mode that can be
+   safely copied per iteration assuming no overlap.  */
 
 static void
 emit_block_move_via_loop (rtx x, rtx y, rtx size,
-			  unsigned int align ATTRIBUTE_UNUSED)
+			  unsigned int align, int incr)
 {
   rtx_code_label *cmp_label, *top_label;
   rtx iter, x_addr, y_addr, tmp;
@@ -2277,7 +2399,38 @@ emit_block_move_via_loop (rtx x, rtx y, rtx size,
   cmp_label = gen_label_rtx ();
   iter = gen_reg_rtx (iter_mode);
 
-  emit_move_insn (iter, const0_rtx);
+  bool downwards = incr < 0;
+  rtx iter_init;
+  rtx_code iter_cond;
+  rtx iter_limit;
+  rtx iter_incr;
+  machine_mode move_mode;
+  if (downwards)
+    {
+      incr = -incr;
+      iter_init = size;
+      iter_cond = GEU;
+      iter_limit = const0_rtx;
+      iter_incr = GEN_INT (incr);
+    }
+  else
+    {
+      iter_init = const0_rtx;
+      iter_cond = LTU;
+      iter_limit = size;
+      iter_incr = GEN_INT (incr);
+    }
+  emit_move_insn (iter, iter_init);
+
+  scalar_int_mode int_move_mode
+    = smallest_int_mode_for_size (incr * BITS_PER_UNIT);
+  if (GET_MODE_BITSIZE (int_move_mode) != incr * BITS_PER_UNIT)
+    {
+      move_mode = BLKmode;
+      gcc_checking_assert (can_move_by_pieces (incr, align));
+    }
+  else
+    move_mode = int_move_mode;
 
   x_addr = force_operand (XEXP (x, 0), NULL_RTX);
   y_addr = force_operand (XEXP (y, 0), NULL_RTX);
@@ -2293,19 +2446,32 @@ emit_block_move_via_loop (rtx x, rtx y, rtx size,
     tmp = convert_modes (y_addr_mode, iter_mode, iter, true);
   y_addr = simplify_gen_binary (PLUS, y_addr_mode, y_addr, tmp);
 
-  x = change_address (x, QImode, x_addr);
-  y = change_address (y, QImode, y_addr);
+  x = change_address (x, move_mode, x_addr);
+  y = change_address (y, move_mode, y_addr);
+
+  if (move_mode == BLKmode)
+    {
+      bool done;
+      emit_block_move_hints (x, y, iter_incr, BLOCK_OP_NO_LIBCALL,
+			     align, incr, incr, incr, incr,
+			     false, &done, false);
+      gcc_checking_assert (done);
+    }
+  else
+    emit_move_insn (x, y);
 
-  emit_move_insn (x, y);
+  if (downwards)
+    emit_label (cmp_label);
 
-  tmp = expand_simple_binop (iter_mode, PLUS, iter, const1_rtx, iter,
+  tmp = expand_simple_binop (iter_mode, PLUS, iter, iter_incr, iter,
 			     true, OPTAB_LIB_WIDEN);
   if (tmp != iter)
     emit_move_insn (iter, tmp);
 
-  emit_label (cmp_label);
+  if (!downwards)
+    emit_label (cmp_label);
 
-  emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
+  emit_cmp_and_jump_insns (iter, iter_limit, iter_cond, NULL_RTX, iter_mode,
 			   true, top_label,
 			   profile_probability::guessed_always ()
 				.apply_scale (9, 10));
@@ -2405,7 +2571,8 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
 
    Both X and Y must be MEM rtx's.  LEN is an rtx that says how long
    they are.  LEN_TYPE is the type of the expression that was used to
-   calculate it.
+   calculate it, and CTZ_LEN is the known trailing-zeros count of LEN,
+   so LEN must be a multiple of 1<<CTZ_LEN even if it's not constant.
 
    If EQUALITY_ONLY is true, it means we don't have to return the tri-state
    value of a normal memcmp call, instead we can just compare for equality.
@@ -2421,7 +2588,7 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
 rtx
 emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
 		      bool equality_only, by_pieces_constfn y_cfn,
-		      void *y_cfndata)
+		      void *y_cfndata, unsigned ctz_len)
 {
   rtx result = 0;
 
@@ -2443,8 +2610,203 @@ emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
   else
     result = emit_block_cmp_via_cmpmem (x, y, len, len_type, target, align);
 
+  if (!result && (flag_inline_stringops & ILSOP_MEMCMP))
+    result = emit_block_cmp_via_loop (x, y, len, len_type,
+				      target, equality_only,
+				      align, ctz_len);
+
   return result;
 }
+
+/* Like emit_block_cmp_hints, but with known alignment and no support
+   for constats.  Always expand to a loop with iterations that compare
+   blocks of the largest compare-by-pieces size that divides both len
+   and align, and then, if !EQUALITY_ONLY, identify the word and then
+   the unit that first differs to return the result.  */
+
+rtx
+emit_block_cmp_via_loop (rtx x, rtx y, rtx len, tree len_type, rtx target,
+			 bool equality_only, unsigned align, unsigned ctz_len)
+{
+  unsigned incr = align / BITS_PER_UNIT;
+
+  if (CONST_INT_P (len))
+    ctz_len = MAX (ctz_len, (unsigned) wi::ctz (UINTVAL (len)));
+
+  if (HOST_WIDE_INT_1U << ctz_len < (unsigned HOST_WIDE_INT) incr)
+    incr = HOST_WIDE_INT_1U << ctz_len;
+
+  while (incr > 1
+	 && !can_do_by_pieces (incr, align, COMPARE_BY_PIECES))
+    incr >>= 1;
+
+  rtx_code_label *cmp_label, *top_label, *ne_label, *res_label;
+  rtx iter, x_addr, y_addr, tmp;
+  machine_mode x_addr_mode = get_address_mode (x);
+  machine_mode y_addr_mode = get_address_mode (y);
+  machine_mode iter_mode;
+
+  iter_mode = GET_MODE (len);
+  if (iter_mode == VOIDmode)
+    iter_mode = word_mode;
+
+  rtx iter_init = const0_rtx;
+  rtx_code iter_cond = LTU;
+  rtx_code entry_cond = GEU;
+  rtx iter_limit = len;
+  rtx iter_incr = GEN_INT (incr);
+  machine_mode cmp_mode;
+
+  /* We can drop the loop back edge if we know there's exactly one
+     iteration.  */
+  top_label = (!rtx_equal_p (len, iter_incr)
+	       ? gen_label_rtx ()
+	       : NULL);
+  /* We need not test before entering the loop if len is known
+     nonzero.  ??? This could be even stricter, testing whether a
+     nonconstant LEN could possibly be zero.  */
+  cmp_label = (!CONSTANT_P (len) || rtx_equal_p (len, iter_init)
+	       ? gen_label_rtx ()
+	       : NULL);
+  ne_label = gen_label_rtx ();
+  res_label = gen_label_rtx ();
+
+  iter = gen_reg_rtx (iter_mode);
+  emit_move_insn (iter, iter_init);
+
+  scalar_int_mode int_cmp_mode
+    = smallest_int_mode_for_size (incr * BITS_PER_UNIT);
+  if (GET_MODE_BITSIZE (int_cmp_mode) != incr * BITS_PER_UNIT
+      || !can_compare_p (NE, int_cmp_mode, ccp_jump))
+    {
+      cmp_mode = BLKmode;
+      gcc_checking_assert (incr != 1);
+    }
+  else
+    cmp_mode = int_cmp_mode;
+
+  /* Save the base addresses.  */
+  x_addr = force_operand (XEXP (x, 0), NULL_RTX);
+  y_addr = force_operand (XEXP (y, 0), NULL_RTX);
+  do_pending_stack_adjust ();
+
+  if (cmp_label)
+    {
+      if (top_label)
+	emit_jump (cmp_label);
+      else
+	emit_cmp_and_jump_insns (iter, iter_limit, entry_cond,
+				 NULL_RTX, iter_mode,
+				 true, cmp_label,
+				 profile_probability::guessed_always ()
+				 .apply_scale (1, 10));
+    }
+  if (top_label)
+    emit_label (top_label);
+
+  /* Offset the base addresses by ITER.  */
+  tmp = convert_modes (x_addr_mode, iter_mode, iter, true);
+  x_addr = simplify_gen_binary (PLUS, x_addr_mode, x_addr, tmp);
+
+  if (x_addr_mode != y_addr_mode)
+    tmp = convert_modes (y_addr_mode, iter_mode, iter, true);
+  y_addr = simplify_gen_binary (PLUS, y_addr_mode, y_addr, tmp);
+
+  x = change_address (x, cmp_mode, x_addr);
+  y = change_address (y, cmp_mode, y_addr);
+
+  /* Compare one block.  */
+  rtx part_res;
+  if (cmp_mode == BLKmode)
+    part_res = compare_by_pieces (x, y, incr, target, align, 0, 0);
+  else
+    part_res = expand_binop (cmp_mode, sub_optab, x, y, NULL_RTX,
+			     true, OPTAB_LIB_WIDEN);
+
+  /* Stop if we found a difference.  */
+  emit_cmp_and_jump_insns (part_res, GEN_INT (0), NE, NULL_RTX,
+			   GET_MODE (part_res), true, ne_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (1, 10));
+
+  /* Increment ITER.  */
+  tmp = expand_simple_binop (iter_mode, PLUS, iter, iter_incr, iter,
+			     true, OPTAB_LIB_WIDEN);
+  if (tmp != iter)
+    emit_move_insn (iter, tmp);
+
+  if (cmp_label)
+    emit_label (cmp_label);
+  /* Loop until we reach the limit.  */
+
+  if (top_label)
+    emit_cmp_and_jump_insns (iter, iter_limit, iter_cond, NULL_RTX, iter_mode,
+			     true, top_label,
+			     profile_probability::guessed_always ()
+			     .apply_scale (9, 10));
+
+  /* We got to the end without differences, so the result is zero.  */
+  if (target == NULL_RTX
+      || !REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
+    target = gen_reg_rtx (TYPE_MODE (integer_type_node));
+
+  emit_move_insn (target, const0_rtx);
+  emit_jump (res_label);
+
+  emit_label (ne_label);
+
+  /* Return nonzero, or pinpoint the difference to return the expected
+     result for non-equality tests.  */
+  if (equality_only)
+    emit_move_insn (target, const1_rtx);
+  else
+    {
+      if (incr > UNITS_PER_WORD)
+	/* ??? Re-compare the block found to be different one word at a
+	   time.  */
+	part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), len_type,
+					    target, equality_only,
+					    BITS_PER_WORD, 0);
+      else if (incr > 1)
+	/* ??? Re-compare the block found to be different one byte at a
+	   time.  We could do better using part_res, and being careful
+	   about endianness.  */
+	part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), len_type,
+					    target, equality_only,
+					    BITS_PER_UNIT, 0);
+      else if (known_gt (GET_MODE_BITSIZE (GET_MODE (target)),
+			 GET_MODE_BITSIZE (cmp_mode)))
+	part_res = expand_binop (GET_MODE (target), sub_optab, x, y, target,
+				 true, OPTAB_LIB_WIDEN);
+      else
+	{
+	  /* In the odd chance target is QImode, we can't count on
+	     widening subtract to capture the result of the unsigned
+	     compares.  */
+	  rtx_code_label *ltu_label;
+	  ltu_label = gen_label_rtx ();
+	  emit_cmp_and_jump_insns (x, y, LTU, NULL_RTX,
+				   cmp_mode, true, ltu_label,
+				   profile_probability::guessed_always ()
+				   .apply_scale (5, 10));
+
+	  emit_move_insn (target, const1_rtx);
+	  emit_jump (res_label);
+
+	  emit_label (ltu_label);
+	  emit_move_insn (target, constm1_rtx);
+	  part_res = target;
+	}
+
+      if (target != part_res)
+	convert_move (target, part_res, false);
+    }
+
+  emit_label (res_label);
+
+  return target;
+}
+
 \f
 /* Copy all or part of a value X into registers starting at REGNO.
    The number of registers to be filled is NREGS.  */
diff --git a/gcc/expr.h b/gcc/expr.h
index 11bff531862..988c783e450 100644
--- a/gcc/expr.h
+++ b/gcc/expr.h
@@ -126,7 +126,8 @@ struct by_pieces_prev
   fixed_size_mode mode;
 };
 
-extern rtx emit_block_move (rtx, rtx, rtx, enum block_op_methods);
+extern rtx emit_block_move (rtx, rtx, rtx, enum block_op_methods,
+			    unsigned ctz_size = 0);
 extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
 			          unsigned int, HOST_WIDE_INT,
 				  unsigned HOST_WIDE_INT,
@@ -134,9 +135,11 @@ extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
 				  unsigned HOST_WIDE_INT,
 				  bool bail_out_libcall = false,
 				  bool *is_move_done = NULL,
-				  bool might_overlap = false);
+				  bool might_overlap = false,
+				  unsigned ctz_size = 0);
 extern rtx emit_block_cmp_hints (rtx, rtx, rtx, tree, rtx, bool,
-				 by_pieces_constfn, void *);
+				 by_pieces_constfn, void *,
+				 unsigned ctz_len = 0);
 extern bool emit_storent_insn (rtx to, rtx from);
 
 /* Copy all or part of a value X into registers starting at REGNO.
diff --git a/gcc/flag-types.h b/gcc/flag-types.h
index 7466c1106f2..7a20cfd5de3 100644
--- a/gcc/flag-types.h
+++ b/gcc/flag-types.h
@@ -437,6 +437,17 @@ enum gfc_convert
 };
 
 
+/* Inline String Operations functions.  */
+enum ilsop_fn
+{
+  ILSOP_NONE = 0,
+  ILSOP_MEMSET = 1 << 0,
+  ILSOP_MEMCPY = 1 << 1,
+  ILSOP_MEMMOVE = 1 << 2,
+  ILSOP_MEMCMP = 1 << 3,
+  ILSOP_ALL = -1
+};
+
 /* Control-Flow Protection values.  */
 enum cf_protection_level
 {
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c
new file mode 100644
index 00000000000..a368f074112
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c
@@ -0,0 +1,7 @@
+/* { dg-do run } */
+/* { dg-options "-finline-stringops=memcmp -save-temps -g0 -fno-lto" } */
+
+#include "../memcmp-1.c"
+
+/* Check that no memcmp calls remain, but allow for lib_memcmp calls.  */
+/* { dg-final { scan-assembler-not {(^|\*)\mmemcmp\M} } } */
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c
new file mode 100644
index 00000000000..c98e903c1f1
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c
@@ -0,0 +1,8 @@
+/* { dg-do run } */
+/* { dg-options "-finline-stringops=memcpy -save-temps -g0 -fno-lto" } */
+
+#include "../memcmp-1.c"
+/* Yeah, this memcmp test exercises plenty of memcpy, more than any of the
+   memcpy tests.  */
+
+/* { dg-final { scan-assembler-not {\mmemcpy\M} } } */
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-cmp-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-cmp-1.c
new file mode 100644
index 00000000000..2cd2057a9b5
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-cmp-1.c
@@ -0,0 +1,11 @@
+/* { dg-do run } */
+/* { dg-options "-finline-stringops -save-temps -g0 -fno-lto" } */
+/* { dg-require-effective-target ptr32plus } */
+/* { dg-timeout-factor 2 } */
+
+#include "../memcmp-1.c"
+/* Yeah, this memcmp test exercises plenty of memcpy, more than any of the
+   memcpy tests.  */
+
+/* { dg-final { scan-assembler-not {\mmemcpy\M} } } */
+/* { dg-final { scan-assembler-not {(^|\*)\mmemcmp\M} } } */
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c
new file mode 100644
index 00000000000..f1d77b50a4c
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c
@@ -0,0 +1,9 @@
+/* { dg-do run } */
+/* { dg-options "-finline-stringops=memmove -save-temps -g0 -fno-lto" } */
+
+#include "../../gcc.c-torture/execute/builtins/memmove.c"
+
+#include "../../gcc.c-torture/execute/builtins/lib/main.c"
+
+/* { dg-final { scan-assembler-not {\mmemcpy\M} } } */
+/* { dg-final { scan-assembler-not {\mmemmove\M} } } */
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c
new file mode 100644
index 00000000000..bdcf9bf5b61
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c
@@ -0,0 +1,84 @@
+/* { dg-do compile } */
+/* { dg-options "-finline-stringops -fno-lto" } */
+
+void *zero (unsigned long long (*p)[32], int n)
+{
+  return __builtin_memset (p, 0, n * sizeof (*p));
+}
+
+void *ones (char (*p)[128], int n)
+{
+  return __builtin_memset (p, -1, n * sizeof (*p));
+}
+
+void *opt2 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 2) * sizeof (*p));
+}
+
+void *opt8 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 8) * sizeof (*p));
+}
+
+void *opt32 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 32) * sizeof (*p));
+}
+
+void *opt128 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 128) * sizeof (*p));
+}
+
+void *opt512 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 512) * sizeof (*p));
+}
+
+void *opt_primes (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 509 : 7) * sizeof (*p));
+}
+
+void *opt_primes_blk (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 521 : 9) * sizeof (*p));
+}
+
+void *huge (long (*p)[16384])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep1 (long (*p)[16384+1])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep4 (long (*p)[16384+4])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep16 (long (*p)[16384+16])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep64 (long (*p)[16384+64])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep256 (long (*p)[16384+256])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep1024p256p64p16p4p1 (long (*p)[16384+1024+64+16+4+1])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+/* { dg-final { scan-assembler-not {\mmemset\M} } } */

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [gcc(refs/users/aoliva/heads/testme)] Introduce -finline-stringops
@ 2023-09-21  5:07 Alexandre Oliva
  0 siblings, 0 replies; 16+ messages in thread
From: Alexandre Oliva @ 2023-09-21  5:07 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:04f0a5016858364a97b7494d128abd5273daaf91

commit 04f0a5016858364a97b7494d128abd5273daaf91
Author: Alexandre Oliva <oliva@adacore.com>
Date:   Fri Sep 15 02:41:24 2023 -0300

    Introduce -finline-stringops
    
    try_store_by_multiple_pieces was added not long ago, enabling
    variable-sized memset to be expanded inline when the worst-case
    in-range constant length would, using conditional blocks with powers
    of two to cover all possibilities of length and alignment.
    
    This patch introduces -finline-stringops[=fn] to request expansions to
    start with a loop, so as to still take advantage of known alignment
    even with long lengths, but without necessarily adding store blocks
    for every power of two.
    
    This makes it possible for the supported stringops (memset, memcpy,
    memmove, memset) to be expanded, even if storing a single byte per
    iteration.  Surely efficient implementations can run faster, with a
    pre-loop to increase alignment, but that would likely be excessive for
    inline expansions.
    
    Still, in some cases, such as in freestanding environments, users
    prefer to inline such stringops, especially those that the compiler
    may introduce itself, even if the expansion is not as performant as a
    highly optimized C library implementation could be, to avoid
    depending on a C runtime library.
    
    
    for  gcc/ChangeLog
    
            * expr.cc (emit_block_move_hints): Take ctz of len.  Obey
            -finline-stringops.  Use oriented or sized loop.
            (emit_block_move): Take ctz of len, and pass it on.
            (emit_block_move_via_sized_loop): New.
            (emit_block_move_via_oriented_loop): New.
            (emit_block_move_via_loop): Take incr.  Move an incr-sized
            block per iteration.
            (emit_block_cmp_via_cmpmem): Take ctz of len.  Obey
            -finline-stringops.
            (emit_block_cmp_via_loop): New.
            * expr.h (emit_block_move): Add ctz of len defaulting to zero.
            (emit_block_move_hints): Likewise.
            (emit_block_cmp_hints): Likewise.
            * builtins.cc (expand_builtin_memory_copy_args): Pass ctz of
            len to emit_block_move_hints.
            (try_store_by_multiple_pieces): Support starting with a loop.
            (expand_builtin_memcmp): Pass ctz of len to
            emit_block_cmp_hints.
            (expand_builtin): Allow inline expansion of memset, memcpy,
            memmove and memcmp if requested.
            * common.opt (finline-stringops): New.
            (ilsop_fn): New enum.
            * flag-types.h (enum ilsop_fn): New.
            * doc/invoke.texi (-finline-stringops): Add.
    
    for  gcc/testsuite/ChangeLog
    
            * gcc.dg/torture/inline-mem-cmp-1.c: New.
            * gcc.dg/torture/inline-mem-cpy-1.c: New.
            * gcc.dg/torture/inline-mem-cpy-cmp-1.c: New.
            * gcc.dg/torture/inline-mem-move-1.c: New.
            * gcc.dg/torture/inline-mem-set-1.c: New.

Diff:
---
 gcc/builtins.cc                                    | 114 +++++-
 gcc/common.opt                                     |  34 ++
 gcc/doc/invoke.texi                                |  15 +
 gcc/expr.cc                                        | 396 ++++++++++++++++++++-
 gcc/expr.h                                         |   9 +-
 gcc/flag-types.h                                   |  11 +
 gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c    |   7 +
 gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c    |   8 +
 .../gcc.dg/torture/inline-mem-cpy-cmp-1.c          |  11 +
 gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c   |   9 +
 gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c    |  84 +++++
 11 files changed, 668 insertions(+), 30 deletions(-)

diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index 3b453b3ec8c..23646da09f8 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -3771,7 +3771,7 @@ expand_builtin_memory_copy_args (tree dest, tree src, tree len,
 				     expected_align, expected_size,
 				     min_size, max_size, probable_max_size,
 				     use_mempcpy_call, &is_move_done,
-				     might_overlap);
+				     might_overlap, tree_ctz (len));
 
   /* Bail out when a mempcpy call would be expanded as libcall and when
      we have a target that provides a fast implementation
@@ -4337,6 +4337,10 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
   int tst_bits = (max_bits != min_bits ? max_bits
 		  : floor_log2 (max_len ^ min_len));
 
+  /* Save the pre-blksize values.  */
+  int orig_max_bits = max_bits;
+  int orig_tst_bits = tst_bits;
+
   /* Check whether it's profitable to start by storing a fixed BLKSIZE
      bytes, to lower max_bits.  In the unlikely case of a constant LEN
      (implied by identical MAX_LEN and MIN_LEN), we want to issue a
@@ -4376,9 +4380,70 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
   if (max_bits >= 0)
     xlenest += ((HOST_WIDE_INT_1U << max_bits) * 2
 		- (HOST_WIDE_INT_1U << ctz_len));
-  if (!can_store_by_pieces (xlenest, builtin_memset_read_str,
-			    &valc, align, true))
-    return false;
+  bool max_loop = false;
+  /* Skip the test in case of overflow in xlenest.  It shouldn't
+     happen because of the way max_bits and blksize are related, but
+     it doesn't hurt to test.  */
+  if (blksize > xlenest
+      || !can_store_by_pieces (xlenest, builtin_memset_read_str,
+			       &valc, align, true))
+    {
+      if (!(flag_inline_stringops & ILSOP_MEMSET))
+	return false;
+
+      for (max_bits = orig_max_bits;
+	   max_bits >= sctz_len;
+	   --max_bits)
+	{
+	  xlenest = ((HOST_WIDE_INT_1U << max_bits) * 2
+		     - (HOST_WIDE_INT_1U << ctz_len));
+	  /* Check that blksize plus the bits to be stored as blocks
+	     sized at powers of two can be stored by pieces.  This is
+	     like the test above, but with smaller max_bits.  Skip
+	     orig_max_bits (it would be redundant).  Also skip in case
+	     of overflow.  */
+	  if (max_bits < orig_max_bits
+	      && xlenest + blksize >= xlenest
+	      && can_store_by_pieces (xlenest + blksize,
+				      builtin_memset_read_str,
+				      &valc, align, true))
+	    {
+	      max_loop = true;
+	      break;
+	    }
+	  if (blksize
+	      && can_store_by_pieces (xlenest,
+				      builtin_memset_read_str,
+				      &valc, align, true))
+	    {
+	      max_len += blksize;
+	      min_len += blksize;
+	      tst_bits = orig_tst_bits;
+	      blksize = 0;
+	      max_loop = true;
+	      break;
+	    }
+	  if (max_bits == sctz_len)
+	    {
+	      --sctz_len;
+	      --ctz_len;
+	    }
+	}
+      if (!max_loop)
+	return false;
+      /* If the boundaries are such that min and max may run a
+	 different number of trips in the initial loop, the remainder
+	 needs not be between the moduli, so set tst_bits to cover all
+	 bits.  Otherwise, if the trip counts are the same, max_len
+	 has the common prefix, and the previously-computed tst_bits
+	 is usable.  */
+      if (max_len >> max_bits > min_len >> max_bits)
+	tst_bits = max_bits;
+    }
+  /* ??? Do we have to check that all powers of two lengths from
+     max_bits down to ctz_len pass can_store_by_pieces?  As in, could
+     it possibly be that xlenest passes while smaller power-of-two
+     sizes don't?  */
 
   by_pieces_constfn constfun;
   void *constfundata;
@@ -4420,7 +4485,9 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
      the least significant bit possibly set in the length.  */
   for (int i = max_bits; i >= sctz_len; i--)
     {
+      rtx_code_label *loop_label = NULL;
       rtx_code_label *label = NULL;
+
       blksize = HOST_WIDE_INT_1U << i;
 
       /* If we're past the bits shared between min_ and max_len, expand
@@ -4434,18 +4501,31 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
 				   profile_probability::even ());
 	}
       /* If we are at a bit that is in the prefix shared by min_ and
-	 max_len, skip this BLKSIZE if the bit is clear.  */
-      else if ((max_len & blksize) == 0)
+	 max_len, skip the current BLKSIZE if the bit is clear, but do
+	 not skip the loop, even if it doesn't require
+	 prechecking.  */
+      else if ((max_len & blksize) == 0
+	       && !(max_loop && i == max_bits))
 	continue;
 
+      if (max_loop && i == max_bits)
+	{
+	  loop_label = gen_label_rtx ();
+	  emit_label (loop_label);
+	  /* Since we may run this multiple times, don't assume we
+	     know anything about the offset.  */
+	  clear_mem_offset (to);
+	}
+
       /* Issue a store of BLKSIZE bytes.  */
+      bool update_needed = i != sctz_len || loop_label;
       to = store_by_pieces (to, blksize,
 			    constfun, constfundata,
 			    align, true,
-			    i != sctz_len ? RETURN_END : RETURN_BEGIN);
+			    update_needed ? RETURN_END : RETURN_BEGIN);
 
       /* Adjust REM and PTR, unless this is the last iteration.  */
-      if (i != sctz_len)
+      if (update_needed)
 	{
 	  emit_move_insn (ptr, force_operand (XEXP (to, 0), NULL_RTX));
 	  to = replace_equiv_address (to, ptr);
@@ -4453,6 +4533,11 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
 	  emit_move_insn (rem, force_operand (rem_minus_blksize, NULL_RTX));
 	}
 
+      if (loop_label)
+	emit_cmp_and_jump_insns (rem, GEN_INT (blksize), GE, NULL,
+				 ptr_mode, 1, loop_label,
+				 profile_probability::likely ());
+
       if (label)
 	{
 	  emit_label (label);
@@ -4739,7 +4824,8 @@ expand_builtin_memcmp (tree exp, rtx target, bool result_eq)
   result = emit_block_cmp_hints (arg1_rtx, arg2_rtx, len_rtx,
 				 TREE_TYPE (len), target,
 				 result_eq, constfn,
-				 CONST_CAST (char *, rep));
+				 CONST_CAST (char *, rep),
+				 tree_ctz (len));
 
   if (result)
     {
@@ -7382,7 +7468,15 @@ expand_builtin (tree exp, rtx target, rtx subtarget, machine_mode mode,
       && fcode != BUILT_IN_EXECVE
       && fcode != BUILT_IN_CLEAR_CACHE
       && !ALLOCA_FUNCTION_CODE_P (fcode)
-      && fcode != BUILT_IN_FREE)
+      && fcode != BUILT_IN_FREE
+      && (fcode != BUILT_IN_MEMSET
+	  || !(flag_inline_stringops & ILSOP_MEMSET))
+      && (fcode != BUILT_IN_MEMCPY
+	  || !(flag_inline_stringops & ILSOP_MEMCPY))
+      && (fcode != BUILT_IN_MEMMOVE
+	  || !(flag_inline_stringops & ILSOP_MEMMOVE))
+      && (fcode != BUILT_IN_MEMCMP
+	  || !(flag_inline_stringops & ILSOP_MEMCMP)))
     return expand_call (exp, target, ignore);
 
   /* The built-in function expanders test for target == const0_rtx
diff --git a/gcc/common.opt b/gcc/common.opt
index f137a1f81ac..1e3f8efbc97 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -1908,6 +1908,40 @@ finline-atomics
 Common Var(flag_inline_atomics) Init(1) Optimization
 Inline __atomic operations when a lock free instruction sequence is available.
 
+finline-stringops
+Common RejectNegative Enum(ilsop_fn) Var(flag_inline_stringops, ILSOP_ALL) Enum(ilsop_fn) Init(ILSOP_NONE) Optimization Undocumented
+
+fno-inline-stringops
+Common RejectNegative Enum(ilsop_fn) Var(flag_inline_stringops, ILSOP_NONE) Enum(ilsop_fn) Optimization Undocumented
+
+finline-stringops=
+Common Joined Var(flag_inline_stringops) EnumSet Enum(ilsop_fn) Optimization
+-finline-stringops[=memcmp|memcpy|memmove|memset]
+Expand supported mem/str operations inline, even if against optimization.
+
+Enum
+Name(ilsop_fn) Type(enum ilsop_fn) UnknownError(unavailable stringop for inlining %qs)
+
+; This is not part of any set.
+; EnumValue
+; Enum(ilsop_fn) String(none) Value(ILSOP_NONE)
+
+EnumValue
+Enum(ilsop_fn) String(memcmp) Value(ILSOP_MEMCMP) Set(1)
+
+EnumValue
+Enum(ilsop_fn) String(memcpy) Value(ILSOP_MEMCPY) Set(2)
+
+EnumValue
+Enum(ilsop_fn) String(memmove) Value(ILSOP_MEMMOVE) Set(3)
+
+EnumValue
+Enum(ilsop_fn) String(memset) Value(ILSOP_MEMSET) Set(4)
+
+; This is not part of any set either.
+; EnumValue
+; Enum(ilsop_fn) String(all) Value(ILSOP_ALL)
+
 fcf-protection
 Common RejectNegative Alias(fcf-protection=,full)
 
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 03d93e6b185..b0649712792 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -556,6 +556,7 @@ Objective-C and Objective-C++ Dialects}.
 -fgcse  -fgcse-after-reload  -fgcse-las  -fgcse-lm  -fgraphite-identity
 -fgcse-sm  -fhoist-adjacent-loads  -fif-conversion
 -fif-conversion2  -findirect-inlining
+-finline-stringops[=@var{fn}]
 -finline-functions  -finline-functions-called-once  -finline-limit=@var{n}
 -finline-small-functions -fipa-modref -fipa-cp  -fipa-cp-clone
 -fipa-bit-cp  -fipa-vrp  -fipa-pta  -fipa-profile  -fipa-pure-const
@@ -12228,6 +12229,20 @@ their @code{_FORTIFY_SOURCE} counterparts into faster alternatives.
 
 Enabled at levels @option{-O2}, @option{-O3}.
 
+@opindex finline-stringops
+@item -finline-stringops[=@var{fn}]
+Expand memory and string operations (for now, only @code{memset})
+inline, even when the length is variable or big enough as to require
+looping.  This is most useful along with @option{-ffreestanding} and
+@option{-fno-builtin}.
+
+In some circumstances, it enables the compiler to generate code that
+takes advantage of known alignment and length multipliers, but even then
+it may be less efficient than optimized runtime implementations, and
+grow code size so much that even a less performant but shared
+implementation runs faster due to better use of code caches.  This
+option is disabled by default.
+
 @opindex fno-inline
 @opindex finline
 @item -fno-inline
diff --git a/gcc/expr.cc b/gcc/expr.cc
index d5b6494b4fc..c6842996b98 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -80,7 +80,11 @@ static bool emit_block_move_via_pattern (rtx, rtx, rtx, unsigned, unsigned,
 					 HOST_WIDE_INT, unsigned HOST_WIDE_INT,
 					 unsigned HOST_WIDE_INT,
 					 unsigned HOST_WIDE_INT, bool);
-static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned);
+static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned, int);
+static void emit_block_move_via_sized_loop (rtx, rtx, rtx, unsigned, unsigned);
+static void emit_block_move_via_oriented_loop (rtx, rtx, rtx, unsigned, unsigned);
+static rtx emit_block_cmp_via_loop (rtx, rtx, rtx, tree, rtx, bool,
+				    unsigned, unsigned);
 static void clear_by_pieces (rtx, unsigned HOST_WIDE_INT, unsigned int);
 static rtx_insn *compress_float_constant (rtx, rtx);
 static rtx get_subtarget (rtx);
@@ -1955,6 +1959,8 @@ compare_by_pieces (rtx arg0, rtx arg1, unsigned HOST_WIDE_INT len,
    MIN_SIZE is the minimal size of block to move
    MAX_SIZE is the maximal size of block to move, if it cannot be represented
    in unsigned HOST_WIDE_INT, than it is mask of all ones.
+   CTZ_SIZE is the trailing-zeros count of SIZE; even a nonconstant SIZE is
+   known to be a multiple of 1<<CTZ_SIZE.
 
    Return the address of the new block, if memcpy is called and returns it,
    0 otherwise.  */
@@ -1966,7 +1972,7 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
 		       unsigned HOST_WIDE_INT max_size,
 		       unsigned HOST_WIDE_INT probable_max_size,
 		       bool bail_out_libcall, bool *is_move_done,
-		       bool might_overlap)
+		       bool might_overlap, unsigned ctz_size)
 {
   int may_use_call;
   rtx retval = 0;
@@ -2052,6 +2058,14 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
 	}
     }
 
+  bool dynamic_direction = false;
+  if (!pattern_ok && !pieces_ok && may_use_call
+      && (flag_inline_stringops & (might_overlap ? ILSOP_MEMMOVE : ILSOP_MEMCPY)))
+    {
+      may_use_call = 0;
+      dynamic_direction = might_overlap;
+    }
+
   if (pattern_ok)
     ;
   else if (pieces_ok)
@@ -2073,10 +2087,12 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
       retval = emit_block_copy_via_libcall (x, y, size,
 					    method == BLOCK_OP_TAILCALL);
     }
+  else if (dynamic_direction)
+    emit_block_move_via_oriented_loop (x, y, size, align, ctz_size);
   else if (might_overlap)
     *is_move_done = false;
   else
-    emit_block_move_via_loop (x, y, size, align);
+    emit_block_move_via_sized_loop (x, y, size, align, ctz_size);
 
   if (method == BLOCK_OP_CALL_PARM)
     OK_DEFER_POP;
@@ -2085,7 +2101,8 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
 }
 
 rtx
-emit_block_move (rtx x, rtx y, rtx size, enum block_op_methods method)
+emit_block_move (rtx x, rtx y, rtx size, enum block_op_methods method,
+		 unsigned int ctz_size)
 {
   unsigned HOST_WIDE_INT max, min = 0;
   if (GET_CODE (size) == CONST_INT)
@@ -2093,7 +2110,8 @@ emit_block_move (rtx x, rtx y, rtx size, enum block_op_methods method)
   else
     max = GET_MODE_MASK (GET_MODE (size));
   return emit_block_move_hints (x, y, size, method, 0, -1,
-				min, max, max);
+				min, max, max,
+				false, NULL, false, ctz_size);
 }
 
 /* A subroutine of emit_block_move.  Returns true if calling the
@@ -2255,13 +2273,117 @@ emit_block_move_via_pattern (rtx x, rtx y, rtx size, unsigned int align,
   return false;
 }
 
+/* Like emit_block_move_via_loop, but choose a suitable INCR based on
+   ALIGN and CTZ_SIZE.  */
+
+static void
+emit_block_move_via_sized_loop (rtx x, rtx y, rtx size,
+				unsigned int align,
+				unsigned int ctz_size)
+{
+  int incr = align / BITS_PER_UNIT;
+
+  if (CONST_INT_P (size))
+    ctz_size = MAX (ctz_size, (unsigned) wi::ctz (UINTVAL (size)));
+
+  if (HOST_WIDE_INT_1U << ctz_size < (unsigned HOST_WIDE_INT) incr)
+    incr = HOST_WIDE_INT_1U << ctz_size;
+
+  while (incr > 1 && !can_move_by_pieces (incr, align))
+    incr >>= 1;
+
+  gcc_checking_assert (incr);
+
+  return emit_block_move_via_loop (x, y, size, align, incr);
+}
+
+/* Like emit_block_move_via_sized_loop, but besides choosing INCR so
+   as to ensure safe moves even in case of overlap, output dynamic
+   tests to choose between two loops, one moving downwards, another
+   moving upwards.  */
+
+static void
+emit_block_move_via_oriented_loop (rtx x, rtx y, rtx size,
+				   unsigned int align,
+				   unsigned int ctz_size)
+{
+  int incr = align / BITS_PER_UNIT;
+
+  if (CONST_INT_P (size))
+    ctz_size = MAX (ctz_size, (unsigned) wi::ctz (UINTVAL (size)));
+
+  if (HOST_WIDE_INT_1U << ctz_size < (unsigned HOST_WIDE_INT) incr)
+    incr = HOST_WIDE_INT_1U << ctz_size;
+
+  while (incr > 1 && !int_mode_for_size (incr, 0).exists ())
+    incr >>= 1;
+
+  gcc_checking_assert (incr);
+
+  rtx_code_label *upw_label, *end_label;
+  upw_label = gen_label_rtx ();
+  end_label = gen_label_rtx ();
+
+  rtx x_addr = force_operand (XEXP (x, 0), NULL_RTX);
+  rtx y_addr = force_operand (XEXP (y, 0), NULL_RTX);
+  do_pending_stack_adjust ();
+
+  machine_mode mode = GET_MODE (x_addr);
+  if (mode != GET_MODE (y_addr))
+    {
+      scalar_int_mode xmode
+	= smallest_int_mode_for_size (GET_MODE_BITSIZE (mode));
+      scalar_int_mode ymode
+	= smallest_int_mode_for_size (GET_MODE_BITSIZE
+				      (GET_MODE (y_addr)));
+      if (GET_MODE_BITSIZE (xmode) < GET_MODE_BITSIZE (ymode))
+	mode = ymode;
+      else
+	mode = xmode;
+
+#ifndef POINTERS_EXTEND_UNSIGNED
+      const int POINTERS_EXTEND_UNSIGNED = 1;
+#endif
+      x_addr = convert_modes (mode, GET_MODE (x_addr), x_addr,
+			      POINTERS_EXTEND_UNSIGNED);
+      y_addr = convert_modes (mode, GET_MODE (y_addr), y_addr,
+			      POINTERS_EXTEND_UNSIGNED);
+    }
+
+  /* Test for overlap: if (x >= y || x + size <= y) goto upw_label.  */
+  emit_cmp_and_jump_insns (x_addr, y_addr, GEU, NULL_RTX, mode,
+			   true, upw_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (5, 10));
+  rtx tmp = convert_modes (GET_MODE (x_addr), GET_MODE (size), size, true);
+  tmp = simplify_gen_binary (PLUS, GET_MODE (x_addr), x_addr, tmp);
+
+  emit_cmp_and_jump_insns (tmp, y_addr, LEU, NULL_RTX, mode,
+			   true, upw_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (8, 10));
+
+  emit_block_move_via_loop (x, y, size, align, -incr);
+
+  emit_jump (end_label);
+  emit_label (upw_label);
+
+  emit_block_move_via_loop (x, y, size, align, incr);
+
+  emit_label (end_label);
+}
+
 /* A subroutine of emit_block_move.  Copy the data via an explicit
-   loop.  This is used only when libcalls are forbidden.  */
-/* ??? It'd be nice to copy in hunks larger than QImode.  */
+   loop.  This is used only when libcalls are forbidden, or when
+   inlining is required.  INCR is the block size to be copied in each
+   loop iteration.  If it is negative, the absolute value is used, and
+   the block is copied backwards.  INCR must be a power of two, an
+   exact divisor for SIZE and ALIGN, and imply a mode that can be
+   safely copied per iteration assuming no overlap.  */
 
 static void
 emit_block_move_via_loop (rtx x, rtx y, rtx size,
-			  unsigned int align ATTRIBUTE_UNUSED)
+			  unsigned int align, int incr)
 {
   rtx_code_label *cmp_label, *top_label;
   rtx iter, x_addr, y_addr, tmp;
@@ -2277,7 +2399,38 @@ emit_block_move_via_loop (rtx x, rtx y, rtx size,
   cmp_label = gen_label_rtx ();
   iter = gen_reg_rtx (iter_mode);
 
-  emit_move_insn (iter, const0_rtx);
+  bool downwards = incr < 0;
+  rtx iter_init;
+  rtx_code iter_cond;
+  rtx iter_limit;
+  rtx iter_incr;
+  machine_mode move_mode;
+  if (downwards)
+    {
+      incr = -incr;
+      iter_init = size;
+      iter_cond = GEU;
+      iter_limit = const0_rtx;
+      iter_incr = GEN_INT (incr);
+    }
+  else
+    {
+      iter_init = const0_rtx;
+      iter_cond = LTU;
+      iter_limit = size;
+      iter_incr = GEN_INT (incr);
+    }
+  emit_move_insn (iter, iter_init);
+
+  scalar_int_mode int_move_mode
+    = smallest_int_mode_for_size (incr * BITS_PER_UNIT);
+  if (GET_MODE_BITSIZE (int_move_mode) != incr * BITS_PER_UNIT)
+    {
+      move_mode = BLKmode;
+      gcc_checking_assert (can_move_by_pieces (incr, align));
+    }
+  else
+    move_mode = int_move_mode;
 
   x_addr = force_operand (XEXP (x, 0), NULL_RTX);
   y_addr = force_operand (XEXP (y, 0), NULL_RTX);
@@ -2293,19 +2446,32 @@ emit_block_move_via_loop (rtx x, rtx y, rtx size,
     tmp = convert_modes (y_addr_mode, iter_mode, iter, true);
   y_addr = simplify_gen_binary (PLUS, y_addr_mode, y_addr, tmp);
 
-  x = change_address (x, QImode, x_addr);
-  y = change_address (y, QImode, y_addr);
+  x = change_address (x, move_mode, x_addr);
+  y = change_address (y, move_mode, y_addr);
+
+  if (move_mode == BLKmode)
+    {
+      bool done;
+      emit_block_move_hints (x, y, iter_incr, BLOCK_OP_NO_LIBCALL,
+			     align, incr, incr, incr, incr,
+			     false, &done, false);
+      gcc_checking_assert (done);
+    }
+  else
+    emit_move_insn (x, y);
 
-  emit_move_insn (x, y);
+  if (downwards)
+    emit_label (cmp_label);
 
-  tmp = expand_simple_binop (iter_mode, PLUS, iter, const1_rtx, iter,
+  tmp = expand_simple_binop (iter_mode, PLUS, iter, iter_incr, iter,
 			     true, OPTAB_LIB_WIDEN);
   if (tmp != iter)
     emit_move_insn (iter, tmp);
 
-  emit_label (cmp_label);
+  if (!downwards)
+    emit_label (cmp_label);
 
-  emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
+  emit_cmp_and_jump_insns (iter, iter_limit, iter_cond, NULL_RTX, iter_mode,
 			   true, top_label,
 			   profile_probability::guessed_always ()
 				.apply_scale (9, 10));
@@ -2405,7 +2571,8 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
 
    Both X and Y must be MEM rtx's.  LEN is an rtx that says how long
    they are.  LEN_TYPE is the type of the expression that was used to
-   calculate it.
+   calculate it, and CTZ_LEN is the known trailing-zeros count of LEN,
+   so LEN must be a multiple of 1<<CTZ_LEN even if it's not constant.
 
    If EQUALITY_ONLY is true, it means we don't have to return the tri-state
    value of a normal memcmp call, instead we can just compare for equality.
@@ -2421,7 +2588,7 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
 rtx
 emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
 		      bool equality_only, by_pieces_constfn y_cfn,
-		      void *y_cfndata)
+		      void *y_cfndata, unsigned ctz_len)
 {
   rtx result = 0;
 
@@ -2443,8 +2610,203 @@ emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
   else
     result = emit_block_cmp_via_cmpmem (x, y, len, len_type, target, align);
 
+  if (!result && (flag_inline_stringops & ILSOP_MEMCMP))
+    result = emit_block_cmp_via_loop (x, y, len, len_type,
+				      target, equality_only,
+				      align, ctz_len);
+
   return result;
 }
+
+/* Like emit_block_cmp_hints, but with known alignment and no support
+   for constats.  Always expand to a loop with iterations that compare
+   blocks of the largest compare-by-pieces size that divides both len
+   and align, and then, if !EQUALITY_ONLY, identify the word and then
+   the unit that first differs to return the result.  */
+
+rtx
+emit_block_cmp_via_loop (rtx x, rtx y, rtx len, tree len_type, rtx target,
+			 bool equality_only, unsigned align, unsigned ctz_len)
+{
+  unsigned incr = align / BITS_PER_UNIT;
+
+  if (CONST_INT_P (len))
+    ctz_len = MAX (ctz_len, (unsigned) wi::ctz (UINTVAL (len)));
+
+  if (HOST_WIDE_INT_1U << ctz_len < (unsigned HOST_WIDE_INT) incr)
+    incr = HOST_WIDE_INT_1U << ctz_len;
+
+  while (incr > 1
+	 && !can_do_by_pieces (incr, align, COMPARE_BY_PIECES))
+    incr >>= 1;
+
+  rtx_code_label *cmp_label, *top_label, *ne_label, *res_label;
+  rtx iter, x_addr, y_addr, tmp;
+  machine_mode x_addr_mode = get_address_mode (x);
+  machine_mode y_addr_mode = get_address_mode (y);
+  machine_mode iter_mode;
+
+  iter_mode = GET_MODE (len);
+  if (iter_mode == VOIDmode)
+    iter_mode = word_mode;
+
+  rtx iter_init = const0_rtx;
+  rtx_code iter_cond = LTU;
+  rtx_code entry_cond = GEU;
+  rtx iter_limit = len;
+  rtx iter_incr = GEN_INT (incr);
+  machine_mode cmp_mode;
+
+  /* We can drop the loop back edge if we know there's exactly one
+     iteration.  */
+  top_label = (!rtx_equal_p (len, iter_incr)
+	       ? gen_label_rtx ()
+	       : NULL);
+  /* We need not test before entering the loop if len is known
+     nonzero.  ??? This could be even stricter, testing whether a
+     nonconstant LEN could possibly be zero.  */
+  cmp_label = (!CONSTANT_P (len) || rtx_equal_p (len, iter_init)
+	       ? gen_label_rtx ()
+	       : NULL);
+  ne_label = gen_label_rtx ();
+  res_label = gen_label_rtx ();
+
+  iter = gen_reg_rtx (iter_mode);
+  emit_move_insn (iter, iter_init);
+
+  scalar_int_mode int_cmp_mode
+    = smallest_int_mode_for_size (incr * BITS_PER_UNIT);
+  if (GET_MODE_BITSIZE (int_cmp_mode) != incr * BITS_PER_UNIT
+      || !can_compare_p (NE, int_cmp_mode, ccp_jump))
+    {
+      cmp_mode = BLKmode;
+      gcc_checking_assert (incr != 1);
+    }
+  else
+    cmp_mode = int_cmp_mode;
+
+  /* Save the base addresses.  */
+  x_addr = force_operand (XEXP (x, 0), NULL_RTX);
+  y_addr = force_operand (XEXP (y, 0), NULL_RTX);
+  do_pending_stack_adjust ();
+
+  if (cmp_label)
+    {
+      if (top_label)
+	emit_jump (cmp_label);
+      else
+	emit_cmp_and_jump_insns (iter, iter_limit, entry_cond,
+				 NULL_RTX, iter_mode,
+				 true, cmp_label,
+				 profile_probability::guessed_always ()
+				 .apply_scale (1, 10));
+    }
+  if (top_label)
+    emit_label (top_label);
+
+  /* Offset the base addresses by ITER.  */
+  tmp = convert_modes (x_addr_mode, iter_mode, iter, true);
+  x_addr = simplify_gen_binary (PLUS, x_addr_mode, x_addr, tmp);
+
+  if (x_addr_mode != y_addr_mode)
+    tmp = convert_modes (y_addr_mode, iter_mode, iter, true);
+  y_addr = simplify_gen_binary (PLUS, y_addr_mode, y_addr, tmp);
+
+  x = change_address (x, cmp_mode, x_addr);
+  y = change_address (y, cmp_mode, y_addr);
+
+  /* Compare one block.  */
+  rtx part_res;
+  if (cmp_mode == BLKmode)
+    part_res = compare_by_pieces (x, y, incr, target, align, 0, 0);
+  else
+    part_res = expand_binop (cmp_mode, sub_optab, x, y, NULL_RTX,
+			     true, OPTAB_LIB_WIDEN);
+
+  /* Stop if we found a difference.  */
+  emit_cmp_and_jump_insns (part_res, GEN_INT (0), NE, NULL_RTX,
+			   GET_MODE (part_res), true, ne_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (1, 10));
+
+  /* Increment ITER.  */
+  tmp = expand_simple_binop (iter_mode, PLUS, iter, iter_incr, iter,
+			     true, OPTAB_LIB_WIDEN);
+  if (tmp != iter)
+    emit_move_insn (iter, tmp);
+
+  if (cmp_label)
+    emit_label (cmp_label);
+  /* Loop until we reach the limit.  */
+
+  if (top_label)
+    emit_cmp_and_jump_insns (iter, iter_limit, iter_cond, NULL_RTX, iter_mode,
+			     true, top_label,
+			     profile_probability::guessed_always ()
+			     .apply_scale (9, 10));
+
+  /* We got to the end without differences, so the result is zero.  */
+  if (target == NULL_RTX
+      || !REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
+    target = gen_reg_rtx (TYPE_MODE (integer_type_node));
+
+  emit_move_insn (target, const0_rtx);
+  emit_jump (res_label);
+
+  emit_label (ne_label);
+
+  /* Return nonzero, or pinpoint the difference to return the expected
+     result for non-equality tests.  */
+  if (equality_only)
+    emit_move_insn (target, const1_rtx);
+  else
+    {
+      if (incr > UNITS_PER_WORD)
+	/* ??? Re-compare the block found to be different one word at a
+	   time.  */
+	part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), len_type,
+					    target, equality_only,
+					    BITS_PER_WORD, 0);
+      else if (incr > 1)
+	/* ??? Re-compare the block found to be different one byte at a
+	   time.  We could do better using part_res, and being careful
+	   about endianness.  */
+	part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), len_type,
+					    target, equality_only,
+					    BITS_PER_UNIT, 0);
+      else if (known_gt (GET_MODE_BITSIZE (GET_MODE (target)),
+			 GET_MODE_BITSIZE (cmp_mode)))
+	part_res = expand_binop (GET_MODE (target), sub_optab, x, y, target,
+				 true, OPTAB_LIB_WIDEN);
+      else
+	{
+	  /* In the odd chance target is QImode, we can't count on
+	     widening subtract to capture the result of the unsigned
+	     compares.  */
+	  rtx_code_label *ltu_label;
+	  ltu_label = gen_label_rtx ();
+	  emit_cmp_and_jump_insns (x, y, LTU, NULL_RTX,
+				   cmp_mode, true, ltu_label,
+				   profile_probability::guessed_always ()
+				   .apply_scale (5, 10));
+
+	  emit_move_insn (target, const1_rtx);
+	  emit_jump (res_label);
+
+	  emit_label (ltu_label);
+	  emit_move_insn (target, constm1_rtx);
+	  part_res = target;
+	}
+
+      if (target != part_res)
+	convert_move (target, part_res, false);
+    }
+
+  emit_label (res_label);
+
+  return target;
+}
+
 \f
 /* Copy all or part of a value X into registers starting at REGNO.
    The number of registers to be filled is NREGS.  */
diff --git a/gcc/expr.h b/gcc/expr.h
index 11bff531862..988c783e450 100644
--- a/gcc/expr.h
+++ b/gcc/expr.h
@@ -126,7 +126,8 @@ struct by_pieces_prev
   fixed_size_mode mode;
 };
 
-extern rtx emit_block_move (rtx, rtx, rtx, enum block_op_methods);
+extern rtx emit_block_move (rtx, rtx, rtx, enum block_op_methods,
+			    unsigned ctz_size = 0);
 extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
 			          unsigned int, HOST_WIDE_INT,
 				  unsigned HOST_WIDE_INT,
@@ -134,9 +135,11 @@ extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
 				  unsigned HOST_WIDE_INT,
 				  bool bail_out_libcall = false,
 				  bool *is_move_done = NULL,
-				  bool might_overlap = false);
+				  bool might_overlap = false,
+				  unsigned ctz_size = 0);
 extern rtx emit_block_cmp_hints (rtx, rtx, rtx, tree, rtx, bool,
-				 by_pieces_constfn, void *);
+				 by_pieces_constfn, void *,
+				 unsigned ctz_len = 0);
 extern bool emit_storent_insn (rtx to, rtx from);
 
 /* Copy all or part of a value X into registers starting at REGNO.
diff --git a/gcc/flag-types.h b/gcc/flag-types.h
index 7466c1106f2..7a20cfd5de3 100644
--- a/gcc/flag-types.h
+++ b/gcc/flag-types.h
@@ -437,6 +437,17 @@ enum gfc_convert
 };
 
 
+/* Inline String Operations functions.  */
+enum ilsop_fn
+{
+  ILSOP_NONE = 0,
+  ILSOP_MEMSET = 1 << 0,
+  ILSOP_MEMCPY = 1 << 1,
+  ILSOP_MEMMOVE = 1 << 2,
+  ILSOP_MEMCMP = 1 << 3,
+  ILSOP_ALL = -1
+};
+
 /* Control-Flow Protection values.  */
 enum cf_protection_level
 {
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c
new file mode 100644
index 00000000000..a368f074112
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c
@@ -0,0 +1,7 @@
+/* { dg-do run } */
+/* { dg-options "-finline-stringops=memcmp -save-temps -g0 -fno-lto" } */
+
+#include "../memcmp-1.c"
+
+/* Check that no memcmp calls remain, but allow for lib_memcmp calls.  */
+/* { dg-final { scan-assembler-not {(^|\*)\mmemcmp\M} } } */
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c
new file mode 100644
index 00000000000..c98e903c1f1
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c
@@ -0,0 +1,8 @@
+/* { dg-do run } */
+/* { dg-options "-finline-stringops=memcpy -save-temps -g0 -fno-lto" } */
+
+#include "../memcmp-1.c"
+/* Yeah, this memcmp test exercises plenty of memcpy, more than any of the
+   memcpy tests.  */
+
+/* { dg-final { scan-assembler-not {\mmemcpy\M} } } */
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-cmp-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-cmp-1.c
new file mode 100644
index 00000000000..2cd2057a9b5
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-cmp-1.c
@@ -0,0 +1,11 @@
+/* { dg-do run } */
+/* { dg-options "-finline-stringops -save-temps -g0 -fno-lto" } */
+/* { dg-require-effective-target ptr32plus } */
+/* { dg-timeout-factor 2 } */
+
+#include "../memcmp-1.c"
+/* Yeah, this memcmp test exercises plenty of memcpy, more than any of the
+   memcpy tests.  */
+
+/* { dg-final { scan-assembler-not {\mmemcpy\M} } } */
+/* { dg-final { scan-assembler-not {(^|\*)\mmemcmp\M} } } */
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c
new file mode 100644
index 00000000000..f1d77b50a4c
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c
@@ -0,0 +1,9 @@
+/* { dg-do run } */
+/* { dg-options "-finline-stringops=memmove -save-temps -g0 -fno-lto" } */
+
+#include "../../gcc.c-torture/execute/builtins/memmove.c"
+
+#include "../../gcc.c-torture/execute/builtins/lib/main.c"
+
+/* { dg-final { scan-assembler-not {\mmemcpy\M} } } */
+/* { dg-final { scan-assembler-not {\mmemmove\M} } } */
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c
new file mode 100644
index 00000000000..bdcf9bf5b61
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c
@@ -0,0 +1,84 @@
+/* { dg-do compile } */
+/* { dg-options "-finline-stringops -fno-lto" } */
+
+void *zero (unsigned long long (*p)[32], int n)
+{
+  return __builtin_memset (p, 0, n * sizeof (*p));
+}
+
+void *ones (char (*p)[128], int n)
+{
+  return __builtin_memset (p, -1, n * sizeof (*p));
+}
+
+void *opt2 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 2) * sizeof (*p));
+}
+
+void *opt8 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 8) * sizeof (*p));
+}
+
+void *opt32 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 32) * sizeof (*p));
+}
+
+void *opt128 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 128) * sizeof (*p));
+}
+
+void *opt512 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 512) * sizeof (*p));
+}
+
+void *opt_primes (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 509 : 7) * sizeof (*p));
+}
+
+void *opt_primes_blk (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 521 : 9) * sizeof (*p));
+}
+
+void *huge (long (*p)[16384])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep1 (long (*p)[16384+1])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep4 (long (*p)[16384+4])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep16 (long (*p)[16384+16])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep64 (long (*p)[16384+64])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep256 (long (*p)[16384+256])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep1024p256p64p16p4p1 (long (*p)[16384+1024+64+16+4+1])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+/* { dg-final { scan-assembler-not {\mmemset\M} } } */

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [gcc(refs/users/aoliva/heads/testme)] Introduce -finline-stringops
@ 2023-09-21  4:53 Alexandre Oliva
  0 siblings, 0 replies; 16+ messages in thread
From: Alexandre Oliva @ 2023-09-21  4:53 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:5b34a11db11ba0596ebcbf4478949fecf6c4a234

commit 5b34a11db11ba0596ebcbf4478949fecf6c4a234
Author: Alexandre Oliva <oliva@adacore.com>
Date:   Fri Sep 15 02:41:24 2023 -0300

    Introduce -finline-stringops
    
    try_store_by_multiple_pieces was added not long ago, enabling
    variable-sized memset to be expanded inline when the worst-case
    in-range constant length would, using conditional blocks with powers
    of two to cover all possibilities of length and alignment.
    
    This patch introduces -finline-stringops[=fn] to request expansions to
    start with a loop, so as to still take advantage of known alignment
    even with long lengths, but without necessarily adding store blocks
    for every power of two.
    
    This makes it possible for the supported stringops (memset, memcpy,
    memmove, memset) to be expanded, even if storing a single byte per
    iteration.  Surely efficient implementations can run faster, with a
    pre-loop to increase alignment, but that would likely be excessive for
    inline expansions.
    
    Still, in some cases, such as in freestanding environments, users
    prefer to inline such stringops, especially those that the compiler
    may introduce itself, even if the expansion is not as performant as a
    highly optimized C library implementation could be, to avoid
    depending on a C runtime library.
    
    
    for  gcc/ChangeLog
    
            * expr.cc (emit_block_move_hints): Take ctz of len.  Obey
            -finline-stringops.  Use oriented or sized loop.
            (emit_block_move): Take ctz of len, and pass it on.
            (emit_block_move_via_sized_loop): New.
            (emit_block_move_via_oriented_loop): New.
            (emit_block_move_via_loop): Take incr.  Move an incr-sized
            block per iteration.
            (emit_block_cmp_via_cmpmem): Take ctz of len.  Obey
            -finline-stringops.
            (emit_block_cmp_via_loop): New.
            * expr.h (emit_block_move): Add ctz of len defaulting to zero.
            (emit_block_move_hints): Likewise.
            (emit_block_cmp_hints): Likewise.
            * builtins.cc (expand_builtin_memory_copy_args): Pass ctz of
            len to emit_block_move_hints.
            (try_store_by_multiple_pieces): Support starting with a loop.
            (expand_builtin_memcmp): Pass ctz of len to
            emit_block_cmp_hints.
            (expand_builtin): Allow inline expansion of memset, memcpy,
            memmove and memcmp if requested.
            * common.opt (finline-stringops): New.
            (ilsop_fn): New enum.
            * flag-types.h (enum ilsop_fn): New.
            * doc/invoke.texi (-finline-stringops): Add.
    
    for  gcc/testsuite/ChangeLog
    
            * gcc.dg/torture/inline-mem-cmp-1.c: New.
            * gcc.dg/torture/inline-mem-cpy-1.c: New.
            * gcc.dg/torture/inline-mem-cpy-cmp-1.c: New.
            * gcc.dg/torture/inline-mem-move-1.c: New.
            * gcc.dg/torture/inline-mem-set-1.c: New.

Diff:
---
 gcc/builtins.cc                                    | 114 +++++-
 gcc/common.opt                                     |  34 ++
 gcc/doc/invoke.texi                                |  15 +
 gcc/expr.cc                                        | 395 ++++++++++++++++++++-
 gcc/expr.h                                         |   9 +-
 gcc/flag-types.h                                   |  11 +
 gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c    |   7 +
 gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c    |   8 +
 .../gcc.dg/torture/inline-mem-cpy-cmp-1.c          |  11 +
 gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c   |   9 +
 gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c    |  84 +++++
 11 files changed, 667 insertions(+), 30 deletions(-)

diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index 3b453b3ec8c..23646da09f8 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -3771,7 +3771,7 @@ expand_builtin_memory_copy_args (tree dest, tree src, tree len,
 				     expected_align, expected_size,
 				     min_size, max_size, probable_max_size,
 				     use_mempcpy_call, &is_move_done,
-				     might_overlap);
+				     might_overlap, tree_ctz (len));
 
   /* Bail out when a mempcpy call would be expanded as libcall and when
      we have a target that provides a fast implementation
@@ -4337,6 +4337,10 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
   int tst_bits = (max_bits != min_bits ? max_bits
 		  : floor_log2 (max_len ^ min_len));
 
+  /* Save the pre-blksize values.  */
+  int orig_max_bits = max_bits;
+  int orig_tst_bits = tst_bits;
+
   /* Check whether it's profitable to start by storing a fixed BLKSIZE
      bytes, to lower max_bits.  In the unlikely case of a constant LEN
      (implied by identical MAX_LEN and MIN_LEN), we want to issue a
@@ -4376,9 +4380,70 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
   if (max_bits >= 0)
     xlenest += ((HOST_WIDE_INT_1U << max_bits) * 2
 		- (HOST_WIDE_INT_1U << ctz_len));
-  if (!can_store_by_pieces (xlenest, builtin_memset_read_str,
-			    &valc, align, true))
-    return false;
+  bool max_loop = false;
+  /* Skip the test in case of overflow in xlenest.  It shouldn't
+     happen because of the way max_bits and blksize are related, but
+     it doesn't hurt to test.  */
+  if (blksize > xlenest
+      || !can_store_by_pieces (xlenest, builtin_memset_read_str,
+			       &valc, align, true))
+    {
+      if (!(flag_inline_stringops & ILSOP_MEMSET))
+	return false;
+
+      for (max_bits = orig_max_bits;
+	   max_bits >= sctz_len;
+	   --max_bits)
+	{
+	  xlenest = ((HOST_WIDE_INT_1U << max_bits) * 2
+		     - (HOST_WIDE_INT_1U << ctz_len));
+	  /* Check that blksize plus the bits to be stored as blocks
+	     sized at powers of two can be stored by pieces.  This is
+	     like the test above, but with smaller max_bits.  Skip
+	     orig_max_bits (it would be redundant).  Also skip in case
+	     of overflow.  */
+	  if (max_bits < orig_max_bits
+	      && xlenest + blksize >= xlenest
+	      && can_store_by_pieces (xlenest + blksize,
+				      builtin_memset_read_str,
+				      &valc, align, true))
+	    {
+	      max_loop = true;
+	      break;
+	    }
+	  if (blksize
+	      && can_store_by_pieces (xlenest,
+				      builtin_memset_read_str,
+				      &valc, align, true))
+	    {
+	      max_len += blksize;
+	      min_len += blksize;
+	      tst_bits = orig_tst_bits;
+	      blksize = 0;
+	      max_loop = true;
+	      break;
+	    }
+	  if (max_bits == sctz_len)
+	    {
+	      --sctz_len;
+	      --ctz_len;
+	    }
+	}
+      if (!max_loop)
+	return false;
+      /* If the boundaries are such that min and max may run a
+	 different number of trips in the initial loop, the remainder
+	 needs not be between the moduli, so set tst_bits to cover all
+	 bits.  Otherwise, if the trip counts are the same, max_len
+	 has the common prefix, and the previously-computed tst_bits
+	 is usable.  */
+      if (max_len >> max_bits > min_len >> max_bits)
+	tst_bits = max_bits;
+    }
+  /* ??? Do we have to check that all powers of two lengths from
+     max_bits down to ctz_len pass can_store_by_pieces?  As in, could
+     it possibly be that xlenest passes while smaller power-of-two
+     sizes don't?  */
 
   by_pieces_constfn constfun;
   void *constfundata;
@@ -4420,7 +4485,9 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
      the least significant bit possibly set in the length.  */
   for (int i = max_bits; i >= sctz_len; i--)
     {
+      rtx_code_label *loop_label = NULL;
       rtx_code_label *label = NULL;
+
       blksize = HOST_WIDE_INT_1U << i;
 
       /* If we're past the bits shared between min_ and max_len, expand
@@ -4434,18 +4501,31 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
 				   profile_probability::even ());
 	}
       /* If we are at a bit that is in the prefix shared by min_ and
-	 max_len, skip this BLKSIZE if the bit is clear.  */
-      else if ((max_len & blksize) == 0)
+	 max_len, skip the current BLKSIZE if the bit is clear, but do
+	 not skip the loop, even if it doesn't require
+	 prechecking.  */
+      else if ((max_len & blksize) == 0
+	       && !(max_loop && i == max_bits))
 	continue;
 
+      if (max_loop && i == max_bits)
+	{
+	  loop_label = gen_label_rtx ();
+	  emit_label (loop_label);
+	  /* Since we may run this multiple times, don't assume we
+	     know anything about the offset.  */
+	  clear_mem_offset (to);
+	}
+
       /* Issue a store of BLKSIZE bytes.  */
+      bool update_needed = i != sctz_len || loop_label;
       to = store_by_pieces (to, blksize,
 			    constfun, constfundata,
 			    align, true,
-			    i != sctz_len ? RETURN_END : RETURN_BEGIN);
+			    update_needed ? RETURN_END : RETURN_BEGIN);
 
       /* Adjust REM and PTR, unless this is the last iteration.  */
-      if (i != sctz_len)
+      if (update_needed)
 	{
 	  emit_move_insn (ptr, force_operand (XEXP (to, 0), NULL_RTX));
 	  to = replace_equiv_address (to, ptr);
@@ -4453,6 +4533,11 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
 	  emit_move_insn (rem, force_operand (rem_minus_blksize, NULL_RTX));
 	}
 
+      if (loop_label)
+	emit_cmp_and_jump_insns (rem, GEN_INT (blksize), GE, NULL,
+				 ptr_mode, 1, loop_label,
+				 profile_probability::likely ());
+
       if (label)
 	{
 	  emit_label (label);
@@ -4739,7 +4824,8 @@ expand_builtin_memcmp (tree exp, rtx target, bool result_eq)
   result = emit_block_cmp_hints (arg1_rtx, arg2_rtx, len_rtx,
 				 TREE_TYPE (len), target,
 				 result_eq, constfn,
-				 CONST_CAST (char *, rep));
+				 CONST_CAST (char *, rep),
+				 tree_ctz (len));
 
   if (result)
     {
@@ -7382,7 +7468,15 @@ expand_builtin (tree exp, rtx target, rtx subtarget, machine_mode mode,
       && fcode != BUILT_IN_EXECVE
       && fcode != BUILT_IN_CLEAR_CACHE
       && !ALLOCA_FUNCTION_CODE_P (fcode)
-      && fcode != BUILT_IN_FREE)
+      && fcode != BUILT_IN_FREE
+      && (fcode != BUILT_IN_MEMSET
+	  || !(flag_inline_stringops & ILSOP_MEMSET))
+      && (fcode != BUILT_IN_MEMCPY
+	  || !(flag_inline_stringops & ILSOP_MEMCPY))
+      && (fcode != BUILT_IN_MEMMOVE
+	  || !(flag_inline_stringops & ILSOP_MEMMOVE))
+      && (fcode != BUILT_IN_MEMCMP
+	  || !(flag_inline_stringops & ILSOP_MEMCMP)))
     return expand_call (exp, target, ignore);
 
   /* The built-in function expanders test for target == const0_rtx
diff --git a/gcc/common.opt b/gcc/common.opt
index f137a1f81ac..1e3f8efbc97 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -1908,6 +1908,40 @@ finline-atomics
 Common Var(flag_inline_atomics) Init(1) Optimization
 Inline __atomic operations when a lock free instruction sequence is available.
 
+finline-stringops
+Common RejectNegative Enum(ilsop_fn) Var(flag_inline_stringops, ILSOP_ALL) Enum(ilsop_fn) Init(ILSOP_NONE) Optimization Undocumented
+
+fno-inline-stringops
+Common RejectNegative Enum(ilsop_fn) Var(flag_inline_stringops, ILSOP_NONE) Enum(ilsop_fn) Optimization Undocumented
+
+finline-stringops=
+Common Joined Var(flag_inline_stringops) EnumSet Enum(ilsop_fn) Optimization
+-finline-stringops[=memcmp|memcpy|memmove|memset]
+Expand supported mem/str operations inline, even if against optimization.
+
+Enum
+Name(ilsop_fn) Type(enum ilsop_fn) UnknownError(unavailable stringop for inlining %qs)
+
+; This is not part of any set.
+; EnumValue
+; Enum(ilsop_fn) String(none) Value(ILSOP_NONE)
+
+EnumValue
+Enum(ilsop_fn) String(memcmp) Value(ILSOP_MEMCMP) Set(1)
+
+EnumValue
+Enum(ilsop_fn) String(memcpy) Value(ILSOP_MEMCPY) Set(2)
+
+EnumValue
+Enum(ilsop_fn) String(memmove) Value(ILSOP_MEMMOVE) Set(3)
+
+EnumValue
+Enum(ilsop_fn) String(memset) Value(ILSOP_MEMSET) Set(4)
+
+; This is not part of any set either.
+; EnumValue
+; Enum(ilsop_fn) String(all) Value(ILSOP_ALL)
+
 fcf-protection
 Common RejectNegative Alias(fcf-protection=,full)
 
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 03d93e6b185..b0649712792 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -556,6 +556,7 @@ Objective-C and Objective-C++ Dialects}.
 -fgcse  -fgcse-after-reload  -fgcse-las  -fgcse-lm  -fgraphite-identity
 -fgcse-sm  -fhoist-adjacent-loads  -fif-conversion
 -fif-conversion2  -findirect-inlining
+-finline-stringops[=@var{fn}]
 -finline-functions  -finline-functions-called-once  -finline-limit=@var{n}
 -finline-small-functions -fipa-modref -fipa-cp  -fipa-cp-clone
 -fipa-bit-cp  -fipa-vrp  -fipa-pta  -fipa-profile  -fipa-pure-const
@@ -12228,6 +12229,20 @@ their @code{_FORTIFY_SOURCE} counterparts into faster alternatives.
 
 Enabled at levels @option{-O2}, @option{-O3}.
 
+@opindex finline-stringops
+@item -finline-stringops[=@var{fn}]
+Expand memory and string operations (for now, only @code{memset})
+inline, even when the length is variable or big enough as to require
+looping.  This is most useful along with @option{-ffreestanding} and
+@option{-fno-builtin}.
+
+In some circumstances, it enables the compiler to generate code that
+takes advantage of known alignment and length multipliers, but even then
+it may be less efficient than optimized runtime implementations, and
+grow code size so much that even a less performant but shared
+implementation runs faster due to better use of code caches.  This
+option is disabled by default.
+
 @opindex fno-inline
 @opindex finline
 @item -fno-inline
diff --git a/gcc/expr.cc b/gcc/expr.cc
index d5b6494b4fc..5a51eb9029c 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -80,7 +80,11 @@ static bool emit_block_move_via_pattern (rtx, rtx, rtx, unsigned, unsigned,
 					 HOST_WIDE_INT, unsigned HOST_WIDE_INT,
 					 unsigned HOST_WIDE_INT,
 					 unsigned HOST_WIDE_INT, bool);
-static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned);
+static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned, int);
+static void emit_block_move_via_sized_loop (rtx, rtx, rtx, unsigned, unsigned);
+static void emit_block_move_via_oriented_loop (rtx, rtx, rtx, unsigned, unsigned);
+static rtx emit_block_cmp_via_loop (rtx, rtx, rtx, tree, rtx, bool,
+				    unsigned, unsigned);
 static void clear_by_pieces (rtx, unsigned HOST_WIDE_INT, unsigned int);
 static rtx_insn *compress_float_constant (rtx, rtx);
 static rtx get_subtarget (rtx);
@@ -1955,6 +1959,8 @@ compare_by_pieces (rtx arg0, rtx arg1, unsigned HOST_WIDE_INT len,
    MIN_SIZE is the minimal size of block to move
    MAX_SIZE is the maximal size of block to move, if it cannot be represented
    in unsigned HOST_WIDE_INT, than it is mask of all ones.
+   CTZ_SIZE is the trailing-zeros count of SIZE; even a nonconstant SIZE is
+   known to be a multiple of 1<<CTZ_SIZE.
 
    Return the address of the new block, if memcpy is called and returns it,
    0 otherwise.  */
@@ -1966,7 +1972,7 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
 		       unsigned HOST_WIDE_INT max_size,
 		       unsigned HOST_WIDE_INT probable_max_size,
 		       bool bail_out_libcall, bool *is_move_done,
-		       bool might_overlap)
+		       bool might_overlap, unsigned ctz_size)
 {
   int may_use_call;
   rtx retval = 0;
@@ -2052,6 +2058,14 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
 	}
     }
 
+  bool dynamic_direction = false;
+  if (!pattern_ok && !pieces_ok && may_use_call
+      && (flag_inline_stringops & (might_overlap ? ILSOP_MEMMOVE : ILSOP_MEMCPY)))
+    {
+      may_use_call = 0;
+      dynamic_direction = might_overlap;
+    }
+
   if (pattern_ok)
     ;
   else if (pieces_ok)
@@ -2073,10 +2087,12 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
       retval = emit_block_copy_via_libcall (x, y, size,
 					    method == BLOCK_OP_TAILCALL);
     }
+  else if (dynamic_direction)
+    emit_block_move_via_oriented_loop (x, y, size, align, ctz_size);
   else if (might_overlap)
     *is_move_done = false;
   else
-    emit_block_move_via_loop (x, y, size, align);
+    emit_block_move_via_sized_loop (x, y, size, align, ctz_size);
 
   if (method == BLOCK_OP_CALL_PARM)
     OK_DEFER_POP;
@@ -2085,7 +2101,8 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
 }
 
 rtx
-emit_block_move (rtx x, rtx y, rtx size, enum block_op_methods method)
+emit_block_move (rtx x, rtx y, rtx size, enum block_op_methods method,
+		 unsigned int ctz_size)
 {
   unsigned HOST_WIDE_INT max, min = 0;
   if (GET_CODE (size) == CONST_INT)
@@ -2093,7 +2110,8 @@ emit_block_move (rtx x, rtx y, rtx size, enum block_op_methods method)
   else
     max = GET_MODE_MASK (GET_MODE (size));
   return emit_block_move_hints (x, y, size, method, 0, -1,
-				min, max, max);
+				min, max, max,
+				false, NULL, false, ctz_size);
 }
 
 /* A subroutine of emit_block_move.  Returns true if calling the
@@ -2255,13 +2273,117 @@ emit_block_move_via_pattern (rtx x, rtx y, rtx size, unsigned int align,
   return false;
 }
 
+/* Like emit_block_move_via_loop, but choose a suitable INCR based on
+   ALIGN and CTZ_SIZE.  */
+
+static void
+emit_block_move_via_sized_loop (rtx x, rtx y, rtx size,
+				unsigned int align,
+				unsigned int ctz_size)
+{
+  int incr = align / BITS_PER_UNIT;
+
+  if (CONST_INT_P (size))
+    ctz_size = MAX (ctz_size, (unsigned) wi::ctz (UINTVAL (size)));
+
+  if (HOST_WIDE_INT_1U << ctz_size < (unsigned HOST_WIDE_INT) incr)
+    incr = HOST_WIDE_INT_1U << ctz_size;
+
+  while (incr > 1 && !can_move_by_pieces (incr, align))
+    incr >>= 1;
+
+  gcc_checking_assert (incr);
+
+  return emit_block_move_via_loop (x, y, size, align, incr);
+}
+
+/* Like emit_block_move_via_sized_loop, but besides choosing INCR so
+   as to ensure safe moves even in case of overlap, output dynamic
+   tests to choose between two loops, one moving downwards, another
+   moving upwards.  */
+
+static void
+emit_block_move_via_oriented_loop (rtx x, rtx y, rtx size,
+				   unsigned int align,
+				   unsigned int ctz_size)
+{
+  int incr = align / BITS_PER_UNIT;
+
+  if (CONST_INT_P (size))
+    ctz_size = MAX (ctz_size, (unsigned) wi::ctz (UINTVAL (size)));
+
+  if (HOST_WIDE_INT_1U << ctz_size < (unsigned HOST_WIDE_INT) incr)
+    incr = HOST_WIDE_INT_1U << ctz_size;
+
+  while (incr > 1 && !int_mode_for_size (incr, 0).exists ())
+    incr >>= 1;
+
+  gcc_checking_assert (incr);
+
+  rtx_code_label *upw_label, *end_label;
+  upw_label = gen_label_rtx ();
+  end_label = gen_label_rtx ();
+
+  rtx x_addr = force_operand (XEXP (x, 0), NULL_RTX);
+  rtx y_addr = force_operand (XEXP (y, 0), NULL_RTX);
+  do_pending_stack_adjust ();
+
+  machine_mode mode = GET_MODE (x_addr);
+  if (mode != GET_MODE (y_addr))
+    {
+      scalar_int_mode xmode
+	= smallest_int_mode_for_size (GET_MODE_BITSIZE (mode));
+      scalar_int_mode ymode
+	= smallest_int_mode_for_size (GET_MODE_BITSIZE
+				      (GET_MODE (y_addr)));
+      if (GET_MODE_BITSIZE (xmode) < GET_MODE_BITSIZE (ymode))
+	mode = ymode;
+      else
+	mode = xmode;
+
+#ifndef POINTERS_EXTEND_UNSIGNED
+      const int POINTERS_EXTEND_UNSIGNED = 1;
+#endif
+      x_addr = convert_modes (mode, GET_MODE (x_addr), x_addr,
+			      POINTERS_EXTEND_UNSIGNED);
+      y_addr = convert_modes (mode, GET_MODE (y_addr), y_addr,
+			      POINTERS_EXTEND_UNSIGNED);
+    }
+
+  /* Test for overlap: if (x >= y || x + size <= y) goto upw_label.  */
+  emit_cmp_and_jump_insns (x_addr, y_addr, GEU, NULL_RTX, mode,
+			   true, upw_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (5, 10));
+  rtx tmp = convert_modes (GET_MODE (x_addr), GET_MODE (size), size, true);
+  tmp = simplify_gen_binary (PLUS, GET_MODE (x_addr), x_addr, tmp);
+
+  emit_cmp_and_jump_insns (tmp, y_addr, LEU, NULL_RTX, mode,
+			   true, upw_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (8, 10));
+
+  emit_block_move_via_loop (x, y, size, align, -incr);
+
+  emit_jump (end_label);
+  emit_label (upw_label);
+
+  emit_block_move_via_loop (x, y, size, align, incr);
+
+  emit_label (end_label);
+}
+
 /* A subroutine of emit_block_move.  Copy the data via an explicit
-   loop.  This is used only when libcalls are forbidden.  */
-/* ??? It'd be nice to copy in hunks larger than QImode.  */
+   loop.  This is used only when libcalls are forbidden, or when
+   inlining is required.  INCR is the block size to be copied in each
+   loop iteration.  If it is negative, the absolute value is used, and
+   the block is copied backwards.  INCR must be a power of two, an
+   exact divisor for SIZE and ALIGN, and imply a mode that can be
+   safely copied per iteration assuming no overlap.  */
 
 static void
 emit_block_move_via_loop (rtx x, rtx y, rtx size,
-			  unsigned int align ATTRIBUTE_UNUSED)
+			  unsigned int align, int incr)
 {
   rtx_code_label *cmp_label, *top_label;
   rtx iter, x_addr, y_addr, tmp;
@@ -2277,7 +2399,38 @@ emit_block_move_via_loop (rtx x, rtx y, rtx size,
   cmp_label = gen_label_rtx ();
   iter = gen_reg_rtx (iter_mode);
 
-  emit_move_insn (iter, const0_rtx);
+  bool downwards = incr < 0;
+  rtx iter_init;
+  rtx_code iter_cond;
+  rtx iter_limit;
+  rtx iter_incr;
+  machine_mode move_mode;
+  if (downwards)
+    {
+      incr = -incr;
+      iter_init = size;
+      iter_cond = GEU;
+      iter_limit = const0_rtx;
+      iter_incr = GEN_INT (incr);
+    }
+  else
+    {
+      iter_init = const0_rtx;
+      iter_cond = LTU;
+      iter_limit = size;
+      iter_incr = GEN_INT (incr);
+    }
+  emit_move_insn (iter, iter_init);
+
+  scalar_int_mode int_move_mode
+    = smallest_int_mode_for_size (incr * BITS_PER_UNIT);
+  if (GET_MODE_BITSIZE (int_move_mode) != incr * BITS_PER_UNIT)
+    {
+      move_mode = BLKmode;
+      gcc_checking_assert (can_move_by_pieces (incr, align));
+    }
+  else
+    move_mode = int_move_mode;
 
   x_addr = force_operand (XEXP (x, 0), NULL_RTX);
   y_addr = force_operand (XEXP (y, 0), NULL_RTX);
@@ -2293,19 +2446,32 @@ emit_block_move_via_loop (rtx x, rtx y, rtx size,
     tmp = convert_modes (y_addr_mode, iter_mode, iter, true);
   y_addr = simplify_gen_binary (PLUS, y_addr_mode, y_addr, tmp);
 
-  x = change_address (x, QImode, x_addr);
-  y = change_address (y, QImode, y_addr);
+  x = change_address (x, move_mode, x_addr);
+  y = change_address (y, move_mode, y_addr);
+
+  if (move_mode == BLKmode)
+    {
+      bool done;
+      emit_block_move_hints (x, y, iter_incr, BLOCK_OP_NO_LIBCALL,
+			     align, incr, incr, incr, incr,
+			     false, &done, false);
+      gcc_checking_assert (done);
+    }
+  else
+    emit_move_insn (x, y);
 
-  emit_move_insn (x, y);
+  if (downwards)
+    emit_label (cmp_label);
 
-  tmp = expand_simple_binop (iter_mode, PLUS, iter, const1_rtx, iter,
+  tmp = expand_simple_binop (iter_mode, PLUS, iter, iter_incr, iter,
 			     true, OPTAB_LIB_WIDEN);
   if (tmp != iter)
     emit_move_insn (iter, tmp);
 
-  emit_label (cmp_label);
+  if (!downwards)
+    emit_label (cmp_label);
 
-  emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
+  emit_cmp_and_jump_insns (iter, iter_limit, iter_cond, NULL_RTX, iter_mode,
 			   true, top_label,
 			   profile_probability::guessed_always ()
 				.apply_scale (9, 10));
@@ -2405,7 +2571,8 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
 
    Both X and Y must be MEM rtx's.  LEN is an rtx that says how long
    they are.  LEN_TYPE is the type of the expression that was used to
-   calculate it.
+   calculate it, and CTZ_LEN is the known trailing-zeros count of LEN,
+   so LEN must be a multiple of 1<<CTZ_LEN even if it's not constant.
 
    If EQUALITY_ONLY is true, it means we don't have to return the tri-state
    value of a normal memcmp call, instead we can just compare for equality.
@@ -2421,7 +2588,7 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
 rtx
 emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
 		      bool equality_only, by_pieces_constfn y_cfn,
-		      void *y_cfndata)
+		      void *y_cfndata, unsigned ctz_len)
 {
   rtx result = 0;
 
@@ -2443,8 +2610,202 @@ emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
   else
     result = emit_block_cmp_via_cmpmem (x, y, len, len_type, target, align);
 
+  if (!result && (flag_inline_stringops & ILSOP_MEMCMP))
+    result = emit_block_cmp_via_loop (x, y, len, len_type,
+				      target, equality_only,
+				      align, ctz_len);
+
   return result;
 }
+
+/* Like emit_block_cmp_hints, but with known alignment and no support
+   for constats.  Always expand to a loop with iterations that compare
+   blocks of the largest compare-by-pieces size that divides both len
+   and align, and then, if !EQUALITY_ONLY, identify the word and then
+   the unit that first differs to return the result.  */
+
+rtx
+emit_block_cmp_via_loop (rtx x, rtx y, rtx len, tree len_type, rtx target,
+			 bool equality_only, unsigned align, unsigned ctz_len)
+{
+  unsigned incr = align / BITS_PER_UNIT;
+
+  if (CONST_INT_P (len))
+    ctz_len = MAX (ctz_len, (unsigned) wi::ctz (UINTVAL (len)));
+
+  if (HOST_WIDE_INT_1U << ctz_len < (unsigned HOST_WIDE_INT) incr)
+    incr = HOST_WIDE_INT_1U << ctz_len;
+
+  while (incr > 1
+	 && !can_do_by_pieces (incr, align, COMPARE_BY_PIECES))
+    incr >>= 1;
+
+  rtx_code_label *cmp_label, *top_label, *ne_label, *res_label;
+  rtx iter, x_addr, y_addr, tmp;
+  machine_mode x_addr_mode = get_address_mode (x);
+  machine_mode y_addr_mode = get_address_mode (y);
+  machine_mode iter_mode;
+
+  iter_mode = GET_MODE (len);
+  if (iter_mode == VOIDmode)
+    iter_mode = word_mode;
+
+  rtx iter_init = const0_rtx;
+  rtx_code iter_cond = LTU;
+  rtx iter_limit = len;
+  rtx iter_incr = GEN_INT (incr);
+  machine_mode cmp_mode;
+
+  /* We can drop the loop back edge if we know there's exactly one
+     iteration.  */
+  top_label = (!rtx_equal_p (len, iter_incr)
+	       ? gen_label_rtx ()
+	       : NULL);
+  /* We need not test before entering the loop if len is known
+     nonzero.  ??? This could be even stricter, testing whether a
+     nonconstant LEN could possibly be zero.  */
+  cmp_label = (!CONSTANT_P (len) || rtx_equal_p (len, iter_init)
+	       ? gen_label_rtx ()
+	       : NULL);
+  ne_label = gen_label_rtx ();
+  res_label = gen_label_rtx ();
+
+  iter = gen_reg_rtx (iter_mode);
+  emit_move_insn (iter, iter_init);
+
+  scalar_int_mode int_cmp_mode
+    = smallest_int_mode_for_size (incr * BITS_PER_UNIT);
+  if (GET_MODE_BITSIZE (int_cmp_mode) != incr * BITS_PER_UNIT
+      || !can_compare_p (NE, int_cmp_mode, ccp_jump))
+    {
+      cmp_mode = BLKmode;
+      gcc_checking_assert (incr != 1);
+    }
+  else
+    cmp_mode = int_cmp_mode;
+
+  /* Save the base addresses.  */
+  x_addr = force_operand (XEXP (x, 0), NULL_RTX);
+  y_addr = force_operand (XEXP (y, 0), NULL_RTX);
+  do_pending_stack_adjust ();
+
+  if (cmp_label)
+    {
+      if (top_label)
+	emit_jump (cmp_label);
+      else
+	emit_cmp_and_jump_insns (iter, iter_limit, entry_cond,
+				 NULL_RTX, iter_mode,
+				 true, cmp_label,
+				 profile_probability::guessed_always ()
+				 .apply_scale (1, 10));
+    }
+  if (top_label)
+    emit_label (top_label);
+
+  /* Offset the base addresses by ITER.  */
+  tmp = convert_modes (x_addr_mode, iter_mode, iter, true);
+  x_addr = simplify_gen_binary (PLUS, x_addr_mode, x_addr, tmp);
+
+  if (x_addr_mode != y_addr_mode)
+    tmp = convert_modes (y_addr_mode, iter_mode, iter, true);
+  y_addr = simplify_gen_binary (PLUS, y_addr_mode, y_addr, tmp);
+
+  x = change_address (x, cmp_mode, x_addr);
+  y = change_address (y, cmp_mode, y_addr);
+
+  /* Compare one block.  */
+  rtx part_res;
+  if (cmp_mode == BLKmode)
+    part_res = compare_by_pieces (x, y, incr, target, align, 0, 0);
+  else
+    part_res = expand_binop (cmp_mode, sub_optab, x, y, NULL_RTX,
+			     true, OPTAB_LIB_WIDEN);
+
+  /* Stop if we found a difference.  */
+  emit_cmp_and_jump_insns (part_res, GEN_INT (0), NE, NULL_RTX,
+			   GET_MODE (part_res), true, ne_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (1, 10));
+
+  /* Increment ITER.  */
+  tmp = expand_simple_binop (iter_mode, PLUS, iter, iter_incr, iter,
+			     true, OPTAB_LIB_WIDEN);
+  if (tmp != iter)
+    emit_move_insn (iter, tmp);
+
+  if (cmp_label)
+    emit_label (cmp_label);
+  /* Loop until we reach the limit.  */
+
+  if (top_label)
+    emit_cmp_and_jump_insns (iter, iter_limit, iter_cond, NULL_RTX, iter_mode,
+			     true, top_label,
+			     profile_probability::guessed_always ()
+			     .apply_scale (9, 10));
+
+  /* We got to the end without differences, so the result is zero.  */
+  if (target == NULL_RTX
+      || !REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
+    target = gen_reg_rtx (TYPE_MODE (integer_type_node));
+
+  emit_move_insn (target, const0_rtx);
+  emit_jump (res_label);
+
+  emit_label (ne_label);
+
+  /* Return nonzero, or pinpoint the difference to return the expected
+     result for non-equality tests.  */
+  if (equality_only)
+    emit_move_insn (target, const1_rtx);
+  else
+    {
+      if (incr > UNITS_PER_WORD)
+	/* ??? Re-compare the block found to be different one word at a
+	   time.  */
+	part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), len_type,
+					    target, equality_only,
+					    BITS_PER_WORD, 0);
+      else if (incr > 1)
+	/* ??? Re-compare the block found to be different one byte at a
+	   time.  We could do better using part_res, and being careful
+	   about endianness.  */
+	part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), len_type,
+					    target, equality_only,
+					    BITS_PER_UNIT, 0);
+      else if (known_gt (GET_MODE_BITSIZE (GET_MODE (target)),
+			 GET_MODE_BITSIZE (cmp_mode)))
+	part_res = expand_binop (GET_MODE (target), sub_optab, x, y, target,
+				 true, OPTAB_LIB_WIDEN);
+      else
+	{
+	  /* In the odd chance target is QImode, we can't count on
+	     widening subtract to capture the result of the unsigned
+	     compares.  */
+	  rtx_code_label *ltu_label;
+	  ltu_label = gen_label_rtx ();
+	  emit_cmp_and_jump_insns (x, y, LTU, NULL_RTX,
+				   cmp_mode, true, ltu_label,
+				   profile_probability::guessed_always ()
+				   .apply_scale (5, 10));
+
+	  emit_move_insn (target, const1_rtx);
+	  emit_jump (res_label);
+
+	  emit_label (ltu_label);
+	  emit_move_insn (target, constm1_rtx);
+	  part_res = target;
+	}
+
+      if (target != part_res)
+	convert_move (target, part_res, false);
+    }
+
+  emit_label (res_label);
+
+  return target;
+}
+
 \f
 /* Copy all or part of a value X into registers starting at REGNO.
    The number of registers to be filled is NREGS.  */
diff --git a/gcc/expr.h b/gcc/expr.h
index 11bff531862..988c783e450 100644
--- a/gcc/expr.h
+++ b/gcc/expr.h
@@ -126,7 +126,8 @@ struct by_pieces_prev
   fixed_size_mode mode;
 };
 
-extern rtx emit_block_move (rtx, rtx, rtx, enum block_op_methods);
+extern rtx emit_block_move (rtx, rtx, rtx, enum block_op_methods,
+			    unsigned ctz_size = 0);
 extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
 			          unsigned int, HOST_WIDE_INT,
 				  unsigned HOST_WIDE_INT,
@@ -134,9 +135,11 @@ extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
 				  unsigned HOST_WIDE_INT,
 				  bool bail_out_libcall = false,
 				  bool *is_move_done = NULL,
-				  bool might_overlap = false);
+				  bool might_overlap = false,
+				  unsigned ctz_size = 0);
 extern rtx emit_block_cmp_hints (rtx, rtx, rtx, tree, rtx, bool,
-				 by_pieces_constfn, void *);
+				 by_pieces_constfn, void *,
+				 unsigned ctz_len = 0);
 extern bool emit_storent_insn (rtx to, rtx from);
 
 /* Copy all or part of a value X into registers starting at REGNO.
diff --git a/gcc/flag-types.h b/gcc/flag-types.h
index 7466c1106f2..7a20cfd5de3 100644
--- a/gcc/flag-types.h
+++ b/gcc/flag-types.h
@@ -437,6 +437,17 @@ enum gfc_convert
 };
 
 
+/* Inline String Operations functions.  */
+enum ilsop_fn
+{
+  ILSOP_NONE = 0,
+  ILSOP_MEMSET = 1 << 0,
+  ILSOP_MEMCPY = 1 << 1,
+  ILSOP_MEMMOVE = 1 << 2,
+  ILSOP_MEMCMP = 1 << 3,
+  ILSOP_ALL = -1
+};
+
 /* Control-Flow Protection values.  */
 enum cf_protection_level
 {
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c
new file mode 100644
index 00000000000..a368f074112
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c
@@ -0,0 +1,7 @@
+/* { dg-do run } */
+/* { dg-options "-finline-stringops=memcmp -save-temps -g0 -fno-lto" } */
+
+#include "../memcmp-1.c"
+
+/* Check that no memcmp calls remain, but allow for lib_memcmp calls.  */
+/* { dg-final { scan-assembler-not {(^|\*)\mmemcmp\M} } } */
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c
new file mode 100644
index 00000000000..c98e903c1f1
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c
@@ -0,0 +1,8 @@
+/* { dg-do run } */
+/* { dg-options "-finline-stringops=memcpy -save-temps -g0 -fno-lto" } */
+
+#include "../memcmp-1.c"
+/* Yeah, this memcmp test exercises plenty of memcpy, more than any of the
+   memcpy tests.  */
+
+/* { dg-final { scan-assembler-not {\mmemcpy\M} } } */
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-cmp-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-cmp-1.c
new file mode 100644
index 00000000000..2cd2057a9b5
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-cmp-1.c
@@ -0,0 +1,11 @@
+/* { dg-do run } */
+/* { dg-options "-finline-stringops -save-temps -g0 -fno-lto" } */
+/* { dg-require-effective-target ptr32plus } */
+/* { dg-timeout-factor 2 } */
+
+#include "../memcmp-1.c"
+/* Yeah, this memcmp test exercises plenty of memcpy, more than any of the
+   memcpy tests.  */
+
+/* { dg-final { scan-assembler-not {\mmemcpy\M} } } */
+/* { dg-final { scan-assembler-not {(^|\*)\mmemcmp\M} } } */
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c
new file mode 100644
index 00000000000..f1d77b50a4c
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c
@@ -0,0 +1,9 @@
+/* { dg-do run } */
+/* { dg-options "-finline-stringops=memmove -save-temps -g0 -fno-lto" } */
+
+#include "../../gcc.c-torture/execute/builtins/memmove.c"
+
+#include "../../gcc.c-torture/execute/builtins/lib/main.c"
+
+/* { dg-final { scan-assembler-not {\mmemcpy\M} } } */
+/* { dg-final { scan-assembler-not {\mmemmove\M} } } */
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c
new file mode 100644
index 00000000000..bdcf9bf5b61
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c
@@ -0,0 +1,84 @@
+/* { dg-do compile } */
+/* { dg-options "-finline-stringops -fno-lto" } */
+
+void *zero (unsigned long long (*p)[32], int n)
+{
+  return __builtin_memset (p, 0, n * sizeof (*p));
+}
+
+void *ones (char (*p)[128], int n)
+{
+  return __builtin_memset (p, -1, n * sizeof (*p));
+}
+
+void *opt2 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 2) * sizeof (*p));
+}
+
+void *opt8 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 8) * sizeof (*p));
+}
+
+void *opt32 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 32) * sizeof (*p));
+}
+
+void *opt128 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 128) * sizeof (*p));
+}
+
+void *opt512 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 512) * sizeof (*p));
+}
+
+void *opt_primes (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 509 : 7) * sizeof (*p));
+}
+
+void *opt_primes_blk (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 521 : 9) * sizeof (*p));
+}
+
+void *huge (long (*p)[16384])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep1 (long (*p)[16384+1])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep4 (long (*p)[16384+4])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep16 (long (*p)[16384+16])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep64 (long (*p)[16384+64])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep256 (long (*p)[16384+256])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep1024p256p64p16p4p1 (long (*p)[16384+1024+64+16+4+1])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+/* { dg-final { scan-assembler-not {\mmemset\M} } } */

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [gcc(refs/users/aoliva/heads/testme)] Introduce -finline-stringops
@ 2023-09-21  3:19 Alexandre Oliva
  0 siblings, 0 replies; 16+ messages in thread
From: Alexandre Oliva @ 2023-09-21  3:19 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:9844b091fbff133f4a35d0e5700ab5fa010c95e8

commit 9844b091fbff133f4a35d0e5700ab5fa010c95e8
Author: Alexandre Oliva <oliva@adacore.com>
Date:   Fri Sep 15 02:41:24 2023 -0300

    Introduce -finline-stringops
    
    try_store_by_multiple_pieces was added not long ago, enabling
    variable-sized memset to be expanded inline when the worst-case
    in-range constant length would, using conditional blocks with powers
    of two to cover all possibilities of length and alignment.
    
    This patch introduces -finline-stringops[=fn] to request expansions to
    start with a loop, so as to still take advantage of known alignment
    even with long lengths, but without necessarily adding store blocks
    for every power of two.
    
    This makes it possible for the supported stringops (memset, memcpy,
    memmove, memset) to be expanded, even if storing a single byte per
    iteration.  Surely efficient implementations can run faster, with a
    pre-loop to increase alignment, but that would likely be excessive for
    inline expansions.
    
    Still, in some cases, such as in freestanding environments, users
    prefer to inline such stringops, especially those that the compiler
    may introduce itself, even if the expansion is not as performant as a
    highly optimized C library implementation could be, to avoid
    depending on a C runtime library.
    
    
    for  gcc/ChangeLog
    
            * expr.cc (emit_block_move_hints): Take ctz of len.  Obey
            -finline-stringops.  Use oriented or sized loop.
            (emit_block_move): Take ctz of len, and pass it on.
            (emit_block_move_via_sized_loop): New.
            (emit_block_move_via_oriented_loop): New.
            (emit_block_move_via_loop): Take incr.  Move an incr-sized
            block per iteration.
            (emit_block_cmp_via_cmpmem): Take ctz of len.  Obey
            -finline-stringops.
            (emit_block_cmp_via_loop): New.
            * expr.h (emit_block_move): Add ctz of len defaulting to zero.
            (emit_block_move_hints): Likewise.
            (emit_block_cmp_hints): Likewise.
            * builtins.cc (expand_builtin_memory_copy_args): Pass ctz of
            len to emit_block_move_hints.
            (try_store_by_multiple_pieces): Support starting with a loop.
            (expand_builtin_memcmp): Pass ctz of len to
            emit_block_cmp_hints.
            (expand_builtin): Allow inline expansion of memset, memcpy,
            memmove and memcmp if requested.
            * common.opt (finline-stringops): New.
            (ilsop_fn): New enum.
            * flag-types.h (enum ilsop_fn): New.
            * doc/invoke.texi (-finline-stringops): Add.
    
    for  gcc/testsuite/ChangeLog
    
            * gcc.dg/torture/inline-mem-cmp-1.c: New.
            * gcc.dg/torture/inline-mem-cpy-1.c: New.
            * gcc.dg/torture/inline-mem-cpy-cmp-1.c: New.
            * gcc.dg/torture/inline-mem-move-1.c: New.
            * gcc.dg/torture/inline-mem-set-1.c: New.

Diff:
---
 gcc/builtins.cc                                    | 114 +++++-
 gcc/common.opt                                     |  34 ++
 gcc/doc/invoke.texi                                |  15 +
 gcc/expr.cc                                        | 385 ++++++++++++++++++++-
 gcc/expr.h                                         |   9 +-
 gcc/flag-types.h                                   |  11 +
 gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c    |   7 +
 gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c    |   8 +
 .../gcc.dg/torture/inline-mem-cpy-cmp-1.c          |  11 +
 gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c   |   9 +
 gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c    |  84 +++++
 11 files changed, 657 insertions(+), 30 deletions(-)

diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index 3b453b3ec8c..23646da09f8 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -3771,7 +3771,7 @@ expand_builtin_memory_copy_args (tree dest, tree src, tree len,
 				     expected_align, expected_size,
 				     min_size, max_size, probable_max_size,
 				     use_mempcpy_call, &is_move_done,
-				     might_overlap);
+				     might_overlap, tree_ctz (len));
 
   /* Bail out when a mempcpy call would be expanded as libcall and when
      we have a target that provides a fast implementation
@@ -4337,6 +4337,10 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
   int tst_bits = (max_bits != min_bits ? max_bits
 		  : floor_log2 (max_len ^ min_len));
 
+  /* Save the pre-blksize values.  */
+  int orig_max_bits = max_bits;
+  int orig_tst_bits = tst_bits;
+
   /* Check whether it's profitable to start by storing a fixed BLKSIZE
      bytes, to lower max_bits.  In the unlikely case of a constant LEN
      (implied by identical MAX_LEN and MIN_LEN), we want to issue a
@@ -4376,9 +4380,70 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
   if (max_bits >= 0)
     xlenest += ((HOST_WIDE_INT_1U << max_bits) * 2
 		- (HOST_WIDE_INT_1U << ctz_len));
-  if (!can_store_by_pieces (xlenest, builtin_memset_read_str,
-			    &valc, align, true))
-    return false;
+  bool max_loop = false;
+  /* Skip the test in case of overflow in xlenest.  It shouldn't
+     happen because of the way max_bits and blksize are related, but
+     it doesn't hurt to test.  */
+  if (blksize > xlenest
+      || !can_store_by_pieces (xlenest, builtin_memset_read_str,
+			       &valc, align, true))
+    {
+      if (!(flag_inline_stringops & ILSOP_MEMSET))
+	return false;
+
+      for (max_bits = orig_max_bits;
+	   max_bits >= sctz_len;
+	   --max_bits)
+	{
+	  xlenest = ((HOST_WIDE_INT_1U << max_bits) * 2
+		     - (HOST_WIDE_INT_1U << ctz_len));
+	  /* Check that blksize plus the bits to be stored as blocks
+	     sized at powers of two can be stored by pieces.  This is
+	     like the test above, but with smaller max_bits.  Skip
+	     orig_max_bits (it would be redundant).  Also skip in case
+	     of overflow.  */
+	  if (max_bits < orig_max_bits
+	      && xlenest + blksize >= xlenest
+	      && can_store_by_pieces (xlenest + blksize,
+				      builtin_memset_read_str,
+				      &valc, align, true))
+	    {
+	      max_loop = true;
+	      break;
+	    }
+	  if (blksize
+	      && can_store_by_pieces (xlenest,
+				      builtin_memset_read_str,
+				      &valc, align, true))
+	    {
+	      max_len += blksize;
+	      min_len += blksize;
+	      tst_bits = orig_tst_bits;
+	      blksize = 0;
+	      max_loop = true;
+	      break;
+	    }
+	  if (max_bits == sctz_len)
+	    {
+	      --sctz_len;
+	      --ctz_len;
+	    }
+	}
+      if (!max_loop)
+	return false;
+      /* If the boundaries are such that min and max may run a
+	 different number of trips in the initial loop, the remainder
+	 needs not be between the moduli, so set tst_bits to cover all
+	 bits.  Otherwise, if the trip counts are the same, max_len
+	 has the common prefix, and the previously-computed tst_bits
+	 is usable.  */
+      if (max_len >> max_bits > min_len >> max_bits)
+	tst_bits = max_bits;
+    }
+  /* ??? Do we have to check that all powers of two lengths from
+     max_bits down to ctz_len pass can_store_by_pieces?  As in, could
+     it possibly be that xlenest passes while smaller power-of-two
+     sizes don't?  */
 
   by_pieces_constfn constfun;
   void *constfundata;
@@ -4420,7 +4485,9 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
      the least significant bit possibly set in the length.  */
   for (int i = max_bits; i >= sctz_len; i--)
     {
+      rtx_code_label *loop_label = NULL;
       rtx_code_label *label = NULL;
+
       blksize = HOST_WIDE_INT_1U << i;
 
       /* If we're past the bits shared between min_ and max_len, expand
@@ -4434,18 +4501,31 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
 				   profile_probability::even ());
 	}
       /* If we are at a bit that is in the prefix shared by min_ and
-	 max_len, skip this BLKSIZE if the bit is clear.  */
-      else if ((max_len & blksize) == 0)
+	 max_len, skip the current BLKSIZE if the bit is clear, but do
+	 not skip the loop, even if it doesn't require
+	 prechecking.  */
+      else if ((max_len & blksize) == 0
+	       && !(max_loop && i == max_bits))
 	continue;
 
+      if (max_loop && i == max_bits)
+	{
+	  loop_label = gen_label_rtx ();
+	  emit_label (loop_label);
+	  /* Since we may run this multiple times, don't assume we
+	     know anything about the offset.  */
+	  clear_mem_offset (to);
+	}
+
       /* Issue a store of BLKSIZE bytes.  */
+      bool update_needed = i != sctz_len || loop_label;
       to = store_by_pieces (to, blksize,
 			    constfun, constfundata,
 			    align, true,
-			    i != sctz_len ? RETURN_END : RETURN_BEGIN);
+			    update_needed ? RETURN_END : RETURN_BEGIN);
 
       /* Adjust REM and PTR, unless this is the last iteration.  */
-      if (i != sctz_len)
+      if (update_needed)
 	{
 	  emit_move_insn (ptr, force_operand (XEXP (to, 0), NULL_RTX));
 	  to = replace_equiv_address (to, ptr);
@@ -4453,6 +4533,11 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
 	  emit_move_insn (rem, force_operand (rem_minus_blksize, NULL_RTX));
 	}
 
+      if (loop_label)
+	emit_cmp_and_jump_insns (rem, GEN_INT (blksize), GE, NULL,
+				 ptr_mode, 1, loop_label,
+				 profile_probability::likely ());
+
       if (label)
 	{
 	  emit_label (label);
@@ -4739,7 +4824,8 @@ expand_builtin_memcmp (tree exp, rtx target, bool result_eq)
   result = emit_block_cmp_hints (arg1_rtx, arg2_rtx, len_rtx,
 				 TREE_TYPE (len), target,
 				 result_eq, constfn,
-				 CONST_CAST (char *, rep));
+				 CONST_CAST (char *, rep),
+				 tree_ctz (len));
 
   if (result)
     {
@@ -7382,7 +7468,15 @@ expand_builtin (tree exp, rtx target, rtx subtarget, machine_mode mode,
       && fcode != BUILT_IN_EXECVE
       && fcode != BUILT_IN_CLEAR_CACHE
       && !ALLOCA_FUNCTION_CODE_P (fcode)
-      && fcode != BUILT_IN_FREE)
+      && fcode != BUILT_IN_FREE
+      && (fcode != BUILT_IN_MEMSET
+	  || !(flag_inline_stringops & ILSOP_MEMSET))
+      && (fcode != BUILT_IN_MEMCPY
+	  || !(flag_inline_stringops & ILSOP_MEMCPY))
+      && (fcode != BUILT_IN_MEMMOVE
+	  || !(flag_inline_stringops & ILSOP_MEMMOVE))
+      && (fcode != BUILT_IN_MEMCMP
+	  || !(flag_inline_stringops & ILSOP_MEMCMP)))
     return expand_call (exp, target, ignore);
 
   /* The built-in function expanders test for target == const0_rtx
diff --git a/gcc/common.opt b/gcc/common.opt
index f137a1f81ac..1e3f8efbc97 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -1908,6 +1908,40 @@ finline-atomics
 Common Var(flag_inline_atomics) Init(1) Optimization
 Inline __atomic operations when a lock free instruction sequence is available.
 
+finline-stringops
+Common RejectNegative Enum(ilsop_fn) Var(flag_inline_stringops, ILSOP_ALL) Enum(ilsop_fn) Init(ILSOP_NONE) Optimization Undocumented
+
+fno-inline-stringops
+Common RejectNegative Enum(ilsop_fn) Var(flag_inline_stringops, ILSOP_NONE) Enum(ilsop_fn) Optimization Undocumented
+
+finline-stringops=
+Common Joined Var(flag_inline_stringops) EnumSet Enum(ilsop_fn) Optimization
+-finline-stringops[=memcmp|memcpy|memmove|memset]
+Expand supported mem/str operations inline, even if against optimization.
+
+Enum
+Name(ilsop_fn) Type(enum ilsop_fn) UnknownError(unavailable stringop for inlining %qs)
+
+; This is not part of any set.
+; EnumValue
+; Enum(ilsop_fn) String(none) Value(ILSOP_NONE)
+
+EnumValue
+Enum(ilsop_fn) String(memcmp) Value(ILSOP_MEMCMP) Set(1)
+
+EnumValue
+Enum(ilsop_fn) String(memcpy) Value(ILSOP_MEMCPY) Set(2)
+
+EnumValue
+Enum(ilsop_fn) String(memmove) Value(ILSOP_MEMMOVE) Set(3)
+
+EnumValue
+Enum(ilsop_fn) String(memset) Value(ILSOP_MEMSET) Set(4)
+
+; This is not part of any set either.
+; EnumValue
+; Enum(ilsop_fn) String(all) Value(ILSOP_ALL)
+
 fcf-protection
 Common RejectNegative Alias(fcf-protection=,full)
 
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 03d93e6b185..b0649712792 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -556,6 +556,7 @@ Objective-C and Objective-C++ Dialects}.
 -fgcse  -fgcse-after-reload  -fgcse-las  -fgcse-lm  -fgraphite-identity
 -fgcse-sm  -fhoist-adjacent-loads  -fif-conversion
 -fif-conversion2  -findirect-inlining
+-finline-stringops[=@var{fn}]
 -finline-functions  -finline-functions-called-once  -finline-limit=@var{n}
 -finline-small-functions -fipa-modref -fipa-cp  -fipa-cp-clone
 -fipa-bit-cp  -fipa-vrp  -fipa-pta  -fipa-profile  -fipa-pure-const
@@ -12228,6 +12229,20 @@ their @code{_FORTIFY_SOURCE} counterparts into faster alternatives.
 
 Enabled at levels @option{-O2}, @option{-O3}.
 
+@opindex finline-stringops
+@item -finline-stringops[=@var{fn}]
+Expand memory and string operations (for now, only @code{memset})
+inline, even when the length is variable or big enough as to require
+looping.  This is most useful along with @option{-ffreestanding} and
+@option{-fno-builtin}.
+
+In some circumstances, it enables the compiler to generate code that
+takes advantage of known alignment and length multipliers, but even then
+it may be less efficient than optimized runtime implementations, and
+grow code size so much that even a less performant but shared
+implementation runs faster due to better use of code caches.  This
+option is disabled by default.
+
 @opindex fno-inline
 @opindex finline
 @item -fno-inline
diff --git a/gcc/expr.cc b/gcc/expr.cc
index d5b6494b4fc..d2643706dbe 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -80,7 +80,11 @@ static bool emit_block_move_via_pattern (rtx, rtx, rtx, unsigned, unsigned,
 					 HOST_WIDE_INT, unsigned HOST_WIDE_INT,
 					 unsigned HOST_WIDE_INT,
 					 unsigned HOST_WIDE_INT, bool);
-static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned);
+static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned, int);
+static void emit_block_move_via_sized_loop (rtx, rtx, rtx, unsigned, unsigned);
+static void emit_block_move_via_oriented_loop (rtx, rtx, rtx, unsigned, unsigned);
+static rtx emit_block_cmp_via_loop (rtx, rtx, rtx, tree, rtx, bool,
+				    unsigned, unsigned);
 static void clear_by_pieces (rtx, unsigned HOST_WIDE_INT, unsigned int);
 static rtx_insn *compress_float_constant (rtx, rtx);
 static rtx get_subtarget (rtx);
@@ -1955,6 +1959,8 @@ compare_by_pieces (rtx arg0, rtx arg1, unsigned HOST_WIDE_INT len,
    MIN_SIZE is the minimal size of block to move
    MAX_SIZE is the maximal size of block to move, if it cannot be represented
    in unsigned HOST_WIDE_INT, than it is mask of all ones.
+   CTZ_SIZE is the trailing-zeros count of SIZE; even a nonconstant SIZE is
+   known to be a multiple of 1<<CTZ_SIZE.
 
    Return the address of the new block, if memcpy is called and returns it,
    0 otherwise.  */
@@ -1966,7 +1972,7 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
 		       unsigned HOST_WIDE_INT max_size,
 		       unsigned HOST_WIDE_INT probable_max_size,
 		       bool bail_out_libcall, bool *is_move_done,
-		       bool might_overlap)
+		       bool might_overlap, unsigned ctz_size)
 {
   int may_use_call;
   rtx retval = 0;
@@ -2052,6 +2058,14 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
 	}
     }
 
+  bool dynamic_direction = false;
+  if (!pattern_ok && !pieces_ok && may_use_call
+      && (flag_inline_stringops & (might_overlap ? ILSOP_MEMMOVE : ILSOP_MEMCPY)))
+    {
+      may_use_call = 0;
+      dynamic_direction = might_overlap;
+    }
+
   if (pattern_ok)
     ;
   else if (pieces_ok)
@@ -2073,10 +2087,12 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
       retval = emit_block_copy_via_libcall (x, y, size,
 					    method == BLOCK_OP_TAILCALL);
     }
+  else if (dynamic_direction)
+    emit_block_move_via_oriented_loop (x, y, size, align, ctz_size);
   else if (might_overlap)
     *is_move_done = false;
   else
-    emit_block_move_via_loop (x, y, size, align);
+    emit_block_move_via_sized_loop (x, y, size, align, ctz_size);
 
   if (method == BLOCK_OP_CALL_PARM)
     OK_DEFER_POP;
@@ -2085,7 +2101,8 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
 }
 
 rtx
-emit_block_move (rtx x, rtx y, rtx size, enum block_op_methods method)
+emit_block_move (rtx x, rtx y, rtx size, enum block_op_methods method,
+		 unsigned int ctz_size)
 {
   unsigned HOST_WIDE_INT max, min = 0;
   if (GET_CODE (size) == CONST_INT)
@@ -2093,7 +2110,8 @@ emit_block_move (rtx x, rtx y, rtx size, enum block_op_methods method)
   else
     max = GET_MODE_MASK (GET_MODE (size));
   return emit_block_move_hints (x, y, size, method, 0, -1,
-				min, max, max);
+				min, max, max,
+				false, NULL, false, ctz_size);
 }
 
 /* A subroutine of emit_block_move.  Returns true if calling the
@@ -2255,13 +2273,117 @@ emit_block_move_via_pattern (rtx x, rtx y, rtx size, unsigned int align,
   return false;
 }
 
+/* Like emit_block_move_via_loop, but choose a suitable INCR based on
+   ALIGN and CTZ_SIZE.  */
+
+static void
+emit_block_move_via_sized_loop (rtx x, rtx y, rtx size,
+				unsigned int align,
+				unsigned int ctz_size)
+{
+  int incr = align / BITS_PER_UNIT;
+
+  if (CONST_INT_P (size))
+    ctz_size = MAX (ctz_size, (unsigned) wi::ctz (UINTVAL (size)));
+
+  if (HOST_WIDE_INT_1U << ctz_size < (unsigned HOST_WIDE_INT) incr)
+    incr = HOST_WIDE_INT_1U << ctz_size;
+
+  while (incr > 1 && !can_move_by_pieces (incr, align))
+    incr >>= 1;
+
+  gcc_checking_assert (incr);
+
+  return emit_block_move_via_loop (x, y, size, align, incr);
+}
+
+/* Like emit_block_move_via_sized_loop, but besides choosing INCR so
+   as to ensure safe moves even in case of overlap, output dynamic
+   tests to choose between two loops, one moving downwards, another
+   moving upwards.  */
+
+static void
+emit_block_move_via_oriented_loop (rtx x, rtx y, rtx size,
+				   unsigned int align,
+				   unsigned int ctz_size)
+{
+  int incr = align / BITS_PER_UNIT;
+
+  if (CONST_INT_P (size))
+    ctz_size = MAX (ctz_size, (unsigned) wi::ctz (UINTVAL (size)));
+
+  if (HOST_WIDE_INT_1U << ctz_size < (unsigned HOST_WIDE_INT) incr)
+    incr = HOST_WIDE_INT_1U << ctz_size;
+
+  while (incr > 1 && !int_mode_for_size (incr, 0).exists ())
+    incr >>= 1;
+
+  gcc_checking_assert (incr);
+
+  rtx_code_label *upw_label, *end_label;
+  upw_label = gen_label_rtx ();
+  end_label = gen_label_rtx ();
+
+  rtx x_addr = force_operand (XEXP (x, 0), NULL_RTX);
+  rtx y_addr = force_operand (XEXP (y, 0), NULL_RTX);
+  do_pending_stack_adjust ();
+
+  machine_mode mode = GET_MODE (x_addr);
+  if (mode != GET_MODE (y_addr))
+    {
+      scalar_int_mode xmode
+	= smallest_int_mode_for_size (GET_MODE_BITSIZE (mode));
+      scalar_int_mode ymode
+	= smallest_int_mode_for_size (GET_MODE_BITSIZE
+				      (GET_MODE (y_addr)));
+      if (GET_MODE_BITSIZE (xmode) < GET_MODE_BITSIZE (ymode))
+	mode = ymode;
+      else
+	mode = xmode;
+
+#ifndef POINTERS_EXTEND_UNSIGNED
+      const int POINTERS_EXTEND_UNSIGNED = 1;
+#endif
+      x_addr = convert_modes (mode, GET_MODE (x_addr), x_addr,
+			      POINTERS_EXTEND_UNSIGNED);
+      y_addr = convert_modes (mode, GET_MODE (y_addr), y_addr,
+			      POINTERS_EXTEND_UNSIGNED);
+    }
+
+  /* Test for overlap: if (x >= y || x + size <= y) goto upw_label.  */
+  emit_cmp_and_jump_insns (x_addr, y_addr, GEU, NULL_RTX, mode,
+			   true, upw_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (5, 10));
+  rtx tmp = convert_modes (GET_MODE (x_addr), GET_MODE (size), size, true);
+  tmp = simplify_gen_binary (PLUS, GET_MODE (x_addr), x_addr, tmp);
+
+  emit_cmp_and_jump_insns (tmp, y_addr, LEU, NULL_RTX, mode,
+			   true, upw_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (8, 10));
+
+  emit_block_move_via_loop (x, y, size, align, -incr);
+
+  emit_jump (end_label);
+  emit_label (upw_label);
+
+  emit_block_move_via_loop (x, y, size, align, incr);
+
+  emit_label (end_label);
+}
+
 /* A subroutine of emit_block_move.  Copy the data via an explicit
-   loop.  This is used only when libcalls are forbidden.  */
-/* ??? It'd be nice to copy in hunks larger than QImode.  */
+   loop.  This is used only when libcalls are forbidden, or when
+   inlining is required.  INCR is the block size to be copied in each
+   loop iteration.  If it is negative, the absolute value is used, and
+   the block is copied backwards.  INCR must be a power of two, an
+   exact divisor for SIZE and ALIGN, and imply a mode that can be
+   safely copied per iteration assuming no overlap.  */
 
 static void
 emit_block_move_via_loop (rtx x, rtx y, rtx size,
-			  unsigned int align ATTRIBUTE_UNUSED)
+			  unsigned int align, int incr)
 {
   rtx_code_label *cmp_label, *top_label;
   rtx iter, x_addr, y_addr, tmp;
@@ -2277,7 +2399,38 @@ emit_block_move_via_loop (rtx x, rtx y, rtx size,
   cmp_label = gen_label_rtx ();
   iter = gen_reg_rtx (iter_mode);
 
-  emit_move_insn (iter, const0_rtx);
+  bool downwards = incr < 0;
+  rtx iter_init;
+  rtx_code iter_cond;
+  rtx iter_limit;
+  rtx iter_incr;
+  machine_mode move_mode;
+  if (downwards)
+    {
+      incr = -incr;
+      iter_init = size;
+      iter_cond = GEU;
+      iter_limit = const0_rtx;
+      iter_incr = GEN_INT (incr);
+    }
+  else
+    {
+      iter_init = const0_rtx;
+      iter_cond = LTU;
+      iter_limit = size;
+      iter_incr = GEN_INT (incr);
+    }
+  emit_move_insn (iter, iter_init);
+
+  scalar_int_mode int_move_mode
+    = smallest_int_mode_for_size (incr * BITS_PER_UNIT);
+  if (GET_MODE_BITSIZE (int_move_mode) != incr * BITS_PER_UNIT)
+    {
+      move_mode = BLKmode;
+      gcc_checking_assert (can_move_by_pieces (incr, align));
+    }
+  else
+    move_mode = int_move_mode;
 
   x_addr = force_operand (XEXP (x, 0), NULL_RTX);
   y_addr = force_operand (XEXP (y, 0), NULL_RTX);
@@ -2293,19 +2446,32 @@ emit_block_move_via_loop (rtx x, rtx y, rtx size,
     tmp = convert_modes (y_addr_mode, iter_mode, iter, true);
   y_addr = simplify_gen_binary (PLUS, y_addr_mode, y_addr, tmp);
 
-  x = change_address (x, QImode, x_addr);
-  y = change_address (y, QImode, y_addr);
+  x = change_address (x, move_mode, x_addr);
+  y = change_address (y, move_mode, y_addr);
+
+  if (move_mode == BLKmode)
+    {
+      bool done;
+      emit_block_move_hints (x, y, iter_incr, BLOCK_OP_NO_LIBCALL,
+			     align, incr, incr, incr, incr,
+			     false, &done, false);
+      gcc_checking_assert (done);
+    }
+  else
+    emit_move_insn (x, y);
 
-  emit_move_insn (x, y);
+  if (downwards)
+    emit_label (cmp_label);
 
-  tmp = expand_simple_binop (iter_mode, PLUS, iter, const1_rtx, iter,
+  tmp = expand_simple_binop (iter_mode, PLUS, iter, iter_incr, iter,
 			     true, OPTAB_LIB_WIDEN);
   if (tmp != iter)
     emit_move_insn (iter, tmp);
 
-  emit_label (cmp_label);
+  if (!downwards)
+    emit_label (cmp_label);
 
-  emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
+  emit_cmp_and_jump_insns (iter, iter_limit, iter_cond, NULL_RTX, iter_mode,
 			   true, top_label,
 			   profile_probability::guessed_always ()
 				.apply_scale (9, 10));
@@ -2405,7 +2571,8 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
 
    Both X and Y must be MEM rtx's.  LEN is an rtx that says how long
    they are.  LEN_TYPE is the type of the expression that was used to
-   calculate it.
+   calculate it, and CTZ_LEN is the known trailing-zeros count of LEN,
+   so LEN must be a multiple of 1<<CTZ_LEN even if it's not constant.
 
    If EQUALITY_ONLY is true, it means we don't have to return the tri-state
    value of a normal memcmp call, instead we can just compare for equality.
@@ -2421,7 +2588,7 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
 rtx
 emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
 		      bool equality_only, by_pieces_constfn y_cfn,
-		      void *y_cfndata)
+		      void *y_cfndata, unsigned ctz_len)
 {
   rtx result = 0;
 
@@ -2443,8 +2610,192 @@ emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
   else
     result = emit_block_cmp_via_cmpmem (x, y, len, len_type, target, align);
 
+  if (!result && (flag_inline_stringops & ILSOP_MEMCMP))
+    result = emit_block_cmp_via_loop (x, y, len, len_type,
+				      target, equality_only,
+				      align, ctz_len);
+
   return result;
 }
+
+/* Like emit_block_cmp_hints, but with known alignment and no support
+   for constats.  Always expand to a loop with iterations that compare
+   blocks of the largest compare-by-pieces size that divides both len
+   and align, and then, if !EQUALITY_ONLY, identify the word and then
+   the unit that first differs to return the result.  */
+
+rtx
+emit_block_cmp_via_loop (rtx x, rtx y, rtx len, tree len_type, rtx target,
+			 bool equality_only, unsigned align, unsigned ctz_len)
+{
+  unsigned incr = align / BITS_PER_UNIT;
+
+  if (CONST_INT_P (len))
+    ctz_len = MAX (ctz_len, (unsigned) wi::ctz (UINTVAL (len)));
+
+  if (HOST_WIDE_INT_1U << ctz_len < (unsigned HOST_WIDE_INT) incr)
+    incr = HOST_WIDE_INT_1U << ctz_len;
+
+  while (incr > 1
+	 && !can_do_by_pieces (incr, align, COMPARE_BY_PIECES))
+    incr >>= 1;
+
+  rtx_code_label *cmp_label, *top_label, *ne_label, *res_label;
+  rtx iter, x_addr, y_addr, tmp;
+  machine_mode x_addr_mode = get_address_mode (x);
+  machine_mode y_addr_mode = get_address_mode (y);
+  machine_mode iter_mode;
+
+  iter_mode = GET_MODE (len);
+  if (iter_mode == VOIDmode)
+    iter_mode = word_mode;
+
+  rtx iter_init = const0_rtx;
+  rtx_code iter_cond = LTU;
+  rtx iter_limit = len;
+  rtx iter_incr = GEN_INT (incr);
+  machine_mode cmp_mode;
+
+  /* We can skip the loop if we know there's a single iteration.  */
+  top_label = (!rtx_equal_p (len, iter_incr)
+	       ? gen_label_rtx ()
+	       : NULL);
+  /* We need not test before entering the loop if len is known
+     nonzero.  ??? This could be even stricter, testing whether a
+     nonconstant LEN could possibly be zero.  */
+  cmp_label = (!CONSTANT_P (len) || !rtx_equal_p (len, iter_init)
+	       ? gen_label_rtx ()
+	       : NULL);
+  ne_label = gen_label_rtx ();
+  res_label = gen_label_rtx ();
+
+  iter = gen_reg_rtx (iter_mode);
+  emit_move_insn (iter, iter_init);
+
+  scalar_int_mode int_cmp_mode
+    = smallest_int_mode_for_size (incr * BITS_PER_UNIT);
+  if (GET_MODE_BITSIZE (int_cmp_mode) != incr * BITS_PER_UNIT
+      || !can_compare_p (NE, int_cmp_mode, ccp_jump))
+    {
+      cmp_mode = BLKmode;
+      gcc_checking_assert (incr != 1);
+    }
+  else
+    cmp_mode = int_cmp_mode;
+
+  /* Save the base addresses.  */
+  x_addr = force_operand (XEXP (x, 0), NULL_RTX);
+  y_addr = force_operand (XEXP (y, 0), NULL_RTX);
+  do_pending_stack_adjust ();
+
+  if (cmp_label)
+    emit_jump (cmp_label);
+  if (top_label)
+    emit_label (top_label);
+
+  /* Offset the base addresses by ITER.  */
+  tmp = convert_modes (x_addr_mode, iter_mode, iter, true);
+  x_addr = simplify_gen_binary (PLUS, x_addr_mode, x_addr, tmp);
+
+  if (x_addr_mode != y_addr_mode)
+    tmp = convert_modes (y_addr_mode, iter_mode, iter, true);
+  y_addr = simplify_gen_binary (PLUS, y_addr_mode, y_addr, tmp);
+
+  x = change_address (x, cmp_mode, x_addr);
+  y = change_address (y, cmp_mode, y_addr);
+
+  /* Compare one block.  */
+  rtx part_res;
+  if (cmp_mode == BLKmode)
+    part_res = compare_by_pieces (x, y, incr, target, align, 0, 0);
+  else
+    part_res = expand_binop (cmp_mode, sub_optab, x, y, NULL_RTX,
+			     true, OPTAB_LIB_WIDEN);
+
+  /* Stop if we found a difference.  */
+  emit_cmp_and_jump_insns (part_res, GEN_INT (0), NE, NULL_RTX,
+			   GET_MODE (part_res), true, ne_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (1, 10));
+
+  /* Increment ITER.  */
+  tmp = expand_simple_binop (iter_mode, PLUS, iter, iter_incr, iter,
+			     true, OPTAB_LIB_WIDEN);
+  if (tmp != iter)
+    emit_move_insn (iter, tmp);
+
+  if (cmp_label)
+    emit_label (cmp_label);
+  /* Loop until we reach the limit.  */
+
+  if (top_label)
+    emit_cmp_and_jump_insns (iter, iter_limit, iter_cond, NULL_RTX, iter_mode,
+			     true, top_label,
+			     profile_probability::guessed_always ()
+			     .apply_scale (9, 10));
+
+  /* We got to the end without differences, so the result is zero.  */
+  if (target == NULL_RTX
+      || !REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
+    target = gen_reg_rtx (TYPE_MODE (integer_type_node));
+
+  emit_move_insn (target, const0_rtx);
+  emit_jump (res_label);
+
+  emit_label (ne_label);
+
+  /* Return nonzero, or pinpoint the difference to return the expected
+     result for non-equality tests.  */
+  if (equality_only)
+    emit_move_insn (target, const1_rtx);
+  else
+    {
+      if (incr > UNITS_PER_WORD)
+	/* ??? Re-compare the block found to be different one word at a
+	   time.  */
+	part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), len_type,
+					    target, equality_only,
+					    BITS_PER_WORD, 0);
+      else if (incr > 1)
+	/* ??? Re-compare the block found to be different one byte at a
+	   time.  We could do better using part_res, and being careful
+	   about endianness.  */
+	part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), len_type,
+					    target, equality_only,
+					    BITS_PER_UNIT, 0);
+      else if (known_gt (GET_MODE_BITSIZE (GET_MODE (target)),
+			 GET_MODE_BITSIZE (cmp_mode)))
+	part_res = expand_binop (GET_MODE (target), sub_optab, x, y, target,
+				 true, OPTAB_LIB_WIDEN);
+      else
+	{
+	  /* In the odd chance target is QImode, we can't count on
+	     widening subtract to capture the result of the unsigned
+	     compares.  */
+	  rtx_code_label *ltu_label;
+	  ltu_label = gen_label_rtx ();
+	  emit_cmp_and_jump_insns (x, y, LTU, NULL_RTX,
+				   cmp_mode, true, ltu_label,
+				   profile_probability::guessed_always ()
+				   .apply_scale (5, 10));
+
+	  emit_move_insn (target, const1_rtx);
+	  emit_jump (res_label);
+
+	  emit_label (ltu_label);
+	  emit_move_insn (target, constm1_rtx);
+	  part_res = target;
+	}
+
+      if (target != part_res)
+	convert_move (target, part_res, false);
+    }
+
+  emit_label (res_label);
+
+  return target;
+}
+
 \f
 /* Copy all or part of a value X into registers starting at REGNO.
    The number of registers to be filled is NREGS.  */
diff --git a/gcc/expr.h b/gcc/expr.h
index 11bff531862..988c783e450 100644
--- a/gcc/expr.h
+++ b/gcc/expr.h
@@ -126,7 +126,8 @@ struct by_pieces_prev
   fixed_size_mode mode;
 };
 
-extern rtx emit_block_move (rtx, rtx, rtx, enum block_op_methods);
+extern rtx emit_block_move (rtx, rtx, rtx, enum block_op_methods,
+			    unsigned ctz_size = 0);
 extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
 			          unsigned int, HOST_WIDE_INT,
 				  unsigned HOST_WIDE_INT,
@@ -134,9 +135,11 @@ extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
 				  unsigned HOST_WIDE_INT,
 				  bool bail_out_libcall = false,
 				  bool *is_move_done = NULL,
-				  bool might_overlap = false);
+				  bool might_overlap = false,
+				  unsigned ctz_size = 0);
 extern rtx emit_block_cmp_hints (rtx, rtx, rtx, tree, rtx, bool,
-				 by_pieces_constfn, void *);
+				 by_pieces_constfn, void *,
+				 unsigned ctz_len = 0);
 extern bool emit_storent_insn (rtx to, rtx from);
 
 /* Copy all or part of a value X into registers starting at REGNO.
diff --git a/gcc/flag-types.h b/gcc/flag-types.h
index 7466c1106f2..7a20cfd5de3 100644
--- a/gcc/flag-types.h
+++ b/gcc/flag-types.h
@@ -437,6 +437,17 @@ enum gfc_convert
 };
 
 
+/* Inline String Operations functions.  */
+enum ilsop_fn
+{
+  ILSOP_NONE = 0,
+  ILSOP_MEMSET = 1 << 0,
+  ILSOP_MEMCPY = 1 << 1,
+  ILSOP_MEMMOVE = 1 << 2,
+  ILSOP_MEMCMP = 1 << 3,
+  ILSOP_ALL = -1
+};
+
 /* Control-Flow Protection values.  */
 enum cf_protection_level
 {
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c
new file mode 100644
index 00000000000..a368f074112
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c
@@ -0,0 +1,7 @@
+/* { dg-do run } */
+/* { dg-options "-finline-stringops=memcmp -save-temps -g0 -fno-lto" } */
+
+#include "../memcmp-1.c"
+
+/* Check that no memcmp calls remain, but allow for lib_memcmp calls.  */
+/* { dg-final { scan-assembler-not {(^|\*)\mmemcmp\M} } } */
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c
new file mode 100644
index 00000000000..c98e903c1f1
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c
@@ -0,0 +1,8 @@
+/* { dg-do run } */
+/* { dg-options "-finline-stringops=memcpy -save-temps -g0 -fno-lto" } */
+
+#include "../memcmp-1.c"
+/* Yeah, this memcmp test exercises plenty of memcpy, more than any of the
+   memcpy tests.  */
+
+/* { dg-final { scan-assembler-not {\mmemcpy\M} } } */
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-cmp-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-cmp-1.c
new file mode 100644
index 00000000000..2cd2057a9b5
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-cmp-1.c
@@ -0,0 +1,11 @@
+/* { dg-do run } */
+/* { dg-options "-finline-stringops -save-temps -g0 -fno-lto" } */
+/* { dg-require-effective-target ptr32plus } */
+/* { dg-timeout-factor 2 } */
+
+#include "../memcmp-1.c"
+/* Yeah, this memcmp test exercises plenty of memcpy, more than any of the
+   memcpy tests.  */
+
+/* { dg-final { scan-assembler-not {\mmemcpy\M} } } */
+/* { dg-final { scan-assembler-not {(^|\*)\mmemcmp\M} } } */
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c
new file mode 100644
index 00000000000..f1d77b50a4c
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c
@@ -0,0 +1,9 @@
+/* { dg-do run } */
+/* { dg-options "-finline-stringops=memmove -save-temps -g0 -fno-lto" } */
+
+#include "../../gcc.c-torture/execute/builtins/memmove.c"
+
+#include "../../gcc.c-torture/execute/builtins/lib/main.c"
+
+/* { dg-final { scan-assembler-not {\mmemcpy\M} } } */
+/* { dg-final { scan-assembler-not {\mmemmove\M} } } */
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c
new file mode 100644
index 00000000000..bdcf9bf5b61
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c
@@ -0,0 +1,84 @@
+/* { dg-do compile } */
+/* { dg-options "-finline-stringops -fno-lto" } */
+
+void *zero (unsigned long long (*p)[32], int n)
+{
+  return __builtin_memset (p, 0, n * sizeof (*p));
+}
+
+void *ones (char (*p)[128], int n)
+{
+  return __builtin_memset (p, -1, n * sizeof (*p));
+}
+
+void *opt2 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 2) * sizeof (*p));
+}
+
+void *opt8 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 8) * sizeof (*p));
+}
+
+void *opt32 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 32) * sizeof (*p));
+}
+
+void *opt128 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 128) * sizeof (*p));
+}
+
+void *opt512 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 512) * sizeof (*p));
+}
+
+void *opt_primes (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 509 : 7) * sizeof (*p));
+}
+
+void *opt_primes_blk (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 521 : 9) * sizeof (*p));
+}
+
+void *huge (long (*p)[16384])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep1 (long (*p)[16384+1])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep4 (long (*p)[16384+4])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep16 (long (*p)[16384+16])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep64 (long (*p)[16384+64])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep256 (long (*p)[16384+256])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep1024p256p64p16p4p1 (long (*p)[16384+1024+64+16+4+1])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+/* { dg-final { scan-assembler-not {\mmemset\M} } } */

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [gcc(refs/users/aoliva/heads/testme)] Introduce -finline-stringops
@ 2023-09-15  5:42 Alexandre Oliva
  0 siblings, 0 replies; 16+ messages in thread
From: Alexandre Oliva @ 2023-09-15  5:42 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:463fe491cc4bb8552021b9e9d2851082f0e30afb

commit 463fe491cc4bb8552021b9e9d2851082f0e30afb
Author: Alexandre Oliva <oliva@adacore.com>
Date:   Fri Sep 15 02:41:24 2023 -0300

    Introduce -finline-stringops
    
    try_store_by_multiple_pieces was added not long ago, enabling
    variable-sized memset to be expanded inline when the worst-case
    in-range constant length would, using conditional blocks with powers
    of two to cover all possibilities of length and alignment.
    
    This patch introduces -finline-stringops[=fn] to request expansions to
    start with a loop, so as to still take advantage of known alignment
    even with long lengths, but without necessarily adding store blocks
    for every power of two.
    
    This makes it possible for the supported stringops (memset, memcpy,
    memmove, memset) to be expanded, even if storing a single byte per
    iteration.  Surely efficient implementations can run faster, with a
    pre-loop to increase alignment, but that would likely be excessive for
    inline expansions.
    
    Still, in some cases, such as in freestanding environments, users
    prefer to inline such stringops, especially those that the compiler
    may introduce itself, even if the expansion is not as performant as a
    highly optimized C library implementation could be, to avoid
    depending on a C runtime library.
    
    
    for  gcc/ChangeLog
    
            * expr.cc (emit_block_move_hints): Take ctz of len.  Obey
            -finline-stringops.  Use oriented or sized loop.
            (emit_block_move): Take ctz of len, and pass it on.
            (emit_block_move_via_sized_loop): New.
            (emit_block_move_via_oriented_loop): New.
            (emit_block_move_via_loop): Take incr.  Move an incr-sized
            block per iteration.
            (emit_block_cmp_via_cmpmem): Take ctz of len.  Obey
            -finline-stringops.
            (emit_block_cmp_via_loop): New.
            * expr.h (emit_block_move): Add ctz of len defaulting to zero.
            (emit_block_move_hints): Likewise.
            (emit_block_cmp_hints): Likewise.
            * builtins.cc (expand_builtin_memory_copy_args): Pass ctz of
            len to emit_block_move_hints.
            (try_store_by_multiple_pieces): Support starting with a loop.
            (expand_builtin_memcmp): Pass ctz of len to
            emit_block_cmp_hints.
            (expand_builtin): Allow inline expansion of memset, memcpy,
            memmove and memcmp if requested.
            * common.opt (finline-stringops): New.
            (ilsop_fn): New enum.
            * flag-types.h (enum ilsop_fn): New.
            * doc/invoke.texi (-finline-stringops): Add.
    
    for  gcc/testsuite/ChangeLog
    
            * gcc.dg/torture/inline-mem-cmp-1.c: New.
            * gcc.dg/torture/inline-mem-cpy-1.c: New.
            * gcc.dg/torture/inline-mem-cpy-cmp-1.c: New.
            * gcc.dg/torture/inline-mem-move-1.c: New.
            * gcc.dg/torture/inline-mem-set-1.c: New.

Diff:
---
 gcc/builtins.cc                                    | 114 ++++++-
 gcc/common.opt                                     |  34 ++
 gcc/doc/invoke.texi                                |  15 +
 gcc/expr.cc                                        | 374 ++++++++++++++++++++-
 gcc/expr.h                                         |   9 +-
 gcc/flag-types.h                                   |  11 +
 gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c    |   7 +
 gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c    |   8 +
 .../gcc.dg/torture/inline-mem-cpy-cmp-1.c          |  11 +
 gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c   |   9 +
 gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c    |  84 +++++
 11 files changed, 646 insertions(+), 30 deletions(-)

diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index 3b453b3ec8c..23646da09f8 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -3771,7 +3771,7 @@ expand_builtin_memory_copy_args (tree dest, tree src, tree len,
 				     expected_align, expected_size,
 				     min_size, max_size, probable_max_size,
 				     use_mempcpy_call, &is_move_done,
-				     might_overlap);
+				     might_overlap, tree_ctz (len));
 
   /* Bail out when a mempcpy call would be expanded as libcall and when
      we have a target that provides a fast implementation
@@ -4337,6 +4337,10 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
   int tst_bits = (max_bits != min_bits ? max_bits
 		  : floor_log2 (max_len ^ min_len));
 
+  /* Save the pre-blksize values.  */
+  int orig_max_bits = max_bits;
+  int orig_tst_bits = tst_bits;
+
   /* Check whether it's profitable to start by storing a fixed BLKSIZE
      bytes, to lower max_bits.  In the unlikely case of a constant LEN
      (implied by identical MAX_LEN and MIN_LEN), we want to issue a
@@ -4376,9 +4380,70 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
   if (max_bits >= 0)
     xlenest += ((HOST_WIDE_INT_1U << max_bits) * 2
 		- (HOST_WIDE_INT_1U << ctz_len));
-  if (!can_store_by_pieces (xlenest, builtin_memset_read_str,
-			    &valc, align, true))
-    return false;
+  bool max_loop = false;
+  /* Skip the test in case of overflow in xlenest.  It shouldn't
+     happen because of the way max_bits and blksize are related, but
+     it doesn't hurt to test.  */
+  if (blksize > xlenest
+      || !can_store_by_pieces (xlenest, builtin_memset_read_str,
+			       &valc, align, true))
+    {
+      if (!(flag_inline_stringops & ILSOP_MEMSET))
+	return false;
+
+      for (max_bits = orig_max_bits;
+	   max_bits >= sctz_len;
+	   --max_bits)
+	{
+	  xlenest = ((HOST_WIDE_INT_1U << max_bits) * 2
+		     - (HOST_WIDE_INT_1U << ctz_len));
+	  /* Check that blksize plus the bits to be stored as blocks
+	     sized at powers of two can be stored by pieces.  This is
+	     like the test above, but with smaller max_bits.  Skip
+	     orig_max_bits (it would be redundant).  Also skip in case
+	     of overflow.  */
+	  if (max_bits < orig_max_bits
+	      && xlenest + blksize >= xlenest
+	      && can_store_by_pieces (xlenest + blksize,
+				      builtin_memset_read_str,
+				      &valc, align, true))
+	    {
+	      max_loop = true;
+	      break;
+	    }
+	  if (blksize
+	      && can_store_by_pieces (xlenest,
+				      builtin_memset_read_str,
+				      &valc, align, true))
+	    {
+	      max_len += blksize;
+	      min_len += blksize;
+	      tst_bits = orig_tst_bits;
+	      blksize = 0;
+	      max_loop = true;
+	      break;
+	    }
+	  if (max_bits == sctz_len)
+	    {
+	      --sctz_len;
+	      --ctz_len;
+	    }
+	}
+      if (!max_loop)
+	return false;
+      /* If the boundaries are such that min and max may run a
+	 different number of trips in the initial loop, the remainder
+	 needs not be between the moduli, so set tst_bits to cover all
+	 bits.  Otherwise, if the trip counts are the same, max_len
+	 has the common prefix, and the previously-computed tst_bits
+	 is usable.  */
+      if (max_len >> max_bits > min_len >> max_bits)
+	tst_bits = max_bits;
+    }
+  /* ??? Do we have to check that all powers of two lengths from
+     max_bits down to ctz_len pass can_store_by_pieces?  As in, could
+     it possibly be that xlenest passes while smaller power-of-two
+     sizes don't?  */
 
   by_pieces_constfn constfun;
   void *constfundata;
@@ -4420,7 +4485,9 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
      the least significant bit possibly set in the length.  */
   for (int i = max_bits; i >= sctz_len; i--)
     {
+      rtx_code_label *loop_label = NULL;
       rtx_code_label *label = NULL;
+
       blksize = HOST_WIDE_INT_1U << i;
 
       /* If we're past the bits shared between min_ and max_len, expand
@@ -4434,18 +4501,31 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
 				   profile_probability::even ());
 	}
       /* If we are at a bit that is in the prefix shared by min_ and
-	 max_len, skip this BLKSIZE if the bit is clear.  */
-      else if ((max_len & blksize) == 0)
+	 max_len, skip the current BLKSIZE if the bit is clear, but do
+	 not skip the loop, even if it doesn't require
+	 prechecking.  */
+      else if ((max_len & blksize) == 0
+	       && !(max_loop && i == max_bits))
 	continue;
 
+      if (max_loop && i == max_bits)
+	{
+	  loop_label = gen_label_rtx ();
+	  emit_label (loop_label);
+	  /* Since we may run this multiple times, don't assume we
+	     know anything about the offset.  */
+	  clear_mem_offset (to);
+	}
+
       /* Issue a store of BLKSIZE bytes.  */
+      bool update_needed = i != sctz_len || loop_label;
       to = store_by_pieces (to, blksize,
 			    constfun, constfundata,
 			    align, true,
-			    i != sctz_len ? RETURN_END : RETURN_BEGIN);
+			    update_needed ? RETURN_END : RETURN_BEGIN);
 
       /* Adjust REM and PTR, unless this is the last iteration.  */
-      if (i != sctz_len)
+      if (update_needed)
 	{
 	  emit_move_insn (ptr, force_operand (XEXP (to, 0), NULL_RTX));
 	  to = replace_equiv_address (to, ptr);
@@ -4453,6 +4533,11 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
 	  emit_move_insn (rem, force_operand (rem_minus_blksize, NULL_RTX));
 	}
 
+      if (loop_label)
+	emit_cmp_and_jump_insns (rem, GEN_INT (blksize), GE, NULL,
+				 ptr_mode, 1, loop_label,
+				 profile_probability::likely ());
+
       if (label)
 	{
 	  emit_label (label);
@@ -4739,7 +4824,8 @@ expand_builtin_memcmp (tree exp, rtx target, bool result_eq)
   result = emit_block_cmp_hints (arg1_rtx, arg2_rtx, len_rtx,
 				 TREE_TYPE (len), target,
 				 result_eq, constfn,
-				 CONST_CAST (char *, rep));
+				 CONST_CAST (char *, rep),
+				 tree_ctz (len));
 
   if (result)
     {
@@ -7382,7 +7468,15 @@ expand_builtin (tree exp, rtx target, rtx subtarget, machine_mode mode,
       && fcode != BUILT_IN_EXECVE
       && fcode != BUILT_IN_CLEAR_CACHE
       && !ALLOCA_FUNCTION_CODE_P (fcode)
-      && fcode != BUILT_IN_FREE)
+      && fcode != BUILT_IN_FREE
+      && (fcode != BUILT_IN_MEMSET
+	  || !(flag_inline_stringops & ILSOP_MEMSET))
+      && (fcode != BUILT_IN_MEMCPY
+	  || !(flag_inline_stringops & ILSOP_MEMCPY))
+      && (fcode != BUILT_IN_MEMMOVE
+	  || !(flag_inline_stringops & ILSOP_MEMMOVE))
+      && (fcode != BUILT_IN_MEMCMP
+	  || !(flag_inline_stringops & ILSOP_MEMCMP)))
     return expand_call (exp, target, ignore);
 
   /* The built-in function expanders test for target == const0_rtx
diff --git a/gcc/common.opt b/gcc/common.opt
index f137a1f81ac..1e3f8efbc97 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -1908,6 +1908,40 @@ finline-atomics
 Common Var(flag_inline_atomics) Init(1) Optimization
 Inline __atomic operations when a lock free instruction sequence is available.
 
+finline-stringops
+Common RejectNegative Enum(ilsop_fn) Var(flag_inline_stringops, ILSOP_ALL) Enum(ilsop_fn) Init(ILSOP_NONE) Optimization Undocumented
+
+fno-inline-stringops
+Common RejectNegative Enum(ilsop_fn) Var(flag_inline_stringops, ILSOP_NONE) Enum(ilsop_fn) Optimization Undocumented
+
+finline-stringops=
+Common Joined Var(flag_inline_stringops) EnumSet Enum(ilsop_fn) Optimization
+-finline-stringops[=memcmp|memcpy|memmove|memset]
+Expand supported mem/str operations inline, even if against optimization.
+
+Enum
+Name(ilsop_fn) Type(enum ilsop_fn) UnknownError(unavailable stringop for inlining %qs)
+
+; This is not part of any set.
+; EnumValue
+; Enum(ilsop_fn) String(none) Value(ILSOP_NONE)
+
+EnumValue
+Enum(ilsop_fn) String(memcmp) Value(ILSOP_MEMCMP) Set(1)
+
+EnumValue
+Enum(ilsop_fn) String(memcpy) Value(ILSOP_MEMCPY) Set(2)
+
+EnumValue
+Enum(ilsop_fn) String(memmove) Value(ILSOP_MEMMOVE) Set(3)
+
+EnumValue
+Enum(ilsop_fn) String(memset) Value(ILSOP_MEMSET) Set(4)
+
+; This is not part of any set either.
+; EnumValue
+; Enum(ilsop_fn) String(all) Value(ILSOP_ALL)
+
 fcf-protection
 Common RejectNegative Alias(fcf-protection=,full)
 
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 03d93e6b185..b0649712792 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -556,6 +556,7 @@ Objective-C and Objective-C++ Dialects}.
 -fgcse  -fgcse-after-reload  -fgcse-las  -fgcse-lm  -fgraphite-identity
 -fgcse-sm  -fhoist-adjacent-loads  -fif-conversion
 -fif-conversion2  -findirect-inlining
+-finline-stringops[=@var{fn}]
 -finline-functions  -finline-functions-called-once  -finline-limit=@var{n}
 -finline-small-functions -fipa-modref -fipa-cp  -fipa-cp-clone
 -fipa-bit-cp  -fipa-vrp  -fipa-pta  -fipa-profile  -fipa-pure-const
@@ -12228,6 +12229,20 @@ their @code{_FORTIFY_SOURCE} counterparts into faster alternatives.
 
 Enabled at levels @option{-O2}, @option{-O3}.
 
+@opindex finline-stringops
+@item -finline-stringops[=@var{fn}]
+Expand memory and string operations (for now, only @code{memset})
+inline, even when the length is variable or big enough as to require
+looping.  This is most useful along with @option{-ffreestanding} and
+@option{-fno-builtin}.
+
+In some circumstances, it enables the compiler to generate code that
+takes advantage of known alignment and length multipliers, but even then
+it may be less efficient than optimized runtime implementations, and
+grow code size so much that even a less performant but shared
+implementation runs faster due to better use of code caches.  This
+option is disabled by default.
+
 @opindex fno-inline
 @opindex finline
 @item -fno-inline
diff --git a/gcc/expr.cc b/gcc/expr.cc
index d5b6494b4fc..1afed88677a 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -80,7 +80,11 @@ static bool emit_block_move_via_pattern (rtx, rtx, rtx, unsigned, unsigned,
 					 HOST_WIDE_INT, unsigned HOST_WIDE_INT,
 					 unsigned HOST_WIDE_INT,
 					 unsigned HOST_WIDE_INT, bool);
-static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned);
+static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned, int);
+static void emit_block_move_via_sized_loop (rtx, rtx, rtx, unsigned, unsigned);
+static void emit_block_move_via_oriented_loop (rtx, rtx, rtx, unsigned, unsigned);
+static rtx emit_block_cmp_via_loop (rtx, rtx, rtx, tree, rtx, bool,
+				    unsigned, unsigned);
 static void clear_by_pieces (rtx, unsigned HOST_WIDE_INT, unsigned int);
 static rtx_insn *compress_float_constant (rtx, rtx);
 static rtx get_subtarget (rtx);
@@ -1955,6 +1959,8 @@ compare_by_pieces (rtx arg0, rtx arg1, unsigned HOST_WIDE_INT len,
    MIN_SIZE is the minimal size of block to move
    MAX_SIZE is the maximal size of block to move, if it cannot be represented
    in unsigned HOST_WIDE_INT, than it is mask of all ones.
+   CTZ_SIZE is the trailing-zeros count of SIZE; even a nonconstant SIZE is
+   known to be a multiple of 1<<CTZ_SIZE.
 
    Return the address of the new block, if memcpy is called and returns it,
    0 otherwise.  */
@@ -1966,7 +1972,7 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
 		       unsigned HOST_WIDE_INT max_size,
 		       unsigned HOST_WIDE_INT probable_max_size,
 		       bool bail_out_libcall, bool *is_move_done,
-		       bool might_overlap)
+		       bool might_overlap, unsigned ctz_size)
 {
   int may_use_call;
   rtx retval = 0;
@@ -2052,6 +2058,14 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
 	}
     }
 
+  bool dynamic_direction = false;
+  if (!pattern_ok && !pieces_ok && may_use_call
+      && (flag_inline_stringops & (might_overlap ? ILSOP_MEMMOVE : ILSOP_MEMCPY)))
+    {
+      may_use_call = 0;
+      dynamic_direction = might_overlap;
+    }
+
   if (pattern_ok)
     ;
   else if (pieces_ok)
@@ -2073,10 +2087,12 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
       retval = emit_block_copy_via_libcall (x, y, size,
 					    method == BLOCK_OP_TAILCALL);
     }
+  else if (dynamic_direction)
+    emit_block_move_via_oriented_loop (x, y, size, align, ctz_size);
   else if (might_overlap)
     *is_move_done = false;
   else
-    emit_block_move_via_loop (x, y, size, align);
+    emit_block_move_via_sized_loop (x, y, size, align, ctz_size);
 
   if (method == BLOCK_OP_CALL_PARM)
     OK_DEFER_POP;
@@ -2085,7 +2101,8 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
 }
 
 rtx
-emit_block_move (rtx x, rtx y, rtx size, enum block_op_methods method)
+emit_block_move (rtx x, rtx y, rtx size, enum block_op_methods method,
+		 unsigned int ctz_size)
 {
   unsigned HOST_WIDE_INT max, min = 0;
   if (GET_CODE (size) == CONST_INT)
@@ -2093,7 +2110,8 @@ emit_block_move (rtx x, rtx y, rtx size, enum block_op_methods method)
   else
     max = GET_MODE_MASK (GET_MODE (size));
   return emit_block_move_hints (x, y, size, method, 0, -1,
-				min, max, max);
+				min, max, max,
+				false, NULL, false, ctz_size);
 }
 
 /* A subroutine of emit_block_move.  Returns true if calling the
@@ -2255,13 +2273,117 @@ emit_block_move_via_pattern (rtx x, rtx y, rtx size, unsigned int align,
   return false;
 }
 
+/* Like emit_block_move_via_loop, but choose a suitable INCR based on
+   ALIGN and CTZ_SIZE.  */
+
+static void
+emit_block_move_via_sized_loop (rtx x, rtx y, rtx size,
+				unsigned int align,
+				unsigned int ctz_size)
+{
+  int incr = align / BITS_PER_UNIT;
+
+  if (CONST_INT_P (size))
+    ctz_size = MAX (ctz_size, (unsigned) wi::ctz (UINTVAL (size)));
+
+  if (HOST_WIDE_INT_1U << ctz_size < (unsigned HOST_WIDE_INT) incr)
+    incr = HOST_WIDE_INT_1U << ctz_size;
+
+  while (incr > 1 && !can_move_by_pieces (incr, align))
+    incr >>= 1;
+
+  gcc_checking_assert (incr);
+
+  return emit_block_move_via_loop (x, y, size, align, incr);
+}
+
+/* Like emit_block_move_via_sized_loop, but besides choosing INCR so
+   as to ensure safe moves even in case of overlap, output dynamic
+   tests to choose between two loops, one moving downwards, another
+   moving upwards.  */
+
+static void
+emit_block_move_via_oriented_loop (rtx x, rtx y, rtx size,
+				   unsigned int align,
+				   unsigned int ctz_size)
+{
+  int incr = align / BITS_PER_UNIT;
+
+  if (CONST_INT_P (size))
+    ctz_size = MAX (ctz_size, (unsigned) wi::ctz (UINTVAL (size)));
+
+  if (HOST_WIDE_INT_1U << ctz_size < (unsigned HOST_WIDE_INT) incr)
+    incr = HOST_WIDE_INT_1U << ctz_size;
+
+  while (incr > 1 && !int_mode_for_size (incr, 0).exists ())
+    incr >>= 1;
+
+  gcc_checking_assert (incr);
+
+  rtx_code_label *upw_label, *end_label;
+  upw_label = gen_label_rtx ();
+  end_label = gen_label_rtx ();
+
+  rtx x_addr = force_operand (XEXP (x, 0), NULL_RTX);
+  rtx y_addr = force_operand (XEXP (y, 0), NULL_RTX);
+  do_pending_stack_adjust ();
+
+  machine_mode mode = GET_MODE (x_addr);
+  if (mode != GET_MODE (y_addr))
+    {
+      scalar_int_mode xmode
+	= smallest_int_mode_for_size (GET_MODE_BITSIZE (mode));
+      scalar_int_mode ymode
+	= smallest_int_mode_for_size (GET_MODE_BITSIZE
+				      (GET_MODE (y_addr)));
+      if (GET_MODE_BITSIZE (xmode) < GET_MODE_BITSIZE (ymode))
+	mode = ymode;
+      else
+	mode = xmode;
+
+#ifndef POINTERS_EXTEND_UNSIGNED
+      const int POINTERS_EXTEND_UNSIGNED = 1;
+#endif
+      x_addr = convert_modes (mode, GET_MODE (x_addr), x_addr,
+			      POINTERS_EXTEND_UNSIGNED);
+      y_addr = convert_modes (mode, GET_MODE (y_addr), y_addr,
+			      POINTERS_EXTEND_UNSIGNED);
+    }
+
+  /* Test for overlap: if (x >= y || x + size <= y) goto upw_label.  */
+  emit_cmp_and_jump_insns (x_addr, y_addr, GEU, NULL_RTX, mode,
+			   true, upw_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (5, 10));
+  rtx tmp = convert_modes (GET_MODE (x_addr), GET_MODE (size), size, true);
+  tmp = simplify_gen_binary (PLUS, GET_MODE (x_addr), x_addr, tmp);
+
+  emit_cmp_and_jump_insns (tmp, y_addr, LEU, NULL_RTX, mode,
+			   true, upw_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (8, 10));
+
+  emit_block_move_via_loop (x, y, size, align, -incr);
+
+  emit_jump (end_label);
+  emit_label (upw_label);
+
+  emit_block_move_via_loop (x, y, size, align, incr);
+
+  emit_label (end_label);
+}
+
 /* A subroutine of emit_block_move.  Copy the data via an explicit
-   loop.  This is used only when libcalls are forbidden.  */
-/* ??? It'd be nice to copy in hunks larger than QImode.  */
+   loop.  This is used only when libcalls are forbidden, or when
+   inlining is required.  INCR is the block size to be copied in each
+   loop iteration.  If it is negative, the absolute value is used, and
+   the block is copied backwards.  INCR must be a power of two, an
+   exact divisor for SIZE and ALIGN, and imply a mode that can be
+   safely copied per iteration assuming no overlap.  */
 
 static void
 emit_block_move_via_loop (rtx x, rtx y, rtx size,
-			  unsigned int align ATTRIBUTE_UNUSED)
+			  unsigned int align, int incr)
 {
   rtx_code_label *cmp_label, *top_label;
   rtx iter, x_addr, y_addr, tmp;
@@ -2277,7 +2399,38 @@ emit_block_move_via_loop (rtx x, rtx y, rtx size,
   cmp_label = gen_label_rtx ();
   iter = gen_reg_rtx (iter_mode);
 
-  emit_move_insn (iter, const0_rtx);
+  bool downwards = incr < 0;
+  rtx iter_init;
+  rtx_code iter_cond;
+  rtx iter_limit;
+  rtx iter_incr;
+  machine_mode move_mode;
+  if (downwards)
+    {
+      incr = -incr;
+      iter_init = size;
+      iter_cond = GEU;
+      iter_limit = const0_rtx;
+      iter_incr = GEN_INT (incr);
+    }
+  else
+    {
+      iter_init = const0_rtx;
+      iter_cond = LTU;
+      iter_limit = size;
+      iter_incr = GEN_INT (incr);
+    }
+  emit_move_insn (iter, iter_init);
+
+  scalar_int_mode int_move_mode
+    = smallest_int_mode_for_size (incr * BITS_PER_UNIT);
+  if (GET_MODE_BITSIZE (int_move_mode) != incr * BITS_PER_UNIT)
+    {
+      move_mode = BLKmode;
+      gcc_checking_assert (can_move_by_pieces (incr, align));
+    }
+  else
+    move_mode = int_move_mode;
 
   x_addr = force_operand (XEXP (x, 0), NULL_RTX);
   y_addr = force_operand (XEXP (y, 0), NULL_RTX);
@@ -2293,19 +2446,32 @@ emit_block_move_via_loop (rtx x, rtx y, rtx size,
     tmp = convert_modes (y_addr_mode, iter_mode, iter, true);
   y_addr = simplify_gen_binary (PLUS, y_addr_mode, y_addr, tmp);
 
-  x = change_address (x, QImode, x_addr);
-  y = change_address (y, QImode, y_addr);
+  x = change_address (x, move_mode, x_addr);
+  y = change_address (y, move_mode, y_addr);
 
-  emit_move_insn (x, y);
+  if (move_mode == BLKmode)
+    {
+      bool done;
+      emit_block_move_hints (x, y, iter_incr, BLOCK_OP_NO_LIBCALL,
+			     align, incr, incr, incr, incr,
+			     false, &done, false);
+      gcc_checking_assert (done);
+    }
+  else
+    emit_move_insn (x, y);
 
-  tmp = expand_simple_binop (iter_mode, PLUS, iter, const1_rtx, iter,
+  if (downwards)
+    emit_label (cmp_label);
+
+  tmp = expand_simple_binop (iter_mode, PLUS, iter, iter_incr, iter,
 			     true, OPTAB_LIB_WIDEN);
   if (tmp != iter)
     emit_move_insn (iter, tmp);
 
-  emit_label (cmp_label);
+  if (!downwards)
+    emit_label (cmp_label);
 
-  emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
+  emit_cmp_and_jump_insns (iter, iter_limit, iter_cond, NULL_RTX, iter_mode,
 			   true, top_label,
 			   profile_probability::guessed_always ()
 				.apply_scale (9, 10));
@@ -2405,7 +2571,8 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
 
    Both X and Y must be MEM rtx's.  LEN is an rtx that says how long
    they are.  LEN_TYPE is the type of the expression that was used to
-   calculate it.
+   calculate it, and CTZ_LEN is the known trailing-zeros count of LEN,
+   so LEN must be a multiple of 1<<CTZ_LEN even if it's not constant.
 
    If EQUALITY_ONLY is true, it means we don't have to return the tri-state
    value of a normal memcmp call, instead we can just compare for equality.
@@ -2421,7 +2588,7 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
 rtx
 emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
 		      bool equality_only, by_pieces_constfn y_cfn,
-		      void *y_cfndata)
+		      void *y_cfndata, unsigned ctz_len)
 {
   rtx result = 0;
 
@@ -2443,8 +2610,181 @@ emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
   else
     result = emit_block_cmp_via_cmpmem (x, y, len, len_type, target, align);
 
+  if (!result && (flag_inline_stringops & ILSOP_MEMCMP))
+    result = emit_block_cmp_via_loop (x, y, len, len_type,
+				      target, equality_only,
+				      align, ctz_len);
+
   return result;
 }
+
+/* Like emit_block_cmp_hints, but with known alignment and no support
+   for constats.  Always expand to a loop with iterations that compare
+   blocks of the largest compare-by-pieces size that divides both len
+   and align, and then, if !EQUALITY_ONLY, identify the word and then
+   the unit that first differs to return the result.  */
+
+rtx
+emit_block_cmp_via_loop (rtx x, rtx y, rtx len, tree len_type, rtx target,
+			 bool equality_only, unsigned align, unsigned ctz_len)
+{
+  unsigned incr = align / BITS_PER_UNIT;
+
+  if (CONST_INT_P (len))
+    ctz_len = MAX (ctz_len, (unsigned) wi::ctz (UINTVAL (len)));
+
+  if (HOST_WIDE_INT_1U << ctz_len < (unsigned HOST_WIDE_INT) incr)
+    incr = HOST_WIDE_INT_1U << ctz_len;
+
+  while (incr > 1
+	 && !can_do_by_pieces (incr, align, COMPARE_BY_PIECES))
+    incr >>= 1;
+
+  rtx_code_label *cmp_label, *top_label, *ne_label, *res_label;
+  rtx iter, x_addr, y_addr, tmp;
+  machine_mode x_addr_mode = get_address_mode (x);
+  machine_mode y_addr_mode = get_address_mode (y);
+  machine_mode iter_mode;
+
+  iter_mode = GET_MODE (len);
+  if (iter_mode == VOIDmode)
+    iter_mode = word_mode;
+
+  top_label = gen_label_rtx ();
+  cmp_label = gen_label_rtx ();
+  ne_label = gen_label_rtx ();
+  res_label = gen_label_rtx ();
+  iter = gen_reg_rtx (iter_mode);
+
+  rtx iter_init = const0_rtx;
+  rtx_code iter_cond = LTU;
+  rtx iter_limit = len;
+  rtx iter_incr = GEN_INT (incr);
+  machine_mode cmp_mode;
+
+  emit_move_insn (iter, iter_init);
+
+  scalar_int_mode int_cmp_mode
+    = smallest_int_mode_for_size (incr * BITS_PER_UNIT);
+  if (GET_MODE_BITSIZE (int_cmp_mode) != incr * BITS_PER_UNIT
+      || !can_compare_p (NE, int_cmp_mode, ccp_jump))
+    {
+      cmp_mode = BLKmode;
+      gcc_checking_assert (incr != 1);
+    }
+  else
+    cmp_mode = int_cmp_mode;
+
+  /* Save the base addresses.  */
+  x_addr = force_operand (XEXP (x, 0), NULL_RTX);
+  y_addr = force_operand (XEXP (y, 0), NULL_RTX);
+  do_pending_stack_adjust ();
+
+  emit_jump (cmp_label);
+  emit_label (top_label);
+
+  /* Offset the base addresses by ITER.  */
+  tmp = convert_modes (x_addr_mode, iter_mode, iter, true);
+  x_addr = simplify_gen_binary (PLUS, x_addr_mode, x_addr, tmp);
+
+  if (x_addr_mode != y_addr_mode)
+    tmp = convert_modes (y_addr_mode, iter_mode, iter, true);
+  y_addr = simplify_gen_binary (PLUS, y_addr_mode, y_addr, tmp);
+
+  x = change_address (x, cmp_mode, x_addr);
+  y = change_address (y, cmp_mode, y_addr);
+
+  /* Compare one block.  */
+  rtx part_res;
+  if (cmp_mode == BLKmode)
+    part_res = compare_by_pieces (x, y, incr, target, align, 0, 0);
+  else
+    part_res = expand_binop (cmp_mode, sub_optab, x, y, NULL_RTX,
+			     true, OPTAB_LIB_WIDEN);
+
+  /* Stop if we found a difference.  */
+  emit_cmp_and_jump_insns (part_res, GEN_INT (0), NE, NULL_RTX,
+			   GET_MODE (part_res), true, ne_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (1, 10));
+
+  /* Increment ITER.  */
+  tmp = expand_simple_binop (iter_mode, PLUS, iter, iter_incr, iter,
+			     true, OPTAB_LIB_WIDEN);
+  if (tmp != iter)
+    emit_move_insn (iter, tmp);
+
+  emit_label (cmp_label);
+  /* Loop until we reach the limit.  */
+  emit_cmp_and_jump_insns (iter, iter_limit, iter_cond, NULL_RTX, iter_mode,
+			   true, top_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (9, 10));
+
+  /* We got to the end without differences, so the result is zero.  */
+  if (target == NULL_RTX
+      || !REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
+    target = gen_reg_rtx (TYPE_MODE (integer_type_node));
+
+  emit_move_insn (target, const0_rtx);
+  emit_jump (res_label);
+  emit_barrier ();
+
+  emit_label (ne_label);
+
+  /* Return nonzero, or pinpoint the difference to return the expected
+     result for non-equality tests.  */
+  if (equality_only)
+    emit_move_insn (target, const1_rtx);
+  else
+    {
+      if (incr > UNITS_PER_WORD)
+	/* ??? Re-compare the block found to be different one word at a
+	   time.  */
+	part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), len_type,
+					    target, equality_only,
+					    BITS_PER_WORD, 0);
+      else if (incr > 1)
+	/* ??? Re-compare the block found to be different one byte at a
+	   time.  We could do better using part_res, and being careful
+	   about endianness.  */
+	part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), len_type,
+					    target, equality_only,
+					    BITS_PER_UNIT, 0);
+      else if (known_gt (GET_MODE_BITSIZE (GET_MODE (target)),
+			 GET_MODE_BITSIZE (cmp_mode)))
+	part_res = expand_binop (GET_MODE (target), sub_optab, x, y, target,
+				 true, OPTAB_LIB_WIDEN);
+      else
+	{
+	  /* In the odd chance target is QImode, we can't count on
+	     widening subtract to capture the result of the unsigned
+	     compares.  */
+	  rtx_code_label *ltu_label;
+	  ltu_label = gen_label_rtx ();
+	  emit_cmp_and_jump_insns (x, y, LTU, NULL_RTX,
+				   cmp_mode, true, ltu_label,
+				   profile_probability::guessed_always ()
+				   .apply_scale (5, 10));
+
+	  emit_move_insn (target, const1_rtx);
+	  emit_jump (res_label);
+	  emit_barrier ();
+
+	  emit_label (ltu_label);
+	  emit_move_insn (target, constm1_rtx);
+	  part_res = target;
+	}
+
+      if (target != part_res)
+	convert_move (target, part_res, false);
+    }
+
+  emit_label (res_label);
+
+  return target;
+}
+
 \f
 /* Copy all or part of a value X into registers starting at REGNO.
    The number of registers to be filled is NREGS.  */
diff --git a/gcc/expr.h b/gcc/expr.h
index 11bff531862..988c783e450 100644
--- a/gcc/expr.h
+++ b/gcc/expr.h
@@ -126,7 +126,8 @@ struct by_pieces_prev
   fixed_size_mode mode;
 };
 
-extern rtx emit_block_move (rtx, rtx, rtx, enum block_op_methods);
+extern rtx emit_block_move (rtx, rtx, rtx, enum block_op_methods,
+			    unsigned ctz_size = 0);
 extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
 			          unsigned int, HOST_WIDE_INT,
 				  unsigned HOST_WIDE_INT,
@@ -134,9 +135,11 @@ extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
 				  unsigned HOST_WIDE_INT,
 				  bool bail_out_libcall = false,
 				  bool *is_move_done = NULL,
-				  bool might_overlap = false);
+				  bool might_overlap = false,
+				  unsigned ctz_size = 0);
 extern rtx emit_block_cmp_hints (rtx, rtx, rtx, tree, rtx, bool,
-				 by_pieces_constfn, void *);
+				 by_pieces_constfn, void *,
+				 unsigned ctz_len = 0);
 extern bool emit_storent_insn (rtx to, rtx from);
 
 /* Copy all or part of a value X into registers starting at REGNO.
diff --git a/gcc/flag-types.h b/gcc/flag-types.h
index 7466c1106f2..7a20cfd5de3 100644
--- a/gcc/flag-types.h
+++ b/gcc/flag-types.h
@@ -437,6 +437,17 @@ enum gfc_convert
 };
 
 
+/* Inline String Operations functions.  */
+enum ilsop_fn
+{
+  ILSOP_NONE = 0,
+  ILSOP_MEMSET = 1 << 0,
+  ILSOP_MEMCPY = 1 << 1,
+  ILSOP_MEMMOVE = 1 << 2,
+  ILSOP_MEMCMP = 1 << 3,
+  ILSOP_ALL = -1
+};
+
 /* Control-Flow Protection values.  */
 enum cf_protection_level
 {
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c
new file mode 100644
index 00000000000..a368f074112
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c
@@ -0,0 +1,7 @@
+/* { dg-do run } */
+/* { dg-options "-finline-stringops=memcmp -save-temps -g0 -fno-lto" } */
+
+#include "../memcmp-1.c"
+
+/* Check that no memcmp calls remain, but allow for lib_memcmp calls.  */
+/* { dg-final { scan-assembler-not {(^|\*)\mmemcmp\M} } } */
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c
new file mode 100644
index 00000000000..c98e903c1f1
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c
@@ -0,0 +1,8 @@
+/* { dg-do run } */
+/* { dg-options "-finline-stringops=memcpy -save-temps -g0 -fno-lto" } */
+
+#include "../memcmp-1.c"
+/* Yeah, this memcmp test exercises plenty of memcpy, more than any of the
+   memcpy tests.  */
+
+/* { dg-final { scan-assembler-not {\mmemcpy\M} } } */
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-cmp-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-cmp-1.c
new file mode 100644
index 00000000000..2cd2057a9b5
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-cmp-1.c
@@ -0,0 +1,11 @@
+/* { dg-do run } */
+/* { dg-options "-finline-stringops -save-temps -g0 -fno-lto" } */
+/* { dg-require-effective-target ptr32plus } */
+/* { dg-timeout-factor 2 } */
+
+#include "../memcmp-1.c"
+/* Yeah, this memcmp test exercises plenty of memcpy, more than any of the
+   memcpy tests.  */
+
+/* { dg-final { scan-assembler-not {\mmemcpy\M} } } */
+/* { dg-final { scan-assembler-not {(^|\*)\mmemcmp\M} } } */
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c
new file mode 100644
index 00000000000..f1d77b50a4c
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c
@@ -0,0 +1,9 @@
+/* { dg-do run } */
+/* { dg-options "-finline-stringops=memmove -save-temps -g0 -fno-lto" } */
+
+#include "../../gcc.c-torture/execute/builtins/memmove.c"
+
+#include "../../gcc.c-torture/execute/builtins/lib/main.c"
+
+/* { dg-final { scan-assembler-not {\mmemcpy\M} } } */
+/* { dg-final { scan-assembler-not {\mmemmove\M} } } */
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c
new file mode 100644
index 00000000000..bdcf9bf5b61
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c
@@ -0,0 +1,84 @@
+/* { dg-do compile } */
+/* { dg-options "-finline-stringops -fno-lto" } */
+
+void *zero (unsigned long long (*p)[32], int n)
+{
+  return __builtin_memset (p, 0, n * sizeof (*p));
+}
+
+void *ones (char (*p)[128], int n)
+{
+  return __builtin_memset (p, -1, n * sizeof (*p));
+}
+
+void *opt2 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 2) * sizeof (*p));
+}
+
+void *opt8 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 8) * sizeof (*p));
+}
+
+void *opt32 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 32) * sizeof (*p));
+}
+
+void *opt128 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 128) * sizeof (*p));
+}
+
+void *opt512 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 512) * sizeof (*p));
+}
+
+void *opt_primes (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 509 : 7) * sizeof (*p));
+}
+
+void *opt_primes_blk (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 521 : 9) * sizeof (*p));
+}
+
+void *huge (long (*p)[16384])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep1 (long (*p)[16384+1])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep4 (long (*p)[16384+4])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep16 (long (*p)[16384+16])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep64 (long (*p)[16384+64])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep256 (long (*p)[16384+256])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep1024p256p64p16p4p1 (long (*p)[16384+1024+64+16+4+1])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+/* { dg-final { scan-assembler-not {\mmemset\M} } } */

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [gcc(refs/users/aoliva/heads/testme)] Introduce -finline-stringops
@ 2023-06-02  4:42 Alexandre Oliva
  0 siblings, 0 replies; 16+ messages in thread
From: Alexandre Oliva @ 2023-06-02  4:42 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:70c357a8c239456ac128823305bb0f645f1a242b

commit 70c357a8c239456ac128823305bb0f645f1a242b
Author: Alexandre Oliva <oliva@adacore.com>
Date:   Thu Jun 1 22:38:29 2023 -0300

    Introduce -finline-stringops
    
    try_store_by_multiple_pieces was added not long ago, enabling
    variable-sized memset to be expanded inline when the worst-case
    in-range constant length would, using conditional blocks with powers
    of two to cover all possibilities of length and alignment.
    
    This patch introduces -finline-stringops[=fn] to request expansions to
    start with a loop, so as to still take advantage of known alignment
    even with long lengths, but without necessarily adding store blocks
    for every power of two.
    
    This makes it possible for the supported stringops (memset, memcpy,
    memmove, memset) to be expanded, even if storing a single byte per
    iteration.  Surely efficient implementations can run faster, with a
    pre-loop to increase alignment, but that would likely be excessive for
    inline expansions.
    
    Still, in some cases, such as in freestanding environments, users
    prefer to inline such stringops, especially those that the compiler
    may introduce itself, even if the expansion is not as performant as a
    highly optimized C library implementation could be, to avoid
    depending on a C runtime library.
    
    
    for  gcc/ChangeLog
    
            * expr.cc (emit_block_move_hints): Take ctz of len.  Obey
            -finline-stringops.  Use oriented or sized loop.
            (emit_block_move): Take ctz of len, and pass it on.
            (emit_block_move_via_sized_loop): New.
            (emit_block_move_via_oriented_loop): New.
            (emit_block_move_via_loop): Take incr.  Move an incr-sized
            block per iteration.
            (emit_block_cmp_via_cmpmem): Take ctz of len.  Obey
            -finline-stringops.
            (emit_block_cmp_via_loop): New.
            * expr.h (emit_block_move): Add ctz of len defaulting to zero.
            (emit_block_move_hints): Likewise.
            (emit_block_cmp_hints): Likewise.
            * builtins.cc (expand_builtin_memory_copy_args): Pass ctz of
            len to emit_block_move_hints.
            (try_store_by_multiple_pieces): Support starting with a loop.
            (expand_builtin_memcmp): Pass ctz of len to
            emit_block_cmp_hints.
            (expand_builtin): Allow inline expansion of memset, memcpy,
            memmove and memcmp if requested.
            * common.opt (finline-stringops): New.
            (ilsop_fn): New enum.
            * flag-types.h (enum ilsop_fn): New.
            * doc/invoke.texi (-finline-stringops): Add.
    
    for  gcc/testsuite/ChangeLog
    
            * gcc.dg/torture/inline-mem-cmp-1.c: New.
            * gcc.dg/torture/inline-mem-cpy-1.c: New.
            * gcc.dg/torture/inline-mem-cpy-cmp-1.c: New.
            * gcc.dg/torture/inline-mem-move-1.c: New.
            * gcc.dg/torture/inline-mem-set-1.c: New.

Diff:
---
 gcc/builtins.cc                                    | 114 ++++++-
 gcc/common.opt                                     |  34 ++
 gcc/doc/invoke.texi                                |  15 +
 gcc/expr.cc                                        | 374 ++++++++++++++++++++-
 gcc/expr.h                                         |   9 +-
 gcc/flag-types.h                                   |  11 +
 gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c    |   7 +
 gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c    |   8 +
 .../gcc.dg/torture/inline-mem-cpy-cmp-1.c          |  11 +
 gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c   |   9 +
 gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c    |  84 +++++
 11 files changed, 646 insertions(+), 30 deletions(-)

diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index 8400adaf5b4..1beaa4eae97 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -3769,7 +3769,7 @@ expand_builtin_memory_copy_args (tree dest, tree src, tree len,
 				     expected_align, expected_size,
 				     min_size, max_size, probable_max_size,
 				     use_mempcpy_call, &is_move_done,
-				     might_overlap);
+				     might_overlap, tree_ctz (len));
 
   /* Bail out when a mempcpy call would be expanded as libcall and when
      we have a target that provides a fast implementation
@@ -4335,6 +4335,10 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
   int tst_bits = (max_bits != min_bits ? max_bits
 		  : floor_log2 (max_len ^ min_len));
 
+  /* Save the pre-blksize values.  */
+  int orig_max_bits = max_bits;
+  int orig_tst_bits = tst_bits;
+
   /* Check whether it's profitable to start by storing a fixed BLKSIZE
      bytes, to lower max_bits.  In the unlikely case of a constant LEN
      (implied by identical MAX_LEN and MIN_LEN), we want to issue a
@@ -4374,9 +4378,70 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
   if (max_bits >= 0)
     xlenest += ((HOST_WIDE_INT_1U << max_bits) * 2
 		- (HOST_WIDE_INT_1U << ctz_len));
-  if (!can_store_by_pieces (xlenest, builtin_memset_read_str,
-			    &valc, align, true))
-    return false;
+  bool max_loop = false;
+  /* Skip the test in case of overflow in xlenest.  It shouldn't
+     happen because of the way max_bits and blksize are related, but
+     it doesn't hurt to test.  */
+  if (blksize > xlenest
+      || !can_store_by_pieces (xlenest, builtin_memset_read_str,
+			       &valc, align, true))
+    {
+      if (!(flag_inline_stringops & ILSOP_MEMSET))
+	return false;
+
+      for (max_bits = orig_max_bits;
+	   max_bits >= sctz_len;
+	   --max_bits)
+	{
+	  xlenest = ((HOST_WIDE_INT_1U << max_bits) * 2
+		     - (HOST_WIDE_INT_1U << ctz_len));
+	  /* Check that blksize plus the bits to be stored as blocks
+	     sized at powers of two can be stored by pieces.  This is
+	     like the test above, but with smaller max_bits.  Skip
+	     orig_max_bits (it would be redundant).  Also skip in case
+	     of overflow.  */
+	  if (max_bits < orig_max_bits
+	      && xlenest + blksize >= xlenest
+	      && can_store_by_pieces (xlenest + blksize,
+				      builtin_memset_read_str,
+				      &valc, align, true))
+	    {
+	      max_loop = true;
+	      break;
+	    }
+	  if (blksize
+	      && can_store_by_pieces (xlenest,
+				      builtin_memset_read_str,
+				      &valc, align, true))
+	    {
+	      max_len += blksize;
+	      min_len += blksize;
+	      tst_bits = orig_tst_bits;
+	      blksize = 0;
+	      max_loop = true;
+	      break;
+	    }
+	  if (max_bits == sctz_len)
+	    {
+	      --sctz_len;
+	      --ctz_len;
+	    }
+	}
+      if (!max_loop)
+	return false;
+      /* If the boundaries are such that min and max may run a
+	 different number of trips in the initial loop, the remainder
+	 needs not be between the moduli, so set tst_bits to cover all
+	 bits.  Otherwise, if the trip counts are the same, max_len
+	 has the common prefix, and the previously-computed tst_bits
+	 is usable.  */
+      if (max_len >> max_bits > min_len >> max_bits)
+	tst_bits = max_bits;
+    }
+  /* ??? Do we have to check that all powers of two lengths from
+     max_bits down to ctz_len pass can_store_by_pieces?  As in, could
+     it possibly be that xlenest passes while smaller power-of-two
+     sizes don't?  */
 
   by_pieces_constfn constfun;
   void *constfundata;
@@ -4418,7 +4483,9 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
      the least significant bit possibly set in the length.  */
   for (int i = max_bits; i >= sctz_len; i--)
     {
+      rtx_code_label *loop_label = NULL;
       rtx_code_label *label = NULL;
+
       blksize = HOST_WIDE_INT_1U << i;
 
       /* If we're past the bits shared between min_ and max_len, expand
@@ -4432,18 +4499,31 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
 				   profile_probability::even ());
 	}
       /* If we are at a bit that is in the prefix shared by min_ and
-	 max_len, skip this BLKSIZE if the bit is clear.  */
-      else if ((max_len & blksize) == 0)
+	 max_len, skip the current BLKSIZE if the bit is clear, but do
+	 not skip the loop, even if it doesn't require
+	 prechecking.  */
+      else if ((max_len & blksize) == 0
+	       && !(max_loop && i == max_bits))
 	continue;
 
+      if (max_loop && i == max_bits)
+	{
+	  loop_label = gen_label_rtx ();
+	  emit_label (loop_label);
+	  /* Since we may run this multiple times, don't assume we
+	     know anything about the offset.  */
+	  clear_mem_offset (to);
+	}
+
       /* Issue a store of BLKSIZE bytes.  */
+      bool update_needed = i != sctz_len || loop_label;
       to = store_by_pieces (to, blksize,
 			    constfun, constfundata,
 			    align, true,
-			    i != sctz_len ? RETURN_END : RETURN_BEGIN);
+			    update_needed ? RETURN_END : RETURN_BEGIN);
 
       /* Adjust REM and PTR, unless this is the last iteration.  */
-      if (i != sctz_len)
+      if (update_needed)
 	{
 	  emit_move_insn (ptr, force_operand (XEXP (to, 0), NULL_RTX));
 	  to = replace_equiv_address (to, ptr);
@@ -4451,6 +4531,11 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
 	  emit_move_insn (rem, force_operand (rem_minus_blksize, NULL_RTX));
 	}
 
+      if (loop_label)
+	emit_cmp_and_jump_insns (rem, GEN_INT (blksize), GE, NULL,
+				 ptr_mode, 1, loop_label,
+				 profile_probability::likely ());
+
       if (label)
 	{
 	  emit_label (label);
@@ -4737,7 +4822,8 @@ expand_builtin_memcmp (tree exp, rtx target, bool result_eq)
   result = emit_block_cmp_hints (arg1_rtx, arg2_rtx, len_rtx,
 				 TREE_TYPE (len), target,
 				 result_eq, constfn,
-				 CONST_CAST (char *, rep));
+				 CONST_CAST (char *, rep),
+				 tree_ctz (len));
 
   if (result)
     {
@@ -7380,7 +7466,15 @@ expand_builtin (tree exp, rtx target, rtx subtarget, machine_mode mode,
       && fcode != BUILT_IN_EXECVE
       && fcode != BUILT_IN_CLEAR_CACHE
       && !ALLOCA_FUNCTION_CODE_P (fcode)
-      && fcode != BUILT_IN_FREE)
+      && fcode != BUILT_IN_FREE
+      && (fcode != BUILT_IN_MEMSET
+	  || !(flag_inline_stringops & ILSOP_MEMSET))
+      && (fcode != BUILT_IN_MEMCPY
+	  || !(flag_inline_stringops & ILSOP_MEMCPY))
+      && (fcode != BUILT_IN_MEMMOVE
+	  || !(flag_inline_stringops & ILSOP_MEMMOVE))
+      && (fcode != BUILT_IN_MEMCMP
+	  || !(flag_inline_stringops & ILSOP_MEMCMP)))
     return expand_call (exp, target, ignore);
 
   /* The built-in function expanders test for target == const0_rtx
diff --git a/gcc/common.opt b/gcc/common.opt
index a28ca13385a..fcf945019ec 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -1882,6 +1882,40 @@ finline-atomics
 Common Var(flag_inline_atomics) Init(1) Optimization
 Inline __atomic operations when a lock free instruction sequence is available.
 
+finline-stringops
+Common RejectNegative Enum(ilsop_fn) Var(flag_inline_stringops, ILSOP_ALL) Enum(ilsop_fn) Init(ILSOP_NONE) Optimization Undocumented
+
+fno-inline-stringops
+Common RejectNegative Enum(ilsop_fn) Var(flag_inline_stringops, ILSOP_NONE) Enum(ilsop_fn) Optimization Undocumented
+
+finline-stringops=
+Common Joined Var(flag_inline_stringops) EnumSet Enum(ilsop_fn) Optimization
+-finline-stringops[=memcmp|memcpy|memmove|memset]
+Expand supported mem/str operations inline, even if against optimization.
+
+Enum
+Name(ilsop_fn) Type(enum ilsop_fn) UnknownError(unavailable stringop for inlining %qs)
+
+; This is not part of any set.
+; EnumValue
+; Enum(ilsop_fn) String(none) Value(ILSOP_NONE)
+
+EnumValue
+Enum(ilsop_fn) String(memcmp) Value(ILSOP_MEMCMP) Set(1)
+
+EnumValue
+Enum(ilsop_fn) String(memcpy) Value(ILSOP_MEMCPY) Set(2)
+
+EnumValue
+Enum(ilsop_fn) String(memmove) Value(ILSOP_MEMMOVE) Set(3)
+
+EnumValue
+Enum(ilsop_fn) String(memset) Value(ILSOP_MEMSET) Set(4)
+
+; This is not part of any set either.
+; EnumValue
+; Enum(ilsop_fn) String(all) Value(ILSOP_ALL)
+
 fcf-protection
 Common RejectNegative Alias(fcf-protection=,full)
 
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 9130104af22..923c8005f57 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -550,6 +550,7 @@ Objective-C and Objective-C++ Dialects}.
 -fgcse  -fgcse-after-reload  -fgcse-las  -fgcse-lm  -fgraphite-identity
 -fgcse-sm  -fhoist-adjacent-loads  -fif-conversion
 -fif-conversion2  -findirect-inlining
+-finline-stringops[=@var{fn}]
 -finline-functions  -finline-functions-called-once  -finline-limit=@var{n}
 -finline-small-functions -fipa-modref -fipa-cp  -fipa-cp-clone
 -fipa-bit-cp  -fipa-vrp  -fipa-pta  -fipa-profile  -fipa-pure-const
@@ -12081,6 +12082,20 @@ their @code{_FORTIFY_SOURCE} counterparts into faster alternatives.
 
 Enabled at levels @option{-O2}, @option{-O3}.
 
+@opindex finline-stringops
+@item -finline-stringops[=@var{fn}]
+Expand memory and string operations (for now, only @code{memset})
+inline, even when the length is variable or big enough as to require
+looping.  This is most useful along with @option{-ffreestanding} and
+@option{-fno-builtin}.
+
+In some circumstances, it enables the compiler to generate code that
+takes advantage of known alignment and length multipliers, but even then
+it may be less efficient than optimized runtime implementations, and
+grow code size so much that even a less performant but shared
+implementation runs faster due to better use of code caches.  This
+option is disabled by default.
+
 @opindex fno-inline
 @opindex finline
 @item -fno-inline
diff --git a/gcc/expr.cc b/gcc/expr.cc
index 56b51876f80..efa409643a3 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -80,7 +80,11 @@ static bool emit_block_move_via_pattern (rtx, rtx, rtx, unsigned, unsigned,
 					 HOST_WIDE_INT, unsigned HOST_WIDE_INT,
 					 unsigned HOST_WIDE_INT,
 					 unsigned HOST_WIDE_INT, bool);
-static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned);
+static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned, int);
+static void emit_block_move_via_sized_loop (rtx, rtx, rtx, unsigned, unsigned);
+static void emit_block_move_via_oriented_loop (rtx, rtx, rtx, unsigned, unsigned);
+static rtx emit_block_cmp_via_loop (rtx, rtx, rtx, tree, rtx, bool,
+				    unsigned, unsigned);
 static void clear_by_pieces (rtx, unsigned HOST_WIDE_INT, unsigned int);
 static rtx_insn *compress_float_constant (rtx, rtx);
 static rtx get_subtarget (rtx);
@@ -1955,6 +1959,8 @@ compare_by_pieces (rtx arg0, rtx arg1, unsigned HOST_WIDE_INT len,
    MIN_SIZE is the minimal size of block to move
    MAX_SIZE is the maximal size of block to move, if it cannot be represented
    in unsigned HOST_WIDE_INT, than it is mask of all ones.
+   CTZ_SIZE is the trailing-zeros count of SIZE; even a nonconstant SIZE is
+   known to be a multiple of 1<<CTZ_SIZE.
 
    Return the address of the new block, if memcpy is called and returns it,
    0 otherwise.  */
@@ -1966,7 +1972,7 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
 		       unsigned HOST_WIDE_INT max_size,
 		       unsigned HOST_WIDE_INT probable_max_size,
 		       bool bail_out_libcall, bool *is_move_done,
-		       bool might_overlap)
+		       bool might_overlap, unsigned ctz_size)
 {
   int may_use_call;
   rtx retval = 0;
@@ -2052,6 +2058,14 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
 	}
     }
 
+  bool dynamic_direction = false;
+  if (!pattern_ok && !pieces_ok && may_use_call
+      && (flag_inline_stringops & (might_overlap ? ILSOP_MEMMOVE : ILSOP_MEMCPY)))
+    {
+      may_use_call = 0;
+      dynamic_direction = might_overlap;
+    }
+
   if (pattern_ok)
     ;
   else if (pieces_ok)
@@ -2073,10 +2087,12 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
       retval = emit_block_copy_via_libcall (x, y, size,
 					    method == BLOCK_OP_TAILCALL);
     }
+  else if (dynamic_direction)
+    emit_block_move_via_oriented_loop (x, y, size, align, ctz_size);
   else if (might_overlap)
     *is_move_done = false;
   else
-    emit_block_move_via_loop (x, y, size, align);
+    emit_block_move_via_sized_loop (x, y, size, align, ctz_size);
 
   if (method == BLOCK_OP_CALL_PARM)
     OK_DEFER_POP;
@@ -2085,7 +2101,8 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
 }
 
 rtx
-emit_block_move (rtx x, rtx y, rtx size, enum block_op_methods method)
+emit_block_move (rtx x, rtx y, rtx size, enum block_op_methods method,
+		 unsigned int ctz_size)
 {
   unsigned HOST_WIDE_INT max, min = 0;
   if (GET_CODE (size) == CONST_INT)
@@ -2093,7 +2110,8 @@ emit_block_move (rtx x, rtx y, rtx size, enum block_op_methods method)
   else
     max = GET_MODE_MASK (GET_MODE (size));
   return emit_block_move_hints (x, y, size, method, 0, -1,
-				min, max, max);
+				min, max, max,
+				false, NULL, false, ctz_size);
 }
 
 /* A subroutine of emit_block_move.  Returns true if calling the
@@ -2255,13 +2273,117 @@ emit_block_move_via_pattern (rtx x, rtx y, rtx size, unsigned int align,
   return false;
 }
 
+/* Like emit_block_move_via_loop, but choose a suitable INCR based on
+   ALIGN and CTZ_SIZE.  */
+
+static void
+emit_block_move_via_sized_loop (rtx x, rtx y, rtx size,
+				unsigned int align,
+				unsigned int ctz_size)
+{
+  int incr = align / BITS_PER_UNIT;
+
+  if (CONST_INT_P (size))
+    ctz_size = MAX (ctz_size, (unsigned) wi::ctz (UINTVAL (size)));
+
+  if (HOST_WIDE_INT_1U << ctz_size < (unsigned HOST_WIDE_INT) incr)
+    incr = HOST_WIDE_INT_1U << ctz_size;
+
+  while (incr > 1 && !can_move_by_pieces (incr, align))
+    incr >>= 1;
+
+  gcc_checking_assert (incr);
+
+  return emit_block_move_via_loop (x, y, size, align, incr);
+}
+
+/* Like emit_block_move_via_sized_loop, but besides choosing INCR so
+   as to ensure safe moves even in case of overlap, output dynamic
+   tests to choose between two loops, one moving downwards, another
+   moving upwards.  */
+
+static void
+emit_block_move_via_oriented_loop (rtx x, rtx y, rtx size,
+				   unsigned int align,
+				   unsigned int ctz_size)
+{
+  int incr = align / BITS_PER_UNIT;
+
+  if (CONST_INT_P (size))
+    ctz_size = MAX (ctz_size, (unsigned) wi::ctz (UINTVAL (size)));
+
+  if (HOST_WIDE_INT_1U << ctz_size < (unsigned HOST_WIDE_INT) incr)
+    incr = HOST_WIDE_INT_1U << ctz_size;
+
+  while (incr > 1 && !int_mode_for_size (incr, 0).exists ())
+    incr >>= 1;
+
+  gcc_checking_assert (incr);
+
+  rtx_code_label *upw_label, *end_label;
+  upw_label = gen_label_rtx ();
+  end_label = gen_label_rtx ();
+
+  rtx x_addr = force_operand (XEXP (x, 0), NULL_RTX);
+  rtx y_addr = force_operand (XEXP (y, 0), NULL_RTX);
+  do_pending_stack_adjust ();
+
+  machine_mode mode = GET_MODE (x_addr);
+  if (mode != GET_MODE (y_addr))
+    {
+      scalar_int_mode xmode
+	= smallest_int_mode_for_size (GET_MODE_BITSIZE (mode));
+      scalar_int_mode ymode
+	= smallest_int_mode_for_size (GET_MODE_BITSIZE
+				      (GET_MODE (y_addr)));
+      if (GET_MODE_BITSIZE (xmode) < GET_MODE_BITSIZE (ymode))
+	mode = ymode;
+      else
+	mode = xmode;
+
+#ifndef POINTERS_EXTEND_UNSIGNED
+      const int POINTERS_EXTEND_UNSIGNED = 1;
+#endif
+      x_addr = convert_modes (mode, GET_MODE (x_addr), x_addr,
+			      POINTERS_EXTEND_UNSIGNED);
+      y_addr = convert_modes (mode, GET_MODE (y_addr), y_addr,
+			      POINTERS_EXTEND_UNSIGNED);
+    }
+
+  /* Test for overlap: if (x >= y || x + size <= y) goto upw_label.  */
+  emit_cmp_and_jump_insns (x_addr, y_addr, GEU, NULL_RTX, mode,
+			   true, upw_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (5, 10));
+  rtx tmp = convert_modes (GET_MODE (x_addr), GET_MODE (size), size, true);
+  tmp = simplify_gen_binary (PLUS, GET_MODE (x_addr), x_addr, tmp);
+
+  emit_cmp_and_jump_insns (tmp, y_addr, LEU, NULL_RTX, mode,
+			   true, upw_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (8, 10));
+
+  emit_block_move_via_loop (x, y, size, align, -incr);
+
+  emit_jump (end_label);
+  emit_label (upw_label);
+
+  emit_block_move_via_loop (x, y, size, align, incr);
+
+  emit_label (end_label);
+}
+
 /* A subroutine of emit_block_move.  Copy the data via an explicit
-   loop.  This is used only when libcalls are forbidden.  */
-/* ??? It'd be nice to copy in hunks larger than QImode.  */
+   loop.  This is used only when libcalls are forbidden, or when
+   inlining is required.  INCR is the block size to be copied in each
+   loop iteration.  If it is negative, the absolute value is used, and
+   the block is copied backwards.  INCR must be a power of two, an
+   exact divisor for SIZE and ALIGN, and imply a mode that can be
+   safely copied per iteration assuming no overlap.  */
 
 static void
 emit_block_move_via_loop (rtx x, rtx y, rtx size,
-			  unsigned int align ATTRIBUTE_UNUSED)
+			  unsigned int align, int incr)
 {
   rtx_code_label *cmp_label, *top_label;
   rtx iter, x_addr, y_addr, tmp;
@@ -2277,7 +2399,38 @@ emit_block_move_via_loop (rtx x, rtx y, rtx size,
   cmp_label = gen_label_rtx ();
   iter = gen_reg_rtx (iter_mode);
 
-  emit_move_insn (iter, const0_rtx);
+  bool downwards = incr < 0;
+  rtx iter_init;
+  rtx_code iter_cond;
+  rtx iter_limit;
+  rtx iter_incr;
+  machine_mode move_mode;
+  if (downwards)
+    {
+      incr = -incr;
+      iter_init = size;
+      iter_cond = GEU;
+      iter_limit = const0_rtx;
+      iter_incr = GEN_INT (incr);
+    }
+  else
+    {
+      iter_init = const0_rtx;
+      iter_cond = LTU;
+      iter_limit = size;
+      iter_incr = GEN_INT (incr);
+    }
+  emit_move_insn (iter, iter_init);
+
+  scalar_int_mode int_move_mode
+    = smallest_int_mode_for_size (incr * BITS_PER_UNIT);
+  if (GET_MODE_BITSIZE (int_move_mode) != incr * BITS_PER_UNIT)
+    {
+      move_mode = BLKmode;
+      gcc_checking_assert (can_move_by_pieces (incr, align));
+    }
+  else
+    move_mode = int_move_mode;
 
   x_addr = force_operand (XEXP (x, 0), NULL_RTX);
   y_addr = force_operand (XEXP (y, 0), NULL_RTX);
@@ -2293,19 +2446,32 @@ emit_block_move_via_loop (rtx x, rtx y, rtx size,
     tmp = convert_modes (y_addr_mode, iter_mode, iter, true);
   y_addr = simplify_gen_binary (PLUS, y_addr_mode, y_addr, tmp);
 
-  x = change_address (x, QImode, x_addr);
-  y = change_address (y, QImode, y_addr);
+  x = change_address (x, move_mode, x_addr);
+  y = change_address (y, move_mode, y_addr);
 
-  emit_move_insn (x, y);
+  if (move_mode == BLKmode)
+    {
+      bool done;
+      emit_block_move_hints (x, y, iter_incr, BLOCK_OP_NO_LIBCALL,
+			     align, incr, incr, incr, incr,
+			     false, &done, false);
+      gcc_checking_assert (done);
+    }
+  else
+    emit_move_insn (x, y);
 
-  tmp = expand_simple_binop (iter_mode, PLUS, iter, const1_rtx, iter,
+  if (downwards)
+    emit_label (cmp_label);
+
+  tmp = expand_simple_binop (iter_mode, PLUS, iter, iter_incr, iter,
 			     true, OPTAB_LIB_WIDEN);
   if (tmp != iter)
     emit_move_insn (iter, tmp);
 
-  emit_label (cmp_label);
+  if (!downwards)
+    emit_label (cmp_label);
 
-  emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
+  emit_cmp_and_jump_insns (iter, iter_limit, iter_cond, NULL_RTX, iter_mode,
 			   true, top_label,
 			   profile_probability::guessed_always ()
 				.apply_scale (9, 10));
@@ -2405,7 +2571,8 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
 
    Both X and Y must be MEM rtx's.  LEN is an rtx that says how long
    they are.  LEN_TYPE is the type of the expression that was used to
-   calculate it.
+   calculate it, and CTZ_LEN is the known trailing-zeros count of LEN,
+   so LEN must be a multiple of 1<<CTZ_LEN even if it's not constant.
 
    If EQUALITY_ONLY is true, it means we don't have to return the tri-state
    value of a normal memcmp call, instead we can just compare for equality.
@@ -2421,7 +2588,7 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
 rtx
 emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
 		      bool equality_only, by_pieces_constfn y_cfn,
-		      void *y_cfndata)
+		      void *y_cfndata, unsigned ctz_len)
 {
   rtx result = 0;
 
@@ -2443,8 +2610,181 @@ emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
   else
     result = emit_block_cmp_via_cmpmem (x, y, len, len_type, target, align);
 
+  if (!result && (flag_inline_stringops & ILSOP_MEMCMP))
+    result = emit_block_cmp_via_loop (x, y, len, len_type,
+				      target, equality_only,
+				      align, ctz_len);
+
   return result;
 }
+
+/* Like emit_block_cmp_hints, but with known alignment and no support
+   for constats.  Always expand to a loop with iterations that compare
+   blocks of the largest compare-by-pieces size that divides both len
+   and align, and then, if !EQUALITY_ONLY, identify the word and then
+   the unit that first differs to return the result.  */
+
+rtx
+emit_block_cmp_via_loop (rtx x, rtx y, rtx len, tree len_type, rtx target,
+			 bool equality_only, unsigned align, unsigned ctz_len)
+{
+  unsigned incr = align / BITS_PER_UNIT;
+
+  if (CONST_INT_P (len))
+    ctz_len = MAX (ctz_len, (unsigned) wi::ctz (UINTVAL (len)));
+
+  if (HOST_WIDE_INT_1U << ctz_len < (unsigned HOST_WIDE_INT) incr)
+    incr = HOST_WIDE_INT_1U << ctz_len;
+
+  while (incr > 1
+	 && !can_do_by_pieces (incr, align, COMPARE_BY_PIECES))
+    incr >>= 1;
+
+  rtx_code_label *cmp_label, *top_label, *ne_label, *res_label;
+  rtx iter, x_addr, y_addr, tmp;
+  machine_mode x_addr_mode = get_address_mode (x);
+  machine_mode y_addr_mode = get_address_mode (y);
+  machine_mode iter_mode;
+
+  iter_mode = GET_MODE (len);
+  if (iter_mode == VOIDmode)
+    iter_mode = word_mode;
+
+  top_label = gen_label_rtx ();
+  cmp_label = gen_label_rtx ();
+  ne_label = gen_label_rtx ();
+  res_label = gen_label_rtx ();
+  iter = gen_reg_rtx (iter_mode);
+
+  rtx iter_init = const0_rtx;
+  rtx_code iter_cond = LTU;
+  rtx iter_limit = len;
+  rtx iter_incr = GEN_INT (incr);
+  machine_mode cmp_mode;
+
+  emit_move_insn (iter, iter_init);
+
+  scalar_int_mode int_cmp_mode
+    = smallest_int_mode_for_size (incr * BITS_PER_UNIT);
+  if (GET_MODE_BITSIZE (int_cmp_mode) != incr * BITS_PER_UNIT
+      || !can_compare_p (NE, int_cmp_mode, ccp_jump))
+    {
+      cmp_mode = BLKmode;
+      gcc_checking_assert (incr != 1);
+    }
+  else
+    cmp_mode = int_cmp_mode;
+
+  /* Save the base addresses.  */
+  x_addr = force_operand (XEXP (x, 0), NULL_RTX);
+  y_addr = force_operand (XEXP (y, 0), NULL_RTX);
+  do_pending_stack_adjust ();
+
+  emit_jump (cmp_label);
+  emit_label (top_label);
+
+  /* Offset the base addresses by ITER.  */
+  tmp = convert_modes (x_addr_mode, iter_mode, iter, true);
+  x_addr = simplify_gen_binary (PLUS, x_addr_mode, x_addr, tmp);
+
+  if (x_addr_mode != y_addr_mode)
+    tmp = convert_modes (y_addr_mode, iter_mode, iter, true);
+  y_addr = simplify_gen_binary (PLUS, y_addr_mode, y_addr, tmp);
+
+  x = change_address (x, cmp_mode, x_addr);
+  y = change_address (y, cmp_mode, y_addr);
+
+  /* Compare one block.  */
+  rtx part_res;
+  if (cmp_mode == BLKmode)
+    part_res = compare_by_pieces (x, y, incr, target, align, 0, 0);
+  else
+    part_res = expand_binop (cmp_mode, sub_optab, x, y, NULL_RTX,
+			     true, OPTAB_LIB_WIDEN);
+
+  /* Stop if we found a difference.  */
+  emit_cmp_and_jump_insns (part_res, GEN_INT (0), NE, NULL_RTX,
+			   GET_MODE (part_res), true, ne_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (1, 10));
+
+  /* Increment ITER.  */
+  tmp = expand_simple_binop (iter_mode, PLUS, iter, iter_incr, iter,
+			     true, OPTAB_LIB_WIDEN);
+  if (tmp != iter)
+    emit_move_insn (iter, tmp);
+
+  emit_label (cmp_label);
+  /* Loop until we reach the limit.  */
+  emit_cmp_and_jump_insns (iter, iter_limit, iter_cond, NULL_RTX, iter_mode,
+			   true, top_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (9, 10));
+
+  /* We got to the end without differences, so the result is zero.  */
+  if (target == NULL_RTX
+      || !REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
+    target = gen_reg_rtx (TYPE_MODE (integer_type_node));
+
+  emit_move_insn (target, const0_rtx);
+  emit_jump (res_label);
+  emit_barrier ();
+
+  emit_label (ne_label);
+
+  /* Return nonzero, or pinpoint the difference to return the expected
+     result for non-equality tests.  */
+  if (equality_only)
+    emit_move_insn (target, const1_rtx);
+  else
+    {
+      if (incr > UNITS_PER_WORD)
+	/* ??? Re-compare the block found to be different one word at a
+	   time.  */
+	part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), len_type,
+					    target, equality_only,
+					    BITS_PER_WORD, 0);
+      else if (incr > 1)
+	/* ??? Re-compare the block found to be different one byte at a
+	   time.  We could do better using part_res, and being careful
+	   about endianness.  */
+	part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), len_type,
+					    target, equality_only,
+					    BITS_PER_UNIT, 0);
+      else if (known_gt (GET_MODE_BITSIZE (GET_MODE (target)),
+			 GET_MODE_BITSIZE (cmp_mode)))
+	part_res = expand_binop (GET_MODE (target), sub_optab, x, y, target,
+				 true, OPTAB_LIB_WIDEN);
+      else
+	{
+	  /* In the odd chance target is QImode, we can't count on
+	     widening subtract to capture the result of the unsigned
+	     compares.  */
+	  rtx_code_label *ltu_label;
+	  ltu_label = gen_label_rtx ();
+	  emit_cmp_and_jump_insns (x, y, LTU, NULL_RTX,
+				   cmp_mode, true, ltu_label,
+				   profile_probability::guessed_always ()
+				   .apply_scale (5, 10));
+
+	  emit_move_insn (target, const1_rtx);
+	  emit_jump (res_label);
+	  emit_barrier ();
+
+	  emit_label (ltu_label);
+	  emit_move_insn (target, constm1_rtx);
+	  part_res = target;
+	}
+
+      if (target != part_res)
+	convert_move (target, part_res, false);
+    }
+
+  emit_label (res_label);
+
+  return target;
+}
+
 \f
 /* Copy all or part of a value X into registers starting at REGNO.
    The number of registers to be filled is NREGS.  */
diff --git a/gcc/expr.h b/gcc/expr.h
index 0c059ed9023..63c75c9688c 100644
--- a/gcc/expr.h
+++ b/gcc/expr.h
@@ -126,7 +126,8 @@ struct by_pieces_prev
   fixed_size_mode mode;
 };
 
-extern rtx emit_block_move (rtx, rtx, rtx, enum block_op_methods);
+extern rtx emit_block_move (rtx, rtx, rtx, enum block_op_methods,
+			    unsigned ctz_size = 0);
 extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
 			          unsigned int, HOST_WIDE_INT,
 				  unsigned HOST_WIDE_INT,
@@ -134,9 +135,11 @@ extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
 				  unsigned HOST_WIDE_INT,
 				  bool bail_out_libcall = false,
 				  bool *is_move_done = NULL,
-				  bool might_overlap = false);
+				  bool might_overlap = false,
+				  unsigned ctz_size = 0);
 extern rtx emit_block_cmp_hints (rtx, rtx, rtx, tree, rtx, bool,
-				 by_pieces_constfn, void *);
+				 by_pieces_constfn, void *,
+				 unsigned ctz_len = 0);
 extern bool emit_storent_insn (rtx to, rtx from);
 
 /* Copy all or part of a value X into registers starting at REGNO.
diff --git a/gcc/flag-types.h b/gcc/flag-types.h
index 78dbdbe90dd..a9d9eb166b6 100644
--- a/gcc/flag-types.h
+++ b/gcc/flag-types.h
@@ -432,6 +432,17 @@ enum gfc_convert
 };
 
 
+/* Inline String Operations functions.  */
+enum ilsop_fn
+{
+  ILSOP_NONE = 0,
+  ILSOP_MEMSET = 1 << 0,
+  ILSOP_MEMCPY = 1 << 1,
+  ILSOP_MEMMOVE = 1 << 2,
+  ILSOP_MEMCMP = 1 << 3,
+  ILSOP_ALL = -1
+};
+
 /* Control-Flow Protection values.  */
 enum cf_protection_level
 {
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c
new file mode 100644
index 00000000000..a368f074112
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c
@@ -0,0 +1,7 @@
+/* { dg-do run } */
+/* { dg-options "-finline-stringops=memcmp -save-temps -g0 -fno-lto" } */
+
+#include "../memcmp-1.c"
+
+/* Check that no memcmp calls remain, but allow for lib_memcmp calls.  */
+/* { dg-final { scan-assembler-not {(^|\*)\mmemcmp\M} } } */
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c
new file mode 100644
index 00000000000..c98e903c1f1
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c
@@ -0,0 +1,8 @@
+/* { dg-do run } */
+/* { dg-options "-finline-stringops=memcpy -save-temps -g0 -fno-lto" } */
+
+#include "../memcmp-1.c"
+/* Yeah, this memcmp test exercises plenty of memcpy, more than any of the
+   memcpy tests.  */
+
+/* { dg-final { scan-assembler-not {\mmemcpy\M} } } */
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-cmp-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-cmp-1.c
new file mode 100644
index 00000000000..2cd2057a9b5
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-cmp-1.c
@@ -0,0 +1,11 @@
+/* { dg-do run } */
+/* { dg-options "-finline-stringops -save-temps -g0 -fno-lto" } */
+/* { dg-require-effective-target ptr32plus } */
+/* { dg-timeout-factor 2 } */
+
+#include "../memcmp-1.c"
+/* Yeah, this memcmp test exercises plenty of memcpy, more than any of the
+   memcpy tests.  */
+
+/* { dg-final { scan-assembler-not {\mmemcpy\M} } } */
+/* { dg-final { scan-assembler-not {(^|\*)\mmemcmp\M} } } */
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c
new file mode 100644
index 00000000000..f1d77b50a4c
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c
@@ -0,0 +1,9 @@
+/* { dg-do run } */
+/* { dg-options "-finline-stringops=memmove -save-temps -g0 -fno-lto" } */
+
+#include "../../gcc.c-torture/execute/builtins/memmove.c"
+
+#include "../../gcc.c-torture/execute/builtins/lib/main.c"
+
+/* { dg-final { scan-assembler-not {\mmemcpy\M} } } */
+/* { dg-final { scan-assembler-not {\mmemmove\M} } } */
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c
new file mode 100644
index 00000000000..bdcf9bf5b61
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c
@@ -0,0 +1,84 @@
+/* { dg-do compile } */
+/* { dg-options "-finline-stringops -fno-lto" } */
+
+void *zero (unsigned long long (*p)[32], int n)
+{
+  return __builtin_memset (p, 0, n * sizeof (*p));
+}
+
+void *ones (char (*p)[128], int n)
+{
+  return __builtin_memset (p, -1, n * sizeof (*p));
+}
+
+void *opt2 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 2) * sizeof (*p));
+}
+
+void *opt8 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 8) * sizeof (*p));
+}
+
+void *opt32 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 32) * sizeof (*p));
+}
+
+void *opt128 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 128) * sizeof (*p));
+}
+
+void *opt512 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 512) * sizeof (*p));
+}
+
+void *opt_primes (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 509 : 7) * sizeof (*p));
+}
+
+void *opt_primes_blk (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 521 : 9) * sizeof (*p));
+}
+
+void *huge (long (*p)[16384])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep1 (long (*p)[16384+1])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep4 (long (*p)[16384+4])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep16 (long (*p)[16384+16])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep64 (long (*p)[16384+64])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep256 (long (*p)[16384+256])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep1024p256p64p16p4p1 (long (*p)[16384+1024+64+16+4+1])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+/* { dg-final { scan-assembler-not {\mmemset\M} } } */

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [gcc(refs/users/aoliva/heads/testme)] Introduce -finline-stringops
@ 2023-06-02  4:34 Alexandre Oliva
  0 siblings, 0 replies; 16+ messages in thread
From: Alexandre Oliva @ 2023-06-02  4:34 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:6c0a6b064e1a98e411a37f62a52a3fe5ae814d67

commit 6c0a6b064e1a98e411a37f62a52a3fe5ae814d67
Author: Alexandre Oliva <oliva@adacore.com>
Date:   Thu Jun 1 22:38:29 2023 -0300

    Introduce -finline-stringops
    
    try_store_by_multiple_pieces was added not long ago, enabling
    variable-sized memset to be expanded inline when the worst-case
    in-range constant length would, using conditional blocks with powers
    of two to cover all possibilities of length and alignment.
    
    This patch introduces -finline-stringops[=fn] to request expansions to
    start with a loop, so as to still take advantage of known alignment
    even with long lengths, but without necessarily adding store blocks
    for every power of two.
    
    This makes it possible for the supported stringops (memset, memcpy,
    memmove, memset) to be expanded, even if storing a single byte per
    iteration.  Surely efficient implementations can run faster, with a
    pre-loop to increase alignment, but that would likely be excessive for
    inline expansions.
    
    Still, in some cases, such as in freestanding environments, users
    prefer to inline such stringops, especially those that the compiler
    may introduce itself, even if the expansion is not as performant as a
    highly optimized C library implementation could be, to avoid
    depending on a C runtime library.
    
    
    for  gcc/ChangeLog
    
            * expr.cc (emit_block_move_hints): Take ctz of len.  Obey
            -finline-stringops.  Use oriented or sized loop.
            (emit_block_move): Take ctz of len, and pass it on.
            (emit_block_move_via_sized_loop): New.
            (emit_block_move_via_oriented_loop): New.
            (emit_block_move_via_loop): Take incr.  Move an incr-sized
            block per iteration.
            (emit_block_cmp_via_cmpmem): Take ctz of len.  Obey
            -finline-stringops.
            (emit_block_cmp_via_loop): New.
            * expr.h (emit_block_move): Add ctz of len defaulting to zero.
            (emit_block_move_hints): Likewise.
            (emit_block_cmp_hints): Likewise.
            * builtins.cc (expand_builtin_memory_copy_args): Pass ctz of
            len to emit_block_move_hints.
            (try_store_by_multiple_pieces): Support starting with a loop.
            (expand_builtin_memcmp): Pass ctz of len to
            emit_block_cmp_hints.
            (expand_builtin): Allow inline expansion of memset, memcpy,
            memmove and memcmp if requested.
            * common.opt (finline-stringops): New.
            (ilsop_fn): New enum.
            * flag-types.h (enum ilsop_fn): New.
            * doc/invoke.texi (-finline-stringops): Add.
    
    for  gcc/testsuite/ChangeLog
    
            * gcc.dg/torture/inline-mem-cmp-1.c: New.
            * gcc.dg/torture/inline-mem-cpy-1.c: New.
            * gcc.dg/torture/inline-mem-cpy-cmp-1.c: New.
            * gcc.dg/torture/inline-mem-move-1.c: New.
            * gcc.dg/torture/inline-mem-set-1.c: New.

Diff:
---
 gcc/builtins.cc                                    | 114 ++++++-
 gcc/common.opt                                     |  34 ++
 gcc/doc/invoke.texi                                |  15 +
 gcc/expr.cc                                        | 374 ++++++++++++++++++++-
 gcc/expr.h                                         |   9 +-
 gcc/flag-types.h                                   |  11 +
 gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c    |   6 +
 gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c    |   8 +
 .../gcc.dg/torture/inline-mem-cpy-cmp-1.c          |  11 +
 gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c   |   9 +
 gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c    |  84 +++++
 11 files changed, 645 insertions(+), 30 deletions(-)

diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index 8400adaf5b4..1beaa4eae97 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -3769,7 +3769,7 @@ expand_builtin_memory_copy_args (tree dest, tree src, tree len,
 				     expected_align, expected_size,
 				     min_size, max_size, probable_max_size,
 				     use_mempcpy_call, &is_move_done,
-				     might_overlap);
+				     might_overlap, tree_ctz (len));
 
   /* Bail out when a mempcpy call would be expanded as libcall and when
      we have a target that provides a fast implementation
@@ -4335,6 +4335,10 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
   int tst_bits = (max_bits != min_bits ? max_bits
 		  : floor_log2 (max_len ^ min_len));
 
+  /* Save the pre-blksize values.  */
+  int orig_max_bits = max_bits;
+  int orig_tst_bits = tst_bits;
+
   /* Check whether it's profitable to start by storing a fixed BLKSIZE
      bytes, to lower max_bits.  In the unlikely case of a constant LEN
      (implied by identical MAX_LEN and MIN_LEN), we want to issue a
@@ -4374,9 +4378,70 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
   if (max_bits >= 0)
     xlenest += ((HOST_WIDE_INT_1U << max_bits) * 2
 		- (HOST_WIDE_INT_1U << ctz_len));
-  if (!can_store_by_pieces (xlenest, builtin_memset_read_str,
-			    &valc, align, true))
-    return false;
+  bool max_loop = false;
+  /* Skip the test in case of overflow in xlenest.  It shouldn't
+     happen because of the way max_bits and blksize are related, but
+     it doesn't hurt to test.  */
+  if (blksize > xlenest
+      || !can_store_by_pieces (xlenest, builtin_memset_read_str,
+			       &valc, align, true))
+    {
+      if (!(flag_inline_stringops & ILSOP_MEMSET))
+	return false;
+
+      for (max_bits = orig_max_bits;
+	   max_bits >= sctz_len;
+	   --max_bits)
+	{
+	  xlenest = ((HOST_WIDE_INT_1U << max_bits) * 2
+		     - (HOST_WIDE_INT_1U << ctz_len));
+	  /* Check that blksize plus the bits to be stored as blocks
+	     sized at powers of two can be stored by pieces.  This is
+	     like the test above, but with smaller max_bits.  Skip
+	     orig_max_bits (it would be redundant).  Also skip in case
+	     of overflow.  */
+	  if (max_bits < orig_max_bits
+	      && xlenest + blksize >= xlenest
+	      && can_store_by_pieces (xlenest + blksize,
+				      builtin_memset_read_str,
+				      &valc, align, true))
+	    {
+	      max_loop = true;
+	      break;
+	    }
+	  if (blksize
+	      && can_store_by_pieces (xlenest,
+				      builtin_memset_read_str,
+				      &valc, align, true))
+	    {
+	      max_len += blksize;
+	      min_len += blksize;
+	      tst_bits = orig_tst_bits;
+	      blksize = 0;
+	      max_loop = true;
+	      break;
+	    }
+	  if (max_bits == sctz_len)
+	    {
+	      --sctz_len;
+	      --ctz_len;
+	    }
+	}
+      if (!max_loop)
+	return false;
+      /* If the boundaries are such that min and max may run a
+	 different number of trips in the initial loop, the remainder
+	 needs not be between the moduli, so set tst_bits to cover all
+	 bits.  Otherwise, if the trip counts are the same, max_len
+	 has the common prefix, and the previously-computed tst_bits
+	 is usable.  */
+      if (max_len >> max_bits > min_len >> max_bits)
+	tst_bits = max_bits;
+    }
+  /* ??? Do we have to check that all powers of two lengths from
+     max_bits down to ctz_len pass can_store_by_pieces?  As in, could
+     it possibly be that xlenest passes while smaller power-of-two
+     sizes don't?  */
 
   by_pieces_constfn constfun;
   void *constfundata;
@@ -4418,7 +4483,9 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
      the least significant bit possibly set in the length.  */
   for (int i = max_bits; i >= sctz_len; i--)
     {
+      rtx_code_label *loop_label = NULL;
       rtx_code_label *label = NULL;
+
       blksize = HOST_WIDE_INT_1U << i;
 
       /* If we're past the bits shared between min_ and max_len, expand
@@ -4432,18 +4499,31 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
 				   profile_probability::even ());
 	}
       /* If we are at a bit that is in the prefix shared by min_ and
-	 max_len, skip this BLKSIZE if the bit is clear.  */
-      else if ((max_len & blksize) == 0)
+	 max_len, skip the current BLKSIZE if the bit is clear, but do
+	 not skip the loop, even if it doesn't require
+	 prechecking.  */
+      else if ((max_len & blksize) == 0
+	       && !(max_loop && i == max_bits))
 	continue;
 
+      if (max_loop && i == max_bits)
+	{
+	  loop_label = gen_label_rtx ();
+	  emit_label (loop_label);
+	  /* Since we may run this multiple times, don't assume we
+	     know anything about the offset.  */
+	  clear_mem_offset (to);
+	}
+
       /* Issue a store of BLKSIZE bytes.  */
+      bool update_needed = i != sctz_len || loop_label;
       to = store_by_pieces (to, blksize,
 			    constfun, constfundata,
 			    align, true,
-			    i != sctz_len ? RETURN_END : RETURN_BEGIN);
+			    update_needed ? RETURN_END : RETURN_BEGIN);
 
       /* Adjust REM and PTR, unless this is the last iteration.  */
-      if (i != sctz_len)
+      if (update_needed)
 	{
 	  emit_move_insn (ptr, force_operand (XEXP (to, 0), NULL_RTX));
 	  to = replace_equiv_address (to, ptr);
@@ -4451,6 +4531,11 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
 	  emit_move_insn (rem, force_operand (rem_minus_blksize, NULL_RTX));
 	}
 
+      if (loop_label)
+	emit_cmp_and_jump_insns (rem, GEN_INT (blksize), GE, NULL,
+				 ptr_mode, 1, loop_label,
+				 profile_probability::likely ());
+
       if (label)
 	{
 	  emit_label (label);
@@ -4737,7 +4822,8 @@ expand_builtin_memcmp (tree exp, rtx target, bool result_eq)
   result = emit_block_cmp_hints (arg1_rtx, arg2_rtx, len_rtx,
 				 TREE_TYPE (len), target,
 				 result_eq, constfn,
-				 CONST_CAST (char *, rep));
+				 CONST_CAST (char *, rep),
+				 tree_ctz (len));
 
   if (result)
     {
@@ -7380,7 +7466,15 @@ expand_builtin (tree exp, rtx target, rtx subtarget, machine_mode mode,
       && fcode != BUILT_IN_EXECVE
       && fcode != BUILT_IN_CLEAR_CACHE
       && !ALLOCA_FUNCTION_CODE_P (fcode)
-      && fcode != BUILT_IN_FREE)
+      && fcode != BUILT_IN_FREE
+      && (fcode != BUILT_IN_MEMSET
+	  || !(flag_inline_stringops & ILSOP_MEMSET))
+      && (fcode != BUILT_IN_MEMCPY
+	  || !(flag_inline_stringops & ILSOP_MEMCPY))
+      && (fcode != BUILT_IN_MEMMOVE
+	  || !(flag_inline_stringops & ILSOP_MEMMOVE))
+      && (fcode != BUILT_IN_MEMCMP
+	  || !(flag_inline_stringops & ILSOP_MEMCMP)))
     return expand_call (exp, target, ignore);
 
   /* The built-in function expanders test for target == const0_rtx
diff --git a/gcc/common.opt b/gcc/common.opt
index a28ca13385a..fcf945019ec 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -1882,6 +1882,40 @@ finline-atomics
 Common Var(flag_inline_atomics) Init(1) Optimization
 Inline __atomic operations when a lock free instruction sequence is available.
 
+finline-stringops
+Common RejectNegative Enum(ilsop_fn) Var(flag_inline_stringops, ILSOP_ALL) Enum(ilsop_fn) Init(ILSOP_NONE) Optimization Undocumented
+
+fno-inline-stringops
+Common RejectNegative Enum(ilsop_fn) Var(flag_inline_stringops, ILSOP_NONE) Enum(ilsop_fn) Optimization Undocumented
+
+finline-stringops=
+Common Joined Var(flag_inline_stringops) EnumSet Enum(ilsop_fn) Optimization
+-finline-stringops[=memcmp|memcpy|memmove|memset]
+Expand supported mem/str operations inline, even if against optimization.
+
+Enum
+Name(ilsop_fn) Type(enum ilsop_fn) UnknownError(unavailable stringop for inlining %qs)
+
+; This is not part of any set.
+; EnumValue
+; Enum(ilsop_fn) String(none) Value(ILSOP_NONE)
+
+EnumValue
+Enum(ilsop_fn) String(memcmp) Value(ILSOP_MEMCMP) Set(1)
+
+EnumValue
+Enum(ilsop_fn) String(memcpy) Value(ILSOP_MEMCPY) Set(2)
+
+EnumValue
+Enum(ilsop_fn) String(memmove) Value(ILSOP_MEMMOVE) Set(3)
+
+EnumValue
+Enum(ilsop_fn) String(memset) Value(ILSOP_MEMSET) Set(4)
+
+; This is not part of any set either.
+; EnumValue
+; Enum(ilsop_fn) String(all) Value(ILSOP_ALL)
+
 fcf-protection
 Common RejectNegative Alias(fcf-protection=,full)
 
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 9130104af22..923c8005f57 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -550,6 +550,7 @@ Objective-C and Objective-C++ Dialects}.
 -fgcse  -fgcse-after-reload  -fgcse-las  -fgcse-lm  -fgraphite-identity
 -fgcse-sm  -fhoist-adjacent-loads  -fif-conversion
 -fif-conversion2  -findirect-inlining
+-finline-stringops[=@var{fn}]
 -finline-functions  -finline-functions-called-once  -finline-limit=@var{n}
 -finline-small-functions -fipa-modref -fipa-cp  -fipa-cp-clone
 -fipa-bit-cp  -fipa-vrp  -fipa-pta  -fipa-profile  -fipa-pure-const
@@ -12081,6 +12082,20 @@ their @code{_FORTIFY_SOURCE} counterparts into faster alternatives.
 
 Enabled at levels @option{-O2}, @option{-O3}.
 
+@opindex finline-stringops
+@item -finline-stringops[=@var{fn}]
+Expand memory and string operations (for now, only @code{memset})
+inline, even when the length is variable or big enough as to require
+looping.  This is most useful along with @option{-ffreestanding} and
+@option{-fno-builtin}.
+
+In some circumstances, it enables the compiler to generate code that
+takes advantage of known alignment and length multipliers, but even then
+it may be less efficient than optimized runtime implementations, and
+grow code size so much that even a less performant but shared
+implementation runs faster due to better use of code caches.  This
+option is disabled by default.
+
 @opindex fno-inline
 @opindex finline
 @item -fno-inline
diff --git a/gcc/expr.cc b/gcc/expr.cc
index 56b51876f80..efa409643a3 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -80,7 +80,11 @@ static bool emit_block_move_via_pattern (rtx, rtx, rtx, unsigned, unsigned,
 					 HOST_WIDE_INT, unsigned HOST_WIDE_INT,
 					 unsigned HOST_WIDE_INT,
 					 unsigned HOST_WIDE_INT, bool);
-static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned);
+static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned, int);
+static void emit_block_move_via_sized_loop (rtx, rtx, rtx, unsigned, unsigned);
+static void emit_block_move_via_oriented_loop (rtx, rtx, rtx, unsigned, unsigned);
+static rtx emit_block_cmp_via_loop (rtx, rtx, rtx, tree, rtx, bool,
+				    unsigned, unsigned);
 static void clear_by_pieces (rtx, unsigned HOST_WIDE_INT, unsigned int);
 static rtx_insn *compress_float_constant (rtx, rtx);
 static rtx get_subtarget (rtx);
@@ -1955,6 +1959,8 @@ compare_by_pieces (rtx arg0, rtx arg1, unsigned HOST_WIDE_INT len,
    MIN_SIZE is the minimal size of block to move
    MAX_SIZE is the maximal size of block to move, if it cannot be represented
    in unsigned HOST_WIDE_INT, than it is mask of all ones.
+   CTZ_SIZE is the trailing-zeros count of SIZE; even a nonconstant SIZE is
+   known to be a multiple of 1<<CTZ_SIZE.
 
    Return the address of the new block, if memcpy is called and returns it,
    0 otherwise.  */
@@ -1966,7 +1972,7 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
 		       unsigned HOST_WIDE_INT max_size,
 		       unsigned HOST_WIDE_INT probable_max_size,
 		       bool bail_out_libcall, bool *is_move_done,
-		       bool might_overlap)
+		       bool might_overlap, unsigned ctz_size)
 {
   int may_use_call;
   rtx retval = 0;
@@ -2052,6 +2058,14 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
 	}
     }
 
+  bool dynamic_direction = false;
+  if (!pattern_ok && !pieces_ok && may_use_call
+      && (flag_inline_stringops & (might_overlap ? ILSOP_MEMMOVE : ILSOP_MEMCPY)))
+    {
+      may_use_call = 0;
+      dynamic_direction = might_overlap;
+    }
+
   if (pattern_ok)
     ;
   else if (pieces_ok)
@@ -2073,10 +2087,12 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
       retval = emit_block_copy_via_libcall (x, y, size,
 					    method == BLOCK_OP_TAILCALL);
     }
+  else if (dynamic_direction)
+    emit_block_move_via_oriented_loop (x, y, size, align, ctz_size);
   else if (might_overlap)
     *is_move_done = false;
   else
-    emit_block_move_via_loop (x, y, size, align);
+    emit_block_move_via_sized_loop (x, y, size, align, ctz_size);
 
   if (method == BLOCK_OP_CALL_PARM)
     OK_DEFER_POP;
@@ -2085,7 +2101,8 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
 }
 
 rtx
-emit_block_move (rtx x, rtx y, rtx size, enum block_op_methods method)
+emit_block_move (rtx x, rtx y, rtx size, enum block_op_methods method,
+		 unsigned int ctz_size)
 {
   unsigned HOST_WIDE_INT max, min = 0;
   if (GET_CODE (size) == CONST_INT)
@@ -2093,7 +2110,8 @@ emit_block_move (rtx x, rtx y, rtx size, enum block_op_methods method)
   else
     max = GET_MODE_MASK (GET_MODE (size));
   return emit_block_move_hints (x, y, size, method, 0, -1,
-				min, max, max);
+				min, max, max,
+				false, NULL, false, ctz_size);
 }
 
 /* A subroutine of emit_block_move.  Returns true if calling the
@@ -2255,13 +2273,117 @@ emit_block_move_via_pattern (rtx x, rtx y, rtx size, unsigned int align,
   return false;
 }
 
+/* Like emit_block_move_via_loop, but choose a suitable INCR based on
+   ALIGN and CTZ_SIZE.  */
+
+static void
+emit_block_move_via_sized_loop (rtx x, rtx y, rtx size,
+				unsigned int align,
+				unsigned int ctz_size)
+{
+  int incr = align / BITS_PER_UNIT;
+
+  if (CONST_INT_P (size))
+    ctz_size = MAX (ctz_size, (unsigned) wi::ctz (UINTVAL (size)));
+
+  if (HOST_WIDE_INT_1U << ctz_size < (unsigned HOST_WIDE_INT) incr)
+    incr = HOST_WIDE_INT_1U << ctz_size;
+
+  while (incr > 1 && !can_move_by_pieces (incr, align))
+    incr >>= 1;
+
+  gcc_checking_assert (incr);
+
+  return emit_block_move_via_loop (x, y, size, align, incr);
+}
+
+/* Like emit_block_move_via_sized_loop, but besides choosing INCR so
+   as to ensure safe moves even in case of overlap, output dynamic
+   tests to choose between two loops, one moving downwards, another
+   moving upwards.  */
+
+static void
+emit_block_move_via_oriented_loop (rtx x, rtx y, rtx size,
+				   unsigned int align,
+				   unsigned int ctz_size)
+{
+  int incr = align / BITS_PER_UNIT;
+
+  if (CONST_INT_P (size))
+    ctz_size = MAX (ctz_size, (unsigned) wi::ctz (UINTVAL (size)));
+
+  if (HOST_WIDE_INT_1U << ctz_size < (unsigned HOST_WIDE_INT) incr)
+    incr = HOST_WIDE_INT_1U << ctz_size;
+
+  while (incr > 1 && !int_mode_for_size (incr, 0).exists ())
+    incr >>= 1;
+
+  gcc_checking_assert (incr);
+
+  rtx_code_label *upw_label, *end_label;
+  upw_label = gen_label_rtx ();
+  end_label = gen_label_rtx ();
+
+  rtx x_addr = force_operand (XEXP (x, 0), NULL_RTX);
+  rtx y_addr = force_operand (XEXP (y, 0), NULL_RTX);
+  do_pending_stack_adjust ();
+
+  machine_mode mode = GET_MODE (x_addr);
+  if (mode != GET_MODE (y_addr))
+    {
+      scalar_int_mode xmode
+	= smallest_int_mode_for_size (GET_MODE_BITSIZE (mode));
+      scalar_int_mode ymode
+	= smallest_int_mode_for_size (GET_MODE_BITSIZE
+				      (GET_MODE (y_addr)));
+      if (GET_MODE_BITSIZE (xmode) < GET_MODE_BITSIZE (ymode))
+	mode = ymode;
+      else
+	mode = xmode;
+
+#ifndef POINTERS_EXTEND_UNSIGNED
+      const int POINTERS_EXTEND_UNSIGNED = 1;
+#endif
+      x_addr = convert_modes (mode, GET_MODE (x_addr), x_addr,
+			      POINTERS_EXTEND_UNSIGNED);
+      y_addr = convert_modes (mode, GET_MODE (y_addr), y_addr,
+			      POINTERS_EXTEND_UNSIGNED);
+    }
+
+  /* Test for overlap: if (x >= y || x + size <= y) goto upw_label.  */
+  emit_cmp_and_jump_insns (x_addr, y_addr, GEU, NULL_RTX, mode,
+			   true, upw_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (5, 10));
+  rtx tmp = convert_modes (GET_MODE (x_addr), GET_MODE (size), size, true);
+  tmp = simplify_gen_binary (PLUS, GET_MODE (x_addr), x_addr, tmp);
+
+  emit_cmp_and_jump_insns (tmp, y_addr, LEU, NULL_RTX, mode,
+			   true, upw_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (8, 10));
+
+  emit_block_move_via_loop (x, y, size, align, -incr);
+
+  emit_jump (end_label);
+  emit_label (upw_label);
+
+  emit_block_move_via_loop (x, y, size, align, incr);
+
+  emit_label (end_label);
+}
+
 /* A subroutine of emit_block_move.  Copy the data via an explicit
-   loop.  This is used only when libcalls are forbidden.  */
-/* ??? It'd be nice to copy in hunks larger than QImode.  */
+   loop.  This is used only when libcalls are forbidden, or when
+   inlining is required.  INCR is the block size to be copied in each
+   loop iteration.  If it is negative, the absolute value is used, and
+   the block is copied backwards.  INCR must be a power of two, an
+   exact divisor for SIZE and ALIGN, and imply a mode that can be
+   safely copied per iteration assuming no overlap.  */
 
 static void
 emit_block_move_via_loop (rtx x, rtx y, rtx size,
-			  unsigned int align ATTRIBUTE_UNUSED)
+			  unsigned int align, int incr)
 {
   rtx_code_label *cmp_label, *top_label;
   rtx iter, x_addr, y_addr, tmp;
@@ -2277,7 +2399,38 @@ emit_block_move_via_loop (rtx x, rtx y, rtx size,
   cmp_label = gen_label_rtx ();
   iter = gen_reg_rtx (iter_mode);
 
-  emit_move_insn (iter, const0_rtx);
+  bool downwards = incr < 0;
+  rtx iter_init;
+  rtx_code iter_cond;
+  rtx iter_limit;
+  rtx iter_incr;
+  machine_mode move_mode;
+  if (downwards)
+    {
+      incr = -incr;
+      iter_init = size;
+      iter_cond = GEU;
+      iter_limit = const0_rtx;
+      iter_incr = GEN_INT (incr);
+    }
+  else
+    {
+      iter_init = const0_rtx;
+      iter_cond = LTU;
+      iter_limit = size;
+      iter_incr = GEN_INT (incr);
+    }
+  emit_move_insn (iter, iter_init);
+
+  scalar_int_mode int_move_mode
+    = smallest_int_mode_for_size (incr * BITS_PER_UNIT);
+  if (GET_MODE_BITSIZE (int_move_mode) != incr * BITS_PER_UNIT)
+    {
+      move_mode = BLKmode;
+      gcc_checking_assert (can_move_by_pieces (incr, align));
+    }
+  else
+    move_mode = int_move_mode;
 
   x_addr = force_operand (XEXP (x, 0), NULL_RTX);
   y_addr = force_operand (XEXP (y, 0), NULL_RTX);
@@ -2293,19 +2446,32 @@ emit_block_move_via_loop (rtx x, rtx y, rtx size,
     tmp = convert_modes (y_addr_mode, iter_mode, iter, true);
   y_addr = simplify_gen_binary (PLUS, y_addr_mode, y_addr, tmp);
 
-  x = change_address (x, QImode, x_addr);
-  y = change_address (y, QImode, y_addr);
+  x = change_address (x, move_mode, x_addr);
+  y = change_address (y, move_mode, y_addr);
 
-  emit_move_insn (x, y);
+  if (move_mode == BLKmode)
+    {
+      bool done;
+      emit_block_move_hints (x, y, iter_incr, BLOCK_OP_NO_LIBCALL,
+			     align, incr, incr, incr, incr,
+			     false, &done, false);
+      gcc_checking_assert (done);
+    }
+  else
+    emit_move_insn (x, y);
 
-  tmp = expand_simple_binop (iter_mode, PLUS, iter, const1_rtx, iter,
+  if (downwards)
+    emit_label (cmp_label);
+
+  tmp = expand_simple_binop (iter_mode, PLUS, iter, iter_incr, iter,
 			     true, OPTAB_LIB_WIDEN);
   if (tmp != iter)
     emit_move_insn (iter, tmp);
 
-  emit_label (cmp_label);
+  if (!downwards)
+    emit_label (cmp_label);
 
-  emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
+  emit_cmp_and_jump_insns (iter, iter_limit, iter_cond, NULL_RTX, iter_mode,
 			   true, top_label,
 			   profile_probability::guessed_always ()
 				.apply_scale (9, 10));
@@ -2405,7 +2571,8 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
 
    Both X and Y must be MEM rtx's.  LEN is an rtx that says how long
    they are.  LEN_TYPE is the type of the expression that was used to
-   calculate it.
+   calculate it, and CTZ_LEN is the known trailing-zeros count of LEN,
+   so LEN must be a multiple of 1<<CTZ_LEN even if it's not constant.
 
    If EQUALITY_ONLY is true, it means we don't have to return the tri-state
    value of a normal memcmp call, instead we can just compare for equality.
@@ -2421,7 +2588,7 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
 rtx
 emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
 		      bool equality_only, by_pieces_constfn y_cfn,
-		      void *y_cfndata)
+		      void *y_cfndata, unsigned ctz_len)
 {
   rtx result = 0;
 
@@ -2443,8 +2610,181 @@ emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
   else
     result = emit_block_cmp_via_cmpmem (x, y, len, len_type, target, align);
 
+  if (!result && (flag_inline_stringops & ILSOP_MEMCMP))
+    result = emit_block_cmp_via_loop (x, y, len, len_type,
+				      target, equality_only,
+				      align, ctz_len);
+
   return result;
 }
+
+/* Like emit_block_cmp_hints, but with known alignment and no support
+   for constats.  Always expand to a loop with iterations that compare
+   blocks of the largest compare-by-pieces size that divides both len
+   and align, and then, if !EQUALITY_ONLY, identify the word and then
+   the unit that first differs to return the result.  */
+
+rtx
+emit_block_cmp_via_loop (rtx x, rtx y, rtx len, tree len_type, rtx target,
+			 bool equality_only, unsigned align, unsigned ctz_len)
+{
+  unsigned incr = align / BITS_PER_UNIT;
+
+  if (CONST_INT_P (len))
+    ctz_len = MAX (ctz_len, (unsigned) wi::ctz (UINTVAL (len)));
+
+  if (HOST_WIDE_INT_1U << ctz_len < (unsigned HOST_WIDE_INT) incr)
+    incr = HOST_WIDE_INT_1U << ctz_len;
+
+  while (incr > 1
+	 && !can_do_by_pieces (incr, align, COMPARE_BY_PIECES))
+    incr >>= 1;
+
+  rtx_code_label *cmp_label, *top_label, *ne_label, *res_label;
+  rtx iter, x_addr, y_addr, tmp;
+  machine_mode x_addr_mode = get_address_mode (x);
+  machine_mode y_addr_mode = get_address_mode (y);
+  machine_mode iter_mode;
+
+  iter_mode = GET_MODE (len);
+  if (iter_mode == VOIDmode)
+    iter_mode = word_mode;
+
+  top_label = gen_label_rtx ();
+  cmp_label = gen_label_rtx ();
+  ne_label = gen_label_rtx ();
+  res_label = gen_label_rtx ();
+  iter = gen_reg_rtx (iter_mode);
+
+  rtx iter_init = const0_rtx;
+  rtx_code iter_cond = LTU;
+  rtx iter_limit = len;
+  rtx iter_incr = GEN_INT (incr);
+  machine_mode cmp_mode;
+
+  emit_move_insn (iter, iter_init);
+
+  scalar_int_mode int_cmp_mode
+    = smallest_int_mode_for_size (incr * BITS_PER_UNIT);
+  if (GET_MODE_BITSIZE (int_cmp_mode) != incr * BITS_PER_UNIT
+      || !can_compare_p (NE, int_cmp_mode, ccp_jump))
+    {
+      cmp_mode = BLKmode;
+      gcc_checking_assert (incr != 1);
+    }
+  else
+    cmp_mode = int_cmp_mode;
+
+  /* Save the base addresses.  */
+  x_addr = force_operand (XEXP (x, 0), NULL_RTX);
+  y_addr = force_operand (XEXP (y, 0), NULL_RTX);
+  do_pending_stack_adjust ();
+
+  emit_jump (cmp_label);
+  emit_label (top_label);
+
+  /* Offset the base addresses by ITER.  */
+  tmp = convert_modes (x_addr_mode, iter_mode, iter, true);
+  x_addr = simplify_gen_binary (PLUS, x_addr_mode, x_addr, tmp);
+
+  if (x_addr_mode != y_addr_mode)
+    tmp = convert_modes (y_addr_mode, iter_mode, iter, true);
+  y_addr = simplify_gen_binary (PLUS, y_addr_mode, y_addr, tmp);
+
+  x = change_address (x, cmp_mode, x_addr);
+  y = change_address (y, cmp_mode, y_addr);
+
+  /* Compare one block.  */
+  rtx part_res;
+  if (cmp_mode == BLKmode)
+    part_res = compare_by_pieces (x, y, incr, target, align, 0, 0);
+  else
+    part_res = expand_binop (cmp_mode, sub_optab, x, y, NULL_RTX,
+			     true, OPTAB_LIB_WIDEN);
+
+  /* Stop if we found a difference.  */
+  emit_cmp_and_jump_insns (part_res, GEN_INT (0), NE, NULL_RTX,
+			   GET_MODE (part_res), true, ne_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (1, 10));
+
+  /* Increment ITER.  */
+  tmp = expand_simple_binop (iter_mode, PLUS, iter, iter_incr, iter,
+			     true, OPTAB_LIB_WIDEN);
+  if (tmp != iter)
+    emit_move_insn (iter, tmp);
+
+  emit_label (cmp_label);
+  /* Loop until we reach the limit.  */
+  emit_cmp_and_jump_insns (iter, iter_limit, iter_cond, NULL_RTX, iter_mode,
+			   true, top_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (9, 10));
+
+  /* We got to the end without differences, so the result is zero.  */
+  if (target == NULL_RTX
+      || !REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
+    target = gen_reg_rtx (TYPE_MODE (integer_type_node));
+
+  emit_move_insn (target, const0_rtx);
+  emit_jump (res_label);
+  emit_barrier ();
+
+  emit_label (ne_label);
+
+  /* Return nonzero, or pinpoint the difference to return the expected
+     result for non-equality tests.  */
+  if (equality_only)
+    emit_move_insn (target, const1_rtx);
+  else
+    {
+      if (incr > UNITS_PER_WORD)
+	/* ??? Re-compare the block found to be different one word at a
+	   time.  */
+	part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), len_type,
+					    target, equality_only,
+					    BITS_PER_WORD, 0);
+      else if (incr > 1)
+	/* ??? Re-compare the block found to be different one byte at a
+	   time.  We could do better using part_res, and being careful
+	   about endianness.  */
+	part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), len_type,
+					    target, equality_only,
+					    BITS_PER_UNIT, 0);
+      else if (known_gt (GET_MODE_BITSIZE (GET_MODE (target)),
+			 GET_MODE_BITSIZE (cmp_mode)))
+	part_res = expand_binop (GET_MODE (target), sub_optab, x, y, target,
+				 true, OPTAB_LIB_WIDEN);
+      else
+	{
+	  /* In the odd chance target is QImode, we can't count on
+	     widening subtract to capture the result of the unsigned
+	     compares.  */
+	  rtx_code_label *ltu_label;
+	  ltu_label = gen_label_rtx ();
+	  emit_cmp_and_jump_insns (x, y, LTU, NULL_RTX,
+				   cmp_mode, true, ltu_label,
+				   profile_probability::guessed_always ()
+				   .apply_scale (5, 10));
+
+	  emit_move_insn (target, const1_rtx);
+	  emit_jump (res_label);
+	  emit_barrier ();
+
+	  emit_label (ltu_label);
+	  emit_move_insn (target, constm1_rtx);
+	  part_res = target;
+	}
+
+      if (target != part_res)
+	convert_move (target, part_res, false);
+    }
+
+  emit_label (res_label);
+
+  return target;
+}
+
 \f
 /* Copy all or part of a value X into registers starting at REGNO.
    The number of registers to be filled is NREGS.  */
diff --git a/gcc/expr.h b/gcc/expr.h
index 0c059ed9023..63c75c9688c 100644
--- a/gcc/expr.h
+++ b/gcc/expr.h
@@ -126,7 +126,8 @@ struct by_pieces_prev
   fixed_size_mode mode;
 };
 
-extern rtx emit_block_move (rtx, rtx, rtx, enum block_op_methods);
+extern rtx emit_block_move (rtx, rtx, rtx, enum block_op_methods,
+			    unsigned ctz_size = 0);
 extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
 			          unsigned int, HOST_WIDE_INT,
 				  unsigned HOST_WIDE_INT,
@@ -134,9 +135,11 @@ extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
 				  unsigned HOST_WIDE_INT,
 				  bool bail_out_libcall = false,
 				  bool *is_move_done = NULL,
-				  bool might_overlap = false);
+				  bool might_overlap = false,
+				  unsigned ctz_size = 0);
 extern rtx emit_block_cmp_hints (rtx, rtx, rtx, tree, rtx, bool,
-				 by_pieces_constfn, void *);
+				 by_pieces_constfn, void *,
+				 unsigned ctz_len = 0);
 extern bool emit_storent_insn (rtx to, rtx from);
 
 /* Copy all or part of a value X into registers starting at REGNO.
diff --git a/gcc/flag-types.h b/gcc/flag-types.h
index 78dbdbe90dd..a9d9eb166b6 100644
--- a/gcc/flag-types.h
+++ b/gcc/flag-types.h
@@ -432,6 +432,17 @@ enum gfc_convert
 };
 
 
+/* Inline String Operations functions.  */
+enum ilsop_fn
+{
+  ILSOP_NONE = 0,
+  ILSOP_MEMSET = 1 << 0,
+  ILSOP_MEMCPY = 1 << 1,
+  ILSOP_MEMMOVE = 1 << 2,
+  ILSOP_MEMCMP = 1 << 3,
+  ILSOP_ALL = -1
+};
+
 /* Control-Flow Protection values.  */
 enum cf_protection_level
 {
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c
new file mode 100644
index 00000000000..8fdc5bb7c7d
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c
@@ -0,0 +1,6 @@
+/* { dg-do run } */
+/* { dg-options "-finline-stringops=memcmp -save-temps -g0 -fno-lto" } */
+
+#include "../memcmp-1.c"
+
+/* { dg-final { scan-assembler-not {(?!\*)\mmemcmp\M} } } */
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c
new file mode 100644
index 00000000000..c98e903c1f1
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c
@@ -0,0 +1,8 @@
+/* { dg-do run } */
+/* { dg-options "-finline-stringops=memcpy -save-temps -g0 -fno-lto" } */
+
+#include "../memcmp-1.c"
+/* Yeah, this memcmp test exercises plenty of memcpy, more than any of the
+   memcpy tests.  */
+
+/* { dg-final { scan-assembler-not {\mmemcpy\M} } } */
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-cmp-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-cmp-1.c
new file mode 100644
index 00000000000..fc665d92894
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-cmp-1.c
@@ -0,0 +1,11 @@
+/* { dg-do run } */
+/* { dg-options "-finline-stringops -save-temps -g0 -fno-lto" } */
+/* { dg-require-effective-target ptr32plus } */
+/* { dg-timeout-factor 2 } */
+
+#include "../memcmp-1.c"
+/* Yeah, this memcmp test exercises plenty of memcpy, more than any of the
+   memcpy tests.  */
+
+/* { dg-final { scan-assembler-not {\mmemcpy\M} } } */
+/* { dg-final { scan-assembler-not {(?!\*)\mmemcmp\M} } } */
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c
new file mode 100644
index 00000000000..f1d77b50a4c
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c
@@ -0,0 +1,9 @@
+/* { dg-do run } */
+/* { dg-options "-finline-stringops=memmove -save-temps -g0 -fno-lto" } */
+
+#include "../../gcc.c-torture/execute/builtins/memmove.c"
+
+#include "../../gcc.c-torture/execute/builtins/lib/main.c"
+
+/* { dg-final { scan-assembler-not {\mmemcpy\M} } } */
+/* { dg-final { scan-assembler-not {\mmemmove\M} } } */
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c
new file mode 100644
index 00000000000..bdcf9bf5b61
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c
@@ -0,0 +1,84 @@
+/* { dg-do compile } */
+/* { dg-options "-finline-stringops -fno-lto" } */
+
+void *zero (unsigned long long (*p)[32], int n)
+{
+  return __builtin_memset (p, 0, n * sizeof (*p));
+}
+
+void *ones (char (*p)[128], int n)
+{
+  return __builtin_memset (p, -1, n * sizeof (*p));
+}
+
+void *opt2 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 2) * sizeof (*p));
+}
+
+void *opt8 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 8) * sizeof (*p));
+}
+
+void *opt32 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 32) * sizeof (*p));
+}
+
+void *opt128 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 128) * sizeof (*p));
+}
+
+void *opt512 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 512) * sizeof (*p));
+}
+
+void *opt_primes (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 509 : 7) * sizeof (*p));
+}
+
+void *opt_primes_blk (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 521 : 9) * sizeof (*p));
+}
+
+void *huge (long (*p)[16384])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep1 (long (*p)[16384+1])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep4 (long (*p)[16384+4])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep16 (long (*p)[16384+16])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep64 (long (*p)[16384+64])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep256 (long (*p)[16384+256])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep1024p256p64p16p4p1 (long (*p)[16384+1024+64+16+4+1])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+/* { dg-final { scan-assembler-not {\mmemset\M} } } */

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [gcc(refs/users/aoliva/heads/testme)] Introduce -finline-stringops
@ 2023-06-02  4:10 Alexandre Oliva
  0 siblings, 0 replies; 16+ messages in thread
From: Alexandre Oliva @ 2023-06-02  4:10 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:ca30d4ef5482b5a1d6d4cd0e4a4bad710beb48aa

commit ca30d4ef5482b5a1d6d4cd0e4a4bad710beb48aa
Author: Alexandre Oliva <oliva@adacore.com>
Date:   Thu Jun 1 22:38:29 2023 -0300

    Introduce -finline-stringops
    
    try_store_by_multiple_pieces was added not long ago, enabling
    variable-sized memset to be expanded inline when the worst-case
    in-range constant length would, using conditional blocks with powers
    of two to cover all possibilities of length and alignment.
    
    This patch introduces -finline-stringops[=fn] to request expansions to
    start with a loop, so as to still take advantage of known alignment
    even with long lengths, but without necessarily adding store blocks
    for every power of two.
    
    This makes it possible for the supported stringops (memset, memcpy,
    memmove, memset) to be expanded, even if storing a single byte per
    iteration.  Surely efficient implementations can run faster, with a
    pre-loop to increase alignment, but that would likely be excessive for
    inline expansions.
    
    Still, in some cases, such as in freestanding environments, users
    prefer to inline such stringops, especially those that the compiler
    may introduce itself, even if the expansion is not as performant as a
    highly optimized C library implementation could be, to avoid
    depending on a C runtime library.
    
    
    for  gcc/ChangeLog
    
            * expr.cc (emit_block_move_hints): Take ctz of len.  Obey
            -finline-stringops.  Use oriented or sized loop.
            (emit_block_move): Take ctz of len, and pass it on.
            (emit_block_move_via_sized_loop): New.
            (emit_block_move_via_oriented_loop): New.
            (emit_block_move_via_loop): Take incr.  Move an incr-sized
            block per iteration.
            (emit_block_cmp_via_cmpmem): Take ctz of len.  Obey
            -finline-stringops.
            (emit_block_cmp_via_loop): New.
            * expr.h (emit_block_move): Add ctz of len defaulting to zero.
            (emit_block_move_hints): Likewise.
            (emit_block_cmp_hints): Likewise.
            * builtins.cc (expand_builtin_memory_copy_args): Pass ctz of
            len to emit_block_move_hints.
            (try_store_by_multiple_pieces): Support starting with a loop.
            (expand_builtin_memcmp): Pass ctz of len to
            emit_block_cmp_hints.
            (expand_builtin): Allow inline expansion of memset, memcpy,
            memmove and memcmp if requested.
            * common.opt (finline-stringops): New.
            (ilsop_fn): New enum.
            * flag-types.h (enum ilsop_fn): New.
            * doc/invoke.texi (-finline-stringops): Add.
    
    for  gcc/testsuite/ChangeLog
    
            * gcc.dg/torture/inline-mem-cmp-1.c: New.
            * gcc.dg/torture/inline-mem-cpy-1.c: New.
            * gcc.dg/torture/inline-mem-cpy-cmp-1.c: New.
            * gcc.dg/torture/inline-mem-move-1.c: New.
            * gcc.dg/torture/inline-mem-set-1.c: New.

Diff:
---
 gcc/builtins.cc                                    | 114 ++++++-
 gcc/common.opt                                     |  34 ++
 gcc/doc/invoke.texi                                |  15 +
 gcc/expr.cc                                        | 374 ++++++++++++++++++++-
 gcc/expr.h                                         |   9 +-
 gcc/flag-types.h                                   |  11 +
 gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c    |   6 +
 gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c    |   8 +
 .../gcc.dg/torture/inline-mem-cpy-cmp-1.c          |  11 +
 gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c   |   9 +
 gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c    |  84 +++++
 11 files changed, 645 insertions(+), 30 deletions(-)

diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index 8400adaf5b4..1beaa4eae97 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -3769,7 +3769,7 @@ expand_builtin_memory_copy_args (tree dest, tree src, tree len,
 				     expected_align, expected_size,
 				     min_size, max_size, probable_max_size,
 				     use_mempcpy_call, &is_move_done,
-				     might_overlap);
+				     might_overlap, tree_ctz (len));
 
   /* Bail out when a mempcpy call would be expanded as libcall and when
      we have a target that provides a fast implementation
@@ -4335,6 +4335,10 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
   int tst_bits = (max_bits != min_bits ? max_bits
 		  : floor_log2 (max_len ^ min_len));
 
+  /* Save the pre-blksize values.  */
+  int orig_max_bits = max_bits;
+  int orig_tst_bits = tst_bits;
+
   /* Check whether it's profitable to start by storing a fixed BLKSIZE
      bytes, to lower max_bits.  In the unlikely case of a constant LEN
      (implied by identical MAX_LEN and MIN_LEN), we want to issue a
@@ -4374,9 +4378,70 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
   if (max_bits >= 0)
     xlenest += ((HOST_WIDE_INT_1U << max_bits) * 2
 		- (HOST_WIDE_INT_1U << ctz_len));
-  if (!can_store_by_pieces (xlenest, builtin_memset_read_str,
-			    &valc, align, true))
-    return false;
+  bool max_loop = false;
+  /* Skip the test in case of overflow in xlenest.  It shouldn't
+     happen because of the way max_bits and blksize are related, but
+     it doesn't hurt to test.  */
+  if (blksize > xlenest
+      || !can_store_by_pieces (xlenest, builtin_memset_read_str,
+			       &valc, align, true))
+    {
+      if (!(flag_inline_stringops & ILSOP_MEMSET))
+	return false;
+
+      for (max_bits = orig_max_bits;
+	   max_bits >= sctz_len;
+	   --max_bits)
+	{
+	  xlenest = ((HOST_WIDE_INT_1U << max_bits) * 2
+		     - (HOST_WIDE_INT_1U << ctz_len));
+	  /* Check that blksize plus the bits to be stored as blocks
+	     sized at powers of two can be stored by pieces.  This is
+	     like the test above, but with smaller max_bits.  Skip
+	     orig_max_bits (it would be redundant).  Also skip in case
+	     of overflow.  */
+	  if (max_bits < orig_max_bits
+	      && xlenest + blksize >= xlenest
+	      && can_store_by_pieces (xlenest + blksize,
+				      builtin_memset_read_str,
+				      &valc, align, true))
+	    {
+	      max_loop = true;
+	      break;
+	    }
+	  if (blksize
+	      && can_store_by_pieces (xlenest,
+				      builtin_memset_read_str,
+				      &valc, align, true))
+	    {
+	      max_len += blksize;
+	      min_len += blksize;
+	      tst_bits = orig_tst_bits;
+	      blksize = 0;
+	      max_loop = true;
+	      break;
+	    }
+	  if (max_bits == sctz_len)
+	    {
+	      --sctz_len;
+	      --ctz_len;
+	    }
+	}
+      if (!max_loop)
+	return false;
+      /* If the boundaries are such that min and max may run a
+	 different number of trips in the initial loop, the remainder
+	 needs not be between the moduli, so set tst_bits to cover all
+	 bits.  Otherwise, if the trip counts are the same, max_len
+	 has the common prefix, and the previously-computed tst_bits
+	 is usable.  */
+      if (max_len >> max_bits > min_len >> max_bits)
+	tst_bits = max_bits;
+    }
+  /* ??? Do we have to check that all powers of two lengths from
+     max_bits down to ctz_len pass can_store_by_pieces?  As in, could
+     it possibly be that xlenest passes while smaller power-of-two
+     sizes don't?  */
 
   by_pieces_constfn constfun;
   void *constfundata;
@@ -4418,7 +4483,9 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
      the least significant bit possibly set in the length.  */
   for (int i = max_bits; i >= sctz_len; i--)
     {
+      rtx_code_label *loop_label = NULL;
       rtx_code_label *label = NULL;
+
       blksize = HOST_WIDE_INT_1U << i;
 
       /* If we're past the bits shared between min_ and max_len, expand
@@ -4432,18 +4499,31 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
 				   profile_probability::even ());
 	}
       /* If we are at a bit that is in the prefix shared by min_ and
-	 max_len, skip this BLKSIZE if the bit is clear.  */
-      else if ((max_len & blksize) == 0)
+	 max_len, skip the current BLKSIZE if the bit is clear, but do
+	 not skip the loop, even if it doesn't require
+	 prechecking.  */
+      else if ((max_len & blksize) == 0
+	       && !(max_loop && i == max_bits))
 	continue;
 
+      if (max_loop && i == max_bits)
+	{
+	  loop_label = gen_label_rtx ();
+	  emit_label (loop_label);
+	  /* Since we may run this multiple times, don't assume we
+	     know anything about the offset.  */
+	  clear_mem_offset (to);
+	}
+
       /* Issue a store of BLKSIZE bytes.  */
+      bool update_needed = i != sctz_len || loop_label;
       to = store_by_pieces (to, blksize,
 			    constfun, constfundata,
 			    align, true,
-			    i != sctz_len ? RETURN_END : RETURN_BEGIN);
+			    update_needed ? RETURN_END : RETURN_BEGIN);
 
       /* Adjust REM and PTR, unless this is the last iteration.  */
-      if (i != sctz_len)
+      if (update_needed)
 	{
 	  emit_move_insn (ptr, force_operand (XEXP (to, 0), NULL_RTX));
 	  to = replace_equiv_address (to, ptr);
@@ -4451,6 +4531,11 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
 	  emit_move_insn (rem, force_operand (rem_minus_blksize, NULL_RTX));
 	}
 
+      if (loop_label)
+	emit_cmp_and_jump_insns (rem, GEN_INT (blksize), GE, NULL,
+				 ptr_mode, 1, loop_label,
+				 profile_probability::likely ());
+
       if (label)
 	{
 	  emit_label (label);
@@ -4737,7 +4822,8 @@ expand_builtin_memcmp (tree exp, rtx target, bool result_eq)
   result = emit_block_cmp_hints (arg1_rtx, arg2_rtx, len_rtx,
 				 TREE_TYPE (len), target,
 				 result_eq, constfn,
-				 CONST_CAST (char *, rep));
+				 CONST_CAST (char *, rep),
+				 tree_ctz (len));
 
   if (result)
     {
@@ -7380,7 +7466,15 @@ expand_builtin (tree exp, rtx target, rtx subtarget, machine_mode mode,
       && fcode != BUILT_IN_EXECVE
       && fcode != BUILT_IN_CLEAR_CACHE
       && !ALLOCA_FUNCTION_CODE_P (fcode)
-      && fcode != BUILT_IN_FREE)
+      && fcode != BUILT_IN_FREE
+      && (fcode != BUILT_IN_MEMSET
+	  || !(flag_inline_stringops & ILSOP_MEMSET))
+      && (fcode != BUILT_IN_MEMCPY
+	  || !(flag_inline_stringops & ILSOP_MEMCPY))
+      && (fcode != BUILT_IN_MEMMOVE
+	  || !(flag_inline_stringops & ILSOP_MEMMOVE))
+      && (fcode != BUILT_IN_MEMCMP
+	  || !(flag_inline_stringops & ILSOP_MEMCMP)))
     return expand_call (exp, target, ignore);
 
   /* The built-in function expanders test for target == const0_rtx
diff --git a/gcc/common.opt b/gcc/common.opt
index a28ca13385a..fcf945019ec 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -1882,6 +1882,40 @@ finline-atomics
 Common Var(flag_inline_atomics) Init(1) Optimization
 Inline __atomic operations when a lock free instruction sequence is available.
 
+finline-stringops
+Common RejectNegative Enum(ilsop_fn) Var(flag_inline_stringops, ILSOP_ALL) Enum(ilsop_fn) Init(ILSOP_NONE) Optimization Undocumented
+
+fno-inline-stringops
+Common RejectNegative Enum(ilsop_fn) Var(flag_inline_stringops, ILSOP_NONE) Enum(ilsop_fn) Optimization Undocumented
+
+finline-stringops=
+Common Joined Var(flag_inline_stringops) EnumSet Enum(ilsop_fn) Optimization
+-finline-stringops[=memcmp|memcpy|memmove|memset]
+Expand supported mem/str operations inline, even if against optimization.
+
+Enum
+Name(ilsop_fn) Type(enum ilsop_fn) UnknownError(unavailable stringop for inlining %qs)
+
+; This is not part of any set.
+; EnumValue
+; Enum(ilsop_fn) String(none) Value(ILSOP_NONE)
+
+EnumValue
+Enum(ilsop_fn) String(memcmp) Value(ILSOP_MEMCMP) Set(1)
+
+EnumValue
+Enum(ilsop_fn) String(memcpy) Value(ILSOP_MEMCPY) Set(2)
+
+EnumValue
+Enum(ilsop_fn) String(memmove) Value(ILSOP_MEMMOVE) Set(3)
+
+EnumValue
+Enum(ilsop_fn) String(memset) Value(ILSOP_MEMSET) Set(4)
+
+; This is not part of any set either.
+; EnumValue
+; Enum(ilsop_fn) String(all) Value(ILSOP_ALL)
+
 fcf-protection
 Common RejectNegative Alias(fcf-protection=,full)
 
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 9130104af22..923c8005f57 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -550,6 +550,7 @@ Objective-C and Objective-C++ Dialects}.
 -fgcse  -fgcse-after-reload  -fgcse-las  -fgcse-lm  -fgraphite-identity
 -fgcse-sm  -fhoist-adjacent-loads  -fif-conversion
 -fif-conversion2  -findirect-inlining
+-finline-stringops[=@var{fn}]
 -finline-functions  -finline-functions-called-once  -finline-limit=@var{n}
 -finline-small-functions -fipa-modref -fipa-cp  -fipa-cp-clone
 -fipa-bit-cp  -fipa-vrp  -fipa-pta  -fipa-profile  -fipa-pure-const
@@ -12081,6 +12082,20 @@ their @code{_FORTIFY_SOURCE} counterparts into faster alternatives.
 
 Enabled at levels @option{-O2}, @option{-O3}.
 
+@opindex finline-stringops
+@item -finline-stringops[=@var{fn}]
+Expand memory and string operations (for now, only @code{memset})
+inline, even when the length is variable or big enough as to require
+looping.  This is most useful along with @option{-ffreestanding} and
+@option{-fno-builtin}.
+
+In some circumstances, it enables the compiler to generate code that
+takes advantage of known alignment and length multipliers, but even then
+it may be less efficient than optimized runtime implementations, and
+grow code size so much that even a less performant but shared
+implementation runs faster due to better use of code caches.  This
+option is disabled by default.
+
 @opindex fno-inline
 @opindex finline
 @item -fno-inline
diff --git a/gcc/expr.cc b/gcc/expr.cc
index 56b51876f80..efa409643a3 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -80,7 +80,11 @@ static bool emit_block_move_via_pattern (rtx, rtx, rtx, unsigned, unsigned,
 					 HOST_WIDE_INT, unsigned HOST_WIDE_INT,
 					 unsigned HOST_WIDE_INT,
 					 unsigned HOST_WIDE_INT, bool);
-static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned);
+static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned, int);
+static void emit_block_move_via_sized_loop (rtx, rtx, rtx, unsigned, unsigned);
+static void emit_block_move_via_oriented_loop (rtx, rtx, rtx, unsigned, unsigned);
+static rtx emit_block_cmp_via_loop (rtx, rtx, rtx, tree, rtx, bool,
+				    unsigned, unsigned);
 static void clear_by_pieces (rtx, unsigned HOST_WIDE_INT, unsigned int);
 static rtx_insn *compress_float_constant (rtx, rtx);
 static rtx get_subtarget (rtx);
@@ -1955,6 +1959,8 @@ compare_by_pieces (rtx arg0, rtx arg1, unsigned HOST_WIDE_INT len,
    MIN_SIZE is the minimal size of block to move
    MAX_SIZE is the maximal size of block to move, if it cannot be represented
    in unsigned HOST_WIDE_INT, than it is mask of all ones.
+   CTZ_SIZE is the trailing-zeros count of SIZE; even a nonconstant SIZE is
+   known to be a multiple of 1<<CTZ_SIZE.
 
    Return the address of the new block, if memcpy is called and returns it,
    0 otherwise.  */
@@ -1966,7 +1972,7 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
 		       unsigned HOST_WIDE_INT max_size,
 		       unsigned HOST_WIDE_INT probable_max_size,
 		       bool bail_out_libcall, bool *is_move_done,
-		       bool might_overlap)
+		       bool might_overlap, unsigned ctz_size)
 {
   int may_use_call;
   rtx retval = 0;
@@ -2052,6 +2058,14 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
 	}
     }
 
+  bool dynamic_direction = false;
+  if (!pattern_ok && !pieces_ok && may_use_call
+      && (flag_inline_stringops & (might_overlap ? ILSOP_MEMMOVE : ILSOP_MEMCPY)))
+    {
+      may_use_call = 0;
+      dynamic_direction = might_overlap;
+    }
+
   if (pattern_ok)
     ;
   else if (pieces_ok)
@@ -2073,10 +2087,12 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
       retval = emit_block_copy_via_libcall (x, y, size,
 					    method == BLOCK_OP_TAILCALL);
     }
+  else if (dynamic_direction)
+    emit_block_move_via_oriented_loop (x, y, size, align, ctz_size);
   else if (might_overlap)
     *is_move_done = false;
   else
-    emit_block_move_via_loop (x, y, size, align);
+    emit_block_move_via_sized_loop (x, y, size, align, ctz_size);
 
   if (method == BLOCK_OP_CALL_PARM)
     OK_DEFER_POP;
@@ -2085,7 +2101,8 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
 }
 
 rtx
-emit_block_move (rtx x, rtx y, rtx size, enum block_op_methods method)
+emit_block_move (rtx x, rtx y, rtx size, enum block_op_methods method,
+		 unsigned int ctz_size)
 {
   unsigned HOST_WIDE_INT max, min = 0;
   if (GET_CODE (size) == CONST_INT)
@@ -2093,7 +2110,8 @@ emit_block_move (rtx x, rtx y, rtx size, enum block_op_methods method)
   else
     max = GET_MODE_MASK (GET_MODE (size));
   return emit_block_move_hints (x, y, size, method, 0, -1,
-				min, max, max);
+				min, max, max,
+				false, NULL, false, ctz_size);
 }
 
 /* A subroutine of emit_block_move.  Returns true if calling the
@@ -2255,13 +2273,117 @@ emit_block_move_via_pattern (rtx x, rtx y, rtx size, unsigned int align,
   return false;
 }
 
+/* Like emit_block_move_via_loop, but choose a suitable INCR based on
+   ALIGN and CTZ_SIZE.  */
+
+static void
+emit_block_move_via_sized_loop (rtx x, rtx y, rtx size,
+				unsigned int align,
+				unsigned int ctz_size)
+{
+  int incr = align / BITS_PER_UNIT;
+
+  if (CONST_INT_P (size))
+    ctz_size = MAX (ctz_size, (unsigned) wi::ctz (UINTVAL (size)));
+
+  if (HOST_WIDE_INT_1U << ctz_size < (unsigned HOST_WIDE_INT) incr)
+    incr = HOST_WIDE_INT_1U << ctz_size;
+
+  while (incr > 1 && !can_move_by_pieces (incr, align))
+    incr >>= 1;
+
+  gcc_checking_assert (incr);
+
+  return emit_block_move_via_loop (x, y, size, align, incr);
+}
+
+/* Like emit_block_move_via_sized_loop, but besides choosing INCR so
+   as to ensure safe moves even in case of overlap, output dynamic
+   tests to choose between two loops, one moving downwards, another
+   moving upwards.  */
+
+static void
+emit_block_move_via_oriented_loop (rtx x, rtx y, rtx size,
+				   unsigned int align,
+				   unsigned int ctz_size)
+{
+  int incr = align / BITS_PER_UNIT;
+
+  if (CONST_INT_P (size))
+    ctz_size = MAX (ctz_size, (unsigned) wi::ctz (UINTVAL (size)));
+
+  if (HOST_WIDE_INT_1U << ctz_size < (unsigned HOST_WIDE_INT) incr)
+    incr = HOST_WIDE_INT_1U << ctz_size;
+
+  while (incr > 1 && !int_mode_for_size (incr, 0).exists ())
+    incr >>= 1;
+
+  gcc_checking_assert (incr);
+
+  rtx_code_label *upw_label, *end_label;
+  upw_label = gen_label_rtx ();
+  end_label = gen_label_rtx ();
+
+  rtx x_addr = force_operand (XEXP (x, 0), NULL_RTX);
+  rtx y_addr = force_operand (XEXP (y, 0), NULL_RTX);
+  do_pending_stack_adjust ();
+
+  machine_mode mode = GET_MODE (x_addr);
+  if (mode != GET_MODE (y_addr))
+    {
+      scalar_int_mode xmode
+	= smallest_int_mode_for_size (GET_MODE_BITSIZE (mode));
+      scalar_int_mode ymode
+	= smallest_int_mode_for_size (GET_MODE_BITSIZE
+				      (GET_MODE (y_addr)));
+      if (GET_MODE_BITSIZE (xmode) < GET_MODE_BITSIZE (ymode))
+	mode = ymode;
+      else
+	mode = xmode;
+
+#ifndef POINTERS_EXTEND_UNSIGNED
+      const int POINTERS_EXTEND_UNSIGNED = 1;
+#endif
+      x_addr = convert_modes (mode, GET_MODE (x_addr), x_addr,
+			      POINTERS_EXTEND_UNSIGNED);
+      y_addr = convert_modes (mode, GET_MODE (y_addr), y_addr,
+			      POINTERS_EXTEND_UNSIGNED);
+    }
+
+  /* Test for overlap: if (x >= y || x + size <= y) goto upw_label.  */
+  emit_cmp_and_jump_insns (x_addr, y_addr, GEU, NULL_RTX, mode,
+			   true, upw_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (5, 10));
+  rtx tmp = convert_modes (GET_MODE (x_addr), GET_MODE (size), size, true);
+  tmp = simplify_gen_binary (PLUS, GET_MODE (x_addr), x_addr, tmp);
+
+  emit_cmp_and_jump_insns (tmp, y_addr, LEU, NULL_RTX, mode,
+			   true, upw_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (8, 10));
+
+  emit_block_move_via_loop (x, y, size, align, -incr);
+
+  emit_jump (end_label);
+  emit_label (upw_label);
+
+  emit_block_move_via_loop (x, y, size, align, incr);
+
+  emit_label (end_label);
+}
+
 /* A subroutine of emit_block_move.  Copy the data via an explicit
-   loop.  This is used only when libcalls are forbidden.  */
-/* ??? It'd be nice to copy in hunks larger than QImode.  */
+   loop.  This is used only when libcalls are forbidden, or when
+   inlining is required.  INCR is the block size to be copied in each
+   loop iteration.  If it is negative, the absolute value is used, and
+   the block is copied backwards.  INCR must be a power of two, an
+   exact divisor for SIZE and ALIGN, and imply a mode that can be
+   safely copied per iteration assuming no overlap.  */
 
 static void
 emit_block_move_via_loop (rtx x, rtx y, rtx size,
-			  unsigned int align ATTRIBUTE_UNUSED)
+			  unsigned int align, int incr)
 {
   rtx_code_label *cmp_label, *top_label;
   rtx iter, x_addr, y_addr, tmp;
@@ -2277,7 +2399,38 @@ emit_block_move_via_loop (rtx x, rtx y, rtx size,
   cmp_label = gen_label_rtx ();
   iter = gen_reg_rtx (iter_mode);
 
-  emit_move_insn (iter, const0_rtx);
+  bool downwards = incr < 0;
+  rtx iter_init;
+  rtx_code iter_cond;
+  rtx iter_limit;
+  rtx iter_incr;
+  machine_mode move_mode;
+  if (downwards)
+    {
+      incr = -incr;
+      iter_init = size;
+      iter_cond = GEU;
+      iter_limit = const0_rtx;
+      iter_incr = GEN_INT (incr);
+    }
+  else
+    {
+      iter_init = const0_rtx;
+      iter_cond = LTU;
+      iter_limit = size;
+      iter_incr = GEN_INT (incr);
+    }
+  emit_move_insn (iter, iter_init);
+
+  scalar_int_mode int_move_mode
+    = smallest_int_mode_for_size (incr * BITS_PER_UNIT);
+  if (GET_MODE_BITSIZE (int_move_mode) != incr * BITS_PER_UNIT)
+    {
+      move_mode = BLKmode;
+      gcc_checking_assert (can_move_by_pieces (incr, align));
+    }
+  else
+    move_mode = int_move_mode;
 
   x_addr = force_operand (XEXP (x, 0), NULL_RTX);
   y_addr = force_operand (XEXP (y, 0), NULL_RTX);
@@ -2293,19 +2446,32 @@ emit_block_move_via_loop (rtx x, rtx y, rtx size,
     tmp = convert_modes (y_addr_mode, iter_mode, iter, true);
   y_addr = simplify_gen_binary (PLUS, y_addr_mode, y_addr, tmp);
 
-  x = change_address (x, QImode, x_addr);
-  y = change_address (y, QImode, y_addr);
+  x = change_address (x, move_mode, x_addr);
+  y = change_address (y, move_mode, y_addr);
 
-  emit_move_insn (x, y);
+  if (move_mode == BLKmode)
+    {
+      bool done;
+      emit_block_move_hints (x, y, iter_incr, BLOCK_OP_NO_LIBCALL,
+			     align, incr, incr, incr, incr,
+			     false, &done, false);
+      gcc_checking_assert (done);
+    }
+  else
+    emit_move_insn (x, y);
 
-  tmp = expand_simple_binop (iter_mode, PLUS, iter, const1_rtx, iter,
+  if (downwards)
+    emit_label (cmp_label);
+
+  tmp = expand_simple_binop (iter_mode, PLUS, iter, iter_incr, iter,
 			     true, OPTAB_LIB_WIDEN);
   if (tmp != iter)
     emit_move_insn (iter, tmp);
 
-  emit_label (cmp_label);
+  if (!downwards)
+    emit_label (cmp_label);
 
-  emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
+  emit_cmp_and_jump_insns (iter, iter_limit, iter_cond, NULL_RTX, iter_mode,
 			   true, top_label,
 			   profile_probability::guessed_always ()
 				.apply_scale (9, 10));
@@ -2405,7 +2571,8 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
 
    Both X and Y must be MEM rtx's.  LEN is an rtx that says how long
    they are.  LEN_TYPE is the type of the expression that was used to
-   calculate it.
+   calculate it, and CTZ_LEN is the known trailing-zeros count of LEN,
+   so LEN must be a multiple of 1<<CTZ_LEN even if it's not constant.
 
    If EQUALITY_ONLY is true, it means we don't have to return the tri-state
    value of a normal memcmp call, instead we can just compare for equality.
@@ -2421,7 +2588,7 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
 rtx
 emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
 		      bool equality_only, by_pieces_constfn y_cfn,
-		      void *y_cfndata)
+		      void *y_cfndata, unsigned ctz_len)
 {
   rtx result = 0;
 
@@ -2443,8 +2610,181 @@ emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
   else
     result = emit_block_cmp_via_cmpmem (x, y, len, len_type, target, align);
 
+  if (!result && (flag_inline_stringops & ILSOP_MEMCMP))
+    result = emit_block_cmp_via_loop (x, y, len, len_type,
+				      target, equality_only,
+				      align, ctz_len);
+
   return result;
 }
+
+/* Like emit_block_cmp_hints, but with known alignment and no support
+   for constats.  Always expand to a loop with iterations that compare
+   blocks of the largest compare-by-pieces size that divides both len
+   and align, and then, if !EQUALITY_ONLY, identify the word and then
+   the unit that first differs to return the result.  */
+
+rtx
+emit_block_cmp_via_loop (rtx x, rtx y, rtx len, tree len_type, rtx target,
+			 bool equality_only, unsigned align, unsigned ctz_len)
+{
+  unsigned incr = align / BITS_PER_UNIT;
+
+  if (CONST_INT_P (len))
+    ctz_len = MAX (ctz_len, (unsigned) wi::ctz (UINTVAL (len)));
+
+  if (HOST_WIDE_INT_1U << ctz_len < (unsigned HOST_WIDE_INT) incr)
+    incr = HOST_WIDE_INT_1U << ctz_len;
+
+  while (incr > 1
+	 && !can_do_by_pieces (incr, align, COMPARE_BY_PIECES))
+    incr >>= 1;
+
+  rtx_code_label *cmp_label, *top_label, *ne_label, *res_label;
+  rtx iter, x_addr, y_addr, tmp;
+  machine_mode x_addr_mode = get_address_mode (x);
+  machine_mode y_addr_mode = get_address_mode (y);
+  machine_mode iter_mode;
+
+  iter_mode = GET_MODE (len);
+  if (iter_mode == VOIDmode)
+    iter_mode = word_mode;
+
+  top_label = gen_label_rtx ();
+  cmp_label = gen_label_rtx ();
+  ne_label = gen_label_rtx ();
+  res_label = gen_label_rtx ();
+  iter = gen_reg_rtx (iter_mode);
+
+  rtx iter_init = const0_rtx;
+  rtx_code iter_cond = LTU;
+  rtx iter_limit = len;
+  rtx iter_incr = GEN_INT (incr);
+  machine_mode cmp_mode;
+
+  emit_move_insn (iter, iter_init);
+
+  scalar_int_mode int_cmp_mode
+    = smallest_int_mode_for_size (incr * BITS_PER_UNIT);
+  if (GET_MODE_BITSIZE (int_cmp_mode) != incr * BITS_PER_UNIT
+      || !can_compare_p (NE, int_cmp_mode, ccp_jump))
+    {
+      cmp_mode = BLKmode;
+      gcc_checking_assert (incr != 1);
+    }
+  else
+    cmp_mode = int_cmp_mode;
+
+  /* Save the base addresses.  */
+  x_addr = force_operand (XEXP (x, 0), NULL_RTX);
+  y_addr = force_operand (XEXP (y, 0), NULL_RTX);
+  do_pending_stack_adjust ();
+
+  emit_jump (cmp_label);
+  emit_label (top_label);
+
+  /* Offset the base addresses by ITER.  */
+  tmp = convert_modes (x_addr_mode, iter_mode, iter, true);
+  x_addr = simplify_gen_binary (PLUS, x_addr_mode, x_addr, tmp);
+
+  if (x_addr_mode != y_addr_mode)
+    tmp = convert_modes (y_addr_mode, iter_mode, iter, true);
+  y_addr = simplify_gen_binary (PLUS, y_addr_mode, y_addr, tmp);
+
+  x = change_address (x, cmp_mode, x_addr);
+  y = change_address (y, cmp_mode, y_addr);
+
+  /* Compare one block.  */
+  rtx part_res;
+  if (cmp_mode == BLKmode)
+    part_res = compare_by_pieces (x, y, incr, target, align, 0, 0);
+  else
+    part_res = expand_binop (cmp_mode, sub_optab, x, y, NULL_RTX,
+			     true, OPTAB_LIB_WIDEN);
+
+  /* Stop if we found a difference.  */
+  emit_cmp_and_jump_insns (part_res, GEN_INT (0), NE, NULL_RTX,
+			   GET_MODE (part_res), true, ne_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (1, 10));
+
+  /* Increment ITER.  */
+  tmp = expand_simple_binop (iter_mode, PLUS, iter, iter_incr, iter,
+			     true, OPTAB_LIB_WIDEN);
+  if (tmp != iter)
+    emit_move_insn (iter, tmp);
+
+  emit_label (cmp_label);
+  /* Loop until we reach the limit.  */
+  emit_cmp_and_jump_insns (iter, iter_limit, iter_cond, NULL_RTX, iter_mode,
+			   true, top_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (9, 10));
+
+  /* We got to the end without differences, so the result is zero.  */
+  if (target == NULL_RTX
+      || !REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
+    target = gen_reg_rtx (TYPE_MODE (integer_type_node));
+
+  emit_move_insn (target, const0_rtx);
+  emit_jump (res_label);
+  emit_barrier ();
+
+  emit_label (ne_label);
+
+  /* Return nonzero, or pinpoint the difference to return the expected
+     result for non-equality tests.  */
+  if (equality_only)
+    emit_move_insn (target, const1_rtx);
+  else
+    {
+      if (incr > UNITS_PER_WORD)
+	/* ??? Re-compare the block found to be different one word at a
+	   time.  */
+	part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), len_type,
+					    target, equality_only,
+					    BITS_PER_WORD, 0);
+      else if (incr > 1)
+	/* ??? Re-compare the block found to be different one byte at a
+	   time.  We could do better using part_res, and being careful
+	   about endianness.  */
+	part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), len_type,
+					    target, equality_only,
+					    BITS_PER_UNIT, 0);
+      else if (known_gt (GET_MODE_BITSIZE (GET_MODE (target)),
+			 GET_MODE_BITSIZE (cmp_mode)))
+	part_res = expand_binop (GET_MODE (target), sub_optab, x, y, target,
+				 true, OPTAB_LIB_WIDEN);
+      else
+	{
+	  /* In the odd chance target is QImode, we can't count on
+	     widening subtract to capture the result of the unsigned
+	     compares.  */
+	  rtx_code_label *ltu_label;
+	  ltu_label = gen_label_rtx ();
+	  emit_cmp_and_jump_insns (x, y, LTU, NULL_RTX,
+				   cmp_mode, true, ltu_label,
+				   profile_probability::guessed_always ()
+				   .apply_scale (5, 10));
+
+	  emit_move_insn (target, const1_rtx);
+	  emit_jump (res_label);
+	  emit_barrier ();
+
+	  emit_label (ltu_label);
+	  emit_move_insn (target, constm1_rtx);
+	  part_res = target;
+	}
+
+      if (target != part_res)
+	convert_move (target, part_res, false);
+    }
+
+  emit_label (res_label);
+
+  return target;
+}
+
 \f
 /* Copy all or part of a value X into registers starting at REGNO.
    The number of registers to be filled is NREGS.  */
diff --git a/gcc/expr.h b/gcc/expr.h
index 0c059ed9023..63c75c9688c 100644
--- a/gcc/expr.h
+++ b/gcc/expr.h
@@ -126,7 +126,8 @@ struct by_pieces_prev
   fixed_size_mode mode;
 };
 
-extern rtx emit_block_move (rtx, rtx, rtx, enum block_op_methods);
+extern rtx emit_block_move (rtx, rtx, rtx, enum block_op_methods,
+			    unsigned ctz_size = 0);
 extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
 			          unsigned int, HOST_WIDE_INT,
 				  unsigned HOST_WIDE_INT,
@@ -134,9 +135,11 @@ extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
 				  unsigned HOST_WIDE_INT,
 				  bool bail_out_libcall = false,
 				  bool *is_move_done = NULL,
-				  bool might_overlap = false);
+				  bool might_overlap = false,
+				  unsigned ctz_size = 0);
 extern rtx emit_block_cmp_hints (rtx, rtx, rtx, tree, rtx, bool,
-				 by_pieces_constfn, void *);
+				 by_pieces_constfn, void *,
+				 unsigned ctz_len = 0);
 extern bool emit_storent_insn (rtx to, rtx from);
 
 /* Copy all or part of a value X into registers starting at REGNO.
diff --git a/gcc/flag-types.h b/gcc/flag-types.h
index 78dbdbe90dd..a9d9eb166b6 100644
--- a/gcc/flag-types.h
+++ b/gcc/flag-types.h
@@ -432,6 +432,17 @@ enum gfc_convert
 };
 
 
+/* Inline String Operations functions.  */
+enum ilsop_fn
+{
+  ILSOP_NONE = 0,
+  ILSOP_MEMSET = 1 << 0,
+  ILSOP_MEMCPY = 1 << 1,
+  ILSOP_MEMMOVE = 1 << 2,
+  ILSOP_MEMCMP = 1 << 3,
+  ILSOP_ALL = -1
+};
+
 /* Control-Flow Protection values.  */
 enum cf_protection_level
 {
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c
new file mode 100644
index 00000000000..31a6d1dda6d
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c
@@ -0,0 +1,6 @@
+/* { dg-do run } */
+/* { dg-options "-finline-stringops=memcmp -save-temps -g0 -fno-lto" } */
+
+#include "../memcmp-1.c"
+
+/* { dg-final { scan-assembler-not {\mmemcmp\M} } } */
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c
new file mode 100644
index 00000000000..c98e903c1f1
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c
@@ -0,0 +1,8 @@
+/* { dg-do run } */
+/* { dg-options "-finline-stringops=memcpy -save-temps -g0 -fno-lto" } */
+
+#include "../memcmp-1.c"
+/* Yeah, this memcmp test exercises plenty of memcpy, more than any of the
+   memcpy tests.  */
+
+/* { dg-final { scan-assembler-not {\mmemcpy\M} } } */
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-cmp-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-cmp-1.c
new file mode 100644
index 00000000000..15d013118c1
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-cmp-1.c
@@ -0,0 +1,11 @@
+/* { dg-do run } */
+/* { dg-options "-finline-stringops -save-temps -g0 -fno-lto" } */
+/* { dg-require-effective-target ptr32plus } */
+/* { dg-timeout-factor 2 } */
+
+#include "../memcmp-1.c"
+/* Yeah, this memcmp test exercises plenty of memcpy, more than any of the
+   memcpy tests.  */
+
+/* { dg-final { scan-assembler-not {\mmemcpy\M} } } */
+/* { dg-final { scan-assembler-not {\mmemcmp\M} } } */
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c
new file mode 100644
index 00000000000..f1d77b50a4c
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c
@@ -0,0 +1,9 @@
+/* { dg-do run } */
+/* { dg-options "-finline-stringops=memmove -save-temps -g0 -fno-lto" } */
+
+#include "../../gcc.c-torture/execute/builtins/memmove.c"
+
+#include "../../gcc.c-torture/execute/builtins/lib/main.c"
+
+/* { dg-final { scan-assembler-not {\mmemcpy\M} } } */
+/* { dg-final { scan-assembler-not {\mmemmove\M} } } */
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c
new file mode 100644
index 00000000000..bdcf9bf5b61
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c
@@ -0,0 +1,84 @@
+/* { dg-do compile } */
+/* { dg-options "-finline-stringops -fno-lto" } */
+
+void *zero (unsigned long long (*p)[32], int n)
+{
+  return __builtin_memset (p, 0, n * sizeof (*p));
+}
+
+void *ones (char (*p)[128], int n)
+{
+  return __builtin_memset (p, -1, n * sizeof (*p));
+}
+
+void *opt2 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 2) * sizeof (*p));
+}
+
+void *opt8 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 8) * sizeof (*p));
+}
+
+void *opt32 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 32) * sizeof (*p));
+}
+
+void *opt128 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 128) * sizeof (*p));
+}
+
+void *opt512 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 512) * sizeof (*p));
+}
+
+void *opt_primes (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 509 : 7) * sizeof (*p));
+}
+
+void *opt_primes_blk (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 521 : 9) * sizeof (*p));
+}
+
+void *huge (long (*p)[16384])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep1 (long (*p)[16384+1])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep4 (long (*p)[16384+4])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep16 (long (*p)[16384+16])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep64 (long (*p)[16384+64])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep256 (long (*p)[16384+256])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep1024p256p64p16p4p1 (long (*p)[16384+1024+64+16+4+1])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+/* { dg-final { scan-assembler-not {\mmemset\M} } } */

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [gcc(refs/users/aoliva/heads/testme)] Introduce -finline-stringops
@ 2023-06-02  3:55 Alexandre Oliva
  0 siblings, 0 replies; 16+ messages in thread
From: Alexandre Oliva @ 2023-06-02  3:55 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:e13319a5d17a24bc3e130800634724edc1f3c390

commit e13319a5d17a24bc3e130800634724edc1f3c390
Author: Alexandre Oliva <oliva@adacore.com>
Date:   Thu Jun 1 22:38:29 2023 -0300

    Introduce -finline-stringops
    
    try_store_by_multiple_pieces was added not long ago, enabling
    variable-sized memset to be expanded inline when the worst-case
    in-range constant length would, using conditional blocks with powers
    of two to cover all possibilities of length and alignment.
    
    This patch introduces -finline-stringops[=fn] to request expansions to
    start with a loop, so as to still take advantage of known alignment
    even with long lengths, but without necessarily adding store blocks
    for every power of two.
    
    This makes it possible for the supported stringops (memset, memcpy,
    memmove, memset) to be expanded, even if storing a single byte per
    iteration.  Surely efficient implementations can run faster, with a
    pre-loop to increase alignment, but that would likely be excessive for
    inline expansions.
    
    Still, in some cases, such as in freestanding environments, users
    prefer to inline such stringops, especially those that the compiler
    may introduce itself, even if the expansion is not as performant as a
    highly optimized C library implementation could be, to avoid
    depending on a C runtime library.
    
    
    for  gcc/ChangeLog
    
            * expr.cc (emit_block_move_hints): Take ctz of len.  Obey
            -finline-stringops.  Use oriented or sized loop.
            (emit_block_move): Take ctz of len, and pass it on.
            (emit_block_move_via_sized_loop): New.
            (emit_block_move_via_oriented_loop): New.
            (emit_block_move_via_loop): Take incr.  Move an incr-sized
            block per iteration.
            (emit_block_cmp_via_cmpmem): Take ctz of len.  Obey
            -finline-stringops.
            (emit_block_cmp_via_loop): New.
            * expr.h (emit_block_move): Add ctz of len defaulting to zero.
            (emit_block_move_hints): Likewise.
            (emit_block_cmp_hints): Likewise.
            * builtins.cc (expand_builtin_memory_copy_args): Pass ctz of
            len to emit_block_move_hints.
            (try_store_by_multiple_pieces): Support starting with a loop.
            (expand_builtin_memcmp): Pass ctz of len to
            emit_block_cmp_hints.
            (expand_builtin): Allow inline expansion of memset, memcpy,
            memmove and memcmp if requested.
            * common.opt (finline-stringops): New.
            (ilsop_fn): New enum.
            * flag-types.h (enum ilsop_fn): New.
            * doc/invoke.texi (-finline-stringops): Add.
    
    for  gcc/testsuite/ChangeLog
    
            * gcc.dg/torture/inline-mem-cmp-1.c: New.
            * gcc.dg/torture/inline-mem-cpy-1.c: New.
            * gcc.dg/torture/inline-mem-cpy-cmp-1.c: New.
            * gcc.dg/torture/inline-mem-move-1.c: New.
            * gcc.dg/torture/inline-mem-set-1.c: New.

Diff:
---
 gcc/builtins.cc                                    | 114 ++++++-
 gcc/common.opt                                     |  34 ++
 gcc/doc/invoke.texi                                |  15 +
 gcc/expr.cc                                        | 374 ++++++++++++++++++++-
 gcc/expr.h                                         |   9 +-
 gcc/flag-types.h                                   |  11 +
 gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c    |   6 +
 gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c    |   8 +
 .../gcc.dg/torture/inline-mem-cpy-cmp-1.c          |  11 +
 gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c   |   9 +
 gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c    |  84 +++++
 11 files changed, 645 insertions(+), 30 deletions(-)

diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index 8400adaf5b4..1beaa4eae97 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -3769,7 +3769,7 @@ expand_builtin_memory_copy_args (tree dest, tree src, tree len,
 				     expected_align, expected_size,
 				     min_size, max_size, probable_max_size,
 				     use_mempcpy_call, &is_move_done,
-				     might_overlap);
+				     might_overlap, tree_ctz (len));
 
   /* Bail out when a mempcpy call would be expanded as libcall and when
      we have a target that provides a fast implementation
@@ -4335,6 +4335,10 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
   int tst_bits = (max_bits != min_bits ? max_bits
 		  : floor_log2 (max_len ^ min_len));
 
+  /* Save the pre-blksize values.  */
+  int orig_max_bits = max_bits;
+  int orig_tst_bits = tst_bits;
+
   /* Check whether it's profitable to start by storing a fixed BLKSIZE
      bytes, to lower max_bits.  In the unlikely case of a constant LEN
      (implied by identical MAX_LEN and MIN_LEN), we want to issue a
@@ -4374,9 +4378,70 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
   if (max_bits >= 0)
     xlenest += ((HOST_WIDE_INT_1U << max_bits) * 2
 		- (HOST_WIDE_INT_1U << ctz_len));
-  if (!can_store_by_pieces (xlenest, builtin_memset_read_str,
-			    &valc, align, true))
-    return false;
+  bool max_loop = false;
+  /* Skip the test in case of overflow in xlenest.  It shouldn't
+     happen because of the way max_bits and blksize are related, but
+     it doesn't hurt to test.  */
+  if (blksize > xlenest
+      || !can_store_by_pieces (xlenest, builtin_memset_read_str,
+			       &valc, align, true))
+    {
+      if (!(flag_inline_stringops & ILSOP_MEMSET))
+	return false;
+
+      for (max_bits = orig_max_bits;
+	   max_bits >= sctz_len;
+	   --max_bits)
+	{
+	  xlenest = ((HOST_WIDE_INT_1U << max_bits) * 2
+		     - (HOST_WIDE_INT_1U << ctz_len));
+	  /* Check that blksize plus the bits to be stored as blocks
+	     sized at powers of two can be stored by pieces.  This is
+	     like the test above, but with smaller max_bits.  Skip
+	     orig_max_bits (it would be redundant).  Also skip in case
+	     of overflow.  */
+	  if (max_bits < orig_max_bits
+	      && xlenest + blksize >= xlenest
+	      && can_store_by_pieces (xlenest + blksize,
+				      builtin_memset_read_str,
+				      &valc, align, true))
+	    {
+	      max_loop = true;
+	      break;
+	    }
+	  if (blksize
+	      && can_store_by_pieces (xlenest,
+				      builtin_memset_read_str,
+				      &valc, align, true))
+	    {
+	      max_len += blksize;
+	      min_len += blksize;
+	      tst_bits = orig_tst_bits;
+	      blksize = 0;
+	      max_loop = true;
+	      break;
+	    }
+	  if (max_bits == sctz_len)
+	    {
+	      --sctz_len;
+	      --ctz_len;
+	    }
+	}
+      if (!max_loop)
+	return false;
+      /* If the boundaries are such that min and max may run a
+	 different number of trips in the initial loop, the remainder
+	 needs not be between the moduli, so set tst_bits to cover all
+	 bits.  Otherwise, if the trip counts are the same, max_len
+	 has the common prefix, and the previously-computed tst_bits
+	 is usable.  */
+      if (max_len >> max_bits > min_len >> max_bits)
+	tst_bits = max_bits;
+    }
+  /* ??? Do we have to check that all powers of two lengths from
+     max_bits down to ctz_len pass can_store_by_pieces?  As in, could
+     it possibly be that xlenest passes while smaller power-of-two
+     sizes don't?  */
 
   by_pieces_constfn constfun;
   void *constfundata;
@@ -4418,7 +4483,9 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
      the least significant bit possibly set in the length.  */
   for (int i = max_bits; i >= sctz_len; i--)
     {
+      rtx_code_label *loop_label = NULL;
       rtx_code_label *label = NULL;
+
       blksize = HOST_WIDE_INT_1U << i;
 
       /* If we're past the bits shared between min_ and max_len, expand
@@ -4432,18 +4499,31 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
 				   profile_probability::even ());
 	}
       /* If we are at a bit that is in the prefix shared by min_ and
-	 max_len, skip this BLKSIZE if the bit is clear.  */
-      else if ((max_len & blksize) == 0)
+	 max_len, skip the current BLKSIZE if the bit is clear, but do
+	 not skip the loop, even if it doesn't require
+	 prechecking.  */
+      else if ((max_len & blksize) == 0
+	       && !(max_loop && i == max_bits))
 	continue;
 
+      if (max_loop && i == max_bits)
+	{
+	  loop_label = gen_label_rtx ();
+	  emit_label (loop_label);
+	  /* Since we may run this multiple times, don't assume we
+	     know anything about the offset.  */
+	  clear_mem_offset (to);
+	}
+
       /* Issue a store of BLKSIZE bytes.  */
+      bool update_needed = i != sctz_len || loop_label;
       to = store_by_pieces (to, blksize,
 			    constfun, constfundata,
 			    align, true,
-			    i != sctz_len ? RETURN_END : RETURN_BEGIN);
+			    update_needed ? RETURN_END : RETURN_BEGIN);
 
       /* Adjust REM and PTR, unless this is the last iteration.  */
-      if (i != sctz_len)
+      if (update_needed)
 	{
 	  emit_move_insn (ptr, force_operand (XEXP (to, 0), NULL_RTX));
 	  to = replace_equiv_address (to, ptr);
@@ -4451,6 +4531,11 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
 	  emit_move_insn (rem, force_operand (rem_minus_blksize, NULL_RTX));
 	}
 
+      if (loop_label)
+	emit_cmp_and_jump_insns (rem, GEN_INT (blksize), GE, NULL,
+				 ptr_mode, 1, loop_label,
+				 profile_probability::likely ());
+
       if (label)
 	{
 	  emit_label (label);
@@ -4737,7 +4822,8 @@ expand_builtin_memcmp (tree exp, rtx target, bool result_eq)
   result = emit_block_cmp_hints (arg1_rtx, arg2_rtx, len_rtx,
 				 TREE_TYPE (len), target,
 				 result_eq, constfn,
-				 CONST_CAST (char *, rep));
+				 CONST_CAST (char *, rep),
+				 tree_ctz (len));
 
   if (result)
     {
@@ -7380,7 +7466,15 @@ expand_builtin (tree exp, rtx target, rtx subtarget, machine_mode mode,
       && fcode != BUILT_IN_EXECVE
       && fcode != BUILT_IN_CLEAR_CACHE
       && !ALLOCA_FUNCTION_CODE_P (fcode)
-      && fcode != BUILT_IN_FREE)
+      && fcode != BUILT_IN_FREE
+      && (fcode != BUILT_IN_MEMSET
+	  || !(flag_inline_stringops & ILSOP_MEMSET))
+      && (fcode != BUILT_IN_MEMCPY
+	  || !(flag_inline_stringops & ILSOP_MEMCPY))
+      && (fcode != BUILT_IN_MEMMOVE
+	  || !(flag_inline_stringops & ILSOP_MEMMOVE))
+      && (fcode != BUILT_IN_MEMCMP
+	  || !(flag_inline_stringops & ILSOP_MEMCMP)))
     return expand_call (exp, target, ignore);
 
   /* The built-in function expanders test for target == const0_rtx
diff --git a/gcc/common.opt b/gcc/common.opt
index a28ca13385a..fcf945019ec 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -1882,6 +1882,40 @@ finline-atomics
 Common Var(flag_inline_atomics) Init(1) Optimization
 Inline __atomic operations when a lock free instruction sequence is available.
 
+finline-stringops
+Common RejectNegative Enum(ilsop_fn) Var(flag_inline_stringops, ILSOP_ALL) Enum(ilsop_fn) Init(ILSOP_NONE) Optimization Undocumented
+
+fno-inline-stringops
+Common RejectNegative Enum(ilsop_fn) Var(flag_inline_stringops, ILSOP_NONE) Enum(ilsop_fn) Optimization Undocumented
+
+finline-stringops=
+Common Joined Var(flag_inline_stringops) EnumSet Enum(ilsop_fn) Optimization
+-finline-stringops[=memcmp|memcpy|memmove|memset]
+Expand supported mem/str operations inline, even if against optimization.
+
+Enum
+Name(ilsop_fn) Type(enum ilsop_fn) UnknownError(unavailable stringop for inlining %qs)
+
+; This is not part of any set.
+; EnumValue
+; Enum(ilsop_fn) String(none) Value(ILSOP_NONE)
+
+EnumValue
+Enum(ilsop_fn) String(memcmp) Value(ILSOP_MEMCMP) Set(1)
+
+EnumValue
+Enum(ilsop_fn) String(memcpy) Value(ILSOP_MEMCPY) Set(2)
+
+EnumValue
+Enum(ilsop_fn) String(memmove) Value(ILSOP_MEMMOVE) Set(3)
+
+EnumValue
+Enum(ilsop_fn) String(memset) Value(ILSOP_MEMSET) Set(4)
+
+; This is not part of any set either.
+; EnumValue
+; Enum(ilsop_fn) String(all) Value(ILSOP_ALL)
+
 fcf-protection
 Common RejectNegative Alias(fcf-protection=,full)
 
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 9130104af22..923c8005f57 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -550,6 +550,7 @@ Objective-C and Objective-C++ Dialects}.
 -fgcse  -fgcse-after-reload  -fgcse-las  -fgcse-lm  -fgraphite-identity
 -fgcse-sm  -fhoist-adjacent-loads  -fif-conversion
 -fif-conversion2  -findirect-inlining
+-finline-stringops[=@var{fn}]
 -finline-functions  -finline-functions-called-once  -finline-limit=@var{n}
 -finline-small-functions -fipa-modref -fipa-cp  -fipa-cp-clone
 -fipa-bit-cp  -fipa-vrp  -fipa-pta  -fipa-profile  -fipa-pure-const
@@ -12081,6 +12082,20 @@ their @code{_FORTIFY_SOURCE} counterparts into faster alternatives.
 
 Enabled at levels @option{-O2}, @option{-O3}.
 
+@opindex finline-stringops
+@item -finline-stringops[=@var{fn}]
+Expand memory and string operations (for now, only @code{memset})
+inline, even when the length is variable or big enough as to require
+looping.  This is most useful along with @option{-ffreestanding} and
+@option{-fno-builtin}.
+
+In some circumstances, it enables the compiler to generate code that
+takes advantage of known alignment and length multipliers, but even then
+it may be less efficient than optimized runtime implementations, and
+grow code size so much that even a less performant but shared
+implementation runs faster due to better use of code caches.  This
+option is disabled by default.
+
 @opindex fno-inline
 @opindex finline
 @item -fno-inline
diff --git a/gcc/expr.cc b/gcc/expr.cc
index 56b51876f80..efa409643a3 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -80,7 +80,11 @@ static bool emit_block_move_via_pattern (rtx, rtx, rtx, unsigned, unsigned,
 					 HOST_WIDE_INT, unsigned HOST_WIDE_INT,
 					 unsigned HOST_WIDE_INT,
 					 unsigned HOST_WIDE_INT, bool);
-static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned);
+static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned, int);
+static void emit_block_move_via_sized_loop (rtx, rtx, rtx, unsigned, unsigned);
+static void emit_block_move_via_oriented_loop (rtx, rtx, rtx, unsigned, unsigned);
+static rtx emit_block_cmp_via_loop (rtx, rtx, rtx, tree, rtx, bool,
+				    unsigned, unsigned);
 static void clear_by_pieces (rtx, unsigned HOST_WIDE_INT, unsigned int);
 static rtx_insn *compress_float_constant (rtx, rtx);
 static rtx get_subtarget (rtx);
@@ -1955,6 +1959,8 @@ compare_by_pieces (rtx arg0, rtx arg1, unsigned HOST_WIDE_INT len,
    MIN_SIZE is the minimal size of block to move
    MAX_SIZE is the maximal size of block to move, if it cannot be represented
    in unsigned HOST_WIDE_INT, than it is mask of all ones.
+   CTZ_SIZE is the trailing-zeros count of SIZE; even a nonconstant SIZE is
+   known to be a multiple of 1<<CTZ_SIZE.
 
    Return the address of the new block, if memcpy is called and returns it,
    0 otherwise.  */
@@ -1966,7 +1972,7 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
 		       unsigned HOST_WIDE_INT max_size,
 		       unsigned HOST_WIDE_INT probable_max_size,
 		       bool bail_out_libcall, bool *is_move_done,
-		       bool might_overlap)
+		       bool might_overlap, unsigned ctz_size)
 {
   int may_use_call;
   rtx retval = 0;
@@ -2052,6 +2058,14 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
 	}
     }
 
+  bool dynamic_direction = false;
+  if (!pattern_ok && !pieces_ok && may_use_call
+      && (flag_inline_stringops & (might_overlap ? ILSOP_MEMMOVE : ILSOP_MEMCPY)))
+    {
+      may_use_call = 0;
+      dynamic_direction = might_overlap;
+    }
+
   if (pattern_ok)
     ;
   else if (pieces_ok)
@@ -2073,10 +2087,12 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
       retval = emit_block_copy_via_libcall (x, y, size,
 					    method == BLOCK_OP_TAILCALL);
     }
+  else if (dynamic_direction)
+    emit_block_move_via_oriented_loop (x, y, size, align, ctz_size);
   else if (might_overlap)
     *is_move_done = false;
   else
-    emit_block_move_via_loop (x, y, size, align);
+    emit_block_move_via_sized_loop (x, y, size, align, ctz_size);
 
   if (method == BLOCK_OP_CALL_PARM)
     OK_DEFER_POP;
@@ -2085,7 +2101,8 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
 }
 
 rtx
-emit_block_move (rtx x, rtx y, rtx size, enum block_op_methods method)
+emit_block_move (rtx x, rtx y, rtx size, enum block_op_methods method,
+		 unsigned int ctz_size)
 {
   unsigned HOST_WIDE_INT max, min = 0;
   if (GET_CODE (size) == CONST_INT)
@@ -2093,7 +2110,8 @@ emit_block_move (rtx x, rtx y, rtx size, enum block_op_methods method)
   else
     max = GET_MODE_MASK (GET_MODE (size));
   return emit_block_move_hints (x, y, size, method, 0, -1,
-				min, max, max);
+				min, max, max,
+				false, NULL, false, ctz_size);
 }
 
 /* A subroutine of emit_block_move.  Returns true if calling the
@@ -2255,13 +2273,117 @@ emit_block_move_via_pattern (rtx x, rtx y, rtx size, unsigned int align,
   return false;
 }
 
+/* Like emit_block_move_via_loop, but choose a suitable INCR based on
+   ALIGN and CTZ_SIZE.  */
+
+static void
+emit_block_move_via_sized_loop (rtx x, rtx y, rtx size,
+				unsigned int align,
+				unsigned int ctz_size)
+{
+  int incr = align / BITS_PER_UNIT;
+
+  if (CONST_INT_P (size))
+    ctz_size = MAX (ctz_size, (unsigned) wi::ctz (UINTVAL (size)));
+
+  if (HOST_WIDE_INT_1U << ctz_size < (unsigned HOST_WIDE_INT) incr)
+    incr = HOST_WIDE_INT_1U << ctz_size;
+
+  while (incr > 1 && !can_move_by_pieces (incr, align))
+    incr >>= 1;
+
+  gcc_checking_assert (incr);
+
+  return emit_block_move_via_loop (x, y, size, align, incr);
+}
+
+/* Like emit_block_move_via_sized_loop, but besides choosing INCR so
+   as to ensure safe moves even in case of overlap, output dynamic
+   tests to choose between two loops, one moving downwards, another
+   moving upwards.  */
+
+static void
+emit_block_move_via_oriented_loop (rtx x, rtx y, rtx size,
+				   unsigned int align,
+				   unsigned int ctz_size)
+{
+  int incr = align / BITS_PER_UNIT;
+
+  if (CONST_INT_P (size))
+    ctz_size = MAX (ctz_size, (unsigned) wi::ctz (UINTVAL (size)));
+
+  if (HOST_WIDE_INT_1U << ctz_size < (unsigned HOST_WIDE_INT) incr)
+    incr = HOST_WIDE_INT_1U << ctz_size;
+
+  while (incr > 1 && !int_mode_for_size (incr, 0).exists ())
+    incr >>= 1;
+
+  gcc_checking_assert (incr);
+
+  rtx_code_label *upw_label, *end_label;
+  upw_label = gen_label_rtx ();
+  end_label = gen_label_rtx ();
+
+  rtx x_addr = force_operand (XEXP (x, 0), NULL_RTX);
+  rtx y_addr = force_operand (XEXP (y, 0), NULL_RTX);
+  do_pending_stack_adjust ();
+
+  machine_mode mode = GET_MODE (x_addr);
+  if (mode != GET_MODE (y_addr))
+    {
+      scalar_int_mode xmode
+	= smallest_int_mode_for_size (GET_MODE_BITSIZE (mode));
+      scalar_int_mode ymode
+	= smallest_int_mode_for_size (GET_MODE_BITSIZE
+				      (GET_MODE (y_addr)));
+      if (GET_MODE_BITSIZE (xmode) < GET_MODE_BITSIZE (ymode))
+	mode = ymode;
+      else
+	mode = xmode;
+
+#ifndef POINTERS_EXTEND_UNSIGNED
+      const int POINTERS_EXTEND_UNSIGNED = 1;
+#endif
+      x_addr = convert_modes (mode, GET_MODE (x_addr), x_addr,
+			      POINTERS_EXTEND_UNSIGNED);
+      y_addr = convert_modes (mode, GET_MODE (y_addr), y_addr,
+			      POINTERS_EXTEND_UNSIGNED);
+    }
+
+  /* Test for overlap: if (x >= y || x + size <= y) goto upw_label.  */
+  emit_cmp_and_jump_insns (x_addr, y_addr, GEU, NULL_RTX, mode,
+			   true, upw_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (5, 10));
+  rtx tmp = convert_modes (GET_MODE (x_addr), GET_MODE (size), size, true);
+  tmp = simplify_gen_binary (PLUS, GET_MODE (x_addr), x_addr, tmp);
+
+  emit_cmp_and_jump_insns (tmp, y_addr, LEU, NULL_RTX, mode,
+			   true, upw_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (8, 10));
+
+  emit_block_move_via_loop (x, y, size, align, -incr);
+
+  emit_jump (end_label);
+  emit_label (upw_label);
+
+  emit_block_move_via_loop (x, y, size, align, incr);
+
+  emit_label (end_label);
+}
+
 /* A subroutine of emit_block_move.  Copy the data via an explicit
-   loop.  This is used only when libcalls are forbidden.  */
-/* ??? It'd be nice to copy in hunks larger than QImode.  */
+   loop.  This is used only when libcalls are forbidden, or when
+   inlining is required.  INCR is the block size to be copied in each
+   loop iteration.  If it is negative, the absolute value is used, and
+   the block is copied backwards.  INCR must be a power of two, an
+   exact divisor for SIZE and ALIGN, and imply a mode that can be
+   safely copied per iteration assuming no overlap.  */
 
 static void
 emit_block_move_via_loop (rtx x, rtx y, rtx size,
-			  unsigned int align ATTRIBUTE_UNUSED)
+			  unsigned int align, int incr)
 {
   rtx_code_label *cmp_label, *top_label;
   rtx iter, x_addr, y_addr, tmp;
@@ -2277,7 +2399,38 @@ emit_block_move_via_loop (rtx x, rtx y, rtx size,
   cmp_label = gen_label_rtx ();
   iter = gen_reg_rtx (iter_mode);
 
-  emit_move_insn (iter, const0_rtx);
+  bool downwards = incr < 0;
+  rtx iter_init;
+  rtx_code iter_cond;
+  rtx iter_limit;
+  rtx iter_incr;
+  machine_mode move_mode;
+  if (downwards)
+    {
+      incr = -incr;
+      iter_init = size;
+      iter_cond = GEU;
+      iter_limit = const0_rtx;
+      iter_incr = GEN_INT (incr);
+    }
+  else
+    {
+      iter_init = const0_rtx;
+      iter_cond = LTU;
+      iter_limit = size;
+      iter_incr = GEN_INT (incr);
+    }
+  emit_move_insn (iter, iter_init);
+
+  scalar_int_mode int_move_mode
+    = smallest_int_mode_for_size (incr * BITS_PER_UNIT);
+  if (GET_MODE_BITSIZE (int_move_mode) != incr * BITS_PER_UNIT)
+    {
+      move_mode = BLKmode;
+      gcc_checking_assert (can_move_by_pieces (incr, align));
+    }
+  else
+    move_mode = int_move_mode;
 
   x_addr = force_operand (XEXP (x, 0), NULL_RTX);
   y_addr = force_operand (XEXP (y, 0), NULL_RTX);
@@ -2293,19 +2446,32 @@ emit_block_move_via_loop (rtx x, rtx y, rtx size,
     tmp = convert_modes (y_addr_mode, iter_mode, iter, true);
   y_addr = simplify_gen_binary (PLUS, y_addr_mode, y_addr, tmp);
 
-  x = change_address (x, QImode, x_addr);
-  y = change_address (y, QImode, y_addr);
+  x = change_address (x, move_mode, x_addr);
+  y = change_address (y, move_mode, y_addr);
 
-  emit_move_insn (x, y);
+  if (move_mode == BLKmode)
+    {
+      bool done;
+      emit_block_move_hints (x, y, iter_incr, BLOCK_OP_NO_LIBCALL,
+			     align, incr, incr, incr, incr,
+			     false, &done, false);
+      gcc_checking_assert (done);
+    }
+  else
+    emit_move_insn (x, y);
 
-  tmp = expand_simple_binop (iter_mode, PLUS, iter, const1_rtx, iter,
+  if (downwards)
+    emit_label (cmp_label);
+
+  tmp = expand_simple_binop (iter_mode, PLUS, iter, iter_incr, iter,
 			     true, OPTAB_LIB_WIDEN);
   if (tmp != iter)
     emit_move_insn (iter, tmp);
 
-  emit_label (cmp_label);
+  if (!downwards)
+    emit_label (cmp_label);
 
-  emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
+  emit_cmp_and_jump_insns (iter, iter_limit, iter_cond, NULL_RTX, iter_mode,
 			   true, top_label,
 			   profile_probability::guessed_always ()
 				.apply_scale (9, 10));
@@ -2405,7 +2571,8 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
 
    Both X and Y must be MEM rtx's.  LEN is an rtx that says how long
    they are.  LEN_TYPE is the type of the expression that was used to
-   calculate it.
+   calculate it, and CTZ_LEN is the known trailing-zeros count of LEN,
+   so LEN must be a multiple of 1<<CTZ_LEN even if it's not constant.
 
    If EQUALITY_ONLY is true, it means we don't have to return the tri-state
    value of a normal memcmp call, instead we can just compare for equality.
@@ -2421,7 +2588,7 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
 rtx
 emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
 		      bool equality_only, by_pieces_constfn y_cfn,
-		      void *y_cfndata)
+		      void *y_cfndata, unsigned ctz_len)
 {
   rtx result = 0;
 
@@ -2443,8 +2610,181 @@ emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
   else
     result = emit_block_cmp_via_cmpmem (x, y, len, len_type, target, align);
 
+  if (!result && (flag_inline_stringops & ILSOP_MEMCMP))
+    result = emit_block_cmp_via_loop (x, y, len, len_type,
+				      target, equality_only,
+				      align, ctz_len);
+
   return result;
 }
+
+/* Like emit_block_cmp_hints, but with known alignment and no support
+   for constats.  Always expand to a loop with iterations that compare
+   blocks of the largest compare-by-pieces size that divides both len
+   and align, and then, if !EQUALITY_ONLY, identify the word and then
+   the unit that first differs to return the result.  */
+
+rtx
+emit_block_cmp_via_loop (rtx x, rtx y, rtx len, tree len_type, rtx target,
+			 bool equality_only, unsigned align, unsigned ctz_len)
+{
+  unsigned incr = align / BITS_PER_UNIT;
+
+  if (CONST_INT_P (len))
+    ctz_len = MAX (ctz_len, (unsigned) wi::ctz (UINTVAL (len)));
+
+  if (HOST_WIDE_INT_1U << ctz_len < (unsigned HOST_WIDE_INT) incr)
+    incr = HOST_WIDE_INT_1U << ctz_len;
+
+  while (incr > 1
+	 && !can_do_by_pieces (incr, align, COMPARE_BY_PIECES))
+    incr >>= 1;
+
+  rtx_code_label *cmp_label, *top_label, *ne_label, *res_label;
+  rtx iter, x_addr, y_addr, tmp;
+  machine_mode x_addr_mode = get_address_mode (x);
+  machine_mode y_addr_mode = get_address_mode (y);
+  machine_mode iter_mode;
+
+  iter_mode = GET_MODE (len);
+  if (iter_mode == VOIDmode)
+    iter_mode = word_mode;
+
+  top_label = gen_label_rtx ();
+  cmp_label = gen_label_rtx ();
+  ne_label = gen_label_rtx ();
+  res_label = gen_label_rtx ();
+  iter = gen_reg_rtx (iter_mode);
+
+  rtx iter_init = const0_rtx;
+  rtx_code iter_cond = LTU;
+  rtx iter_limit = len;
+  rtx iter_incr = GEN_INT (incr);
+  machine_mode cmp_mode;
+
+  emit_move_insn (iter, iter_init);
+
+  scalar_int_mode int_cmp_mode
+    = smallest_int_mode_for_size (incr * BITS_PER_UNIT);
+  if (GET_MODE_BITSIZE (int_cmp_mode) != incr * BITS_PER_UNIT
+      || !can_compare_p (NE, int_cmp_mode, ccp_jump))
+    {
+      cmp_mode = BLKmode;
+      gcc_checking_assert (incr != 1);
+    }
+  else
+    cmp_mode = int_cmp_mode;
+
+  /* Save the base addresses.  */
+  x_addr = force_operand (XEXP (x, 0), NULL_RTX);
+  y_addr = force_operand (XEXP (y, 0), NULL_RTX);
+  do_pending_stack_adjust ();
+
+  emit_jump (cmp_label);
+  emit_label (top_label);
+
+  /* Offset the base addresses by ITER.  */
+  tmp = convert_modes (x_addr_mode, iter_mode, iter, true);
+  x_addr = simplify_gen_binary (PLUS, x_addr_mode, x_addr, tmp);
+
+  if (x_addr_mode != y_addr_mode)
+    tmp = convert_modes (y_addr_mode, iter_mode, iter, true);
+  y_addr = simplify_gen_binary (PLUS, y_addr_mode, y_addr, tmp);
+
+  x = change_address (x, cmp_mode, x_addr);
+  y = change_address (y, cmp_mode, y_addr);
+
+  /* Compare one block.  */
+  rtx part_res;
+  if (cmp_mode == BLKmode)
+    part_res = compare_by_pieces (x, y, incr, target, align, 0, 0);
+  else
+    part_res = expand_binop (cmp_mode, sub_optab, x, y, NULL_RTX,
+			     true, OPTAB_LIB_WIDEN);
+
+  /* Stop if we found a difference.  */
+  emit_cmp_and_jump_insns (part_res, GEN_INT (0), NE, NULL_RTX,
+			   GET_MODE (part_res), true, ne_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (1, 10));
+
+  /* Increment ITER.  */
+  tmp = expand_simple_binop (iter_mode, PLUS, iter, iter_incr, iter,
+			     true, OPTAB_LIB_WIDEN);
+  if (tmp != iter)
+    emit_move_insn (iter, tmp);
+
+  emit_label (cmp_label);
+  /* Loop until we reach the limit.  */
+  emit_cmp_and_jump_insns (iter, iter_limit, iter_cond, NULL_RTX, iter_mode,
+			   true, top_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (9, 10));
+
+  /* We got to the end without differences, so the result is zero.  */
+  if (target == NULL_RTX
+      || !REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
+    target = gen_reg_rtx (TYPE_MODE (integer_type_node));
+
+  emit_move_insn (target, const0_rtx);
+  emit_jump (res_label);
+  emit_barrier ();
+
+  emit_label (ne_label);
+
+  /* Return nonzero, or pinpoint the difference to return the expected
+     result for non-equality tests.  */
+  if (equality_only)
+    emit_move_insn (target, const1_rtx);
+  else
+    {
+      if (incr > UNITS_PER_WORD)
+	/* ??? Re-compare the block found to be different one word at a
+	   time.  */
+	part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), len_type,
+					    target, equality_only,
+					    BITS_PER_WORD, 0);
+      else if (incr > 1)
+	/* ??? Re-compare the block found to be different one byte at a
+	   time.  We could do better using part_res, and being careful
+	   about endianness.  */
+	part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), len_type,
+					    target, equality_only,
+					    BITS_PER_UNIT, 0);
+      else if (known_gt (GET_MODE_BITSIZE (GET_MODE (target)),
+			 GET_MODE_BITSIZE (cmp_mode)))
+	part_res = expand_binop (GET_MODE (target), sub_optab, x, y, target,
+				 true, OPTAB_LIB_WIDEN);
+      else
+	{
+	  /* In the odd chance target is QImode, we can't count on
+	     widening subtract to capture the result of the unsigned
+	     compares.  */
+	  rtx_code_label *ltu_label;
+	  ltu_label = gen_label_rtx ();
+	  emit_cmp_and_jump_insns (x, y, LTU, NULL_RTX,
+				   cmp_mode, true, ltu_label,
+				   profile_probability::guessed_always ()
+				   .apply_scale (5, 10));
+
+	  emit_move_insn (target, const1_rtx);
+	  emit_jump (res_label);
+	  emit_barrier ();
+
+	  emit_label (ltu_label);
+	  emit_move_insn (target, constm1_rtx);
+	  part_res = target;
+	}
+
+      if (target != part_res)
+	convert_move (target, part_res, false);
+    }
+
+  emit_label (res_label);
+
+  return target;
+}
+
 \f
 /* Copy all or part of a value X into registers starting at REGNO.
    The number of registers to be filled is NREGS.  */
diff --git a/gcc/expr.h b/gcc/expr.h
index 0c059ed9023..63c75c9688c 100644
--- a/gcc/expr.h
+++ b/gcc/expr.h
@@ -126,7 +126,8 @@ struct by_pieces_prev
   fixed_size_mode mode;
 };
 
-extern rtx emit_block_move (rtx, rtx, rtx, enum block_op_methods);
+extern rtx emit_block_move (rtx, rtx, rtx, enum block_op_methods,
+			    unsigned ctz_size = 0);
 extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
 			          unsigned int, HOST_WIDE_INT,
 				  unsigned HOST_WIDE_INT,
@@ -134,9 +135,11 @@ extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
 				  unsigned HOST_WIDE_INT,
 				  bool bail_out_libcall = false,
 				  bool *is_move_done = NULL,
-				  bool might_overlap = false);
+				  bool might_overlap = false,
+				  unsigned ctz_size = 0);
 extern rtx emit_block_cmp_hints (rtx, rtx, rtx, tree, rtx, bool,
-				 by_pieces_constfn, void *);
+				 by_pieces_constfn, void *,
+				 unsigned ctz_len = 0);
 extern bool emit_storent_insn (rtx to, rtx from);
 
 /* Copy all or part of a value X into registers starting at REGNO.
diff --git a/gcc/flag-types.h b/gcc/flag-types.h
index 78dbdbe90dd..a9d9eb166b6 100644
--- a/gcc/flag-types.h
+++ b/gcc/flag-types.h
@@ -432,6 +432,17 @@ enum gfc_convert
 };
 
 
+/* Inline String Operations functions.  */
+enum ilsop_fn
+{
+  ILSOP_NONE = 0,
+  ILSOP_MEMSET = 1 << 0,
+  ILSOP_MEMCPY = 1 << 1,
+  ILSOP_MEMMOVE = 1 << 2,
+  ILSOP_MEMCMP = 1 << 3,
+  ILSOP_ALL = -1
+};
+
 /* Control-Flow Protection values.  */
 enum cf_protection_level
 {
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c
new file mode 100644
index 00000000000..80e45e5ba5a
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c
@@ -0,0 +1,6 @@
+/* { dg-do run } */
+/* { dg-options "-finline-stringops=memcmp -save-temps -g0 -fno-lto" } */
+
+#include "../memcmp-1.c"
+
+/* { dg-final { scan-assembler-not "\mmemcmp\M" } } */
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c
new file mode 100644
index 00000000000..4a6522d94f7
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c
@@ -0,0 +1,8 @@
+/* { dg-do run } */
+/* { dg-options "-finline-stringops=memcpy -save-temps -g0 -fno-lto" } */
+
+#include "../memcmp-1.c"
+/* Yeah, this memcmp test exercises plenty of memcpy, more than any of the
+   memcpy tests.  */
+
+/* { dg-final { scan-assembler-not "\mmemcpy\M" } } */
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-cmp-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-cmp-1.c
new file mode 100644
index 00000000000..ae622137859
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-cmp-1.c
@@ -0,0 +1,11 @@
+/* { dg-do run } */
+/* { dg-options "-finline-stringops -save-temps -g0 -fno-lto" } */
+/* { dg-require-effective-target ptr32plus } */
+/* { dg-timeout-factor 2 } */
+
+#include "../memcmp-1.c"
+/* Yeah, this memcmp test exercises plenty of memcpy, more than any of the
+   memcpy tests.  */
+
+/* { dg-final { scan-assembler-not "\mmemcpy\M" } } */
+/* { dg-final { scan-assembler-not "\mmemcmp\M" } } */
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c
new file mode 100644
index 00000000000..77d997a353b
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c
@@ -0,0 +1,9 @@
+/* { dg-do run } */
+/* { dg-options "-finline-stringops=memmove -save-temps -g0 -fno-lto" } */
+
+#include "../../gcc.c-torture/execute/builtins/memmove.c"
+
+#include "../../gcc/c-torture/execute/builtins/lib/main.c"
+
+/* { dg-final { scan-assembler-not "\mmemcpy\M" } } */
+/* { dg-final { scan-assembler-not "\mmemmove\M" } } */
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c
new file mode 100644
index 00000000000..1e7048155ba
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c
@@ -0,0 +1,84 @@
+/* { dg-do compile } */
+/* { dg-options "-finline-stringops -fno-lto" } */
+
+void *zero (unsigned long long (*p)[32], int n)
+{
+  return __builtin_memset (p, 0, n * sizeof (*p));
+}
+
+void *ones (char (*p)[128], int n)
+{
+  return __builtin_memset (p, -1, n * sizeof (*p));
+}
+
+void *opt2 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 2) * sizeof (*p));
+}
+
+void *opt8 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 8) * sizeof (*p));
+}
+
+void *opt32 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 32) * sizeof (*p));
+}
+
+void *opt128 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 128) * sizeof (*p));
+}
+
+void *opt512 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 512) * sizeof (*p));
+}
+
+void *opt_primes (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 509 : 7) * sizeof (*p));
+}
+
+void *opt_primes_blk (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 521 : 9) * sizeof (*p));
+}
+
+void *huge (long (*p)[16384])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep1 (long (*p)[16384+1])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep4 (long (*p)[16384+4])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep16 (long (*p)[16384+16])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep64 (long (*p)[16384+64])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep256 (long (*p)[16384+256])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep1024p256p64p16p4p1 (long (*p)[16384+1024+64+16+4+1])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+/* { dg-final { scan-assembler-not "\mmemset\M" } } */

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [gcc(refs/users/aoliva/heads/testme)] Introduce -finline-stringops
@ 2023-06-02  3:47 Alexandre Oliva
  0 siblings, 0 replies; 16+ messages in thread
From: Alexandre Oliva @ 2023-06-02  3:47 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:74e8cbdbae7df9068911587c2718ff110651749a

commit 74e8cbdbae7df9068911587c2718ff110651749a
Author: Alexandre Oliva <oliva@adacore.com>
Date:   Thu Jun 1 22:38:29 2023 -0300

    Introduce -finline-stringops
    
    try_store_by_multiple_pieces was added not long ago, enabling
    variable-sized memset to be expanded inline when the worst-case
    in-range constant length would, using conditional blocks with powers
    of two to cover all possibilities of length and alignment.
    
    This patch introduces -finline-stringops[=fn] to request expansions to
    start with a loop, so as to still take advantage of known alignment
    even with long lengths, but without necessarily adding store blocks
    for every power of two.
    
    This makes it possible for the supported stringops (memset, memcpy,
    memmove, memset) to be expanded, even if storing a single byte per
    iteration.  Surely efficient implementations can run faster, with a
    pre-loop to increase alignment, but that would likely be excessive for
    inline expansions.
    
    Still, in some cases, such as in freestanding environments, users
    prefer to inline such stringops, especially those that the compiler
    may introduce itself, even if the expansion is not as performant as a
    highly optimized C library implementation could be, to avoid
    depending on a C runtime library.
    
    
    for  gcc/ChangeLog
    
            * expr.cc (emit_block_move_hints): Take ctz of len.  Obey
            -finline-stringops.  Use oriented or sized loop.
            (emit_block_move): Take ctz of len, and pass it on.
            (emit_block_move_via_sized_loop): New.
            (emit_block_move_via_oriented_loop): New.
            (emit_block_move_via_loop): Take incr.  Move an incr-sized
            block per iteration.
            (emit_block_cmp_via_cmpmem): Take ctz of len.  Obey
            -finline-stringops.
            (emit_block_cmp_via_loop): New.
            * expr.h (emit_block_move): Add ctz of len defaulting to zero.
            (emit_block_move_hints): Likewise.
            (emit_block_cmp_hints): Likewise.
            * builtins.cc (expand_builtin_memory_copy_args): Pass ctz of
            len to emit_block_move_hints.
            (try_store_by_multiple_pieces): Support starting with a loop.
            (expand_builtin_memcmp): Pass ctz of len to
            emit_block_cmp_hints.
            (expand_builtin): Allow inline expansion of memset, memcpy,
            memmove and memcmp if requested.
            * common.opt (finline-stringops): New.
            (ilsop_fn): New enum.
            * flag-types.h (enum ilsop_fn): New.
            * doc/invoke.texi (-finline-stringops): Add.
    
    for  gcc/testsuite/ChangeLog
    
            * gcc.dg/torture/inline-mem-cmp-1.c: New.
            * gcc.dg/torture/inline-mem-cpy-1.c: New.
            * gcc.dg/torture/inline-mem-cpy-cmp-1.c: New.
            * gcc.dg/torture/inline-mem-move-1.c: New.
            * gcc.dg/torture/inline-mem-set-1.c: New.

Diff:
---
 gcc/builtins.cc                                    | 114 ++++++-
 gcc/common.opt                                     |  34 ++
 gcc/doc/invoke.texi                                |  15 +
 gcc/expr.cc                                        | 374 ++++++++++++++++++++-
 gcc/expr.h                                         |   9 +-
 gcc/flag-types.h                                   |  11 +
 gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c    |   6 +
 gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c    |   8 +
 .../gcc.dg/torture/inline-mem-cpy-cmp-1.c          |  11 +
 gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c   |   9 +
 gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c    |  84 +++++
 11 files changed, 645 insertions(+), 30 deletions(-)

diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index 8400adaf5b4..1beaa4eae97 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -3769,7 +3769,7 @@ expand_builtin_memory_copy_args (tree dest, tree src, tree len,
 				     expected_align, expected_size,
 				     min_size, max_size, probable_max_size,
 				     use_mempcpy_call, &is_move_done,
-				     might_overlap);
+				     might_overlap, tree_ctz (len));
 
   /* Bail out when a mempcpy call would be expanded as libcall and when
      we have a target that provides a fast implementation
@@ -4335,6 +4335,10 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
   int tst_bits = (max_bits != min_bits ? max_bits
 		  : floor_log2 (max_len ^ min_len));
 
+  /* Save the pre-blksize values.  */
+  int orig_max_bits = max_bits;
+  int orig_tst_bits = tst_bits;
+
   /* Check whether it's profitable to start by storing a fixed BLKSIZE
      bytes, to lower max_bits.  In the unlikely case of a constant LEN
      (implied by identical MAX_LEN and MIN_LEN), we want to issue a
@@ -4374,9 +4378,70 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
   if (max_bits >= 0)
     xlenest += ((HOST_WIDE_INT_1U << max_bits) * 2
 		- (HOST_WIDE_INT_1U << ctz_len));
-  if (!can_store_by_pieces (xlenest, builtin_memset_read_str,
-			    &valc, align, true))
-    return false;
+  bool max_loop = false;
+  /* Skip the test in case of overflow in xlenest.  It shouldn't
+     happen because of the way max_bits and blksize are related, but
+     it doesn't hurt to test.  */
+  if (blksize > xlenest
+      || !can_store_by_pieces (xlenest, builtin_memset_read_str,
+			       &valc, align, true))
+    {
+      if (!(flag_inline_stringops & ILSOP_MEMSET))
+	return false;
+
+      for (max_bits = orig_max_bits;
+	   max_bits >= sctz_len;
+	   --max_bits)
+	{
+	  xlenest = ((HOST_WIDE_INT_1U << max_bits) * 2
+		     - (HOST_WIDE_INT_1U << ctz_len));
+	  /* Check that blksize plus the bits to be stored as blocks
+	     sized at powers of two can be stored by pieces.  This is
+	     like the test above, but with smaller max_bits.  Skip
+	     orig_max_bits (it would be redundant).  Also skip in case
+	     of overflow.  */
+	  if (max_bits < orig_max_bits
+	      && xlenest + blksize >= xlenest
+	      && can_store_by_pieces (xlenest + blksize,
+				      builtin_memset_read_str,
+				      &valc, align, true))
+	    {
+	      max_loop = true;
+	      break;
+	    }
+	  if (blksize
+	      && can_store_by_pieces (xlenest,
+				      builtin_memset_read_str,
+				      &valc, align, true))
+	    {
+	      max_len += blksize;
+	      min_len += blksize;
+	      tst_bits = orig_tst_bits;
+	      blksize = 0;
+	      max_loop = true;
+	      break;
+	    }
+	  if (max_bits == sctz_len)
+	    {
+	      --sctz_len;
+	      --ctz_len;
+	    }
+	}
+      if (!max_loop)
+	return false;
+      /* If the boundaries are such that min and max may run a
+	 different number of trips in the initial loop, the remainder
+	 needs not be between the moduli, so set tst_bits to cover all
+	 bits.  Otherwise, if the trip counts are the same, max_len
+	 has the common prefix, and the previously-computed tst_bits
+	 is usable.  */
+      if (max_len >> max_bits > min_len >> max_bits)
+	tst_bits = max_bits;
+    }
+  /* ??? Do we have to check that all powers of two lengths from
+     max_bits down to ctz_len pass can_store_by_pieces?  As in, could
+     it possibly be that xlenest passes while smaller power-of-two
+     sizes don't?  */
 
   by_pieces_constfn constfun;
   void *constfundata;
@@ -4418,7 +4483,9 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
      the least significant bit possibly set in the length.  */
   for (int i = max_bits; i >= sctz_len; i--)
     {
+      rtx_code_label *loop_label = NULL;
       rtx_code_label *label = NULL;
+
       blksize = HOST_WIDE_INT_1U << i;
 
       /* If we're past the bits shared between min_ and max_len, expand
@@ -4432,18 +4499,31 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
 				   profile_probability::even ());
 	}
       /* If we are at a bit that is in the prefix shared by min_ and
-	 max_len, skip this BLKSIZE if the bit is clear.  */
-      else if ((max_len & blksize) == 0)
+	 max_len, skip the current BLKSIZE if the bit is clear, but do
+	 not skip the loop, even if it doesn't require
+	 prechecking.  */
+      else if ((max_len & blksize) == 0
+	       && !(max_loop && i == max_bits))
 	continue;
 
+      if (max_loop && i == max_bits)
+	{
+	  loop_label = gen_label_rtx ();
+	  emit_label (loop_label);
+	  /* Since we may run this multiple times, don't assume we
+	     know anything about the offset.  */
+	  clear_mem_offset (to);
+	}
+
       /* Issue a store of BLKSIZE bytes.  */
+      bool update_needed = i != sctz_len || loop_label;
       to = store_by_pieces (to, blksize,
 			    constfun, constfundata,
 			    align, true,
-			    i != sctz_len ? RETURN_END : RETURN_BEGIN);
+			    update_needed ? RETURN_END : RETURN_BEGIN);
 
       /* Adjust REM and PTR, unless this is the last iteration.  */
-      if (i != sctz_len)
+      if (update_needed)
 	{
 	  emit_move_insn (ptr, force_operand (XEXP (to, 0), NULL_RTX));
 	  to = replace_equiv_address (to, ptr);
@@ -4451,6 +4531,11 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
 	  emit_move_insn (rem, force_operand (rem_minus_blksize, NULL_RTX));
 	}
 
+      if (loop_label)
+	emit_cmp_and_jump_insns (rem, GEN_INT (blksize), GE, NULL,
+				 ptr_mode, 1, loop_label,
+				 profile_probability::likely ());
+
       if (label)
 	{
 	  emit_label (label);
@@ -4737,7 +4822,8 @@ expand_builtin_memcmp (tree exp, rtx target, bool result_eq)
   result = emit_block_cmp_hints (arg1_rtx, arg2_rtx, len_rtx,
 				 TREE_TYPE (len), target,
 				 result_eq, constfn,
-				 CONST_CAST (char *, rep));
+				 CONST_CAST (char *, rep),
+				 tree_ctz (len));
 
   if (result)
     {
@@ -7380,7 +7466,15 @@ expand_builtin (tree exp, rtx target, rtx subtarget, machine_mode mode,
       && fcode != BUILT_IN_EXECVE
       && fcode != BUILT_IN_CLEAR_CACHE
       && !ALLOCA_FUNCTION_CODE_P (fcode)
-      && fcode != BUILT_IN_FREE)
+      && fcode != BUILT_IN_FREE
+      && (fcode != BUILT_IN_MEMSET
+	  || !(flag_inline_stringops & ILSOP_MEMSET))
+      && (fcode != BUILT_IN_MEMCPY
+	  || !(flag_inline_stringops & ILSOP_MEMCPY))
+      && (fcode != BUILT_IN_MEMMOVE
+	  || !(flag_inline_stringops & ILSOP_MEMMOVE))
+      && (fcode != BUILT_IN_MEMCMP
+	  || !(flag_inline_stringops & ILSOP_MEMCMP)))
     return expand_call (exp, target, ignore);
 
   /* The built-in function expanders test for target == const0_rtx
diff --git a/gcc/common.opt b/gcc/common.opt
index a28ca13385a..fcf945019ec 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -1882,6 +1882,40 @@ finline-atomics
 Common Var(flag_inline_atomics) Init(1) Optimization
 Inline __atomic operations when a lock free instruction sequence is available.
 
+finline-stringops
+Common RejectNegative Enum(ilsop_fn) Var(flag_inline_stringops, ILSOP_ALL) Enum(ilsop_fn) Init(ILSOP_NONE) Optimization Undocumented
+
+fno-inline-stringops
+Common RejectNegative Enum(ilsop_fn) Var(flag_inline_stringops, ILSOP_NONE) Enum(ilsop_fn) Optimization Undocumented
+
+finline-stringops=
+Common Joined Var(flag_inline_stringops) EnumSet Enum(ilsop_fn) Optimization
+-finline-stringops[=memcmp|memcpy|memmove|memset]
+Expand supported mem/str operations inline, even if against optimization.
+
+Enum
+Name(ilsop_fn) Type(enum ilsop_fn) UnknownError(unavailable stringop for inlining %qs)
+
+; This is not part of any set.
+; EnumValue
+; Enum(ilsop_fn) String(none) Value(ILSOP_NONE)
+
+EnumValue
+Enum(ilsop_fn) String(memcmp) Value(ILSOP_MEMCMP) Set(1)
+
+EnumValue
+Enum(ilsop_fn) String(memcpy) Value(ILSOP_MEMCPY) Set(2)
+
+EnumValue
+Enum(ilsop_fn) String(memmove) Value(ILSOP_MEMMOVE) Set(3)
+
+EnumValue
+Enum(ilsop_fn) String(memset) Value(ILSOP_MEMSET) Set(4)
+
+; This is not part of any set either.
+; EnumValue
+; Enum(ilsop_fn) String(all) Value(ILSOP_ALL)
+
 fcf-protection
 Common RejectNegative Alias(fcf-protection=,full)
 
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 9130104af22..923c8005f57 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -550,6 +550,7 @@ Objective-C and Objective-C++ Dialects}.
 -fgcse  -fgcse-after-reload  -fgcse-las  -fgcse-lm  -fgraphite-identity
 -fgcse-sm  -fhoist-adjacent-loads  -fif-conversion
 -fif-conversion2  -findirect-inlining
+-finline-stringops[=@var{fn}]
 -finline-functions  -finline-functions-called-once  -finline-limit=@var{n}
 -finline-small-functions -fipa-modref -fipa-cp  -fipa-cp-clone
 -fipa-bit-cp  -fipa-vrp  -fipa-pta  -fipa-profile  -fipa-pure-const
@@ -12081,6 +12082,20 @@ their @code{_FORTIFY_SOURCE} counterparts into faster alternatives.
 
 Enabled at levels @option{-O2}, @option{-O3}.
 
+@opindex finline-stringops
+@item -finline-stringops[=@var{fn}]
+Expand memory and string operations (for now, only @code{memset})
+inline, even when the length is variable or big enough as to require
+looping.  This is most useful along with @option{-ffreestanding} and
+@option{-fno-builtin}.
+
+In some circumstances, it enables the compiler to generate code that
+takes advantage of known alignment and length multipliers, but even then
+it may be less efficient than optimized runtime implementations, and
+grow code size so much that even a less performant but shared
+implementation runs faster due to better use of code caches.  This
+option is disabled by default.
+
 @opindex fno-inline
 @opindex finline
 @item -fno-inline
diff --git a/gcc/expr.cc b/gcc/expr.cc
index 56b51876f80..efa409643a3 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -80,7 +80,11 @@ static bool emit_block_move_via_pattern (rtx, rtx, rtx, unsigned, unsigned,
 					 HOST_WIDE_INT, unsigned HOST_WIDE_INT,
 					 unsigned HOST_WIDE_INT,
 					 unsigned HOST_WIDE_INT, bool);
-static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned);
+static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned, int);
+static void emit_block_move_via_sized_loop (rtx, rtx, rtx, unsigned, unsigned);
+static void emit_block_move_via_oriented_loop (rtx, rtx, rtx, unsigned, unsigned);
+static rtx emit_block_cmp_via_loop (rtx, rtx, rtx, tree, rtx, bool,
+				    unsigned, unsigned);
 static void clear_by_pieces (rtx, unsigned HOST_WIDE_INT, unsigned int);
 static rtx_insn *compress_float_constant (rtx, rtx);
 static rtx get_subtarget (rtx);
@@ -1955,6 +1959,8 @@ compare_by_pieces (rtx arg0, rtx arg1, unsigned HOST_WIDE_INT len,
    MIN_SIZE is the minimal size of block to move
    MAX_SIZE is the maximal size of block to move, if it cannot be represented
    in unsigned HOST_WIDE_INT, than it is mask of all ones.
+   CTZ_SIZE is the trailing-zeros count of SIZE; even a nonconstant SIZE is
+   known to be a multiple of 1<<CTZ_SIZE.
 
    Return the address of the new block, if memcpy is called and returns it,
    0 otherwise.  */
@@ -1966,7 +1972,7 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
 		       unsigned HOST_WIDE_INT max_size,
 		       unsigned HOST_WIDE_INT probable_max_size,
 		       bool bail_out_libcall, bool *is_move_done,
-		       bool might_overlap)
+		       bool might_overlap, unsigned ctz_size)
 {
   int may_use_call;
   rtx retval = 0;
@@ -2052,6 +2058,14 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
 	}
     }
 
+  bool dynamic_direction = false;
+  if (!pattern_ok && !pieces_ok && may_use_call
+      && (flag_inline_stringops & (might_overlap ? ILSOP_MEMMOVE : ILSOP_MEMCPY)))
+    {
+      may_use_call = 0;
+      dynamic_direction = might_overlap;
+    }
+
   if (pattern_ok)
     ;
   else if (pieces_ok)
@@ -2073,10 +2087,12 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
       retval = emit_block_copy_via_libcall (x, y, size,
 					    method == BLOCK_OP_TAILCALL);
     }
+  else if (dynamic_direction)
+    emit_block_move_via_oriented_loop (x, y, size, align, ctz_size);
   else if (might_overlap)
     *is_move_done = false;
   else
-    emit_block_move_via_loop (x, y, size, align);
+    emit_block_move_via_sized_loop (x, y, size, align, ctz_size);
 
   if (method == BLOCK_OP_CALL_PARM)
     OK_DEFER_POP;
@@ -2085,7 +2101,8 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
 }
 
 rtx
-emit_block_move (rtx x, rtx y, rtx size, enum block_op_methods method)
+emit_block_move (rtx x, rtx y, rtx size, enum block_op_methods method,
+		 unsigned int ctz_size)
 {
   unsigned HOST_WIDE_INT max, min = 0;
   if (GET_CODE (size) == CONST_INT)
@@ -2093,7 +2110,8 @@ emit_block_move (rtx x, rtx y, rtx size, enum block_op_methods method)
   else
     max = GET_MODE_MASK (GET_MODE (size));
   return emit_block_move_hints (x, y, size, method, 0, -1,
-				min, max, max);
+				min, max, max,
+				false, NULL, false, ctz_size);
 }
 
 /* A subroutine of emit_block_move.  Returns true if calling the
@@ -2255,13 +2273,117 @@ emit_block_move_via_pattern (rtx x, rtx y, rtx size, unsigned int align,
   return false;
 }
 
+/* Like emit_block_move_via_loop, but choose a suitable INCR based on
+   ALIGN and CTZ_SIZE.  */
+
+static void
+emit_block_move_via_sized_loop (rtx x, rtx y, rtx size,
+				unsigned int align,
+				unsigned int ctz_size)
+{
+  int incr = align / BITS_PER_UNIT;
+
+  if (CONST_INT_P (size))
+    ctz_size = MAX (ctz_size, (unsigned) wi::ctz (UINTVAL (size)));
+
+  if (HOST_WIDE_INT_1U << ctz_size < (unsigned HOST_WIDE_INT) incr)
+    incr = HOST_WIDE_INT_1U << ctz_size;
+
+  while (incr > 1 && !can_move_by_pieces (incr, align))
+    incr >>= 1;
+
+  gcc_checking_assert (incr);
+
+  return emit_block_move_via_loop (x, y, size, align, incr);
+}
+
+/* Like emit_block_move_via_sized_loop, but besides choosing INCR so
+   as to ensure safe moves even in case of overlap, output dynamic
+   tests to choose between two loops, one moving downwards, another
+   moving upwards.  */
+
+static void
+emit_block_move_via_oriented_loop (rtx x, rtx y, rtx size,
+				   unsigned int align,
+				   unsigned int ctz_size)
+{
+  int incr = align / BITS_PER_UNIT;
+
+  if (CONST_INT_P (size))
+    ctz_size = MAX (ctz_size, (unsigned) wi::ctz (UINTVAL (size)));
+
+  if (HOST_WIDE_INT_1U << ctz_size < (unsigned HOST_WIDE_INT) incr)
+    incr = HOST_WIDE_INT_1U << ctz_size;
+
+  while (incr > 1 && !int_mode_for_size (incr, 0).exists ())
+    incr >>= 1;
+
+  gcc_checking_assert (incr);
+
+  rtx_code_label *upw_label, *end_label;
+  upw_label = gen_label_rtx ();
+  end_label = gen_label_rtx ();
+
+  rtx x_addr = force_operand (XEXP (x, 0), NULL_RTX);
+  rtx y_addr = force_operand (XEXP (y, 0), NULL_RTX);
+  do_pending_stack_adjust ();
+
+  machine_mode mode = GET_MODE (x_addr);
+  if (mode != GET_MODE (y_addr))
+    {
+      scalar_int_mode xmode
+	= smallest_int_mode_for_size (GET_MODE_BITSIZE (mode));
+      scalar_int_mode ymode
+	= smallest_int_mode_for_size (GET_MODE_BITSIZE
+				      (GET_MODE (y_addr)));
+      if (GET_MODE_BITSIZE (xmode) < GET_MODE_BITSIZE (ymode))
+	mode = ymode;
+      else
+	mode = xmode;
+
+#ifndef POINTERS_EXTEND_UNSIGNED
+      const int POINTERS_EXTEND_UNSIGNED = 1;
+#endif
+      x_addr = convert_modes (mode, GET_MODE (x_addr), x_addr,
+			      POINTERS_EXTEND_UNSIGNED);
+      y_addr = convert_modes (mode, GET_MODE (y_addr), y_addr,
+			      POINTERS_EXTEND_UNSIGNED);
+    }
+
+  /* Test for overlap: if (x >= y || x + size <= y) goto upw_label.  */
+  emit_cmp_and_jump_insns (x_addr, y_addr, GEU, NULL_RTX, mode,
+			   true, upw_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (5, 10));
+  rtx tmp = convert_modes (GET_MODE (x_addr), GET_MODE (size), size, true);
+  tmp = simplify_gen_binary (PLUS, GET_MODE (x_addr), x_addr, tmp);
+
+  emit_cmp_and_jump_insns (tmp, y_addr, LEU, NULL_RTX, mode,
+			   true, upw_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (8, 10));
+
+  emit_block_move_via_loop (x, y, size, align, -incr);
+
+  emit_jump (end_label);
+  emit_label (upw_label);
+
+  emit_block_move_via_loop (x, y, size, align, incr);
+
+  emit_label (end_label);
+}
+
 /* A subroutine of emit_block_move.  Copy the data via an explicit
-   loop.  This is used only when libcalls are forbidden.  */
-/* ??? It'd be nice to copy in hunks larger than QImode.  */
+   loop.  This is used only when libcalls are forbidden, or when
+   inlining is required.  INCR is the block size to be copied in each
+   loop iteration.  If it is negative, the absolute value is used, and
+   the block is copied backwards.  INCR must be a power of two, an
+   exact divisor for SIZE and ALIGN, and imply a mode that can be
+   safely copied per iteration assuming no overlap.  */
 
 static void
 emit_block_move_via_loop (rtx x, rtx y, rtx size,
-			  unsigned int align ATTRIBUTE_UNUSED)
+			  unsigned int align, int incr)
 {
   rtx_code_label *cmp_label, *top_label;
   rtx iter, x_addr, y_addr, tmp;
@@ -2277,7 +2399,38 @@ emit_block_move_via_loop (rtx x, rtx y, rtx size,
   cmp_label = gen_label_rtx ();
   iter = gen_reg_rtx (iter_mode);
 
-  emit_move_insn (iter, const0_rtx);
+  bool downwards = incr < 0;
+  rtx iter_init;
+  rtx_code iter_cond;
+  rtx iter_limit;
+  rtx iter_incr;
+  machine_mode move_mode;
+  if (downwards)
+    {
+      incr = -incr;
+      iter_init = size;
+      iter_cond = GEU;
+      iter_limit = const0_rtx;
+      iter_incr = GEN_INT (incr);
+    }
+  else
+    {
+      iter_init = const0_rtx;
+      iter_cond = LTU;
+      iter_limit = size;
+      iter_incr = GEN_INT (incr);
+    }
+  emit_move_insn (iter, iter_init);
+
+  scalar_int_mode int_move_mode
+    = smallest_int_mode_for_size (incr * BITS_PER_UNIT);
+  if (GET_MODE_BITSIZE (int_move_mode) != incr * BITS_PER_UNIT)
+    {
+      move_mode = BLKmode;
+      gcc_checking_assert (can_move_by_pieces (incr, align));
+    }
+  else
+    move_mode = int_move_mode;
 
   x_addr = force_operand (XEXP (x, 0), NULL_RTX);
   y_addr = force_operand (XEXP (y, 0), NULL_RTX);
@@ -2293,19 +2446,32 @@ emit_block_move_via_loop (rtx x, rtx y, rtx size,
     tmp = convert_modes (y_addr_mode, iter_mode, iter, true);
   y_addr = simplify_gen_binary (PLUS, y_addr_mode, y_addr, tmp);
 
-  x = change_address (x, QImode, x_addr);
-  y = change_address (y, QImode, y_addr);
+  x = change_address (x, move_mode, x_addr);
+  y = change_address (y, move_mode, y_addr);
 
-  emit_move_insn (x, y);
+  if (move_mode == BLKmode)
+    {
+      bool done;
+      emit_block_move_hints (x, y, iter_incr, BLOCK_OP_NO_LIBCALL,
+			     align, incr, incr, incr, incr,
+			     false, &done, false);
+      gcc_checking_assert (done);
+    }
+  else
+    emit_move_insn (x, y);
 
-  tmp = expand_simple_binop (iter_mode, PLUS, iter, const1_rtx, iter,
+  if (downwards)
+    emit_label (cmp_label);
+
+  tmp = expand_simple_binop (iter_mode, PLUS, iter, iter_incr, iter,
 			     true, OPTAB_LIB_WIDEN);
   if (tmp != iter)
     emit_move_insn (iter, tmp);
 
-  emit_label (cmp_label);
+  if (!downwards)
+    emit_label (cmp_label);
 
-  emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
+  emit_cmp_and_jump_insns (iter, iter_limit, iter_cond, NULL_RTX, iter_mode,
 			   true, top_label,
 			   profile_probability::guessed_always ()
 				.apply_scale (9, 10));
@@ -2405,7 +2571,8 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
 
    Both X and Y must be MEM rtx's.  LEN is an rtx that says how long
    they are.  LEN_TYPE is the type of the expression that was used to
-   calculate it.
+   calculate it, and CTZ_LEN is the known trailing-zeros count of LEN,
+   so LEN must be a multiple of 1<<CTZ_LEN even if it's not constant.
 
    If EQUALITY_ONLY is true, it means we don't have to return the tri-state
    value of a normal memcmp call, instead we can just compare for equality.
@@ -2421,7 +2588,7 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
 rtx
 emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
 		      bool equality_only, by_pieces_constfn y_cfn,
-		      void *y_cfndata)
+		      void *y_cfndata, unsigned ctz_len)
 {
   rtx result = 0;
 
@@ -2443,8 +2610,181 @@ emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
   else
     result = emit_block_cmp_via_cmpmem (x, y, len, len_type, target, align);
 
+  if (!result && (flag_inline_stringops & ILSOP_MEMCMP))
+    result = emit_block_cmp_via_loop (x, y, len, len_type,
+				      target, equality_only,
+				      align, ctz_len);
+
   return result;
 }
+
+/* Like emit_block_cmp_hints, but with known alignment and no support
+   for constats.  Always expand to a loop with iterations that compare
+   blocks of the largest compare-by-pieces size that divides both len
+   and align, and then, if !EQUALITY_ONLY, identify the word and then
+   the unit that first differs to return the result.  */
+
+rtx
+emit_block_cmp_via_loop (rtx x, rtx y, rtx len, tree len_type, rtx target,
+			 bool equality_only, unsigned align, unsigned ctz_len)
+{
+  unsigned incr = align / BITS_PER_UNIT;
+
+  if (CONST_INT_P (len))
+    ctz_len = MAX (ctz_len, (unsigned) wi::ctz (UINTVAL (len)));
+
+  if (HOST_WIDE_INT_1U << ctz_len < (unsigned HOST_WIDE_INT) incr)
+    incr = HOST_WIDE_INT_1U << ctz_len;
+
+  while (incr > 1
+	 && !can_do_by_pieces (incr, align, COMPARE_BY_PIECES))
+    incr >>= 1;
+
+  rtx_code_label *cmp_label, *top_label, *ne_label, *res_label;
+  rtx iter, x_addr, y_addr, tmp;
+  machine_mode x_addr_mode = get_address_mode (x);
+  machine_mode y_addr_mode = get_address_mode (y);
+  machine_mode iter_mode;
+
+  iter_mode = GET_MODE (len);
+  if (iter_mode == VOIDmode)
+    iter_mode = word_mode;
+
+  top_label = gen_label_rtx ();
+  cmp_label = gen_label_rtx ();
+  ne_label = gen_label_rtx ();
+  res_label = gen_label_rtx ();
+  iter = gen_reg_rtx (iter_mode);
+
+  rtx iter_init = const0_rtx;
+  rtx_code iter_cond = LTU;
+  rtx iter_limit = len;
+  rtx iter_incr = GEN_INT (incr);
+  machine_mode cmp_mode;
+
+  emit_move_insn (iter, iter_init);
+
+  scalar_int_mode int_cmp_mode
+    = smallest_int_mode_for_size (incr * BITS_PER_UNIT);
+  if (GET_MODE_BITSIZE (int_cmp_mode) != incr * BITS_PER_UNIT
+      || !can_compare_p (NE, int_cmp_mode, ccp_jump))
+    {
+      cmp_mode = BLKmode;
+      gcc_checking_assert (incr != 1);
+    }
+  else
+    cmp_mode = int_cmp_mode;
+
+  /* Save the base addresses.  */
+  x_addr = force_operand (XEXP (x, 0), NULL_RTX);
+  y_addr = force_operand (XEXP (y, 0), NULL_RTX);
+  do_pending_stack_adjust ();
+
+  emit_jump (cmp_label);
+  emit_label (top_label);
+
+  /* Offset the base addresses by ITER.  */
+  tmp = convert_modes (x_addr_mode, iter_mode, iter, true);
+  x_addr = simplify_gen_binary (PLUS, x_addr_mode, x_addr, tmp);
+
+  if (x_addr_mode != y_addr_mode)
+    tmp = convert_modes (y_addr_mode, iter_mode, iter, true);
+  y_addr = simplify_gen_binary (PLUS, y_addr_mode, y_addr, tmp);
+
+  x = change_address (x, cmp_mode, x_addr);
+  y = change_address (y, cmp_mode, y_addr);
+
+  /* Compare one block.  */
+  rtx part_res;
+  if (cmp_mode == BLKmode)
+    part_res = compare_by_pieces (x, y, incr, target, align, 0, 0);
+  else
+    part_res = expand_binop (cmp_mode, sub_optab, x, y, NULL_RTX,
+			     true, OPTAB_LIB_WIDEN);
+
+  /* Stop if we found a difference.  */
+  emit_cmp_and_jump_insns (part_res, GEN_INT (0), NE, NULL_RTX,
+			   GET_MODE (part_res), true, ne_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (1, 10));
+
+  /* Increment ITER.  */
+  tmp = expand_simple_binop (iter_mode, PLUS, iter, iter_incr, iter,
+			     true, OPTAB_LIB_WIDEN);
+  if (tmp != iter)
+    emit_move_insn (iter, tmp);
+
+  emit_label (cmp_label);
+  /* Loop until we reach the limit.  */
+  emit_cmp_and_jump_insns (iter, iter_limit, iter_cond, NULL_RTX, iter_mode,
+			   true, top_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (9, 10));
+
+  /* We got to the end without differences, so the result is zero.  */
+  if (target == NULL_RTX
+      || !REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
+    target = gen_reg_rtx (TYPE_MODE (integer_type_node));
+
+  emit_move_insn (target, const0_rtx);
+  emit_jump (res_label);
+  emit_barrier ();
+
+  emit_label (ne_label);
+
+  /* Return nonzero, or pinpoint the difference to return the expected
+     result for non-equality tests.  */
+  if (equality_only)
+    emit_move_insn (target, const1_rtx);
+  else
+    {
+      if (incr > UNITS_PER_WORD)
+	/* ??? Re-compare the block found to be different one word at a
+	   time.  */
+	part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), len_type,
+					    target, equality_only,
+					    BITS_PER_WORD, 0);
+      else if (incr > 1)
+	/* ??? Re-compare the block found to be different one byte at a
+	   time.  We could do better using part_res, and being careful
+	   about endianness.  */
+	part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), len_type,
+					    target, equality_only,
+					    BITS_PER_UNIT, 0);
+      else if (known_gt (GET_MODE_BITSIZE (GET_MODE (target)),
+			 GET_MODE_BITSIZE (cmp_mode)))
+	part_res = expand_binop (GET_MODE (target), sub_optab, x, y, target,
+				 true, OPTAB_LIB_WIDEN);
+      else
+	{
+	  /* In the odd chance target is QImode, we can't count on
+	     widening subtract to capture the result of the unsigned
+	     compares.  */
+	  rtx_code_label *ltu_label;
+	  ltu_label = gen_label_rtx ();
+	  emit_cmp_and_jump_insns (x, y, LTU, NULL_RTX,
+				   cmp_mode, true, ltu_label,
+				   profile_probability::guessed_always ()
+				   .apply_scale (5, 10));
+
+	  emit_move_insn (target, const1_rtx);
+	  emit_jump (res_label);
+	  emit_barrier ();
+
+	  emit_label (ltu_label);
+	  emit_move_insn (target, constm1_rtx);
+	  part_res = target;
+	}
+
+      if (target != part_res)
+	convert_move (target, part_res, false);
+    }
+
+  emit_label (res_label);
+
+  return target;
+}
+
 \f
 /* Copy all or part of a value X into registers starting at REGNO.
    The number of registers to be filled is NREGS.  */
diff --git a/gcc/expr.h b/gcc/expr.h
index 0c059ed9023..63c75c9688c 100644
--- a/gcc/expr.h
+++ b/gcc/expr.h
@@ -126,7 +126,8 @@ struct by_pieces_prev
   fixed_size_mode mode;
 };
 
-extern rtx emit_block_move (rtx, rtx, rtx, enum block_op_methods);
+extern rtx emit_block_move (rtx, rtx, rtx, enum block_op_methods,
+			    unsigned ctz_size = 0);
 extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
 			          unsigned int, HOST_WIDE_INT,
 				  unsigned HOST_WIDE_INT,
@@ -134,9 +135,11 @@ extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
 				  unsigned HOST_WIDE_INT,
 				  bool bail_out_libcall = false,
 				  bool *is_move_done = NULL,
-				  bool might_overlap = false);
+				  bool might_overlap = false,
+				  unsigned ctz_size = 0);
 extern rtx emit_block_cmp_hints (rtx, rtx, rtx, tree, rtx, bool,
-				 by_pieces_constfn, void *);
+				 by_pieces_constfn, void *,
+				 unsigned ctz_len = 0);
 extern bool emit_storent_insn (rtx to, rtx from);
 
 /* Copy all or part of a value X into registers starting at REGNO.
diff --git a/gcc/flag-types.h b/gcc/flag-types.h
index 78dbdbe90dd..a9d9eb166b6 100644
--- a/gcc/flag-types.h
+++ b/gcc/flag-types.h
@@ -432,6 +432,17 @@ enum gfc_convert
 };
 
 
+/* Inline String Operations functions.  */
+enum ilsop_fn
+{
+  ILSOP_NONE = 0,
+  ILSOP_MEMSET = 1 << 0,
+  ILSOP_MEMCPY = 1 << 1,
+  ILSOP_MEMMOVE = 1 << 2,
+  ILSOP_MEMCMP = 1 << 3,
+  ILSOP_ALL = -1
+};
+
 /* Control-Flow Protection values.  */
 enum cf_protection_level
 {
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c
new file mode 100644
index 00000000000..fcb43e76d99
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c
@@ -0,0 +1,6 @@
+/* { dg-do run } */
+/* { dg-options "-finline-stringops=memcmp -g0 -fno-lto" } */
+
+#include "../memcmp-1.c"
+
+/* { dg-final { scan-assembler-not "\mmemcmp\M" } } */
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c
new file mode 100644
index 00000000000..2ca1ce813e6
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c
@@ -0,0 +1,8 @@
+/* { dg-do run } */
+/* { dg-options "-finline-stringops=memcpy -g0 -fno-lto" } */
+
+#include "../memcmp-1.c"
+/* Yeah, this memcmp test exercises plenty of memcpy, more than any of the
+   memcpy tests.  */
+
+/* { dg-final { scan-assembler-not "\mmemcpy\M" } } */
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-cmp-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-cmp-1.c
new file mode 100644
index 00000000000..2a3b056ce02
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-cmp-1.c
@@ -0,0 +1,11 @@
+/* { dg-do run } */
+/* { dg-options "-finline-stringops -g0 -fno-lto" } */
+/* { dg-require-effective-target ptr32plus } */
+/* { dg-timeout-factor 2 } */
+
+#include "../memcmp-1.c"
+/* Yeah, this memcmp test exercises plenty of memcpy, more than any of the
+   memcpy tests.  */
+
+/* { dg-final { scan-assembler-not "\mmemcpy\M" } } */
+/* { dg-final { scan-assembler-not "\mmemcmp\M" } } */
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c
new file mode 100644
index 00000000000..70ba142fa64
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c
@@ -0,0 +1,9 @@
+/* { dg-do run } */
+/* { dg-options "-finline-stringops=memmove -g0 -fno-lto" } */
+
+#include "../../gcc.c-torture/execute/builtins/memmove.c"
+
+#include "../../gcc/c-torture/execute/builtins/lib/main.c"
+
+/* { dg-final { scan-assembler-not "\mmemcpy\M" } } */
+/* { dg-final { scan-assembler-not "\mmemmove\M" } } */
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c
new file mode 100644
index 00000000000..1e7048155ba
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c
@@ -0,0 +1,84 @@
+/* { dg-do compile } */
+/* { dg-options "-finline-stringops -fno-lto" } */
+
+void *zero (unsigned long long (*p)[32], int n)
+{
+  return __builtin_memset (p, 0, n * sizeof (*p));
+}
+
+void *ones (char (*p)[128], int n)
+{
+  return __builtin_memset (p, -1, n * sizeof (*p));
+}
+
+void *opt2 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 2) * sizeof (*p));
+}
+
+void *opt8 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 8) * sizeof (*p));
+}
+
+void *opt32 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 32) * sizeof (*p));
+}
+
+void *opt128 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 128) * sizeof (*p));
+}
+
+void *opt512 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 512) * sizeof (*p));
+}
+
+void *opt_primes (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 509 : 7) * sizeof (*p));
+}
+
+void *opt_primes_blk (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 521 : 9) * sizeof (*p));
+}
+
+void *huge (long (*p)[16384])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep1 (long (*p)[16384+1])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep4 (long (*p)[16384+4])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep16 (long (*p)[16384+16])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep64 (long (*p)[16384+64])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep256 (long (*p)[16384+256])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep1024p256p64p16p4p1 (long (*p)[16384+1024+64+16+4+1])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+/* { dg-final { scan-assembler-not "\mmemset\M" } } */

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [gcc(refs/users/aoliva/heads/testme)] Introduce -finline-stringops
@ 2023-06-02  1:42 Alexandre Oliva
  0 siblings, 0 replies; 16+ messages in thread
From: Alexandre Oliva @ 2023-06-02  1:42 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:c5c0395b0615f54fc8919e3af4f20c1f0ce3a8c6

commit c5c0395b0615f54fc8919e3af4f20c1f0ce3a8c6
Author: Alexandre Oliva <oliva@adacore.com>
Date:   Thu Jun 1 22:38:29 2023 -0300

    Introduce -finline-stringops
    
    try_store_by_multiple_pieces was added not long ago, enabling
    variable-sized memset to be expanded inline when the worst-case
    in-range constant length would, using conditional blocks with powers
    of two to cover all possibilities of length and alignment.
    
    This patch introduces -finline-stringops[=fn] to request expansions to
    start with a loop, so as to still take advantage of known alignment
    even with long lengths, but without necessarily adding store blocks
    for every power of two.
    
    This makes it possible for the supported stringops (memset, memcpy,
    memmove, memset) to be expanded, even if storing a single byte per
    iteration.  Surely efficient implementations can run faster, with a
    pre-loop to increase alignment, but that would likely be excessive for
    inline expansions.
    
    Still, in some cases, such as in freestanding environments, users
    prefer to inline such stringops, especially those that the compiler
    may introduce itself, even if the expansion is not as performant as a
    highly optimized C library implementation could be, to avoid
    depending on a C runtime library.
    
    
    for  gcc/ChangeLog
    
            * expr.cc (emit_block_move_hints): Take ctz of len.  Obey
            -finline-stringops.  Use oriented or sized loop.
            (emit_block_move): Take ctz of len, and pass it on.
            (emit_block_move_via_sized_loop): New.
            (emit_block_move_via_oriented_loop): New.
            (emit_block_move_via_loop): Take incr.  Move an incr-sized
            block per iteration.
            (emit_block_cmp_via_cmpmem): Take ctz of len.  Obey
            -finline-stringops.
            (emit_block_cmp_via_loop): New.
            * expr.h (emit_block_move): Add ctz of len defaulting to zero.
            (emit_block_move_hints): Likewise.
            (emit_block_cmp_hints): Likewise.
            * builtins.cc (expand_builtin_memory_copy_args): Pass ctz of
            len to emit_block_move_hints.
            (try_store_by_multiple_pieces): Support starting with a loop.
            (expand_builtin_memcmp): Pass ctz of len to
            emit_block_cmp_hints.
            (expand_builtin): Allow inline expansion of memset, memcpy,
            memmove and memcmp if requested.
            * common.opt (finline-stringops): New.
            (ilsop_fn): New enum.
            * flag-types.h (enum ilsop_fn): New.
            * doc/invoke.texi (-finline-stringops): Add.
    
    for  gcc/testsuite/ChangeLog
    
            * gcc.dg/torture/inline-mem-cmp-1.c: New.
            * gcc.dg/torture/inline-mem-cpy-1.c: New.
            * gcc.dg/torture/inline-mem-cpy-cmp-1.c: New.
            * gcc.dg/torture/inline-mem-move-1.c: New.
            * gcc.dg/torture/inline-mem-set-1.c: New.

Diff:
---
 gcc/builtins.cc                                    | 114 ++++++-
 gcc/common.opt                                     |  34 ++
 gcc/doc/invoke.texi                                |  15 +
 gcc/expr.cc                                        | 374 ++++++++++++++++++++-
 gcc/expr.h                                         |   9 +-
 gcc/flag-types.h                                   |  11 +
 gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c    |   6 +
 gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c    |   8 +
 .../gcc.dg/torture/inline-mem-cpy-cmp-1.c          |  11 +
 gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c   |   7 +
 gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c    |  84 +++++
 11 files changed, 643 insertions(+), 30 deletions(-)

diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index 8400adaf5b4..1beaa4eae97 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -3769,7 +3769,7 @@ expand_builtin_memory_copy_args (tree dest, tree src, tree len,
 				     expected_align, expected_size,
 				     min_size, max_size, probable_max_size,
 				     use_mempcpy_call, &is_move_done,
-				     might_overlap);
+				     might_overlap, tree_ctz (len));
 
   /* Bail out when a mempcpy call would be expanded as libcall and when
      we have a target that provides a fast implementation
@@ -4335,6 +4335,10 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
   int tst_bits = (max_bits != min_bits ? max_bits
 		  : floor_log2 (max_len ^ min_len));
 
+  /* Save the pre-blksize values.  */
+  int orig_max_bits = max_bits;
+  int orig_tst_bits = tst_bits;
+
   /* Check whether it's profitable to start by storing a fixed BLKSIZE
      bytes, to lower max_bits.  In the unlikely case of a constant LEN
      (implied by identical MAX_LEN and MIN_LEN), we want to issue a
@@ -4374,9 +4378,70 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
   if (max_bits >= 0)
     xlenest += ((HOST_WIDE_INT_1U << max_bits) * 2
 		- (HOST_WIDE_INT_1U << ctz_len));
-  if (!can_store_by_pieces (xlenest, builtin_memset_read_str,
-			    &valc, align, true))
-    return false;
+  bool max_loop = false;
+  /* Skip the test in case of overflow in xlenest.  It shouldn't
+     happen because of the way max_bits and blksize are related, but
+     it doesn't hurt to test.  */
+  if (blksize > xlenest
+      || !can_store_by_pieces (xlenest, builtin_memset_read_str,
+			       &valc, align, true))
+    {
+      if (!(flag_inline_stringops & ILSOP_MEMSET))
+	return false;
+
+      for (max_bits = orig_max_bits;
+	   max_bits >= sctz_len;
+	   --max_bits)
+	{
+	  xlenest = ((HOST_WIDE_INT_1U << max_bits) * 2
+		     - (HOST_WIDE_INT_1U << ctz_len));
+	  /* Check that blksize plus the bits to be stored as blocks
+	     sized at powers of two can be stored by pieces.  This is
+	     like the test above, but with smaller max_bits.  Skip
+	     orig_max_bits (it would be redundant).  Also skip in case
+	     of overflow.  */
+	  if (max_bits < orig_max_bits
+	      && xlenest + blksize >= xlenest
+	      && can_store_by_pieces (xlenest + blksize,
+				      builtin_memset_read_str,
+				      &valc, align, true))
+	    {
+	      max_loop = true;
+	      break;
+	    }
+	  if (blksize
+	      && can_store_by_pieces (xlenest,
+				      builtin_memset_read_str,
+				      &valc, align, true))
+	    {
+	      max_len += blksize;
+	      min_len += blksize;
+	      tst_bits = orig_tst_bits;
+	      blksize = 0;
+	      max_loop = true;
+	      break;
+	    }
+	  if (max_bits == sctz_len)
+	    {
+	      --sctz_len;
+	      --ctz_len;
+	    }
+	}
+      if (!max_loop)
+	return false;
+      /* If the boundaries are such that min and max may run a
+	 different number of trips in the initial loop, the remainder
+	 needs not be between the moduli, so set tst_bits to cover all
+	 bits.  Otherwise, if the trip counts are the same, max_len
+	 has the common prefix, and the previously-computed tst_bits
+	 is usable.  */
+      if (max_len >> max_bits > min_len >> max_bits)
+	tst_bits = max_bits;
+    }
+  /* ??? Do we have to check that all powers of two lengths from
+     max_bits down to ctz_len pass can_store_by_pieces?  As in, could
+     it possibly be that xlenest passes while smaller power-of-two
+     sizes don't?  */
 
   by_pieces_constfn constfun;
   void *constfundata;
@@ -4418,7 +4483,9 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
      the least significant bit possibly set in the length.  */
   for (int i = max_bits; i >= sctz_len; i--)
     {
+      rtx_code_label *loop_label = NULL;
       rtx_code_label *label = NULL;
+
       blksize = HOST_WIDE_INT_1U << i;
 
       /* If we're past the bits shared between min_ and max_len, expand
@@ -4432,18 +4499,31 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
 				   profile_probability::even ());
 	}
       /* If we are at a bit that is in the prefix shared by min_ and
-	 max_len, skip this BLKSIZE if the bit is clear.  */
-      else if ((max_len & blksize) == 0)
+	 max_len, skip the current BLKSIZE if the bit is clear, but do
+	 not skip the loop, even if it doesn't require
+	 prechecking.  */
+      else if ((max_len & blksize) == 0
+	       && !(max_loop && i == max_bits))
 	continue;
 
+      if (max_loop && i == max_bits)
+	{
+	  loop_label = gen_label_rtx ();
+	  emit_label (loop_label);
+	  /* Since we may run this multiple times, don't assume we
+	     know anything about the offset.  */
+	  clear_mem_offset (to);
+	}
+
       /* Issue a store of BLKSIZE bytes.  */
+      bool update_needed = i != sctz_len || loop_label;
       to = store_by_pieces (to, blksize,
 			    constfun, constfundata,
 			    align, true,
-			    i != sctz_len ? RETURN_END : RETURN_BEGIN);
+			    update_needed ? RETURN_END : RETURN_BEGIN);
 
       /* Adjust REM and PTR, unless this is the last iteration.  */
-      if (i != sctz_len)
+      if (update_needed)
 	{
 	  emit_move_insn (ptr, force_operand (XEXP (to, 0), NULL_RTX));
 	  to = replace_equiv_address (to, ptr);
@@ -4451,6 +4531,11 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
 	  emit_move_insn (rem, force_operand (rem_minus_blksize, NULL_RTX));
 	}
 
+      if (loop_label)
+	emit_cmp_and_jump_insns (rem, GEN_INT (blksize), GE, NULL,
+				 ptr_mode, 1, loop_label,
+				 profile_probability::likely ());
+
       if (label)
 	{
 	  emit_label (label);
@@ -4737,7 +4822,8 @@ expand_builtin_memcmp (tree exp, rtx target, bool result_eq)
   result = emit_block_cmp_hints (arg1_rtx, arg2_rtx, len_rtx,
 				 TREE_TYPE (len), target,
 				 result_eq, constfn,
-				 CONST_CAST (char *, rep));
+				 CONST_CAST (char *, rep),
+				 tree_ctz (len));
 
   if (result)
     {
@@ -7380,7 +7466,15 @@ expand_builtin (tree exp, rtx target, rtx subtarget, machine_mode mode,
       && fcode != BUILT_IN_EXECVE
       && fcode != BUILT_IN_CLEAR_CACHE
       && !ALLOCA_FUNCTION_CODE_P (fcode)
-      && fcode != BUILT_IN_FREE)
+      && fcode != BUILT_IN_FREE
+      && (fcode != BUILT_IN_MEMSET
+	  || !(flag_inline_stringops & ILSOP_MEMSET))
+      && (fcode != BUILT_IN_MEMCPY
+	  || !(flag_inline_stringops & ILSOP_MEMCPY))
+      && (fcode != BUILT_IN_MEMMOVE
+	  || !(flag_inline_stringops & ILSOP_MEMMOVE))
+      && (fcode != BUILT_IN_MEMCMP
+	  || !(flag_inline_stringops & ILSOP_MEMCMP)))
     return expand_call (exp, target, ignore);
 
   /* The built-in function expanders test for target == const0_rtx
diff --git a/gcc/common.opt b/gcc/common.opt
index a28ca13385a..fcf945019ec 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -1882,6 +1882,40 @@ finline-atomics
 Common Var(flag_inline_atomics) Init(1) Optimization
 Inline __atomic operations when a lock free instruction sequence is available.
 
+finline-stringops
+Common RejectNegative Enum(ilsop_fn) Var(flag_inline_stringops, ILSOP_ALL) Enum(ilsop_fn) Init(ILSOP_NONE) Optimization Undocumented
+
+fno-inline-stringops
+Common RejectNegative Enum(ilsop_fn) Var(flag_inline_stringops, ILSOP_NONE) Enum(ilsop_fn) Optimization Undocumented
+
+finline-stringops=
+Common Joined Var(flag_inline_stringops) EnumSet Enum(ilsop_fn) Optimization
+-finline-stringops[=memcmp|memcpy|memmove|memset]
+Expand supported mem/str operations inline, even if against optimization.
+
+Enum
+Name(ilsop_fn) Type(enum ilsop_fn) UnknownError(unavailable stringop for inlining %qs)
+
+; This is not part of any set.
+; EnumValue
+; Enum(ilsop_fn) String(none) Value(ILSOP_NONE)
+
+EnumValue
+Enum(ilsop_fn) String(memcmp) Value(ILSOP_MEMCMP) Set(1)
+
+EnumValue
+Enum(ilsop_fn) String(memcpy) Value(ILSOP_MEMCPY) Set(2)
+
+EnumValue
+Enum(ilsop_fn) String(memmove) Value(ILSOP_MEMMOVE) Set(3)
+
+EnumValue
+Enum(ilsop_fn) String(memset) Value(ILSOP_MEMSET) Set(4)
+
+; This is not part of any set either.
+; EnumValue
+; Enum(ilsop_fn) String(all) Value(ILSOP_ALL)
+
 fcf-protection
 Common RejectNegative Alias(fcf-protection=,full)
 
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 9130104af22..923c8005f57 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -550,6 +550,7 @@ Objective-C and Objective-C++ Dialects}.
 -fgcse  -fgcse-after-reload  -fgcse-las  -fgcse-lm  -fgraphite-identity
 -fgcse-sm  -fhoist-adjacent-loads  -fif-conversion
 -fif-conversion2  -findirect-inlining
+-finline-stringops[=@var{fn}]
 -finline-functions  -finline-functions-called-once  -finline-limit=@var{n}
 -finline-small-functions -fipa-modref -fipa-cp  -fipa-cp-clone
 -fipa-bit-cp  -fipa-vrp  -fipa-pta  -fipa-profile  -fipa-pure-const
@@ -12081,6 +12082,20 @@ their @code{_FORTIFY_SOURCE} counterparts into faster alternatives.
 
 Enabled at levels @option{-O2}, @option{-O3}.
 
+@opindex finline-stringops
+@item -finline-stringops[=@var{fn}]
+Expand memory and string operations (for now, only @code{memset})
+inline, even when the length is variable or big enough as to require
+looping.  This is most useful along with @option{-ffreestanding} and
+@option{-fno-builtin}.
+
+In some circumstances, it enables the compiler to generate code that
+takes advantage of known alignment and length multipliers, but even then
+it may be less efficient than optimized runtime implementations, and
+grow code size so much that even a less performant but shared
+implementation runs faster due to better use of code caches.  This
+option is disabled by default.
+
 @opindex fno-inline
 @opindex finline
 @item -fno-inline
diff --git a/gcc/expr.cc b/gcc/expr.cc
index 56b51876f80..efa409643a3 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -80,7 +80,11 @@ static bool emit_block_move_via_pattern (rtx, rtx, rtx, unsigned, unsigned,
 					 HOST_WIDE_INT, unsigned HOST_WIDE_INT,
 					 unsigned HOST_WIDE_INT,
 					 unsigned HOST_WIDE_INT, bool);
-static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned);
+static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned, int);
+static void emit_block_move_via_sized_loop (rtx, rtx, rtx, unsigned, unsigned);
+static void emit_block_move_via_oriented_loop (rtx, rtx, rtx, unsigned, unsigned);
+static rtx emit_block_cmp_via_loop (rtx, rtx, rtx, tree, rtx, bool,
+				    unsigned, unsigned);
 static void clear_by_pieces (rtx, unsigned HOST_WIDE_INT, unsigned int);
 static rtx_insn *compress_float_constant (rtx, rtx);
 static rtx get_subtarget (rtx);
@@ -1955,6 +1959,8 @@ compare_by_pieces (rtx arg0, rtx arg1, unsigned HOST_WIDE_INT len,
    MIN_SIZE is the minimal size of block to move
    MAX_SIZE is the maximal size of block to move, if it cannot be represented
    in unsigned HOST_WIDE_INT, than it is mask of all ones.
+   CTZ_SIZE is the trailing-zeros count of SIZE; even a nonconstant SIZE is
+   known to be a multiple of 1<<CTZ_SIZE.
 
    Return the address of the new block, if memcpy is called and returns it,
    0 otherwise.  */
@@ -1966,7 +1972,7 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
 		       unsigned HOST_WIDE_INT max_size,
 		       unsigned HOST_WIDE_INT probable_max_size,
 		       bool bail_out_libcall, bool *is_move_done,
-		       bool might_overlap)
+		       bool might_overlap, unsigned ctz_size)
 {
   int may_use_call;
   rtx retval = 0;
@@ -2052,6 +2058,14 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
 	}
     }
 
+  bool dynamic_direction = false;
+  if (!pattern_ok && !pieces_ok && may_use_call
+      && (flag_inline_stringops & (might_overlap ? ILSOP_MEMMOVE : ILSOP_MEMCPY)))
+    {
+      may_use_call = 0;
+      dynamic_direction = might_overlap;
+    }
+
   if (pattern_ok)
     ;
   else if (pieces_ok)
@@ -2073,10 +2087,12 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
       retval = emit_block_copy_via_libcall (x, y, size,
 					    method == BLOCK_OP_TAILCALL);
     }
+  else if (dynamic_direction)
+    emit_block_move_via_oriented_loop (x, y, size, align, ctz_size);
   else if (might_overlap)
     *is_move_done = false;
   else
-    emit_block_move_via_loop (x, y, size, align);
+    emit_block_move_via_sized_loop (x, y, size, align, ctz_size);
 
   if (method == BLOCK_OP_CALL_PARM)
     OK_DEFER_POP;
@@ -2085,7 +2101,8 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
 }
 
 rtx
-emit_block_move (rtx x, rtx y, rtx size, enum block_op_methods method)
+emit_block_move (rtx x, rtx y, rtx size, enum block_op_methods method,
+		 unsigned int ctz_size)
 {
   unsigned HOST_WIDE_INT max, min = 0;
   if (GET_CODE (size) == CONST_INT)
@@ -2093,7 +2110,8 @@ emit_block_move (rtx x, rtx y, rtx size, enum block_op_methods method)
   else
     max = GET_MODE_MASK (GET_MODE (size));
   return emit_block_move_hints (x, y, size, method, 0, -1,
-				min, max, max);
+				min, max, max,
+				false, NULL, false, ctz_size);
 }
 
 /* A subroutine of emit_block_move.  Returns true if calling the
@@ -2255,13 +2273,117 @@ emit_block_move_via_pattern (rtx x, rtx y, rtx size, unsigned int align,
   return false;
 }
 
+/* Like emit_block_move_via_loop, but choose a suitable INCR based on
+   ALIGN and CTZ_SIZE.  */
+
+static void
+emit_block_move_via_sized_loop (rtx x, rtx y, rtx size,
+				unsigned int align,
+				unsigned int ctz_size)
+{
+  int incr = align / BITS_PER_UNIT;
+
+  if (CONST_INT_P (size))
+    ctz_size = MAX (ctz_size, (unsigned) wi::ctz (UINTVAL (size)));
+
+  if (HOST_WIDE_INT_1U << ctz_size < (unsigned HOST_WIDE_INT) incr)
+    incr = HOST_WIDE_INT_1U << ctz_size;
+
+  while (incr > 1 && !can_move_by_pieces (incr, align))
+    incr >>= 1;
+
+  gcc_checking_assert (incr);
+
+  return emit_block_move_via_loop (x, y, size, align, incr);
+}
+
+/* Like emit_block_move_via_sized_loop, but besides choosing INCR so
+   as to ensure safe moves even in case of overlap, output dynamic
+   tests to choose between two loops, one moving downwards, another
+   moving upwards.  */
+
+static void
+emit_block_move_via_oriented_loop (rtx x, rtx y, rtx size,
+				   unsigned int align,
+				   unsigned int ctz_size)
+{
+  int incr = align / BITS_PER_UNIT;
+
+  if (CONST_INT_P (size))
+    ctz_size = MAX (ctz_size, (unsigned) wi::ctz (UINTVAL (size)));
+
+  if (HOST_WIDE_INT_1U << ctz_size < (unsigned HOST_WIDE_INT) incr)
+    incr = HOST_WIDE_INT_1U << ctz_size;
+
+  while (incr > 1 && !int_mode_for_size (incr, 0).exists ())
+    incr >>= 1;
+
+  gcc_checking_assert (incr);
+
+  rtx_code_label *upw_label, *end_label;
+  upw_label = gen_label_rtx ();
+  end_label = gen_label_rtx ();
+
+  rtx x_addr = force_operand (XEXP (x, 0), NULL_RTX);
+  rtx y_addr = force_operand (XEXP (y, 0), NULL_RTX);
+  do_pending_stack_adjust ();
+
+  machine_mode mode = GET_MODE (x_addr);
+  if (mode != GET_MODE (y_addr))
+    {
+      scalar_int_mode xmode
+	= smallest_int_mode_for_size (GET_MODE_BITSIZE (mode));
+      scalar_int_mode ymode
+	= smallest_int_mode_for_size (GET_MODE_BITSIZE
+				      (GET_MODE (y_addr)));
+      if (GET_MODE_BITSIZE (xmode) < GET_MODE_BITSIZE (ymode))
+	mode = ymode;
+      else
+	mode = xmode;
+
+#ifndef POINTERS_EXTEND_UNSIGNED
+      const int POINTERS_EXTEND_UNSIGNED = 1;
+#endif
+      x_addr = convert_modes (mode, GET_MODE (x_addr), x_addr,
+			      POINTERS_EXTEND_UNSIGNED);
+      y_addr = convert_modes (mode, GET_MODE (y_addr), y_addr,
+			      POINTERS_EXTEND_UNSIGNED);
+    }
+
+  /* Test for overlap: if (x >= y || x + size <= y) goto upw_label.  */
+  emit_cmp_and_jump_insns (x_addr, y_addr, GEU, NULL_RTX, mode,
+			   true, upw_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (5, 10));
+  rtx tmp = convert_modes (GET_MODE (x_addr), GET_MODE (size), size, true);
+  tmp = simplify_gen_binary (PLUS, GET_MODE (x_addr), x_addr, tmp);
+
+  emit_cmp_and_jump_insns (tmp, y_addr, LEU, NULL_RTX, mode,
+			   true, upw_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (8, 10));
+
+  emit_block_move_via_loop (x, y, size, align, -incr);
+
+  emit_jump (end_label);
+  emit_label (upw_label);
+
+  emit_block_move_via_loop (x, y, size, align, incr);
+
+  emit_label (end_label);
+}
+
 /* A subroutine of emit_block_move.  Copy the data via an explicit
-   loop.  This is used only when libcalls are forbidden.  */
-/* ??? It'd be nice to copy in hunks larger than QImode.  */
+   loop.  This is used only when libcalls are forbidden, or when
+   inlining is required.  INCR is the block size to be copied in each
+   loop iteration.  If it is negative, the absolute value is used, and
+   the block is copied backwards.  INCR must be a power of two, an
+   exact divisor for SIZE and ALIGN, and imply a mode that can be
+   safely copied per iteration assuming no overlap.  */
 
 static void
 emit_block_move_via_loop (rtx x, rtx y, rtx size,
-			  unsigned int align ATTRIBUTE_UNUSED)
+			  unsigned int align, int incr)
 {
   rtx_code_label *cmp_label, *top_label;
   rtx iter, x_addr, y_addr, tmp;
@@ -2277,7 +2399,38 @@ emit_block_move_via_loop (rtx x, rtx y, rtx size,
   cmp_label = gen_label_rtx ();
   iter = gen_reg_rtx (iter_mode);
 
-  emit_move_insn (iter, const0_rtx);
+  bool downwards = incr < 0;
+  rtx iter_init;
+  rtx_code iter_cond;
+  rtx iter_limit;
+  rtx iter_incr;
+  machine_mode move_mode;
+  if (downwards)
+    {
+      incr = -incr;
+      iter_init = size;
+      iter_cond = GEU;
+      iter_limit = const0_rtx;
+      iter_incr = GEN_INT (incr);
+    }
+  else
+    {
+      iter_init = const0_rtx;
+      iter_cond = LTU;
+      iter_limit = size;
+      iter_incr = GEN_INT (incr);
+    }
+  emit_move_insn (iter, iter_init);
+
+  scalar_int_mode int_move_mode
+    = smallest_int_mode_for_size (incr * BITS_PER_UNIT);
+  if (GET_MODE_BITSIZE (int_move_mode) != incr * BITS_PER_UNIT)
+    {
+      move_mode = BLKmode;
+      gcc_checking_assert (can_move_by_pieces (incr, align));
+    }
+  else
+    move_mode = int_move_mode;
 
   x_addr = force_operand (XEXP (x, 0), NULL_RTX);
   y_addr = force_operand (XEXP (y, 0), NULL_RTX);
@@ -2293,19 +2446,32 @@ emit_block_move_via_loop (rtx x, rtx y, rtx size,
     tmp = convert_modes (y_addr_mode, iter_mode, iter, true);
   y_addr = simplify_gen_binary (PLUS, y_addr_mode, y_addr, tmp);
 
-  x = change_address (x, QImode, x_addr);
-  y = change_address (y, QImode, y_addr);
+  x = change_address (x, move_mode, x_addr);
+  y = change_address (y, move_mode, y_addr);
 
-  emit_move_insn (x, y);
+  if (move_mode == BLKmode)
+    {
+      bool done;
+      emit_block_move_hints (x, y, iter_incr, BLOCK_OP_NO_LIBCALL,
+			     align, incr, incr, incr, incr,
+			     false, &done, false);
+      gcc_checking_assert (done);
+    }
+  else
+    emit_move_insn (x, y);
 
-  tmp = expand_simple_binop (iter_mode, PLUS, iter, const1_rtx, iter,
+  if (downwards)
+    emit_label (cmp_label);
+
+  tmp = expand_simple_binop (iter_mode, PLUS, iter, iter_incr, iter,
 			     true, OPTAB_LIB_WIDEN);
   if (tmp != iter)
     emit_move_insn (iter, tmp);
 
-  emit_label (cmp_label);
+  if (!downwards)
+    emit_label (cmp_label);
 
-  emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
+  emit_cmp_and_jump_insns (iter, iter_limit, iter_cond, NULL_RTX, iter_mode,
 			   true, top_label,
 			   profile_probability::guessed_always ()
 				.apply_scale (9, 10));
@@ -2405,7 +2571,8 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
 
    Both X and Y must be MEM rtx's.  LEN is an rtx that says how long
    they are.  LEN_TYPE is the type of the expression that was used to
-   calculate it.
+   calculate it, and CTZ_LEN is the known trailing-zeros count of LEN,
+   so LEN must be a multiple of 1<<CTZ_LEN even if it's not constant.
 
    If EQUALITY_ONLY is true, it means we don't have to return the tri-state
    value of a normal memcmp call, instead we can just compare for equality.
@@ -2421,7 +2588,7 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
 rtx
 emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
 		      bool equality_only, by_pieces_constfn y_cfn,
-		      void *y_cfndata)
+		      void *y_cfndata, unsigned ctz_len)
 {
   rtx result = 0;
 
@@ -2443,8 +2610,181 @@ emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
   else
     result = emit_block_cmp_via_cmpmem (x, y, len, len_type, target, align);
 
+  if (!result && (flag_inline_stringops & ILSOP_MEMCMP))
+    result = emit_block_cmp_via_loop (x, y, len, len_type,
+				      target, equality_only,
+				      align, ctz_len);
+
   return result;
 }
+
+/* Like emit_block_cmp_hints, but with known alignment and no support
+   for constats.  Always expand to a loop with iterations that compare
+   blocks of the largest compare-by-pieces size that divides both len
+   and align, and then, if !EQUALITY_ONLY, identify the word and then
+   the unit that first differs to return the result.  */
+
+rtx
+emit_block_cmp_via_loop (rtx x, rtx y, rtx len, tree len_type, rtx target,
+			 bool equality_only, unsigned align, unsigned ctz_len)
+{
+  unsigned incr = align / BITS_PER_UNIT;
+
+  if (CONST_INT_P (len))
+    ctz_len = MAX (ctz_len, (unsigned) wi::ctz (UINTVAL (len)));
+
+  if (HOST_WIDE_INT_1U << ctz_len < (unsigned HOST_WIDE_INT) incr)
+    incr = HOST_WIDE_INT_1U << ctz_len;
+
+  while (incr > 1
+	 && !can_do_by_pieces (incr, align, COMPARE_BY_PIECES))
+    incr >>= 1;
+
+  rtx_code_label *cmp_label, *top_label, *ne_label, *res_label;
+  rtx iter, x_addr, y_addr, tmp;
+  machine_mode x_addr_mode = get_address_mode (x);
+  machine_mode y_addr_mode = get_address_mode (y);
+  machine_mode iter_mode;
+
+  iter_mode = GET_MODE (len);
+  if (iter_mode == VOIDmode)
+    iter_mode = word_mode;
+
+  top_label = gen_label_rtx ();
+  cmp_label = gen_label_rtx ();
+  ne_label = gen_label_rtx ();
+  res_label = gen_label_rtx ();
+  iter = gen_reg_rtx (iter_mode);
+
+  rtx iter_init = const0_rtx;
+  rtx_code iter_cond = LTU;
+  rtx iter_limit = len;
+  rtx iter_incr = GEN_INT (incr);
+  machine_mode cmp_mode;
+
+  emit_move_insn (iter, iter_init);
+
+  scalar_int_mode int_cmp_mode
+    = smallest_int_mode_for_size (incr * BITS_PER_UNIT);
+  if (GET_MODE_BITSIZE (int_cmp_mode) != incr * BITS_PER_UNIT
+      || !can_compare_p (NE, int_cmp_mode, ccp_jump))
+    {
+      cmp_mode = BLKmode;
+      gcc_checking_assert (incr != 1);
+    }
+  else
+    cmp_mode = int_cmp_mode;
+
+  /* Save the base addresses.  */
+  x_addr = force_operand (XEXP (x, 0), NULL_RTX);
+  y_addr = force_operand (XEXP (y, 0), NULL_RTX);
+  do_pending_stack_adjust ();
+
+  emit_jump (cmp_label);
+  emit_label (top_label);
+
+  /* Offset the base addresses by ITER.  */
+  tmp = convert_modes (x_addr_mode, iter_mode, iter, true);
+  x_addr = simplify_gen_binary (PLUS, x_addr_mode, x_addr, tmp);
+
+  if (x_addr_mode != y_addr_mode)
+    tmp = convert_modes (y_addr_mode, iter_mode, iter, true);
+  y_addr = simplify_gen_binary (PLUS, y_addr_mode, y_addr, tmp);
+
+  x = change_address (x, cmp_mode, x_addr);
+  y = change_address (y, cmp_mode, y_addr);
+
+  /* Compare one block.  */
+  rtx part_res;
+  if (cmp_mode == BLKmode)
+    part_res = compare_by_pieces (x, y, incr, target, align, 0, 0);
+  else
+    part_res = expand_binop (cmp_mode, sub_optab, x, y, NULL_RTX,
+			     true, OPTAB_LIB_WIDEN);
+
+  /* Stop if we found a difference.  */
+  emit_cmp_and_jump_insns (part_res, GEN_INT (0), NE, NULL_RTX,
+			   GET_MODE (part_res), true, ne_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (1, 10));
+
+  /* Increment ITER.  */
+  tmp = expand_simple_binop (iter_mode, PLUS, iter, iter_incr, iter,
+			     true, OPTAB_LIB_WIDEN);
+  if (tmp != iter)
+    emit_move_insn (iter, tmp);
+
+  emit_label (cmp_label);
+  /* Loop until we reach the limit.  */
+  emit_cmp_and_jump_insns (iter, iter_limit, iter_cond, NULL_RTX, iter_mode,
+			   true, top_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (9, 10));
+
+  /* We got to the end without differences, so the result is zero.  */
+  if (target == NULL_RTX
+      || !REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
+    target = gen_reg_rtx (TYPE_MODE (integer_type_node));
+
+  emit_move_insn (target, const0_rtx);
+  emit_jump (res_label);
+  emit_barrier ();
+
+  emit_label (ne_label);
+
+  /* Return nonzero, or pinpoint the difference to return the expected
+     result for non-equality tests.  */
+  if (equality_only)
+    emit_move_insn (target, const1_rtx);
+  else
+    {
+      if (incr > UNITS_PER_WORD)
+	/* ??? Re-compare the block found to be different one word at a
+	   time.  */
+	part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), len_type,
+					    target, equality_only,
+					    BITS_PER_WORD, 0);
+      else if (incr > 1)
+	/* ??? Re-compare the block found to be different one byte at a
+	   time.  We could do better using part_res, and being careful
+	   about endianness.  */
+	part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), len_type,
+					    target, equality_only,
+					    BITS_PER_UNIT, 0);
+      else if (known_gt (GET_MODE_BITSIZE (GET_MODE (target)),
+			 GET_MODE_BITSIZE (cmp_mode)))
+	part_res = expand_binop (GET_MODE (target), sub_optab, x, y, target,
+				 true, OPTAB_LIB_WIDEN);
+      else
+	{
+	  /* In the odd chance target is QImode, we can't count on
+	     widening subtract to capture the result of the unsigned
+	     compares.  */
+	  rtx_code_label *ltu_label;
+	  ltu_label = gen_label_rtx ();
+	  emit_cmp_and_jump_insns (x, y, LTU, NULL_RTX,
+				   cmp_mode, true, ltu_label,
+				   profile_probability::guessed_always ()
+				   .apply_scale (5, 10));
+
+	  emit_move_insn (target, const1_rtx);
+	  emit_jump (res_label);
+	  emit_barrier ();
+
+	  emit_label (ltu_label);
+	  emit_move_insn (target, constm1_rtx);
+	  part_res = target;
+	}
+
+      if (target != part_res)
+	convert_move (target, part_res, false);
+    }
+
+  emit_label (res_label);
+
+  return target;
+}
+
 \f
 /* Copy all or part of a value X into registers starting at REGNO.
    The number of registers to be filled is NREGS.  */
diff --git a/gcc/expr.h b/gcc/expr.h
index 0c059ed9023..63c75c9688c 100644
--- a/gcc/expr.h
+++ b/gcc/expr.h
@@ -126,7 +126,8 @@ struct by_pieces_prev
   fixed_size_mode mode;
 };
 
-extern rtx emit_block_move (rtx, rtx, rtx, enum block_op_methods);
+extern rtx emit_block_move (rtx, rtx, rtx, enum block_op_methods,
+			    unsigned ctz_size = 0);
 extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
 			          unsigned int, HOST_WIDE_INT,
 				  unsigned HOST_WIDE_INT,
@@ -134,9 +135,11 @@ extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
 				  unsigned HOST_WIDE_INT,
 				  bool bail_out_libcall = false,
 				  bool *is_move_done = NULL,
-				  bool might_overlap = false);
+				  bool might_overlap = false,
+				  unsigned ctz_size = 0);
 extern rtx emit_block_cmp_hints (rtx, rtx, rtx, tree, rtx, bool,
-				 by_pieces_constfn, void *);
+				 by_pieces_constfn, void *,
+				 unsigned ctz_len = 0);
 extern bool emit_storent_insn (rtx to, rtx from);
 
 /* Copy all or part of a value X into registers starting at REGNO.
diff --git a/gcc/flag-types.h b/gcc/flag-types.h
index 78dbdbe90dd..a9d9eb166b6 100644
--- a/gcc/flag-types.h
+++ b/gcc/flag-types.h
@@ -432,6 +432,17 @@ enum gfc_convert
 };
 
 
+/* Inline String Operations functions.  */
+enum ilsop_fn
+{
+  ILSOP_NONE = 0,
+  ILSOP_MEMSET = 1 << 0,
+  ILSOP_MEMCPY = 1 << 1,
+  ILSOP_MEMMOVE = 1 << 2,
+  ILSOP_MEMCMP = 1 << 3,
+  ILSOP_ALL = -1
+};
+
 /* Control-Flow Protection values.  */
 enum cf_protection_level
 {
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c
new file mode 100644
index 00000000000..bcd774c0f77
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-finline-stringops=memcmp -g0 -fno-lto" } */
+
+#include "../memcmp-1.c"
+
+/* { dg-final { scan-assembler-not "memcmp" } } */
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c
new file mode 100644
index 00000000000..2a7d74fbee4
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c
@@ -0,0 +1,8 @@
+/* { dg-do compile } */
+/* { dg-options "-finline-stringops=memcpy -g0 -fno-lto" } */
+
+#include "../memcmp-1.c"
+/* Yeah, this memcmp test exercises plenty of memcpy, more than any of the
+   memcpy tests.  */
+
+/* { dg-final { scan-assembler-not "memcpy" } } */
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-cmp-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-cmp-1.c
new file mode 100644
index 00000000000..6b4fef0b20f
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-cmp-1.c
@@ -0,0 +1,11 @@
+/* { dg-do run } */
+/* { dg-options "-finline-stringops -g0 -fno-lto" } */
+/* { dg-require-effective-target ptr32plus } */
+/* { dg-timeout-factor 2 } */
+
+#include "../memcmp-1.c"
+/* Yeah, this memcmp test exercises plenty of memcpy, more than any of the
+   memcpy tests.  */
+
+/* { dg-final { scan-assembler-not "memcpy" } } */
+/* { dg-final { scan-assembler-not "memcmp" } } */
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c
new file mode 100644
index 00000000000..5e38bc99ce1
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c
@@ -0,0 +1,7 @@
+/* { dg-do run } */
+/* { dg-options "-finline-stringops=memmove -g0 -fno-lto" } */
+
+#include "../../gcc.c-torture/execute/builtins/memmove.c"
+
+/* { dg-final { scan-assembler-not "memcpy" } } */
+/* { dg-final { scan-assembler-not "memmove" } } */
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c
new file mode 100644
index 00000000000..8a2f5782b5a
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c
@@ -0,0 +1,84 @@
+/* { dg-do compile } */
+/* { dg-options "-finline-stringops -fno-lto" } */
+
+void *zero (unsigned long long (*p)[32], int n)
+{
+  return __builtin_memset (p, 0, n * sizeof (*p));
+}
+
+void *ones (char (*p)[128], int n)
+{
+  return __builtin_memset (p, -1, n * sizeof (*p));
+}
+
+void *opt2 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 2) * sizeof (*p));
+}
+
+void *opt8 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 8) * sizeof (*p));
+}
+
+void *opt32 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 32) * sizeof (*p));
+}
+
+void *opt128 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 128) * sizeof (*p));
+}
+
+void *opt512 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 512) * sizeof (*p));
+}
+
+void *opt_primes (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 509 : 7) * sizeof (*p));
+}
+
+void *opt_primes_blk (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 521 : 9) * sizeof (*p));
+}
+
+void *huge (long (*p)[16384])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep1 (long (*p)[16384+1])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep4 (long (*p)[16384+4])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep16 (long (*p)[16384+16])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep64 (long (*p)[16384+64])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep256 (long (*p)[16384+256])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep1024p256p64p16p4p1 (long (*p)[16384+1024+64+16+4+1])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+/* { dg-final { scan-assembler-not "memset" } } */

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2023-11-29  4:35 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-06-02  4:39 [gcc(refs/users/aoliva/heads/testme)] Introduce -finline-stringops Alexandre Oliva
  -- strict thread matches above, loose matches on Subject: below --
2023-11-29  4:35 Alexandre Oliva
2023-11-23 11:45 Alexandre Oliva
2023-11-20  9:03 Alexandre Oliva
2023-09-23  5:23 Alexandre Oliva
2023-09-22  5:46 Alexandre Oliva
2023-09-21  5:07 Alexandre Oliva
2023-09-21  4:53 Alexandre Oliva
2023-09-21  3:19 Alexandre Oliva
2023-09-15  5:42 Alexandre Oliva
2023-06-02  4:42 Alexandre Oliva
2023-06-02  4:34 Alexandre Oliva
2023-06-02  4:10 Alexandre Oliva
2023-06-02  3:55 Alexandre Oliva
2023-06-02  3:47 Alexandre Oliva
2023-06-02  1:42 Alexandre Oliva

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).