public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc(refs/users/aoliva/heads/testme)] Introduce -finline-all-stringops=memset
@ 2023-01-20 18:50 Alexandre Oliva
  0 siblings, 0 replies; 3+ messages in thread
From: Alexandre Oliva @ 2023-01-20 18:50 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:9201ad5763174f8be08ab4e75e00b44f2101ceb2

commit 9201ad5763174f8be08ab4e75e00b44f2101ceb2
Author: Alexandre Oliva <oliva@adacore.com>
Date:   Thu Dec 22 21:28:44 2022 -0300

    Introduce -finline-all-stringops=memset
    
    try_store_by_multiple_pieces was added not long ago, enabling
    variable-sized memset to be expanded inline when the worst-case
    in-range constant length would, using conditional blocks with powers
    of two to cover all possibilities of length and alignment.
    
    This patch extends the memset expansion to start with a loop, so as to
    still take advantage of known alignment even with long lengths, but
    without necessarily adding store blocks for every power of two.
    
    This makes it possible for any memset call to be expanded, even if
    storing a single byte per iteration.  Surely efficient implementations
    of memset can do better, with a pre-loop to increase alignment, but
    that would likely be excessive for inline expansions of memset.
    
    Still, in some cases, users prefer to inline memset, even if it's not
    as performant, or when it's known to be performant in ways the
    compiler can't tell, or to avoid depending on a C runtime library.
    
    With this flag, global or per-function optimizations may enable inline
    expansion of memset to be selectively requested, while the
    infrastructure for that may enable us to introduce per-target tuning
    to enable such looping when advantageous, even if not explicitly
    requested.
    
    
    for  gcc/ChangeLog
    
            * builtins.cc (try_store_by_multiple_pieces): Support starting
            with a loop.
            * common.opt (finline-all-stringops=memset): New.
            * doc/invoke.texi (-finline-all-stringops=memset): Add.
    
    for  gcc/testsuite/ChangeLog
    
            * gcc.dg/torture/inline-mem-set-1.c: New.

Diff:
---
 gcc/builtins.cc                                 | 99 +++++++++++++++++++++++--
 gcc/common.opt                                  |  4 +
 gcc/doc/invoke.texi                             | 13 +++-
 gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c | 84 +++++++++++++++++++++
 4 files changed, 192 insertions(+), 8 deletions(-)

diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index af45829875e..733fe17ede6 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -4322,6 +4322,10 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
   int tst_bits = (max_bits != min_bits ? max_bits
 		  : floor_log2 (max_len ^ min_len));
 
+  /* Save the pre-blksize values.  */
+  int orig_max_bits = max_bits;
+  int orig_tst_bits = tst_bits;
+
   /* Check whether it's profitable to start by storing a fixed BLKSIZE
      bytes, to lower max_bits.  In the unlikely case of a constant LEN
      (implied by identical MAX_LEN and MIN_LEN), we want to issue a
@@ -4361,9 +4365,70 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
   if (max_bits >= 0)
     xlenest += ((HOST_WIDE_INT_1U << max_bits) * 2
 		- (HOST_WIDE_INT_1U << ctz_len));
-  if (!can_store_by_pieces (xlenest, builtin_memset_read_str,
-			    &valc, align, true))
-    return false;
+  bool max_loop = false;
+  /* Skip the test in case of overflow in xlenest.  It shouldn't
+     happen because of the way max_bits and blksize are related, but
+     it doesn't hurt to test.  */
+  if (blksize > xlenest
+      || !can_store_by_pieces (xlenest, builtin_memset_read_str,
+			       &valc, align, true))
+    {
+      if (!flag_inline_memset_loops)
+	return false;
+
+      for (max_bits = orig_max_bits;
+	   max_bits >= sctz_len;
+	   --max_bits)
+	{
+	  xlenest = ((HOST_WIDE_INT_1U << max_bits) * 2
+		     - (HOST_WIDE_INT_1U << ctz_len));
+	  /* Check that blksize plus the bits to be stored as blocks
+	     sized at powers of two can be stored by pieces.  This is
+	     like the test above, but with smaller max_bits.  Skip
+	     orig_max_bits (it would be redundant).  Also skip in case
+	     of overflow.  */
+	  if (max_bits < orig_max_bits
+	      && xlenest + blksize >= xlenest
+	      && can_store_by_pieces (xlenest + blksize,
+				      builtin_memset_read_str,
+				      &valc, align, true))
+	    {
+	      max_loop = true;
+	      break;
+	    }
+	  if (blksize
+	      && can_store_by_pieces (xlenest,
+				      builtin_memset_read_str,
+				      &valc, align, true))
+	    {
+	      max_len += blksize;
+	      min_len += blksize;
+	      tst_bits = orig_tst_bits;
+	      blksize = 0;
+	      max_loop = true;
+	      break;
+	    }
+	  if (max_bits == sctz_len)
+	    {
+	      --sctz_len;
+	      --ctz_len;
+	    }
+	}
+      if (!max_loop)
+	return false;
+      /* If the boundaries are such that min and max may run a
+	 different number of trips in the initial loop, the remainder
+	 needs not be between the moduli, so set tst_bits to cover all
+	 bits.  Otherwise, if the trip counts are the same, max_len
+	 has the common prefix, and the previously-computed tst_bits
+	 is usable.  */
+      if (max_len >> max_bits > min_len >> max_bits)
+	tst_bits = max_bits;
+    }
+  /* ??? Do we have to check that all powers of two lengths from
+     max_bits down to ctz_len pass can_store_by_pieces?  As in, could
+     it possibly be that xlenest passes while smaller power-of-two
+     sizes don't?  */
 
   by_pieces_constfn constfun;
   void *constfundata;
@@ -4405,7 +4470,9 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
      the least significant bit possibly set in the length.  */
   for (int i = max_bits; i >= sctz_len; i--)
     {
+      rtx_code_label *loop_label = NULL;
       rtx_code_label *label = NULL;
+
       blksize = HOST_WIDE_INT_1U << i;
 
       /* If we're past the bits shared between min_ and max_len, expand
@@ -4419,18 +4486,31 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
 				   profile_probability::even ());
 	}
       /* If we are at a bit that is in the prefix shared by min_ and
-	 max_len, skip this BLKSIZE if the bit is clear.  */
-      else if ((max_len & blksize) == 0)
+	 max_len, skip the current BLKSIZE if the bit is clear, but do
+	 not skip the loop, even if it doesn't require
+	 prechecking.  */
+      else if ((max_len & blksize) == 0
+	       && !(max_loop && i == max_bits))
 	continue;
 
+      if (max_loop && i == max_bits)
+	{
+	  loop_label = gen_label_rtx ();
+	  emit_label (loop_label);
+	  /* Since we may run this multiple times, don't assume we
+	     know anything about the offset.  */
+	  clear_mem_offset (to);
+	}
+
       /* Issue a store of BLKSIZE bytes.  */
+      bool update_needed = i != sctz_len || loop_label;
       to = store_by_pieces (to, blksize,
 			    constfun, constfundata,
 			    align, true,
-			    i != sctz_len ? RETURN_END : RETURN_BEGIN);
+			    update_needed ? RETURN_END : RETURN_BEGIN);
 
       /* Adjust REM and PTR, unless this is the last iteration.  */
-      if (i != sctz_len)
+      if (update_needed)
 	{
 	  emit_move_insn (ptr, force_operand (XEXP (to, 0), NULL_RTX));
 	  to = replace_equiv_address (to, ptr);
@@ -4438,6 +4518,11 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
 	  emit_move_insn (rem, force_operand (rem_minus_blksize, NULL_RTX));
 	}
 
+      if (loop_label)
+	emit_cmp_and_jump_insns (rem, GEN_INT (blksize), GE, NULL,
+				 ptr_mode, 1, loop_label,
+				 profile_probability::likely ());
+
       if (label)
 	{
 	  emit_label (label);
diff --git a/gcc/common.opt b/gcc/common.opt
index d0371aec8db..5d28f054241 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -1874,6 +1874,10 @@ finline-atomics
 Common Var(flag_inline_atomics) Init(1) Optimization
 Inline __atomic operations when a lock free instruction sequence is available.
 
+finline-memset-loops
+Common Var(flag_inline_memset_loops) Init(0) Optimization
+Inline memset even if it requires loops.
+
 fcf-protection
 Common RejectNegative Alias(fcf-protection=,full)
 
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 631c00582bf..8f8d52bbeef 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -548,7 +548,8 @@ Objective-C and Objective-C++ Dialects}.
 -fgcse-sm  -fhoist-adjacent-loads  -fif-conversion @gol
 -fif-conversion2  -findirect-inlining @gol
 -finline-functions  -finline-functions-called-once  -finline-limit=@var{n} @gol
--finline-small-functions -fipa-modref -fipa-cp  -fipa-cp-clone @gol
+-finline-memset-loops -finline-small-functions @gol
+-fipa-modref -fipa-cp  -fipa-cp-clone @gol
 -fipa-bit-cp  -fipa-vrp  -fipa-pta  -fipa-profile  -fipa-pure-const @gol
 -fipa-reference  -fipa-reference-addressable @gol
 -fipa-stack-alignment  -fipa-icf  -fira-algorithm=@var{algorithm} @gol
@@ -11961,6 +11962,16 @@ in its own right.
 Enabled at levels @option{-O1}, @option{-O2}, @option{-O3} and @option{-Os},
 but not @option{-Og}.
 
+@item -finline-memset-loops
+@opindex finline-memset-loops
+Expand @code{memset} calls inline, even when the length is variable or
+big enough as to require looping.  This may enable the compiler to take
+advantage of known alignment and length multipliers, but it will often
+generate code that is less efficient than performant implementations of
+@code{memset}, and grow code size so much that even a less performant
+@code{memset} may run faster due to better use of the code cache.  This
+option is disabled by default.
+
 @item -fearly-inlining
 @opindex fearly-inlining
 Inline functions marked by @code{always_inline} and functions whose body seems
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c
new file mode 100644
index 00000000000..4de51df006e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c
@@ -0,0 +1,84 @@
+/* { dg-do compile } */
+/* { dg-options "-finline-memset-loops -gno-record-gcc-switches -fno-lto" } */
+
+void *zero (unsigned long long (*p)[32], int n)
+{
+  return __builtin_memset (p, 0, n * sizeof (*p));
+}
+
+void *ones (char (*p)[128], int n)
+{
+  return __builtin_memset (p, -1, n * sizeof (*p));
+}
+
+void *opt2 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 2) * sizeof (*p));
+}
+
+void *opt8 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 8) * sizeof (*p));
+}
+
+void *opt32 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 32) * sizeof (*p));
+}
+
+void *opt128 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 128) * sizeof (*p));
+}
+
+void *opt512 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 512) * sizeof (*p));
+}
+
+void *opt_primes (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 509 : 7) * sizeof (*p));
+}
+
+void *opt_primes_blk (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 521 : 9) * sizeof (*p));
+}
+
+void *huge (long (*p)[16384])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep1 (long (*p)[16384+1])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep4 (long (*p)[16384+4])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep16 (long (*p)[16384+16])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep64 (long (*p)[16384+64])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep256 (long (*p)[16384+256])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep1024p256p64p16p4p1 (long (*p)[16384+1024+64+16+4+1])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+/* { dg-final { scan-assembler-not "memset" } } */

^ permalink raw reply	[flat|nested] 3+ messages in thread

* [gcc(refs/users/aoliva/heads/testme)] Introduce -finline-all-stringops=memset
@ 2023-01-27  1:57 Alexandre Oliva
  0 siblings, 0 replies; 3+ messages in thread
From: Alexandre Oliva @ 2023-01-27  1:57 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:67c928202a9c09795afdf84bb289adf0e76517f1

commit 67c928202a9c09795afdf84bb289adf0e76517f1
Author: Alexandre Oliva <oliva@adacore.com>
Date:   Thu Jan 26 22:52:10 2023 -0300

    Introduce -finline-all-stringops=memset
    
    try_store_by_multiple_pieces was added not long ago, enabling
    variable-sized memset to be expanded inline when the worst-case
    in-range constant length would, using conditional blocks with powers
    of two to cover all possibilities of length and alignment.
    
    This patch extends the memset expansion to start with a loop, so as to
    still take advantage of known alignment even with long lengths, but
    without necessarily adding store blocks for every power of two.
    
    This makes it possible for any memset call to be expanded, even if
    storing a single byte per iteration.  Surely efficient implementations
    of memset can do better, with a pre-loop to increase alignment, but
    that would likely be excessive for inline expansions of memset.
    
    Still, in some cases, users prefer to inline memset, even if it's not
    as performant, or when it's known to be performant in ways the
    compiler can't tell, or to avoid depending on a C runtime library.
    
    With this flag, global or per-function optimizations may enable inline
    expansion of memset to be selectively requested, while the
    infrastructure for that may enable us to introduce per-target tuning
    to enable such looping when advantageous, even if not explicitly
    requested.
    
    
    for  gcc/ChangeLog
    
            * builtins.cc (try_store_by_multiple_pieces): Support starting
            with a loop.
            * common.opt (finline-all-stringops=memset): New.
            * doc/invoke.texi (-finline-all-stringops=memset): Add.
    
    for  gcc/testsuite/ChangeLog
    
            * gcc.dg/torture/inline-mem-set-1.c: New.

Diff:
---
 gcc/builtins.cc                                 | 99 +++++++++++++++++++++++--
 gcc/common.opt                                  |  4 +
 gcc/doc/invoke.texi                             | 13 +++-
 gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c | 84 +++++++++++++++++++++
 4 files changed, 192 insertions(+), 8 deletions(-)

diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index af45829875e..733fe17ede6 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -4322,6 +4322,10 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
   int tst_bits = (max_bits != min_bits ? max_bits
 		  : floor_log2 (max_len ^ min_len));
 
+  /* Save the pre-blksize values.  */
+  int orig_max_bits = max_bits;
+  int orig_tst_bits = tst_bits;
+
   /* Check whether it's profitable to start by storing a fixed BLKSIZE
      bytes, to lower max_bits.  In the unlikely case of a constant LEN
      (implied by identical MAX_LEN and MIN_LEN), we want to issue a
@@ -4361,9 +4365,70 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
   if (max_bits >= 0)
     xlenest += ((HOST_WIDE_INT_1U << max_bits) * 2
 		- (HOST_WIDE_INT_1U << ctz_len));
-  if (!can_store_by_pieces (xlenest, builtin_memset_read_str,
-			    &valc, align, true))
-    return false;
+  bool max_loop = false;
+  /* Skip the test in case of overflow in xlenest.  It shouldn't
+     happen because of the way max_bits and blksize are related, but
+     it doesn't hurt to test.  */
+  if (blksize > xlenest
+      || !can_store_by_pieces (xlenest, builtin_memset_read_str,
+			       &valc, align, true))
+    {
+      if (!flag_inline_memset_loops)
+	return false;
+
+      for (max_bits = orig_max_bits;
+	   max_bits >= sctz_len;
+	   --max_bits)
+	{
+	  xlenest = ((HOST_WIDE_INT_1U << max_bits) * 2
+		     - (HOST_WIDE_INT_1U << ctz_len));
+	  /* Check that blksize plus the bits to be stored as blocks
+	     sized at powers of two can be stored by pieces.  This is
+	     like the test above, but with smaller max_bits.  Skip
+	     orig_max_bits (it would be redundant).  Also skip in case
+	     of overflow.  */
+	  if (max_bits < orig_max_bits
+	      && xlenest + blksize >= xlenest
+	      && can_store_by_pieces (xlenest + blksize,
+				      builtin_memset_read_str,
+				      &valc, align, true))
+	    {
+	      max_loop = true;
+	      break;
+	    }
+	  if (blksize
+	      && can_store_by_pieces (xlenest,
+				      builtin_memset_read_str,
+				      &valc, align, true))
+	    {
+	      max_len += blksize;
+	      min_len += blksize;
+	      tst_bits = orig_tst_bits;
+	      blksize = 0;
+	      max_loop = true;
+	      break;
+	    }
+	  if (max_bits == sctz_len)
+	    {
+	      --sctz_len;
+	      --ctz_len;
+	    }
+	}
+      if (!max_loop)
+	return false;
+      /* If the boundaries are such that min and max may run a
+	 different number of trips in the initial loop, the remainder
+	 needs not be between the moduli, so set tst_bits to cover all
+	 bits.  Otherwise, if the trip counts are the same, max_len
+	 has the common prefix, and the previously-computed tst_bits
+	 is usable.  */
+      if (max_len >> max_bits > min_len >> max_bits)
+	tst_bits = max_bits;
+    }
+  /* ??? Do we have to check that all powers of two lengths from
+     max_bits down to ctz_len pass can_store_by_pieces?  As in, could
+     it possibly be that xlenest passes while smaller power-of-two
+     sizes don't?  */
 
   by_pieces_constfn constfun;
   void *constfundata;
@@ -4405,7 +4470,9 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
      the least significant bit possibly set in the length.  */
   for (int i = max_bits; i >= sctz_len; i--)
     {
+      rtx_code_label *loop_label = NULL;
       rtx_code_label *label = NULL;
+
       blksize = HOST_WIDE_INT_1U << i;
 
       /* If we're past the bits shared between min_ and max_len, expand
@@ -4419,18 +4486,31 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
 				   profile_probability::even ());
 	}
       /* If we are at a bit that is in the prefix shared by min_ and
-	 max_len, skip this BLKSIZE if the bit is clear.  */
-      else if ((max_len & blksize) == 0)
+	 max_len, skip the current BLKSIZE if the bit is clear, but do
+	 not skip the loop, even if it doesn't require
+	 prechecking.  */
+      else if ((max_len & blksize) == 0
+	       && !(max_loop && i == max_bits))
 	continue;
 
+      if (max_loop && i == max_bits)
+	{
+	  loop_label = gen_label_rtx ();
+	  emit_label (loop_label);
+	  /* Since we may run this multiple times, don't assume we
+	     know anything about the offset.  */
+	  clear_mem_offset (to);
+	}
+
       /* Issue a store of BLKSIZE bytes.  */
+      bool update_needed = i != sctz_len || loop_label;
       to = store_by_pieces (to, blksize,
 			    constfun, constfundata,
 			    align, true,
-			    i != sctz_len ? RETURN_END : RETURN_BEGIN);
+			    update_needed ? RETURN_END : RETURN_BEGIN);
 
       /* Adjust REM and PTR, unless this is the last iteration.  */
-      if (i != sctz_len)
+      if (update_needed)
 	{
 	  emit_move_insn (ptr, force_operand (XEXP (to, 0), NULL_RTX));
 	  to = replace_equiv_address (to, ptr);
@@ -4438,6 +4518,11 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
 	  emit_move_insn (rem, force_operand (rem_minus_blksize, NULL_RTX));
 	}
 
+      if (loop_label)
+	emit_cmp_and_jump_insns (rem, GEN_INT (blksize), GE, NULL,
+				 ptr_mode, 1, loop_label,
+				 profile_probability::likely ());
+
       if (label)
 	{
 	  emit_label (label);
diff --git a/gcc/common.opt b/gcc/common.opt
index d0371aec8db..5d28f054241 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -1874,6 +1874,10 @@ finline-atomics
 Common Var(flag_inline_atomics) Init(1) Optimization
 Inline __atomic operations when a lock free instruction sequence is available.
 
+finline-memset-loops
+Common Var(flag_inline_memset_loops) Init(0) Optimization
+Inline memset even if it requires loops.
+
 fcf-protection
 Common RejectNegative Alias(fcf-protection=,full)
 
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 06d77983e30..651307f315d 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -548,7 +548,8 @@ Objective-C and Objective-C++ Dialects}.
 -fgcse-sm  -fhoist-adjacent-loads  -fif-conversion @gol
 -fif-conversion2  -findirect-inlining @gol
 -finline-functions  -finline-functions-called-once  -finline-limit=@var{n} @gol
--finline-small-functions -fipa-modref -fipa-cp  -fipa-cp-clone @gol
+-finline-memset-loops -finline-small-functions @gol
+-fipa-modref -fipa-cp  -fipa-cp-clone @gol
 -fipa-bit-cp  -fipa-vrp  -fipa-pta  -fipa-profile  -fipa-pure-const @gol
 -fipa-reference  -fipa-reference-addressable @gol
 -fipa-stack-alignment  -fipa-icf  -fira-algorithm=@var{algorithm} @gol
@@ -11963,6 +11964,16 @@ in its own right.
 Enabled at levels @option{-O1}, @option{-O2}, @option{-O3} and @option{-Os},
 but not @option{-Og}.
 
+@item -finline-memset-loops
+@opindex finline-memset-loops
+Expand @code{memset} calls inline, even when the length is variable or
+big enough as to require looping.  This may enable the compiler to take
+advantage of known alignment and length multipliers, but it will often
+generate code that is less efficient than performant implementations of
+@code{memset}, and grow code size so much that even a less performant
+@code{memset} may run faster due to better use of the code cache.  This
+option is disabled by default.
+
 @item -fearly-inlining
 @opindex fearly-inlining
 Inline functions marked by @code{always_inline} and functions whose body seems
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c
new file mode 100644
index 00000000000..4de51df006e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c
@@ -0,0 +1,84 @@
+/* { dg-do compile } */
+/* { dg-options "-finline-memset-loops -gno-record-gcc-switches -fno-lto" } */
+
+void *zero (unsigned long long (*p)[32], int n)
+{
+  return __builtin_memset (p, 0, n * sizeof (*p));
+}
+
+void *ones (char (*p)[128], int n)
+{
+  return __builtin_memset (p, -1, n * sizeof (*p));
+}
+
+void *opt2 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 2) * sizeof (*p));
+}
+
+void *opt8 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 8) * sizeof (*p));
+}
+
+void *opt32 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 32) * sizeof (*p));
+}
+
+void *opt128 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 128) * sizeof (*p));
+}
+
+void *opt512 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 512) * sizeof (*p));
+}
+
+void *opt_primes (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 509 : 7) * sizeof (*p));
+}
+
+void *opt_primes_blk (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 521 : 9) * sizeof (*p));
+}
+
+void *huge (long (*p)[16384])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep1 (long (*p)[16384+1])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep4 (long (*p)[16384+4])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep16 (long (*p)[16384+16])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep64 (long (*p)[16384+64])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep256 (long (*p)[16384+256])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep1024p256p64p16p4p1 (long (*p)[16384+1024+64+16+4+1])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+/* { dg-final { scan-assembler-not "memset" } } */

^ permalink raw reply	[flat|nested] 3+ messages in thread

* [gcc(refs/users/aoliva/heads/testme)] Introduce -finline-all-stringops=memset
@ 2023-01-20 16:58 Alexandre Oliva
  0 siblings, 0 replies; 3+ messages in thread
From: Alexandre Oliva @ 2023-01-20 16:58 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:938f55f70978cb15f5122dab86d16f6364bdf756

commit 938f55f70978cb15f5122dab86d16f6364bdf756
Author: Alexandre Oliva <oliva@adacore.com>
Date:   Thu Dec 22 21:28:44 2022 -0300

    Introduce -finline-all-stringops=memset
    
    try_store_by_multiple_pieces was added not long ago, enabling
    variable-sized memset to be expanded inline when the worst-case
    in-range constant length would, using conditional blocks with powers
    of two to cover all possibilities of length and alignment.
    
    This patch extends the memset expansion to start with a loop, so as to
    still take advantage of known alignment even with long lengths, but
    without necessarily adding store blocks for every power of two.
    
    This makes it possible for any memset call to be expanded, even if
    storing a single byte per iteration.  Surely efficient implementations
    of memset can do better, with a pre-loop to increase alignment, but
    that would likely be excessive for inline expansions of memset.
    
    Still, in some cases, users prefer to inline memset, even if it's not
    as performant, or when it's known to be performant in ways the
    compiler can't tell, or to avoid depending on a C runtime library.
    
    With this flag, global or per-function optimizations may enable inline
    expansion of memset to be selectively requested, while the
    infrastructure for that may enable us to introduce per-target tuning
    to enable such looping when advantageous, even if not explicitly
    requested.
    
    
    for  gcc/ChangeLog
    
            * builtins.cc (try_store_by_multiple_pieces): Support starting
            with a loop.
            * common.opt (finline-all-stringops=memset): New.
            * doc/invoke.texi (-finline-all-stringops=memset): Add.
    
    for  gcc/testsuite/ChangeLog
    
            * gcc.dg/torture/inline-mem-set-1.c: New.

Diff:
---
 gcc/builtins.cc                                 | 99 +++++++++++++++++++++++--
 gcc/common.opt                                  |  4 +
 gcc/doc/invoke.texi                             | 13 +++-
 gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c | 84 +++++++++++++++++++++
 4 files changed, 192 insertions(+), 8 deletions(-)

diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index af45829875e..733fe17ede6 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -4322,6 +4322,10 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
   int tst_bits = (max_bits != min_bits ? max_bits
 		  : floor_log2 (max_len ^ min_len));
 
+  /* Save the pre-blksize values.  */
+  int orig_max_bits = max_bits;
+  int orig_tst_bits = tst_bits;
+
   /* Check whether it's profitable to start by storing a fixed BLKSIZE
      bytes, to lower max_bits.  In the unlikely case of a constant LEN
      (implied by identical MAX_LEN and MIN_LEN), we want to issue a
@@ -4361,9 +4365,70 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
   if (max_bits >= 0)
     xlenest += ((HOST_WIDE_INT_1U << max_bits) * 2
 		- (HOST_WIDE_INT_1U << ctz_len));
-  if (!can_store_by_pieces (xlenest, builtin_memset_read_str,
-			    &valc, align, true))
-    return false;
+  bool max_loop = false;
+  /* Skip the test in case of overflow in xlenest.  It shouldn't
+     happen because of the way max_bits and blksize are related, but
+     it doesn't hurt to test.  */
+  if (blksize > xlenest
+      || !can_store_by_pieces (xlenest, builtin_memset_read_str,
+			       &valc, align, true))
+    {
+      if (!flag_inline_memset_loops)
+	return false;
+
+      for (max_bits = orig_max_bits;
+	   max_bits >= sctz_len;
+	   --max_bits)
+	{
+	  xlenest = ((HOST_WIDE_INT_1U << max_bits) * 2
+		     - (HOST_WIDE_INT_1U << ctz_len));
+	  /* Check that blksize plus the bits to be stored as blocks
+	     sized at powers of two can be stored by pieces.  This is
+	     like the test above, but with smaller max_bits.  Skip
+	     orig_max_bits (it would be redundant).  Also skip in case
+	     of overflow.  */
+	  if (max_bits < orig_max_bits
+	      && xlenest + blksize >= xlenest
+	      && can_store_by_pieces (xlenest + blksize,
+				      builtin_memset_read_str,
+				      &valc, align, true))
+	    {
+	      max_loop = true;
+	      break;
+	    }
+	  if (blksize
+	      && can_store_by_pieces (xlenest,
+				      builtin_memset_read_str,
+				      &valc, align, true))
+	    {
+	      max_len += blksize;
+	      min_len += blksize;
+	      tst_bits = orig_tst_bits;
+	      blksize = 0;
+	      max_loop = true;
+	      break;
+	    }
+	  if (max_bits == sctz_len)
+	    {
+	      --sctz_len;
+	      --ctz_len;
+	    }
+	}
+      if (!max_loop)
+	return false;
+      /* If the boundaries are such that min and max may run a
+	 different number of trips in the initial loop, the remainder
+	 needs not be between the moduli, so set tst_bits to cover all
+	 bits.  Otherwise, if the trip counts are the same, max_len
+	 has the common prefix, and the previously-computed tst_bits
+	 is usable.  */
+      if (max_len >> max_bits > min_len >> max_bits)
+	tst_bits = max_bits;
+    }
+  /* ??? Do we have to check that all powers of two lengths from
+     max_bits down to ctz_len pass can_store_by_pieces?  As in, could
+     it possibly be that xlenest passes while smaller power-of-two
+     sizes don't?  */
 
   by_pieces_constfn constfun;
   void *constfundata;
@@ -4405,7 +4470,9 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
      the least significant bit possibly set in the length.  */
   for (int i = max_bits; i >= sctz_len; i--)
     {
+      rtx_code_label *loop_label = NULL;
       rtx_code_label *label = NULL;
+
       blksize = HOST_WIDE_INT_1U << i;
 
       /* If we're past the bits shared between min_ and max_len, expand
@@ -4419,18 +4486,31 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
 				   profile_probability::even ());
 	}
       /* If we are at a bit that is in the prefix shared by min_ and
-	 max_len, skip this BLKSIZE if the bit is clear.  */
-      else if ((max_len & blksize) == 0)
+	 max_len, skip the current BLKSIZE if the bit is clear, but do
+	 not skip the loop, even if it doesn't require
+	 prechecking.  */
+      else if ((max_len & blksize) == 0
+	       && !(max_loop && i == max_bits))
 	continue;
 
+      if (max_loop && i == max_bits)
+	{
+	  loop_label = gen_label_rtx ();
+	  emit_label (loop_label);
+	  /* Since we may run this multiple times, don't assume we
+	     know anything about the offset.  */
+	  clear_mem_offset (to);
+	}
+
       /* Issue a store of BLKSIZE bytes.  */
+      bool update_needed = i != sctz_len || loop_label;
       to = store_by_pieces (to, blksize,
 			    constfun, constfundata,
 			    align, true,
-			    i != sctz_len ? RETURN_END : RETURN_BEGIN);
+			    update_needed ? RETURN_END : RETURN_BEGIN);
 
       /* Adjust REM and PTR, unless this is the last iteration.  */
-      if (i != sctz_len)
+      if (update_needed)
 	{
 	  emit_move_insn (ptr, force_operand (XEXP (to, 0), NULL_RTX));
 	  to = replace_equiv_address (to, ptr);
@@ -4438,6 +4518,11 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
 	  emit_move_insn (rem, force_operand (rem_minus_blksize, NULL_RTX));
 	}
 
+      if (loop_label)
+	emit_cmp_and_jump_insns (rem, GEN_INT (blksize), GE, NULL,
+				 ptr_mode, 1, loop_label,
+				 profile_probability::likely ());
+
       if (label)
 	{
 	  emit_label (label);
diff --git a/gcc/common.opt b/gcc/common.opt
index d0371aec8db..5d28f054241 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -1874,6 +1874,10 @@ finline-atomics
 Common Var(flag_inline_atomics) Init(1) Optimization
 Inline __atomic operations when a lock free instruction sequence is available.
 
+finline-memset-loops
+Common Var(flag_inline_memset_loops) Init(0) Optimization
+Inline memset even if it requires loops.
+
 fcf-protection
 Common RejectNegative Alias(fcf-protection=,full)
 
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 631c00582bf..8f8d52bbeef 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -548,7 +548,8 @@ Objective-C and Objective-C++ Dialects}.
 -fgcse-sm  -fhoist-adjacent-loads  -fif-conversion @gol
 -fif-conversion2  -findirect-inlining @gol
 -finline-functions  -finline-functions-called-once  -finline-limit=@var{n} @gol
--finline-small-functions -fipa-modref -fipa-cp  -fipa-cp-clone @gol
+-finline-memset-loops -finline-small-functions @gol
+-fipa-modref -fipa-cp  -fipa-cp-clone @gol
 -fipa-bit-cp  -fipa-vrp  -fipa-pta  -fipa-profile  -fipa-pure-const @gol
 -fipa-reference  -fipa-reference-addressable @gol
 -fipa-stack-alignment  -fipa-icf  -fira-algorithm=@var{algorithm} @gol
@@ -11961,6 +11962,16 @@ in its own right.
 Enabled at levels @option{-O1}, @option{-O2}, @option{-O3} and @option{-Os},
 but not @option{-Og}.
 
+@item -finline-memset-loops
+@opindex finline-memset-loops
+Expand @code{memset} calls inline, even when the length is variable or
+big enough as to require looping.  This may enable the compiler to take
+advantage of known alignment and length multipliers, but it will often
+generate code that is less efficient than performant implementations of
+@code{memset}, and grow code size so much that even a less performant
+@code{memset} may run faster due to better use of the code cache.  This
+option is disabled by default.
+
 @item -fearly-inlining
 @opindex fearly-inlining
 Inline functions marked by @code{always_inline} and functions whose body seems
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c
new file mode 100644
index 00000000000..4de51df006e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c
@@ -0,0 +1,84 @@
+/* { dg-do compile } */
+/* { dg-options "-finline-memset-loops -gno-record-gcc-switches -fno-lto" } */
+
+void *zero (unsigned long long (*p)[32], int n)
+{
+  return __builtin_memset (p, 0, n * sizeof (*p));
+}
+
+void *ones (char (*p)[128], int n)
+{
+  return __builtin_memset (p, -1, n * sizeof (*p));
+}
+
+void *opt2 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 2) * sizeof (*p));
+}
+
+void *opt8 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 8) * sizeof (*p));
+}
+
+void *opt32 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 32) * sizeof (*p));
+}
+
+void *opt128 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 128) * sizeof (*p));
+}
+
+void *opt512 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 512) * sizeof (*p));
+}
+
+void *opt_primes (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 509 : 7) * sizeof (*p));
+}
+
+void *opt_primes_blk (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 521 : 9) * sizeof (*p));
+}
+
+void *huge (long (*p)[16384])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep1 (long (*p)[16384+1])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep4 (long (*p)[16384+4])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep16 (long (*p)[16384+16])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep64 (long (*p)[16384+64])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep256 (long (*p)[16384+256])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep1024p256p64p16p4p1 (long (*p)[16384+1024+64+16+4+1])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+/* { dg-final { scan-assembler-not "memset" } } */

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2023-01-27  1:57 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-01-20 18:50 [gcc(refs/users/aoliva/heads/testme)] Introduce -finline-all-stringops=memset Alexandre Oliva
  -- strict thread matches above, loose matches on Subject: below --
2023-01-27  1:57 Alexandre Oliva
2023-01-20 16:58 Alexandre Oliva

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).