[gcc(refs/users/aoliva/heads/testme)] inline memset loops

public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed

* [gcc(refs/users/aoliva/heads/testme)] inline memset loops
@ 2022-12-23  7:38 Alexandre Oliva
  0 siblings, 0 replies; 4+ messages in thread
From: Alexandre Oliva @ 2022-12-23  7:38 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:3ec66f83d972290005242d1ee097514037cfe73d

commit 3ec66f83d972290005242d1ee097514037cfe73d
Author: Alexandre Oliva <oliva@gnu.org>
Date:   Thu Dec 22 21:28:44 2022 -0300

    inline memset loops

Diff:
---
 gcc/builtins.cc                                 | 50 +++++++++++++++++++++++--
 gcc/common.opt                                  |  4 ++
 gcc/doc/invoke.texi                             | 13 ++++++-
 gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c | 14 +++++++
 4 files changed, 77 insertions(+), 4 deletions(-)

diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index 02c4fefa86f..388bae58ce4 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -4361,9 +4361,37 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
   if (max_bits >= 0)
     xlenest += ((HOST_WIDE_INT_1U << max_bits) * 2
 		- (HOST_WIDE_INT_1U << ctz_len));
+  bool max_loop = false;
   if (!can_store_by_pieces (xlenest, builtin_memset_read_str,
 			    &valc, align, true))
-    return false;
+    {
+      if (!flag_inline_memset_loops)
+	return false;
+      while (--max_bits >= sctz_len)
+	{
+	  xlenest = ((HOST_WIDE_INT_1U << max_bits) * 2
+		     - (HOST_WIDE_INT_1U << ctz_len));
+	  if (can_store_by_pieces (xlenest + blksize,
+				   builtin_memset_read_str,
+				   &valc, align, true))
+	    {
+	      max_loop = true;
+	      break;
+	    }
+	  if (!blksize)
+	    continue;
+	  if (can_store_by_pieces (xlenest,
+				   builtin_memset_read_str,
+				   &valc, align, true))
+	    {
+	      blksize = 0;
+	      max_loop = true;
+	      break;
+	    }
+	}
+      if (!max_loop)
+	return false;
+    }
 
   by_pieces_constfn constfun;
   void *constfundata;
@@ -4405,6 +4433,7 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
      the least significant bit possibly set in the length.  */
   for (int i = max_bits; i >= sctz_len; i--)
     {
+      rtx_code_label *loop_label = NULL;
       rtx_code_label *label = NULL;
       blksize = HOST_WIDE_INT_1U << i;
 
@@ -4423,14 +4452,24 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
       else if ((max_len & blksize) == 0)
 	continue;
 
+      if (max_loop && i == max_bits)
+	{
+	  loop_label = gen_label_rtx ();
+	  emit_label (loop_label);
+	  /* Since we may run this multiple times, don't assume we
+	     know anything about the offset.  */
+	  clear_mem_offset (to);
+	}
+
       /* Issue a store of BLKSIZE bytes.  */
+      bool update_needed = i != sctz_len || loop_label;
       to = store_by_pieces (to, blksize,
 			    constfun, constfundata,
 			    align, true,
-			    i != sctz_len ? RETURN_END : RETURN_BEGIN);
+			    update_needed ? RETURN_END : RETURN_BEGIN);
 
       /* Adjust REM and PTR, unless this is the last iteration.  */
-      if (i != sctz_len)
+      if (update_needed)
 	{
 	  emit_move_insn (ptr, force_operand (XEXP (to, 0), NULL_RTX));
 	  to = replace_equiv_address (to, ptr);
@@ -4438,6 +4477,11 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
 	  emit_move_insn (rem, force_operand (rem_minus_blksize, NULL_RTX));
 	}
 
+      if (loop_label)
+	emit_cmp_and_jump_insns (rem, GEN_INT (blksize), GE, NULL,
+				 ptr_mode, 1, loop_label,
+				 profile_probability::likely ());
+
       if (label)
 	{
 	  emit_label (label);
diff --git a/gcc/common.opt b/gcc/common.opt
index 562d73d7f55..c28af170be8 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -1874,6 +1874,10 @@ finline-atomics
 Common Var(flag_inline_atomics) Init(1) Optimization
 Inline __atomic operations when a lock free instruction sequence is available.
 
+finline-memset-loops
+Common Var(flag_inline_memset_loops) Init(0) Optimization
+Inline memset even if it requires loops.
+
 fcf-protection
 Common RejectNegative Alias(fcf-protection=,full)
 
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index da9ad1068fb..19f436ad463 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -548,7 +548,8 @@ Objective-C and Objective-C++ Dialects}.
 -fgcse-sm  -fhoist-adjacent-loads  -fif-conversion @gol
 -fif-conversion2  -findirect-inlining @gol
 -finline-functions  -finline-functions-called-once  -finline-limit=@var{n} @gol
--finline-small-functions -fipa-modref -fipa-cp  -fipa-cp-clone @gol
+-finline-memset-loops -finline-small-functions @gol
+-fipa-modref -fipa-cp  -fipa-cp-clone @gol
 -fipa-bit-cp  -fipa-vrp  -fipa-pta  -fipa-profile  -fipa-pure-const @gol
 -fipa-reference  -fipa-reference-addressable @gol
 -fipa-stack-alignment  -fipa-icf  -fira-algorithm=@var{algorithm} @gol
@@ -11960,6 +11961,16 @@ in its own right.
 Enabled at levels @option{-O1}, @option{-O2}, @option{-O3} and @option{-Os},
 but not @option{-Og}.
 
+@item -finline-memset-loops
+@opindex finline-memset-loops
+Expand @code{memset} calls inline, even when the length is variable or
+big enough as to require looping.  This may enable the compiler to take
+advantage of known alignment and length multipliers, but it will often
+generate code that is less efficient than performant implementations of
+@code{memset}, and grow code size so much that even a less performant
+@code{memset} may run faster due to better use of the code cache.  This
+option is disabled by default.
+
 @item -fearly-inlining
 @opindex fearly-inlining
 Inline functions marked by @code{always_inline} and functions whose body seems
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c
new file mode 100644
index 00000000000..73bd1025f19
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-finline-memset-loops -gno-record-gcc-switches -fno-lto" } */
+
+void *zero (unsigned long long (*p)[32], int n)
+{
+  return __builtin_memset (p, 0, n * sizeof (*p));
+}
+
+void *ones (char (*p)[128], int n)
+{
+  return __builtin_memset (p, -1, n * sizeof (*p));
+}
+
+/* { dg-final { scan-assembler-not "memset" } } */

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [gcc(refs/users/aoliva/heads/testme)] inline memset loops
@ 2022-12-23  7:32 Alexandre Oliva
  0 siblings, 0 replies; 4+ messages in thread
From: Alexandre Oliva @ 2022-12-23  7:32 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:2dc107c433b4462482acc3fd84693f2d63584f6d

commit 2dc107c433b4462482acc3fd84693f2d63584f6d
Author: Alexandre Oliva <oliva@gnu.org>
Date:   Thu Dec 22 21:28:44 2022 -0300

    inline memset loops

Diff:
---
 gcc/builtins.cc                                 | 50 +++++++++++++++++++++++--
 gcc/common.opt                                  |  4 ++
 gcc/doc/invoke.texi                             | 13 ++++++-
 gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c | 14 +++++++
 4 files changed, 77 insertions(+), 4 deletions(-)

diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index 02c4fefa86f..388bae58ce4 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -4361,9 +4361,37 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
   if (max_bits >= 0)
     xlenest += ((HOST_WIDE_INT_1U << max_bits) * 2
 		- (HOST_WIDE_INT_1U << ctz_len));
+  bool max_loop = false;
   if (!can_store_by_pieces (xlenest, builtin_memset_read_str,
 			    &valc, align, true))
-    return false;
+    {
+      if (!flag_inline_memset_loops)
+	return false;
+      while (--max_bits >= sctz_len)
+	{
+	  xlenest = ((HOST_WIDE_INT_1U << max_bits) * 2
+		     - (HOST_WIDE_INT_1U << ctz_len));
+	  if (can_store_by_pieces (xlenest + blksize,
+				   builtin_memset_read_str,
+				   &valc, align, true))
+	    {
+	      max_loop = true;
+	      break;
+	    }
+	  if (!blksize)
+	    continue;
+	  if (can_store_by_pieces (xlenest,
+				   builtin_memset_read_str,
+				   &valc, align, true))
+	    {
+	      blksize = 0;
+	      max_loop = true;
+	      break;
+	    }
+	}
+      if (!max_loop)
+	return false;
+    }
 
   by_pieces_constfn constfun;
   void *constfundata;
@@ -4405,6 +4433,7 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
      the least significant bit possibly set in the length.  */
   for (int i = max_bits; i >= sctz_len; i--)
     {
+      rtx_code_label *loop_label = NULL;
       rtx_code_label *label = NULL;
       blksize = HOST_WIDE_INT_1U << i;
 
@@ -4423,14 +4452,24 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
       else if ((max_len & blksize) == 0)
 	continue;
 
+      if (max_loop && i == max_bits)
+	{
+	  loop_label = gen_label_rtx ();
+	  emit_label (loop_label);
+	  /* Since we may run this multiple times, don't assume we
+	     know anything about the offset.  */
+	  clear_mem_offset (to);
+	}
+
       /* Issue a store of BLKSIZE bytes.  */
+      bool update_needed = i != sctz_len || loop_label;
       to = store_by_pieces (to, blksize,
 			    constfun, constfundata,
 			    align, true,
-			    i != sctz_len ? RETURN_END : RETURN_BEGIN);
+			    update_needed ? RETURN_END : RETURN_BEGIN);
 
       /* Adjust REM and PTR, unless this is the last iteration.  */
-      if (i != sctz_len)
+      if (update_needed)
 	{
 	  emit_move_insn (ptr, force_operand (XEXP (to, 0), NULL_RTX));
 	  to = replace_equiv_address (to, ptr);
@@ -4438,6 +4477,11 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
 	  emit_move_insn (rem, force_operand (rem_minus_blksize, NULL_RTX));
 	}
 
+      if (loop_label)
+	emit_cmp_and_jump_insns (rem, GEN_INT (blksize), GE, NULL,
+				 ptr_mode, 1, loop_label,
+				 profile_probability::likely ());
+
       if (label)
 	{
 	  emit_label (label);
diff --git a/gcc/common.opt b/gcc/common.opt
index 562d73d7f55..c28af170be8 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -1874,6 +1874,10 @@ finline-atomics
 Common Var(flag_inline_atomics) Init(1) Optimization
 Inline __atomic operations when a lock free instruction sequence is available.
 
+finline-memset-loops
+Common Var(flag_inline_memset_loops) Init(0) Optimization
+Inline memset even if it requires loops.
+
 fcf-protection
 Common RejectNegative Alias(fcf-protection=,full)
 
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index da9ad1068fb..19f436ad463 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -548,7 +548,8 @@ Objective-C and Objective-C++ Dialects}.
 -fgcse-sm  -fhoist-adjacent-loads  -fif-conversion @gol
 -fif-conversion2  -findirect-inlining @gol
 -finline-functions  -finline-functions-called-once  -finline-limit=@var{n} @gol
--finline-small-functions -fipa-modref -fipa-cp  -fipa-cp-clone @gol
+-finline-memset-loops -finline-small-functions @gol
+-fipa-modref -fipa-cp  -fipa-cp-clone @gol
 -fipa-bit-cp  -fipa-vrp  -fipa-pta  -fipa-profile  -fipa-pure-const @gol
 -fipa-reference  -fipa-reference-addressable @gol
 -fipa-stack-alignment  -fipa-icf  -fira-algorithm=@var{algorithm} @gol
@@ -11960,6 +11961,16 @@ in its own right.
 Enabled at levels @option{-O1}, @option{-O2}, @option{-O3} and @option{-Os},
 but not @option{-Og}.
 
+@item -finline-memset-loops
+@opindex finline-memset-loops
+Expand @code{memset} calls inline, even when the length is variable or
+big enough as to require looping.  This may enable the compiler to take
+advantage of known alignment and length multipliers, but it will often
+generate code that is less efficient than performant implementations of
+@code{memset}, and grow code size so much that even a less performant
+@code{memset} may run faster due to better use of the code cache.  This
+option is disabled by default.
+
 @item -fearly-inlining
 @opindex fearly-inlining
 Inline functions marked by @code{always_inline} and functions whose body seems
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c
new file mode 100644
index 00000000000..34105a42c07
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-finline-memset-loops" } */
+
+void *zero (unsigned long long (*p)[32], int n)
+{
+  return __builtin_memset (p, 0, n * sizeof (*p));
+}
+
+void *ones (char (*p)[128], int n)
+{
+  return __builtin_memset (p, -1, n * sizeof (*p));
+}
+
+/* { dg-final { scan-assembler-not "memset" } } */

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [gcc(refs/users/aoliva/heads/testme)] inline memset loops
@ 2022-12-23  5:26 Alexandre Oliva
  0 siblings, 0 replies; 4+ messages in thread
From: Alexandre Oliva @ 2022-12-23  5:26 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:8cd58c4687270408ffb342020eb26574ef06989e

commit 8cd58c4687270408ffb342020eb26574ef06989e
Author: Alexandre Oliva <oliva@gnu.org>
Date:   Thu Dec 22 21:28:44 2022 -0300

    inline memset loops

Diff:
---
 gcc/builtins.cc                                 | 50 +++++++++++++++++++++++--
 gcc/common.opt                                  |  4 ++
 gcc/doc/invoke.texi                             | 13 ++++++-
 gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c | 14 +++++++
 4 files changed, 77 insertions(+), 4 deletions(-)

diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index 02c4fefa86f..388bae58ce4 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -4361,9 +4361,37 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
   if (max_bits >= 0)
     xlenest += ((HOST_WIDE_INT_1U << max_bits) * 2
 		- (HOST_WIDE_INT_1U << ctz_len));
+  bool max_loop = false;
   if (!can_store_by_pieces (xlenest, builtin_memset_read_str,
 			    &valc, align, true))
-    return false;
+    {
+      if (!flag_inline_memset_loops)
+	return false;
+      while (--max_bits >= sctz_len)
+	{
+	  xlenest = ((HOST_WIDE_INT_1U << max_bits) * 2
+		     - (HOST_WIDE_INT_1U << ctz_len));
+	  if (can_store_by_pieces (xlenest + blksize,
+				   builtin_memset_read_str,
+				   &valc, align, true))
+	    {
+	      max_loop = true;
+	      break;
+	    }
+	  if (!blksize)
+	    continue;
+	  if (can_store_by_pieces (xlenest,
+				   builtin_memset_read_str,
+				   &valc, align, true))
+	    {
+	      blksize = 0;
+	      max_loop = true;
+	      break;
+	    }
+	}
+      if (!max_loop)
+	return false;
+    }
 
   by_pieces_constfn constfun;
   void *constfundata;
@@ -4405,6 +4433,7 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
      the least significant bit possibly set in the length.  */
   for (int i = max_bits; i >= sctz_len; i--)
     {
+      rtx_code_label *loop_label = NULL;
       rtx_code_label *label = NULL;
       blksize = HOST_WIDE_INT_1U << i;
 
@@ -4423,14 +4452,24 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
       else if ((max_len & blksize) == 0)
 	continue;
 
+      if (max_loop && i == max_bits)
+	{
+	  loop_label = gen_label_rtx ();
+	  emit_label (loop_label);
+	  /* Since we may run this multiple times, don't assume we
+	     know anything about the offset.  */
+	  clear_mem_offset (to);
+	}
+
       /* Issue a store of BLKSIZE bytes.  */
+      bool update_needed = i != sctz_len || loop_label;
       to = store_by_pieces (to, blksize,
 			    constfun, constfundata,
 			    align, true,
-			    i != sctz_len ? RETURN_END : RETURN_BEGIN);
+			    update_needed ? RETURN_END : RETURN_BEGIN);
 
       /* Adjust REM and PTR, unless this is the last iteration.  */
-      if (i != sctz_len)
+      if (update_needed)
 	{
 	  emit_move_insn (ptr, force_operand (XEXP (to, 0), NULL_RTX));
 	  to = replace_equiv_address (to, ptr);
@@ -4438,6 +4477,11 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
 	  emit_move_insn (rem, force_operand (rem_minus_blksize, NULL_RTX));
 	}
 
+      if (loop_label)
+	emit_cmp_and_jump_insns (rem, GEN_INT (blksize), GE, NULL,
+				 ptr_mode, 1, loop_label,
+				 profile_probability::likely ());
+
       if (label)
 	{
 	  emit_label (label);
diff --git a/gcc/common.opt b/gcc/common.opt
index 562d73d7f55..c28af170be8 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -1874,6 +1874,10 @@ finline-atomics
 Common Var(flag_inline_atomics) Init(1) Optimization
 Inline __atomic operations when a lock free instruction sequence is available.
 
+finline-memset-loops
+Common Var(flag_inline_memset_loops) Init(0) Optimization
+Inline memset even if it requires loops.
+
 fcf-protection
 Common RejectNegative Alias(fcf-protection=,full)
 
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index da9ad1068fb..19f436ad463 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -548,7 +548,8 @@ Objective-C and Objective-C++ Dialects}.
 -fgcse-sm  -fhoist-adjacent-loads  -fif-conversion @gol
 -fif-conversion2  -findirect-inlining @gol
 -finline-functions  -finline-functions-called-once  -finline-limit=@var{n} @gol
--finline-small-functions -fipa-modref -fipa-cp  -fipa-cp-clone @gol
+-finline-memset-loops -finline-small-functions @gol
+-fipa-modref -fipa-cp  -fipa-cp-clone @gol
 -fipa-bit-cp  -fipa-vrp  -fipa-pta  -fipa-profile  -fipa-pure-const @gol
 -fipa-reference  -fipa-reference-addressable @gol
 -fipa-stack-alignment  -fipa-icf  -fira-algorithm=@var{algorithm} @gol
@@ -11960,6 +11961,16 @@ in its own right.
 Enabled at levels @option{-O1}, @option{-O2}, @option{-O3} and @option{-Os},
 but not @option{-Og}.
 
+@item -finline-memset-loops
+@opindex finline-memset-loops
+Expand @code{memset} calls inline, even when the length is variable or
+big enough as to require looping.  This may enable the compiler to take
+advantage of known alignment and length multipliers, but it will often
+generate code that is less efficient than performant implementations of
+@code{memset}, and grow code size so much that even a less performant
+@code{memset} may run faster due to better use of the code cache.  This
+option is disabled by default.
+
 @item -fearly-inlining
 @opindex fearly-inlining
 Inline functions marked by @code{always_inline} and functions whose body seems
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c
new file mode 100644
index 00000000000..591c54b4fcc
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { do-options "-finline-memset-loops" } */
+
+void *zero (unsigned long long (*p)[32], int n)
+{
+  return __builtin_memset (p, 0, n * sizeof (*p));
+}
+
+void *ones (char (*p)[128], int n)
+{
+  return __builtin_memset (p, -1, n * sizeof (*p));
+}
+
+/* { dg-final { scan-assembler-not "memset" } } */

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [gcc(refs/users/aoliva/heads/testme)] inline memset loops
@ 2022-12-23  0:57 Alexandre Oliva
  0 siblings, 0 replies; 4+ messages in thread
From: Alexandre Oliva @ 2022-12-23  0:57 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:7d172bec5011526917596810a9fe0a140bcbe9a8

commit 7d172bec5011526917596810a9fe0a140bcbe9a8
Author: Alexandre Oliva <oliva@gnu.org>
Date:   Thu Dec 22 21:28:44 2022 -0300

    inline memset loops

Diff:
---
 gcc/builtins.cc     | 50 +++++++++++++++++++++++++++++++++++++++++++++++---
 gcc/common.opt      |  5 +++++
 gcc/doc/invoke.texi | 13 ++++++++++++-
 3 files changed, 64 insertions(+), 4 deletions(-)

diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index 02c4fefa86f..388bae58ce4 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -4361,9 +4361,37 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
   if (max_bits >= 0)
     xlenest += ((HOST_WIDE_INT_1U << max_bits) * 2
 		- (HOST_WIDE_INT_1U << ctz_len));
+  bool max_loop = false;
   if (!can_store_by_pieces (xlenest, builtin_memset_read_str,
 			    &valc, align, true))
-    return false;
+    {
+      if (!flag_inline_memset_loops)
+	return false;
+      while (--max_bits >= sctz_len)
+	{
+	  xlenest = ((HOST_WIDE_INT_1U << max_bits) * 2
+		     - (HOST_WIDE_INT_1U << ctz_len));
+	  if (can_store_by_pieces (xlenest + blksize,
+				   builtin_memset_read_str,
+				   &valc, align, true))
+	    {
+	      max_loop = true;
+	      break;
+	    }
+	  if (!blksize)
+	    continue;
+	  if (can_store_by_pieces (xlenest,
+				   builtin_memset_read_str,
+				   &valc, align, true))
+	    {
+	      blksize = 0;
+	      max_loop = true;
+	      break;
+	    }
+	}
+      if (!max_loop)
+	return false;
+    }
 
   by_pieces_constfn constfun;
   void *constfundata;
@@ -4405,6 +4433,7 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
      the least significant bit possibly set in the length.  */
   for (int i = max_bits; i >= sctz_len; i--)
     {
+      rtx_code_label *loop_label = NULL;
       rtx_code_label *label = NULL;
       blksize = HOST_WIDE_INT_1U << i;
 
@@ -4423,14 +4452,24 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
       else if ((max_len & blksize) == 0)
 	continue;
 
+      if (max_loop && i == max_bits)
+	{
+	  loop_label = gen_label_rtx ();
+	  emit_label (loop_label);
+	  /* Since we may run this multiple times, don't assume we
+	     know anything about the offset.  */
+	  clear_mem_offset (to);
+	}
+
       /* Issue a store of BLKSIZE bytes.  */
+      bool update_needed = i != sctz_len || loop_label;
       to = store_by_pieces (to, blksize,
 			    constfun, constfundata,
 			    align, true,
-			    i != sctz_len ? RETURN_END : RETURN_BEGIN);
+			    update_needed ? RETURN_END : RETURN_BEGIN);
 
       /* Adjust REM and PTR, unless this is the last iteration.  */
-      if (i != sctz_len)
+      if (update_needed)
 	{
 	  emit_move_insn (ptr, force_operand (XEXP (to, 0), NULL_RTX));
 	  to = replace_equiv_address (to, ptr);
@@ -4438,6 +4477,11 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
 	  emit_move_insn (rem, force_operand (rem_minus_blksize, NULL_RTX));
 	}
 
+      if (loop_label)
+	emit_cmp_and_jump_insns (rem, GEN_INT (blksize), GE, NULL,
+				 ptr_mode, 1, loop_label,
+				 profile_probability::likely ());
+
       if (label)
 	{
 	  emit_label (label);
diff --git a/gcc/common.opt b/gcc/common.opt
index 562d73d7f55..b21f4225782 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -1874,6 +1874,11 @@ finline-atomics
 Common Var(flag_inline_atomics) Init(1) Optimization
 Inline __atomic operations when a lock free instruction sequence is available.
 
+; FIXME: Disable by default!
+finline-memset-loops
+Common Var(flag_inline_memset_loops) Init(1) Optimization
+Inline memset even if it requires loops.
+
 fcf-protection
 Common RejectNegative Alias(fcf-protection=,full)
 
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index da9ad1068fb..19f436ad463 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -548,7 +548,8 @@ Objective-C and Objective-C++ Dialects}.
 -fgcse-sm  -fhoist-adjacent-loads  -fif-conversion @gol
 -fif-conversion2  -findirect-inlining @gol
 -finline-functions  -finline-functions-called-once  -finline-limit=@var{n} @gol
--finline-small-functions -fipa-modref -fipa-cp  -fipa-cp-clone @gol
+-finline-memset-loops -finline-small-functions @gol
+-fipa-modref -fipa-cp  -fipa-cp-clone @gol
 -fipa-bit-cp  -fipa-vrp  -fipa-pta  -fipa-profile  -fipa-pure-const @gol
 -fipa-reference  -fipa-reference-addressable @gol
 -fipa-stack-alignment  -fipa-icf  -fira-algorithm=@var{algorithm} @gol
@@ -11960,6 +11961,16 @@ in its own right.
 Enabled at levels @option{-O1}, @option{-O2}, @option{-O3} and @option{-Os},
 but not @option{-Og}.
 
+@item -finline-memset-loops
+@opindex finline-memset-loops
+Expand @code{memset} calls inline, even when the length is variable or
+big enough as to require looping.  This may enable the compiler to take
+advantage of known alignment and length multipliers, but it will often
+generate code that is less efficient than performant implementations of
+@code{memset}, and grow code size so much that even a less performant
+@code{memset} may run faster due to better use of the code cache.  This
+option is disabled by default.
+
 @item -fearly-inlining
 @opindex fearly-inlining
 Inline functions marked by @code{always_inline} and functions whose body seems

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2022-12-23  7:38 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-12-23  7:38 [gcc(refs/users/aoliva/heads/testme)] inline memset loops Alexandre Oliva
  -- strict thread matches above, loose matches on Subject: below --
2022-12-23  7:32 Alexandre Oliva
2022-12-23  5:26 Alexandre Oliva
2022-12-23  0:57 Alexandre Oliva

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).