public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc(refs/users/aoliva/heads/testme)] inline memset loops
@ 2022-12-23 7:38 Alexandre Oliva
0 siblings, 0 replies; 4+ messages in thread
From: Alexandre Oliva @ 2022-12-23 7:38 UTC (permalink / raw)
To: gcc-cvs
https://gcc.gnu.org/g:3ec66f83d972290005242d1ee097514037cfe73d
commit 3ec66f83d972290005242d1ee097514037cfe73d
Author: Alexandre Oliva <oliva@gnu.org>
Date: Thu Dec 22 21:28:44 2022 -0300
inline memset loops
Diff:
---
gcc/builtins.cc | 50 +++++++++++++++++++++++--
gcc/common.opt | 4 ++
gcc/doc/invoke.texi | 13 ++++++-
gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c | 14 +++++++
4 files changed, 77 insertions(+), 4 deletions(-)
diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index 02c4fefa86f..388bae58ce4 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -4361,9 +4361,37 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
if (max_bits >= 0)
xlenest += ((HOST_WIDE_INT_1U << max_bits) * 2
- (HOST_WIDE_INT_1U << ctz_len));
+ bool max_loop = false;
if (!can_store_by_pieces (xlenest, builtin_memset_read_str,
&valc, align, true))
- return false;
+ {
+ if (!flag_inline_memset_loops)
+ return false;
+ while (--max_bits >= sctz_len)
+ {
+ xlenest = ((HOST_WIDE_INT_1U << max_bits) * 2
+ - (HOST_WIDE_INT_1U << ctz_len));
+ if (can_store_by_pieces (xlenest + blksize,
+ builtin_memset_read_str,
+ &valc, align, true))
+ {
+ max_loop = true;
+ break;
+ }
+ if (!blksize)
+ continue;
+ if (can_store_by_pieces (xlenest,
+ builtin_memset_read_str,
+ &valc, align, true))
+ {
+ blksize = 0;
+ max_loop = true;
+ break;
+ }
+ }
+ if (!max_loop)
+ return false;
+ }
by_pieces_constfn constfun;
void *constfundata;
@@ -4405,6 +4433,7 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
the least significant bit possibly set in the length. */
for (int i = max_bits; i >= sctz_len; i--)
{
+ rtx_code_label *loop_label = NULL;
rtx_code_label *label = NULL;
blksize = HOST_WIDE_INT_1U << i;
@@ -4423,14 +4452,24 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
else if ((max_len & blksize) == 0)
continue;
+ if (max_loop && i == max_bits)
+ {
+ loop_label = gen_label_rtx ();
+ emit_label (loop_label);
+ /* Since we may run this multiple times, don't assume we
+ know anything about the offset. */
+ clear_mem_offset (to);
+ }
+
/* Issue a store of BLKSIZE bytes. */
+ bool update_needed = i != sctz_len || loop_label;
to = store_by_pieces (to, blksize,
constfun, constfundata,
align, true,
- i != sctz_len ? RETURN_END : RETURN_BEGIN);
+ update_needed ? RETURN_END : RETURN_BEGIN);
/* Adjust REM and PTR, unless this is the last iteration. */
- if (i != sctz_len)
+ if (update_needed)
{
emit_move_insn (ptr, force_operand (XEXP (to, 0), NULL_RTX));
to = replace_equiv_address (to, ptr);
@@ -4438,6 +4477,11 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
emit_move_insn (rem, force_operand (rem_minus_blksize, NULL_RTX));
}
+ if (loop_label)
+ emit_cmp_and_jump_insns (rem, GEN_INT (blksize), GE, NULL,
+ ptr_mode, 1, loop_label,
+ profile_probability::likely ());
+
if (label)
{
emit_label (label);
diff --git a/gcc/common.opt b/gcc/common.opt
index 562d73d7f55..c28af170be8 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -1874,6 +1874,10 @@ finline-atomics
Common Var(flag_inline_atomics) Init(1) Optimization
Inline __atomic operations when a lock free instruction sequence is available.
+finline-memset-loops
+Common Var(flag_inline_memset_loops) Init(0) Optimization
+Inline memset even if it requires loops.
+
fcf-protection
Common RejectNegative Alias(fcf-protection=,full)
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index da9ad1068fb..19f436ad463 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -548,7 +548,8 @@ Objective-C and Objective-C++ Dialects}.
-fgcse-sm -fhoist-adjacent-loads -fif-conversion @gol
-fif-conversion2 -findirect-inlining @gol
-finline-functions -finline-functions-called-once -finline-limit=@var{n} @gol
--finline-small-functions -fipa-modref -fipa-cp -fipa-cp-clone @gol
+-finline-memset-loops -finline-small-functions @gol
+-fipa-modref -fipa-cp -fipa-cp-clone @gol
-fipa-bit-cp -fipa-vrp -fipa-pta -fipa-profile -fipa-pure-const @gol
-fipa-reference -fipa-reference-addressable @gol
-fipa-stack-alignment -fipa-icf -fira-algorithm=@var{algorithm} @gol
@@ -11960,6 +11961,16 @@ in its own right.
Enabled at levels @option{-O1}, @option{-O2}, @option{-O3} and @option{-Os},
but not @option{-Og}.
+@item -finline-memset-loops
+@opindex finline-memset-loops
+Expand @code{memset} calls inline, even when the length is variable or
+big enough as to require looping. This may enable the compiler to take
+advantage of known alignment and length multipliers, but it will often
+generate code that is less efficient than performant implementations of
+@code{memset}, and grow code size so much that even a less performant
+@code{memset} may run faster due to better use of the code cache. This
+option is disabled by default.
+
@item -fearly-inlining
@opindex fearly-inlining
Inline functions marked by @code{always_inline} and functions whose body seems
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c
new file mode 100644
index 00000000000..73bd1025f19
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-finline-memset-loops -gno-record-gcc-switches -fno-lto" } */
+
+void *zero (unsigned long long (*p)[32], int n)
+{
+ return __builtin_memset (p, 0, n * sizeof (*p));
+}
+
+void *ones (char (*p)[128], int n)
+{
+ return __builtin_memset (p, -1, n * sizeof (*p));
+}
+
+/* { dg-final { scan-assembler-not "memset" } } */
^ permalink raw reply [flat|nested] 4+ messages in thread
* [gcc(refs/users/aoliva/heads/testme)] inline memset loops
@ 2022-12-23 7:32 Alexandre Oliva
0 siblings, 0 replies; 4+ messages in thread
From: Alexandre Oliva @ 2022-12-23 7:32 UTC (permalink / raw)
To: gcc-cvs
https://gcc.gnu.org/g:2dc107c433b4462482acc3fd84693f2d63584f6d
commit 2dc107c433b4462482acc3fd84693f2d63584f6d
Author: Alexandre Oliva <oliva@gnu.org>
Date: Thu Dec 22 21:28:44 2022 -0300
inline memset loops
Diff:
---
gcc/builtins.cc | 50 +++++++++++++++++++++++--
gcc/common.opt | 4 ++
gcc/doc/invoke.texi | 13 ++++++-
gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c | 14 +++++++
4 files changed, 77 insertions(+), 4 deletions(-)
diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index 02c4fefa86f..388bae58ce4 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -4361,9 +4361,37 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
if (max_bits >= 0)
xlenest += ((HOST_WIDE_INT_1U << max_bits) * 2
- (HOST_WIDE_INT_1U << ctz_len));
+ bool max_loop = false;
if (!can_store_by_pieces (xlenest, builtin_memset_read_str,
&valc, align, true))
- return false;
+ {
+ if (!flag_inline_memset_loops)
+ return false;
+ while (--max_bits >= sctz_len)
+ {
+ xlenest = ((HOST_WIDE_INT_1U << max_bits) * 2
+ - (HOST_WIDE_INT_1U << ctz_len));
+ if (can_store_by_pieces (xlenest + blksize,
+ builtin_memset_read_str,
+ &valc, align, true))
+ {
+ max_loop = true;
+ break;
+ }
+ if (!blksize)
+ continue;
+ if (can_store_by_pieces (xlenest,
+ builtin_memset_read_str,
+ &valc, align, true))
+ {
+ blksize = 0;
+ max_loop = true;
+ break;
+ }
+ }
+ if (!max_loop)
+ return false;
+ }
by_pieces_constfn constfun;
void *constfundata;
@@ -4405,6 +4433,7 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
the least significant bit possibly set in the length. */
for (int i = max_bits; i >= sctz_len; i--)
{
+ rtx_code_label *loop_label = NULL;
rtx_code_label *label = NULL;
blksize = HOST_WIDE_INT_1U << i;
@@ -4423,14 +4452,24 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
else if ((max_len & blksize) == 0)
continue;
+ if (max_loop && i == max_bits)
+ {
+ loop_label = gen_label_rtx ();
+ emit_label (loop_label);
+ /* Since we may run this multiple times, don't assume we
+ know anything about the offset. */
+ clear_mem_offset (to);
+ }
+
/* Issue a store of BLKSIZE bytes. */
+ bool update_needed = i != sctz_len || loop_label;
to = store_by_pieces (to, blksize,
constfun, constfundata,
align, true,
- i != sctz_len ? RETURN_END : RETURN_BEGIN);
+ update_needed ? RETURN_END : RETURN_BEGIN);
/* Adjust REM and PTR, unless this is the last iteration. */
- if (i != sctz_len)
+ if (update_needed)
{
emit_move_insn (ptr, force_operand (XEXP (to, 0), NULL_RTX));
to = replace_equiv_address (to, ptr);
@@ -4438,6 +4477,11 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
emit_move_insn (rem, force_operand (rem_minus_blksize, NULL_RTX));
}
+ if (loop_label)
+ emit_cmp_and_jump_insns (rem, GEN_INT (blksize), GE, NULL,
+ ptr_mode, 1, loop_label,
+ profile_probability::likely ());
+
if (label)
{
emit_label (label);
diff --git a/gcc/common.opt b/gcc/common.opt
index 562d73d7f55..c28af170be8 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -1874,6 +1874,10 @@ finline-atomics
Common Var(flag_inline_atomics) Init(1) Optimization
Inline __atomic operations when a lock free instruction sequence is available.
+finline-memset-loops
+Common Var(flag_inline_memset_loops) Init(0) Optimization
+Inline memset even if it requires loops.
+
fcf-protection
Common RejectNegative Alias(fcf-protection=,full)
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index da9ad1068fb..19f436ad463 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -548,7 +548,8 @@ Objective-C and Objective-C++ Dialects}.
-fgcse-sm -fhoist-adjacent-loads -fif-conversion @gol
-fif-conversion2 -findirect-inlining @gol
-finline-functions -finline-functions-called-once -finline-limit=@var{n} @gol
--finline-small-functions -fipa-modref -fipa-cp -fipa-cp-clone @gol
+-finline-memset-loops -finline-small-functions @gol
+-fipa-modref -fipa-cp -fipa-cp-clone @gol
-fipa-bit-cp -fipa-vrp -fipa-pta -fipa-profile -fipa-pure-const @gol
-fipa-reference -fipa-reference-addressable @gol
-fipa-stack-alignment -fipa-icf -fira-algorithm=@var{algorithm} @gol
@@ -11960,6 +11961,16 @@ in its own right.
Enabled at levels @option{-O1}, @option{-O2}, @option{-O3} and @option{-Os},
but not @option{-Og}.
+@item -finline-memset-loops
+@opindex finline-memset-loops
+Expand @code{memset} calls inline, even when the length is variable or
+big enough as to require looping. This may enable the compiler to take
+advantage of known alignment and length multipliers, but it will often
+generate code that is less efficient than performant implementations of
+@code{memset}, and grow code size so much that even a less performant
+@code{memset} may run faster due to better use of the code cache. This
+option is disabled by default.
+
@item -fearly-inlining
@opindex fearly-inlining
Inline functions marked by @code{always_inline} and functions whose body seems
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c
new file mode 100644
index 00000000000..34105a42c07
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-finline-memset-loops" } */
+
+void *zero (unsigned long long (*p)[32], int n)
+{
+ return __builtin_memset (p, 0, n * sizeof (*p));
+}
+
+void *ones (char (*p)[128], int n)
+{
+ return __builtin_memset (p, -1, n * sizeof (*p));
+}
+
+/* { dg-final { scan-assembler-not "memset" } } */
^ permalink raw reply [flat|nested] 4+ messages in thread
* [gcc(refs/users/aoliva/heads/testme)] inline memset loops
@ 2022-12-23 5:26 Alexandre Oliva
0 siblings, 0 replies; 4+ messages in thread
From: Alexandre Oliva @ 2022-12-23 5:26 UTC (permalink / raw)
To: gcc-cvs
https://gcc.gnu.org/g:8cd58c4687270408ffb342020eb26574ef06989e
commit 8cd58c4687270408ffb342020eb26574ef06989e
Author: Alexandre Oliva <oliva@gnu.org>
Date: Thu Dec 22 21:28:44 2022 -0300
inline memset loops
Diff:
---
gcc/builtins.cc | 50 +++++++++++++++++++++++--
gcc/common.opt | 4 ++
gcc/doc/invoke.texi | 13 ++++++-
gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c | 14 +++++++
4 files changed, 77 insertions(+), 4 deletions(-)
diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index 02c4fefa86f..388bae58ce4 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -4361,9 +4361,37 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
if (max_bits >= 0)
xlenest += ((HOST_WIDE_INT_1U << max_bits) * 2
- (HOST_WIDE_INT_1U << ctz_len));
+ bool max_loop = false;
if (!can_store_by_pieces (xlenest, builtin_memset_read_str,
&valc, align, true))
- return false;
+ {
+ if (!flag_inline_memset_loops)
+ return false;
+ while (--max_bits >= sctz_len)
+ {
+ xlenest = ((HOST_WIDE_INT_1U << max_bits) * 2
+ - (HOST_WIDE_INT_1U << ctz_len));
+ if (can_store_by_pieces (xlenest + blksize,
+ builtin_memset_read_str,
+ &valc, align, true))
+ {
+ max_loop = true;
+ break;
+ }
+ if (!blksize)
+ continue;
+ if (can_store_by_pieces (xlenest,
+ builtin_memset_read_str,
+ &valc, align, true))
+ {
+ blksize = 0;
+ max_loop = true;
+ break;
+ }
+ }
+ if (!max_loop)
+ return false;
+ }
by_pieces_constfn constfun;
void *constfundata;
@@ -4405,6 +4433,7 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
the least significant bit possibly set in the length. */
for (int i = max_bits; i >= sctz_len; i--)
{
+ rtx_code_label *loop_label = NULL;
rtx_code_label *label = NULL;
blksize = HOST_WIDE_INT_1U << i;
@@ -4423,14 +4452,24 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
else if ((max_len & blksize) == 0)
continue;
+ if (max_loop && i == max_bits)
+ {
+ loop_label = gen_label_rtx ();
+ emit_label (loop_label);
+ /* Since we may run this multiple times, don't assume we
+ know anything about the offset. */
+ clear_mem_offset (to);
+ }
+
/* Issue a store of BLKSIZE bytes. */
+ bool update_needed = i != sctz_len || loop_label;
to = store_by_pieces (to, blksize,
constfun, constfundata,
align, true,
- i != sctz_len ? RETURN_END : RETURN_BEGIN);
+ update_needed ? RETURN_END : RETURN_BEGIN);
/* Adjust REM and PTR, unless this is the last iteration. */
- if (i != sctz_len)
+ if (update_needed)
{
emit_move_insn (ptr, force_operand (XEXP (to, 0), NULL_RTX));
to = replace_equiv_address (to, ptr);
@@ -4438,6 +4477,11 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
emit_move_insn (rem, force_operand (rem_minus_blksize, NULL_RTX));
}
+ if (loop_label)
+ emit_cmp_and_jump_insns (rem, GEN_INT (blksize), GE, NULL,
+ ptr_mode, 1, loop_label,
+ profile_probability::likely ());
+
if (label)
{
emit_label (label);
diff --git a/gcc/common.opt b/gcc/common.opt
index 562d73d7f55..c28af170be8 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -1874,6 +1874,10 @@ finline-atomics
Common Var(flag_inline_atomics) Init(1) Optimization
Inline __atomic operations when a lock free instruction sequence is available.
+finline-memset-loops
+Common Var(flag_inline_memset_loops) Init(0) Optimization
+Inline memset even if it requires loops.
+
fcf-protection
Common RejectNegative Alias(fcf-protection=,full)
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index da9ad1068fb..19f436ad463 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -548,7 +548,8 @@ Objective-C and Objective-C++ Dialects}.
-fgcse-sm -fhoist-adjacent-loads -fif-conversion @gol
-fif-conversion2 -findirect-inlining @gol
-finline-functions -finline-functions-called-once -finline-limit=@var{n} @gol
--finline-small-functions -fipa-modref -fipa-cp -fipa-cp-clone @gol
+-finline-memset-loops -finline-small-functions @gol
+-fipa-modref -fipa-cp -fipa-cp-clone @gol
-fipa-bit-cp -fipa-vrp -fipa-pta -fipa-profile -fipa-pure-const @gol
-fipa-reference -fipa-reference-addressable @gol
-fipa-stack-alignment -fipa-icf -fira-algorithm=@var{algorithm} @gol
@@ -11960,6 +11961,16 @@ in its own right.
Enabled at levels @option{-O1}, @option{-O2}, @option{-O3} and @option{-Os},
but not @option{-Og}.
+@item -finline-memset-loops
+@opindex finline-memset-loops
+Expand @code{memset} calls inline, even when the length is variable or
+big enough as to require looping. This may enable the compiler to take
+advantage of known alignment and length multipliers, but it will often
+generate code that is less efficient than performant implementations of
+@code{memset}, and grow code size so much that even a less performant
+@code{memset} may run faster due to better use of the code cache. This
+option is disabled by default.
+
@item -fearly-inlining
@opindex fearly-inlining
Inline functions marked by @code{always_inline} and functions whose body seems
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c
new file mode 100644
index 00000000000..591c54b4fcc
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { do-options "-finline-memset-loops" } */
+
+void *zero (unsigned long long (*p)[32], int n)
+{
+ return __builtin_memset (p, 0, n * sizeof (*p));
+}
+
+void *ones (char (*p)[128], int n)
+{
+ return __builtin_memset (p, -1, n * sizeof (*p));
+}
+
+/* { dg-final { scan-assembler-not "memset" } } */
^ permalink raw reply [flat|nested] 4+ messages in thread
* [gcc(refs/users/aoliva/heads/testme)] inline memset loops
@ 2022-12-23 0:57 Alexandre Oliva
0 siblings, 0 replies; 4+ messages in thread
From: Alexandre Oliva @ 2022-12-23 0:57 UTC (permalink / raw)
To: gcc-cvs
https://gcc.gnu.org/g:7d172bec5011526917596810a9fe0a140bcbe9a8
commit 7d172bec5011526917596810a9fe0a140bcbe9a8
Author: Alexandre Oliva <oliva@gnu.org>
Date: Thu Dec 22 21:28:44 2022 -0300
inline memset loops
Diff:
---
gcc/builtins.cc | 50 +++++++++++++++++++++++++++++++++++++++++++++++---
gcc/common.opt | 5 +++++
gcc/doc/invoke.texi | 13 ++++++++++++-
3 files changed, 64 insertions(+), 4 deletions(-)
diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index 02c4fefa86f..388bae58ce4 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -4361,9 +4361,37 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
if (max_bits >= 0)
xlenest += ((HOST_WIDE_INT_1U << max_bits) * 2
- (HOST_WIDE_INT_1U << ctz_len));
+ bool max_loop = false;
if (!can_store_by_pieces (xlenest, builtin_memset_read_str,
&valc, align, true))
- return false;
+ {
+ if (!flag_inline_memset_loops)
+ return false;
+ while (--max_bits >= sctz_len)
+ {
+ xlenest = ((HOST_WIDE_INT_1U << max_bits) * 2
+ - (HOST_WIDE_INT_1U << ctz_len));
+ if (can_store_by_pieces (xlenest + blksize,
+ builtin_memset_read_str,
+ &valc, align, true))
+ {
+ max_loop = true;
+ break;
+ }
+ if (!blksize)
+ continue;
+ if (can_store_by_pieces (xlenest,
+ builtin_memset_read_str,
+ &valc, align, true))
+ {
+ blksize = 0;
+ max_loop = true;
+ break;
+ }
+ }
+ if (!max_loop)
+ return false;
+ }
by_pieces_constfn constfun;
void *constfundata;
@@ -4405,6 +4433,7 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
the least significant bit possibly set in the length. */
for (int i = max_bits; i >= sctz_len; i--)
{
+ rtx_code_label *loop_label = NULL;
rtx_code_label *label = NULL;
blksize = HOST_WIDE_INT_1U << i;
@@ -4423,14 +4452,24 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
else if ((max_len & blksize) == 0)
continue;
+ if (max_loop && i == max_bits)
+ {
+ loop_label = gen_label_rtx ();
+ emit_label (loop_label);
+ /* Since we may run this multiple times, don't assume we
+ know anything about the offset. */
+ clear_mem_offset (to);
+ }
+
/* Issue a store of BLKSIZE bytes. */
+ bool update_needed = i != sctz_len || loop_label;
to = store_by_pieces (to, blksize,
constfun, constfundata,
align, true,
- i != sctz_len ? RETURN_END : RETURN_BEGIN);
+ update_needed ? RETURN_END : RETURN_BEGIN);
/* Adjust REM and PTR, unless this is the last iteration. */
- if (i != sctz_len)
+ if (update_needed)
{
emit_move_insn (ptr, force_operand (XEXP (to, 0), NULL_RTX));
to = replace_equiv_address (to, ptr);
@@ -4438,6 +4477,11 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
emit_move_insn (rem, force_operand (rem_minus_blksize, NULL_RTX));
}
+ if (loop_label)
+ emit_cmp_and_jump_insns (rem, GEN_INT (blksize), GE, NULL,
+ ptr_mode, 1, loop_label,
+ profile_probability::likely ());
+
if (label)
{
emit_label (label);
diff --git a/gcc/common.opt b/gcc/common.opt
index 562d73d7f55..b21f4225782 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -1874,6 +1874,11 @@ finline-atomics
Common Var(flag_inline_atomics) Init(1) Optimization
Inline __atomic operations when a lock free instruction sequence is available.
+; FIXME: Disable by default!
+finline-memset-loops
+Common Var(flag_inline_memset_loops) Init(1) Optimization
+Inline memset even if it requires loops.
+
fcf-protection
Common RejectNegative Alias(fcf-protection=,full)
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index da9ad1068fb..19f436ad463 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -548,7 +548,8 @@ Objective-C and Objective-C++ Dialects}.
-fgcse-sm -fhoist-adjacent-loads -fif-conversion @gol
-fif-conversion2 -findirect-inlining @gol
-finline-functions -finline-functions-called-once -finline-limit=@var{n} @gol
--finline-small-functions -fipa-modref -fipa-cp -fipa-cp-clone @gol
+-finline-memset-loops -finline-small-functions @gol
+-fipa-modref -fipa-cp -fipa-cp-clone @gol
-fipa-bit-cp -fipa-vrp -fipa-pta -fipa-profile -fipa-pure-const @gol
-fipa-reference -fipa-reference-addressable @gol
-fipa-stack-alignment -fipa-icf -fira-algorithm=@var{algorithm} @gol
@@ -11960,6 +11961,16 @@ in its own right.
Enabled at levels @option{-O1}, @option{-O2}, @option{-O3} and @option{-Os},
but not @option{-Og}.
+@item -finline-memset-loops
+@opindex finline-memset-loops
+Expand @code{memset} calls inline, even when the length is variable or
+big enough as to require looping. This may enable the compiler to take
+advantage of known alignment and length multipliers, but it will often
+generate code that is less efficient than performant implementations of
+@code{memset}, and grow code size so much that even a less performant
+@code{memset} may run faster due to better use of the code cache. This
+option is disabled by default.
+
@item -fearly-inlining
@opindex fearly-inlining
Inline functions marked by @code{always_inline} and functions whose body seems
^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2022-12-23 7:38 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-12-23 7:38 [gcc(refs/users/aoliva/heads/testme)] inline memset loops Alexandre Oliva
-- strict thread matches above, loose matches on Subject: below --
2022-12-23 7:32 Alexandre Oliva
2022-12-23 5:26 Alexandre Oliva
2022-12-23 0:57 Alexandre Oliva
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).