public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc(refs/users/aoliva/heads/testme)] add memcpy and memmove loop expanders
@ 2023-01-20 23:01 Alexandre Oliva
0 siblings, 0 replies; 6+ messages in thread
From: Alexandre Oliva @ 2023-01-20 23:01 UTC (permalink / raw)
To: gcc-cvs
https://gcc.gnu.org/g:c0ddf0d3449743f447b825e68de70d6142301a52
commit c0ddf0d3449743f447b825e68de70d6142301a52
Author: Alexandre Oliva <oliva@gnu.org>
Date: Fri Jan 20 19:23:02 2023 -0300
add memcpy and memmove loop expanders
Diff:
---
gcc/builtins.cc | 2 +-
gcc/expr.cc | 192 +++++++++++++++++++++++++++++++++++++++++++++++++++-----
gcc/expr.h | 6 +-
3 files changed, 182 insertions(+), 18 deletions(-)
diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index b7737678a7d..ca4d4721cdc 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -3756,7 +3756,7 @@ expand_builtin_memory_copy_args (tree dest, tree src, tree len,
expected_align, expected_size,
min_size, max_size, probable_max_size,
use_mempcpy_call, &is_move_done,
- might_overlap);
+ might_overlap, tree_ctz (len));
/* Bail out when a mempcpy call would be expanded as libcall and when
we have a target that provides a fast implementation
diff --git a/gcc/expr.cc b/gcc/expr.cc
index 15be1c8db99..0bba36ee131 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -80,7 +80,9 @@ static bool emit_block_move_via_pattern (rtx, rtx, rtx, unsigned, unsigned,
HOST_WIDE_INT, unsigned HOST_WIDE_INT,
unsigned HOST_WIDE_INT,
unsigned HOST_WIDE_INT, bool);
-static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned);
+static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned, int);
+static void emit_block_move_via_sized_loop (rtx, rtx, rtx, unsigned, unsigned);
+static void emit_block_move_via_oriented_loop (rtx, rtx, rtx, unsigned, unsigned);
static void clear_by_pieces (rtx, unsigned HOST_WIDE_INT, unsigned int);
static rtx_insn *compress_float_constant (rtx, rtx);
static rtx get_subtarget (rtx);
@@ -1966,7 +1968,7 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
unsigned HOST_WIDE_INT max_size,
unsigned HOST_WIDE_INT probable_max_size,
bool bail_out_libcall, bool *is_move_done,
- bool might_overlap)
+ bool might_overlap, unsigned ctz_size)
{
int may_use_call;
rtx retval = 0;
@@ -2052,6 +2054,14 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
}
}
+ bool dynamic_direction = false;
+ if (!pattern_ok && !pieces_ok && may_use_call
+ && (flag_inline_stringops & (might_overlap ? ILSOP_MEMMOVE : ILSOP_MEMCPY)))
+ {
+ may_use_call = 0;
+ dynamic_direction = might_overlap;
+ }
+
if (pattern_ok)
;
else if (pieces_ok)
@@ -2073,10 +2083,12 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
retval = emit_block_copy_via_libcall (x, y, size,
method == BLOCK_OP_TAILCALL);
}
+ else if (dynamic_direction)
+ emit_block_move_via_oriented_loop (x, y, size, align, ctz_size);
else if (might_overlap)
*is_move_done = false;
else
- emit_block_move_via_loop (x, y, size, align);
+ emit_block_move_via_sized_loop (x, y, size, align, ctz_size);
if (method == BLOCK_OP_CALL_PARM)
OK_DEFER_POP;
@@ -2085,7 +2097,8 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
}
rtx
-emit_block_move (rtx x, rtx y, rtx size, enum block_op_methods method)
+emit_block_move (rtx x, rtx y, rtx size, enum block_op_methods method,
+ unsigned int ctz_size)
{
unsigned HOST_WIDE_INT max, min = 0;
if (GET_CODE (size) == CONST_INT)
@@ -2093,7 +2106,8 @@ emit_block_move (rtx x, rtx y, rtx size, enum block_op_methods method)
else
max = GET_MODE_MASK (GET_MODE (size));
return emit_block_move_hints (x, y, size, method, 0, -1,
- min, max, max);
+ min, max, max,
+ false, NULL, false, ctz_size);
}
/* A subroutine of emit_block_move. Returns true if calling the
@@ -2255,13 +2269,117 @@ emit_block_move_via_pattern (rtx x, rtx y, rtx size, unsigned int align,
return false;
}
+/* Like emit_block_move_via_loop, but choose a suitable INCR based on
+ ALIGN and CTZ_SIZE. */
+
+static void
+emit_block_move_via_sized_loop (rtx x, rtx y, rtx size,
+ unsigned int align,
+ unsigned int ctz_size)
+{
+ int incr = align / BITS_PER_UNIT;
+
+ if (CONST_INT_P (size))
+ ctz_size = MAX (ctz_size, (unsigned) wi::ctz (UINTVAL (size)));
+
+ if (HOST_WIDE_INT_1U << ctz_size < (unsigned HOST_WIDE_INT) incr)
+ incr = HOST_WIDE_INT_1U << ctz_size;
+
+ while (incr > 1 && !can_move_by_pieces (incr, align))
+ incr >>= 1;
+
+ gcc_checking_assert (incr);
+
+ return emit_block_move_via_loop (x, y, size, align, incr);
+}
+
+/* Like emit_block_move_via_sized_loop, but besides choosing INCR so
+ as to ensure safe moves even in case of overlap, output dynamic
+ tests to choose between two loops, one moving downwards, another
+ moving upwards. */
+
+static void
+emit_block_move_via_oriented_loop (rtx x, rtx y, rtx size,
+ unsigned int align,
+ unsigned int ctz_size)
+{
+ int incr = align / BITS_PER_UNIT;
+
+ if (CONST_INT_P (size))
+ ctz_size = MAX (ctz_size, (unsigned) wi::ctz (UINTVAL (size)));
+
+ if (HOST_WIDE_INT_1U << ctz_size < (unsigned HOST_WIDE_INT) incr)
+ incr = HOST_WIDE_INT_1U << ctz_size;
+
+ while (incr > 1 && !int_mode_for_size (incr, 0).exists ())
+ incr >>= 1;
+
+ gcc_checking_assert (incr);
+
+ rtx_code_label *upw_label, *end_label;
+ upw_label = gen_label_rtx ();
+ end_label = gen_label_rtx ();
+
+ rtx x_addr = force_operand (XEXP (x, 0), NULL_RTX);
+ rtx y_addr = force_operand (XEXP (y, 0), NULL_RTX);
+ do_pending_stack_adjust ();
+
+ machine_mode mode = GET_MODE (x_addr);
+ if (mode != GET_MODE (y_addr))
+ {
+ scalar_int_mode xmode
+ = smallest_int_mode_for_size (GET_MODE_BITSIZE (mode));
+ scalar_int_mode ymode
+ = smallest_int_mode_for_size (GET_MODE_BITSIZE
+ (GET_MODE (y_addr)));
+ if (GET_MODE_BITSIZE (xmode) < GET_MODE_BITSIZE (ymode))
+ mode = ymode;
+ else
+ mode = xmode;
+
+#ifndef POINTERS_EXTEND_UNSIGNED
+ const int POINTERS_EXTEND_UNSIGNED = 1;
+#endif
+ x_addr = convert_modes (mode, GET_MODE (x_addr), x_addr,
+ POINTERS_EXTEND_UNSIGNED);
+ y_addr = convert_modes (mode, GET_MODE (y_addr), y_addr,
+ POINTERS_EXTEND_UNSIGNED);
+ }
+
+ /* Test for overlap: if (x >= y || x + size <= y) goto upw_label. */
+ emit_cmp_and_jump_insns (x_addr, y_addr, GEU, NULL_RTX, mode,
+ true, upw_label,
+ profile_probability::guessed_always ()
+ .apply_scale (5, 10));
+ rtx tmp = convert_modes (GET_MODE (x_addr), GET_MODE (size), size, true);
+ tmp = simplify_gen_binary (PLUS, GET_MODE (x_addr), x_addr, tmp);
+
+ emit_cmp_and_jump_insns (tmp, y_addr, LEU, NULL_RTX, mode,
+ true, upw_label,
+ profile_probability::guessed_always ()
+ .apply_scale (8, 10));
+
+ emit_block_move_via_loop (x, y, size, align, -incr);
+
+ emit_jump (end_label);
+ emit_label (upw_label);
+
+ emit_block_move_via_loop (x, y, size, align, incr);
+
+ emit_label (end_label);
+}
+
/* A subroutine of emit_block_move. Copy the data via an explicit
- loop. This is used only when libcalls are forbidden. */
-/* ??? It'd be nice to copy in hunks larger than QImode. */
+ loop. This is used only when libcalls are forbidden, or when
+ inlining is required. INCR is the block size to be copied in each
+ loop iteration. If it is negative, the absolute value is used, and
+ the block is copied backwards. INCR must be a power of two, an
+ exact divisor for SIZE and ALIGN, and imply a mode that can be
+ safely copied per iteration assuming no overlap. */
static void
emit_block_move_via_loop (rtx x, rtx y, rtx size,
- unsigned int align ATTRIBUTE_UNUSED)
+ unsigned int align, int incr)
{
rtx_code_label *cmp_label, *top_label;
rtx iter, x_addr, y_addr, tmp;
@@ -2277,7 +2395,38 @@ emit_block_move_via_loop (rtx x, rtx y, rtx size,
cmp_label = gen_label_rtx ();
iter = gen_reg_rtx (iter_mode);
- emit_move_insn (iter, const0_rtx);
+ bool downwards = incr < 0;
+ rtx iter_init;
+ rtx_code iter_cond;
+ rtx iter_limit;
+ rtx iter_incr;
+ machine_mode move_mode;
+ if (downwards)
+ {
+ incr = -incr;
+ iter_init = size;
+ iter_cond = GE;
+ iter_limit = const0_rtx;
+ iter_incr = GEN_INT (incr);
+ }
+ else
+ {
+ iter_init = const0_rtx;
+ iter_cond = LT;
+ iter_limit = size;
+ iter_incr = GEN_INT (incr);
+ }
+ emit_move_insn (iter, iter_init);
+
+ scalar_int_mode int_move_mode
+ = smallest_int_mode_for_size (incr * BITS_PER_UNIT);
+ if (GET_MODE_BITSIZE (int_move_mode) != incr * BITS_PER_UNIT)
+ {
+ move_mode = BLKmode;
+ gcc_checking_assert (can_move_by_pieces (incr, align));
+ }
+ else
+ move_mode = int_move_mode;
x_addr = force_operand (XEXP (x, 0), NULL_RTX);
y_addr = force_operand (XEXP (y, 0), NULL_RTX);
@@ -2293,19 +2442,32 @@ emit_block_move_via_loop (rtx x, rtx y, rtx size,
tmp = convert_modes (y_addr_mode, iter_mode, iter, true);
y_addr = simplify_gen_binary (PLUS, y_addr_mode, y_addr, tmp);
- x = change_address (x, QImode, x_addr);
- y = change_address (y, QImode, y_addr);
+ x = change_address (x, move_mode, x_addr);
+ y = change_address (y, move_mode, y_addr);
- emit_move_insn (x, y);
+ if (move_mode == BLKmode)
+ {
+ bool done;
+ emit_block_move_hints (x, y, iter_incr, BLOCK_OP_NO_LIBCALL,
+ align, incr, incr, incr, incr,
+ false, &done, false);
+ gcc_checking_assert (done);
+ }
+ else
+ emit_move_insn (x, y);
- tmp = expand_simple_binop (iter_mode, PLUS, iter, const1_rtx, iter,
+ if (downwards)
+ emit_label (cmp_label);
+
+ tmp = expand_simple_binop (iter_mode, PLUS, iter, iter_incr, iter,
true, OPTAB_LIB_WIDEN);
if (tmp != iter)
emit_move_insn (iter, tmp);
- emit_label (cmp_label);
+ if (!downwards)
+ emit_label (cmp_label);
- emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
+ emit_cmp_and_jump_insns (iter, iter_limit, iter_cond, NULL_RTX, iter_mode,
true, top_label,
profile_probability::guessed_always ()
.apply_scale (9, 10));
diff --git a/gcc/expr.h b/gcc/expr.h
index e3ba9eb5370..d9fc47c9114 100644
--- a/gcc/expr.h
+++ b/gcc/expr.h
@@ -126,7 +126,8 @@ struct by_pieces_prev
fixed_size_mode mode;
};
-extern rtx emit_block_move (rtx, rtx, rtx, enum block_op_methods);
+extern rtx emit_block_move (rtx, rtx, rtx, enum block_op_methods,
+ unsigned ctz_size = 0);
extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
unsigned int, HOST_WIDE_INT,
unsigned HOST_WIDE_INT,
@@ -134,7 +135,8 @@ extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
unsigned HOST_WIDE_INT,
bool bail_out_libcall = false,
bool *is_move_done = NULL,
- bool might_overlap = false);
+ bool might_overlap = false,
+ unsigned ctz_size = 0);
extern rtx emit_block_cmp_hints (rtx, rtx, rtx, tree, rtx, bool,
by_pieces_constfn, void *);
extern bool emit_storent_insn (rtx to, rtx from);
^ permalink raw reply [flat|nested] 6+ messages in thread
* [gcc(refs/users/aoliva/heads/testme)] add memcpy and memmove loop expanders
@ 2023-01-27 5:58 Alexandre Oliva
0 siblings, 0 replies; 6+ messages in thread
From: Alexandre Oliva @ 2023-01-27 5:58 UTC (permalink / raw)
To: gcc-cvs
https://gcc.gnu.org/g:5fd339790dbc19ed77f8ef2354472647e9235dd8
commit 5fd339790dbc19ed77f8ef2354472647e9235dd8
Author: Alexandre Oliva <oliva@gnu.org>
Date: Thu Jan 26 22:52:18 2023 -0300
add memcpy and memmove loop expanders
Diff:
---
gcc/builtins.cc | 2 +-
gcc/expr.cc | 194 +++++++++++++++++++++--
gcc/expr.h | 6 +-
gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c | 8 +
gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c | 7 +
5 files changed, 199 insertions(+), 18 deletions(-)
diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index 894ee99fd9a..a9f92e9eb29 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -3756,7 +3756,7 @@ expand_builtin_memory_copy_args (tree dest, tree src, tree len,
expected_align, expected_size,
min_size, max_size, probable_max_size,
use_mempcpy_call, &is_move_done,
- might_overlap);
+ might_overlap, tree_ctz (len));
/* Bail out when a mempcpy call would be expanded as libcall and when
we have a target that provides a fast implementation
diff --git a/gcc/expr.cc b/gcc/expr.cc
index 15be1c8db99..aceb3f514fc 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -80,7 +80,9 @@ static bool emit_block_move_via_pattern (rtx, rtx, rtx, unsigned, unsigned,
HOST_WIDE_INT, unsigned HOST_WIDE_INT,
unsigned HOST_WIDE_INT,
unsigned HOST_WIDE_INT, bool);
-static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned);
+static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned, int);
+static void emit_block_move_via_sized_loop (rtx, rtx, rtx, unsigned, unsigned);
+static void emit_block_move_via_oriented_loop (rtx, rtx, rtx, unsigned, unsigned);
static void clear_by_pieces (rtx, unsigned HOST_WIDE_INT, unsigned int);
static rtx_insn *compress_float_constant (rtx, rtx);
static rtx get_subtarget (rtx);
@@ -1955,6 +1957,8 @@ compare_by_pieces (rtx arg0, rtx arg1, unsigned HOST_WIDE_INT len,
MIN_SIZE is the minimal size of block to move
MAX_SIZE is the maximal size of block to move, if it cannot be represented
in unsigned HOST_WIDE_INT, than it is mask of all ones.
+ CTZ_SIZE is the trailing-zeros count of SIZE; even a nonconstant SIZE is
+ known to be a multiple of 1<<CTZ_SIZE.
Return the address of the new block, if memcpy is called and returns it,
0 otherwise. */
@@ -1966,7 +1970,7 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
unsigned HOST_WIDE_INT max_size,
unsigned HOST_WIDE_INT probable_max_size,
bool bail_out_libcall, bool *is_move_done,
- bool might_overlap)
+ bool might_overlap, unsigned ctz_size)
{
int may_use_call;
rtx retval = 0;
@@ -2052,6 +2056,14 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
}
}
+ bool dynamic_direction = false;
+ if (!pattern_ok && !pieces_ok && may_use_call
+ && (flag_inline_stringops & (might_overlap ? ILSOP_MEMMOVE : ILSOP_MEMCPY)))
+ {
+ may_use_call = 0;
+ dynamic_direction = might_overlap;
+ }
+
if (pattern_ok)
;
else if (pieces_ok)
@@ -2073,10 +2085,12 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
retval = emit_block_copy_via_libcall (x, y, size,
method == BLOCK_OP_TAILCALL);
}
+ else if (dynamic_direction)
+ emit_block_move_via_oriented_loop (x, y, size, align, ctz_size);
else if (might_overlap)
*is_move_done = false;
else
- emit_block_move_via_loop (x, y, size, align);
+ emit_block_move_via_sized_loop (x, y, size, align, ctz_size);
if (method == BLOCK_OP_CALL_PARM)
OK_DEFER_POP;
@@ -2085,7 +2099,8 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
}
rtx
-emit_block_move (rtx x, rtx y, rtx size, enum block_op_methods method)
+emit_block_move (rtx x, rtx y, rtx size, enum block_op_methods method,
+ unsigned int ctz_size)
{
unsigned HOST_WIDE_INT max, min = 0;
if (GET_CODE (size) == CONST_INT)
@@ -2093,7 +2108,8 @@ emit_block_move (rtx x, rtx y, rtx size, enum block_op_methods method)
else
max = GET_MODE_MASK (GET_MODE (size));
return emit_block_move_hints (x, y, size, method, 0, -1,
- min, max, max);
+ min, max, max,
+ false, NULL, false, ctz_size);
}
/* A subroutine of emit_block_move. Returns true if calling the
@@ -2255,13 +2271,117 @@ emit_block_move_via_pattern (rtx x, rtx y, rtx size, unsigned int align,
return false;
}
+/* Like emit_block_move_via_loop, but choose a suitable INCR based on
+ ALIGN and CTZ_SIZE. */
+
+static void
+emit_block_move_via_sized_loop (rtx x, rtx y, rtx size,
+ unsigned int align,
+ unsigned int ctz_size)
+{
+ int incr = align / BITS_PER_UNIT;
+
+ if (CONST_INT_P (size))
+ ctz_size = MAX (ctz_size, (unsigned) wi::ctz (UINTVAL (size)));
+
+ if (HOST_WIDE_INT_1U << ctz_size < (unsigned HOST_WIDE_INT) incr)
+ incr = HOST_WIDE_INT_1U << ctz_size;
+
+ while (incr > 1 && !can_move_by_pieces (incr, align))
+ incr >>= 1;
+
+ gcc_checking_assert (incr);
+
+ return emit_block_move_via_loop (x, y, size, align, incr);
+}
+
+/* Like emit_block_move_via_sized_loop, but besides choosing INCR so
+ as to ensure safe moves even in case of overlap, output dynamic
+ tests to choose between two loops, one moving downwards, another
+ moving upwards. */
+
+static void
+emit_block_move_via_oriented_loop (rtx x, rtx y, rtx size,
+ unsigned int align,
+ unsigned int ctz_size)
+{
+ int incr = align / BITS_PER_UNIT;
+
+ if (CONST_INT_P (size))
+ ctz_size = MAX (ctz_size, (unsigned) wi::ctz (UINTVAL (size)));
+
+ if (HOST_WIDE_INT_1U << ctz_size < (unsigned HOST_WIDE_INT) incr)
+ incr = HOST_WIDE_INT_1U << ctz_size;
+
+ while (incr > 1 && !int_mode_for_size (incr, 0).exists ())
+ incr >>= 1;
+
+ gcc_checking_assert (incr);
+
+ rtx_code_label *upw_label, *end_label;
+ upw_label = gen_label_rtx ();
+ end_label = gen_label_rtx ();
+
+ rtx x_addr = force_operand (XEXP (x, 0), NULL_RTX);
+ rtx y_addr = force_operand (XEXP (y, 0), NULL_RTX);
+ do_pending_stack_adjust ();
+
+ machine_mode mode = GET_MODE (x_addr);
+ if (mode != GET_MODE (y_addr))
+ {
+ scalar_int_mode xmode
+ = smallest_int_mode_for_size (GET_MODE_BITSIZE (mode));
+ scalar_int_mode ymode
+ = smallest_int_mode_for_size (GET_MODE_BITSIZE
+ (GET_MODE (y_addr)));
+ if (GET_MODE_BITSIZE (xmode) < GET_MODE_BITSIZE (ymode))
+ mode = ymode;
+ else
+ mode = xmode;
+
+#ifndef POINTERS_EXTEND_UNSIGNED
+ const int POINTERS_EXTEND_UNSIGNED = 1;
+#endif
+ x_addr = convert_modes (mode, GET_MODE (x_addr), x_addr,
+ POINTERS_EXTEND_UNSIGNED);
+ y_addr = convert_modes (mode, GET_MODE (y_addr), y_addr,
+ POINTERS_EXTEND_UNSIGNED);
+ }
+
+ /* Test for overlap: if (x >= y || x + size <= y) goto upw_label. */
+ emit_cmp_and_jump_insns (x_addr, y_addr, GEU, NULL_RTX, mode,
+ true, upw_label,
+ profile_probability::guessed_always ()
+ .apply_scale (5, 10));
+ rtx tmp = convert_modes (GET_MODE (x_addr), GET_MODE (size), size, true);
+ tmp = simplify_gen_binary (PLUS, GET_MODE (x_addr), x_addr, tmp);
+
+ emit_cmp_and_jump_insns (tmp, y_addr, LEU, NULL_RTX, mode,
+ true, upw_label,
+ profile_probability::guessed_always ()
+ .apply_scale (8, 10));
+
+ emit_block_move_via_loop (x, y, size, align, -incr);
+
+ emit_jump (end_label);
+ emit_label (upw_label);
+
+ emit_block_move_via_loop (x, y, size, align, incr);
+
+ emit_label (end_label);
+}
+
/* A subroutine of emit_block_move. Copy the data via an explicit
- loop. This is used only when libcalls are forbidden. */
-/* ??? It'd be nice to copy in hunks larger than QImode. */
+ loop. This is used only when libcalls are forbidden, or when
+ inlining is required. INCR is the block size to be copied in each
+ loop iteration. If it is negative, the absolute value is used, and
+ the block is copied backwards. INCR must be a power of two, an
+ exact divisor for SIZE and ALIGN, and imply a mode that can be
+ safely copied per iteration assuming no overlap. */
static void
emit_block_move_via_loop (rtx x, rtx y, rtx size,
- unsigned int align ATTRIBUTE_UNUSED)
+ unsigned int align, int incr)
{
rtx_code_label *cmp_label, *top_label;
rtx iter, x_addr, y_addr, tmp;
@@ -2277,7 +2397,38 @@ emit_block_move_via_loop (rtx x, rtx y, rtx size,
cmp_label = gen_label_rtx ();
iter = gen_reg_rtx (iter_mode);
- emit_move_insn (iter, const0_rtx);
+ bool downwards = incr < 0;
+ rtx iter_init;
+ rtx_code iter_cond;
+ rtx iter_limit;
+ rtx iter_incr;
+ machine_mode move_mode;
+ if (downwards)
+ {
+ incr = -incr;
+ iter_init = size;
+ iter_cond = GE;
+ iter_limit = const0_rtx;
+ iter_incr = GEN_INT (incr);
+ }
+ else
+ {
+ iter_init = const0_rtx;
+ iter_cond = LT;
+ iter_limit = size;
+ iter_incr = GEN_INT (incr);
+ }
+ emit_move_insn (iter, iter_init);
+
+ scalar_int_mode int_move_mode
+ = smallest_int_mode_for_size (incr * BITS_PER_UNIT);
+ if (GET_MODE_BITSIZE (int_move_mode) != incr * BITS_PER_UNIT)
+ {
+ move_mode = BLKmode;
+ gcc_checking_assert (can_move_by_pieces (incr, align));
+ }
+ else
+ move_mode = int_move_mode;
x_addr = force_operand (XEXP (x, 0), NULL_RTX);
y_addr = force_operand (XEXP (y, 0), NULL_RTX);
@@ -2293,19 +2444,32 @@ emit_block_move_via_loop (rtx x, rtx y, rtx size,
tmp = convert_modes (y_addr_mode, iter_mode, iter, true);
y_addr = simplify_gen_binary (PLUS, y_addr_mode, y_addr, tmp);
- x = change_address (x, QImode, x_addr);
- y = change_address (y, QImode, y_addr);
+ x = change_address (x, move_mode, x_addr);
+ y = change_address (y, move_mode, y_addr);
+
+ if (move_mode == BLKmode)
+ {
+ bool done;
+ emit_block_move_hints (x, y, iter_incr, BLOCK_OP_NO_LIBCALL,
+ align, incr, incr, incr, incr,
+ false, &done, false);
+ gcc_checking_assert (done);
+ }
+ else
+ emit_move_insn (x, y);
- emit_move_insn (x, y);
+ if (downwards)
+ emit_label (cmp_label);
- tmp = expand_simple_binop (iter_mode, PLUS, iter, const1_rtx, iter,
+ tmp = expand_simple_binop (iter_mode, PLUS, iter, iter_incr, iter,
true, OPTAB_LIB_WIDEN);
if (tmp != iter)
emit_move_insn (iter, tmp);
- emit_label (cmp_label);
+ if (!downwards)
+ emit_label (cmp_label);
- emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
+ emit_cmp_and_jump_insns (iter, iter_limit, iter_cond, NULL_RTX, iter_mode,
true, top_label,
profile_probability::guessed_always ()
.apply_scale (9, 10));
diff --git a/gcc/expr.h b/gcc/expr.h
index e3ba9eb5370..d9fc47c9114 100644
--- a/gcc/expr.h
+++ b/gcc/expr.h
@@ -126,7 +126,8 @@ struct by_pieces_prev
fixed_size_mode mode;
};
-extern rtx emit_block_move (rtx, rtx, rtx, enum block_op_methods);
+extern rtx emit_block_move (rtx, rtx, rtx, enum block_op_methods,
+ unsigned ctz_size = 0);
extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
unsigned int, HOST_WIDE_INT,
unsigned HOST_WIDE_INT,
@@ -134,7 +135,8 @@ extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
unsigned HOST_WIDE_INT,
bool bail_out_libcall = false,
bool *is_move_done = NULL,
- bool might_overlap = false);
+ bool might_overlap = false,
+ unsigned ctz_size = 0);
extern rtx emit_block_cmp_hints (rtx, rtx, rtx, tree, rtx, bool,
by_pieces_constfn, void *);
extern bool emit_storent_insn (rtx to, rtx from);
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c
new file mode 100644
index 00000000000..2a7d74fbee4
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c
@@ -0,0 +1,8 @@
+/* { dg-do compile } */
+/* { dg-options "-finline-stringops=memcpy -g0 -fno-lto" } */
+
+#include "../memcmp-1.c"
+/* Yeah, this memcmp test exercises plenty of memcpy, more than any of the
+ memcpy tests. */
+
+/* { dg-final { scan-assembler-not "memcpy" } } */
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c
new file mode 100644
index 00000000000..5e38bc99ce1
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c
@@ -0,0 +1,7 @@
+/* { dg-do run } */
+/* { dg-options "-finline-stringops=memmove -g0 -fno-lto" } */
+
+#include "../../gcc.c-torture/execute/builtins/memmove.c"
+
+/* { dg-final { scan-assembler-not "memcpy" } } */
+/* { dg-final { scan-assembler-not "memmove" } } */
^ permalink raw reply [flat|nested] 6+ messages in thread
* [gcc(refs/users/aoliva/heads/testme)] add memcpy and memmove loop expanders
@ 2023-01-27 2:25 Alexandre Oliva
0 siblings, 0 replies; 6+ messages in thread
From: Alexandre Oliva @ 2023-01-27 2:25 UTC (permalink / raw)
To: gcc-cvs
https://gcc.gnu.org/g:6517957459debe07872931b426376952319d5cdf
commit 6517957459debe07872931b426376952319d5cdf
Author: Alexandre Oliva <oliva@gnu.org>
Date: Thu Jan 26 22:52:18 2023 -0300
add memcpy and memmove loop expanders
Diff:
---
gcc/builtins.cc | 2 +-
gcc/expr.cc | 194 +++++++++++++++++++++--
gcc/expr.h | 6 +-
gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c | 8 +
gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c | 7 +
5 files changed, 199 insertions(+), 18 deletions(-)
diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index bf21fef699f..4bf404a8a78 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -3756,7 +3756,7 @@ expand_builtin_memory_copy_args (tree dest, tree src, tree len,
expected_align, expected_size,
min_size, max_size, probable_max_size,
use_mempcpy_call, &is_move_done,
- might_overlap);
+ might_overlap, tree_ctz (len));
/* Bail out when a mempcpy call would be expanded as libcall and when
we have a target that provides a fast implementation
diff --git a/gcc/expr.cc b/gcc/expr.cc
index 15be1c8db99..aceb3f514fc 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -80,7 +80,9 @@ static bool emit_block_move_via_pattern (rtx, rtx, rtx, unsigned, unsigned,
HOST_WIDE_INT, unsigned HOST_WIDE_INT,
unsigned HOST_WIDE_INT,
unsigned HOST_WIDE_INT, bool);
-static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned);
+static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned, int);
+static void emit_block_move_via_sized_loop (rtx, rtx, rtx, unsigned, unsigned);
+static void emit_block_move_via_oriented_loop (rtx, rtx, rtx, unsigned, unsigned);
static void clear_by_pieces (rtx, unsigned HOST_WIDE_INT, unsigned int);
static rtx_insn *compress_float_constant (rtx, rtx);
static rtx get_subtarget (rtx);
@@ -1955,6 +1957,8 @@ compare_by_pieces (rtx arg0, rtx arg1, unsigned HOST_WIDE_INT len,
MIN_SIZE is the minimal size of block to move
MAX_SIZE is the maximal size of block to move, if it cannot be represented
in unsigned HOST_WIDE_INT, than it is mask of all ones.
+ CTZ_SIZE is the trailing-zeros count of SIZE; even a nonconstant SIZE is
+ known to be a multiple of 1<<CTZ_SIZE.
Return the address of the new block, if memcpy is called and returns it,
0 otherwise. */
@@ -1966,7 +1970,7 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
unsigned HOST_WIDE_INT max_size,
unsigned HOST_WIDE_INT probable_max_size,
bool bail_out_libcall, bool *is_move_done,
- bool might_overlap)
+ bool might_overlap, unsigned ctz_size)
{
int may_use_call;
rtx retval = 0;
@@ -2052,6 +2056,14 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
}
}
+ bool dynamic_direction = false;
+ if (!pattern_ok && !pieces_ok && may_use_call
+ && (flag_inline_stringops & (might_overlap ? ILSOP_MEMMOVE : ILSOP_MEMCPY)))
+ {
+ may_use_call = 0;
+ dynamic_direction = might_overlap;
+ }
+
if (pattern_ok)
;
else if (pieces_ok)
@@ -2073,10 +2085,12 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
retval = emit_block_copy_via_libcall (x, y, size,
method == BLOCK_OP_TAILCALL);
}
+ else if (dynamic_direction)
+ emit_block_move_via_oriented_loop (x, y, size, align, ctz_size);
else if (might_overlap)
*is_move_done = false;
else
- emit_block_move_via_loop (x, y, size, align);
+ emit_block_move_via_sized_loop (x, y, size, align, ctz_size);
if (method == BLOCK_OP_CALL_PARM)
OK_DEFER_POP;
@@ -2085,7 +2099,8 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
}
rtx
-emit_block_move (rtx x, rtx y, rtx size, enum block_op_methods method)
+emit_block_move (rtx x, rtx y, rtx size, enum block_op_methods method,
+ unsigned int ctz_size)
{
unsigned HOST_WIDE_INT max, min = 0;
if (GET_CODE (size) == CONST_INT)
@@ -2093,7 +2108,8 @@ emit_block_move (rtx x, rtx y, rtx size, enum block_op_methods method)
else
max = GET_MODE_MASK (GET_MODE (size));
return emit_block_move_hints (x, y, size, method, 0, -1,
- min, max, max);
+ min, max, max,
+ false, NULL, false, ctz_size);
}
/* A subroutine of emit_block_move. Returns true if calling the
@@ -2255,13 +2271,117 @@ emit_block_move_via_pattern (rtx x, rtx y, rtx size, unsigned int align,
return false;
}
+/* Like emit_block_move_via_loop, but choose a suitable INCR based on
+ ALIGN and CTZ_SIZE. */
+
+static void
+emit_block_move_via_sized_loop (rtx x, rtx y, rtx size,
+ unsigned int align,
+ unsigned int ctz_size)
+{
+ int incr = align / BITS_PER_UNIT;
+
+ if (CONST_INT_P (size))
+ ctz_size = MAX (ctz_size, (unsigned) wi::ctz (UINTVAL (size)));
+
+ if (HOST_WIDE_INT_1U << ctz_size < (unsigned HOST_WIDE_INT) incr)
+ incr = HOST_WIDE_INT_1U << ctz_size;
+
+ while (incr > 1 && !can_move_by_pieces (incr, align))
+ incr >>= 1;
+
+ gcc_checking_assert (incr);
+
+ return emit_block_move_via_loop (x, y, size, align, incr);
+}
+
+/* Like emit_block_move_via_sized_loop, but besides choosing INCR so
+ as to ensure safe moves even in case of overlap, output dynamic
+ tests to choose between two loops, one moving downwards, another
+ moving upwards. */
+
+static void
+emit_block_move_via_oriented_loop (rtx x, rtx y, rtx size,
+ unsigned int align,
+ unsigned int ctz_size)
+{
+ int incr = align / BITS_PER_UNIT;
+
+ if (CONST_INT_P (size))
+ ctz_size = MAX (ctz_size, (unsigned) wi::ctz (UINTVAL (size)));
+
+ if (HOST_WIDE_INT_1U << ctz_size < (unsigned HOST_WIDE_INT) incr)
+ incr = HOST_WIDE_INT_1U << ctz_size;
+
+ while (incr > 1 && !int_mode_for_size (incr, 0).exists ())
+ incr >>= 1;
+
+ gcc_checking_assert (incr);
+
+ rtx_code_label *upw_label, *end_label;
+ upw_label = gen_label_rtx ();
+ end_label = gen_label_rtx ();
+
+ rtx x_addr = force_operand (XEXP (x, 0), NULL_RTX);
+ rtx y_addr = force_operand (XEXP (y, 0), NULL_RTX);
+ do_pending_stack_adjust ();
+
+ machine_mode mode = GET_MODE (x_addr);
+ if (mode != GET_MODE (y_addr))
+ {
+ scalar_int_mode xmode
+ = smallest_int_mode_for_size (GET_MODE_BITSIZE (mode));
+ scalar_int_mode ymode
+ = smallest_int_mode_for_size (GET_MODE_BITSIZE
+ (GET_MODE (y_addr)));
+ if (GET_MODE_BITSIZE (xmode) < GET_MODE_BITSIZE (ymode))
+ mode = ymode;
+ else
+ mode = xmode;
+
+#ifndef POINTERS_EXTEND_UNSIGNED
+ const int POINTERS_EXTEND_UNSIGNED = 1;
+#endif
+ x_addr = convert_modes (mode, GET_MODE (x_addr), x_addr,
+ POINTERS_EXTEND_UNSIGNED);
+ y_addr = convert_modes (mode, GET_MODE (y_addr), y_addr,
+ POINTERS_EXTEND_UNSIGNED);
+ }
+
+ /* Test for overlap: if (x >= y || x + size <= y) goto upw_label. */
+ emit_cmp_and_jump_insns (x_addr, y_addr, GEU, NULL_RTX, mode,
+ true, upw_label,
+ profile_probability::guessed_always ()
+ .apply_scale (5, 10));
+ rtx tmp = convert_modes (GET_MODE (x_addr), GET_MODE (size), size, true);
+ tmp = simplify_gen_binary (PLUS, GET_MODE (x_addr), x_addr, tmp);
+
+ emit_cmp_and_jump_insns (tmp, y_addr, LEU, NULL_RTX, mode,
+ true, upw_label,
+ profile_probability::guessed_always ()
+ .apply_scale (8, 10));
+
+ emit_block_move_via_loop (x, y, size, align, -incr);
+
+ emit_jump (end_label);
+ emit_label (upw_label);
+
+ emit_block_move_via_loop (x, y, size, align, incr);
+
+ emit_label (end_label);
+}
+
/* A subroutine of emit_block_move. Copy the data via an explicit
- loop. This is used only when libcalls are forbidden. */
-/* ??? It'd be nice to copy in hunks larger than QImode. */
+ loop. This is used only when libcalls are forbidden, or when
+ inlining is required. INCR is the block size to be copied in each
+ loop iteration. If it is negative, the absolute value is used, and
+ the block is copied backwards. INCR must be a power of two, an
+ exact divisor for SIZE and ALIGN, and imply a mode that can be
+ safely copied per iteration assuming no overlap. */
static void
emit_block_move_via_loop (rtx x, rtx y, rtx size,
- unsigned int align ATTRIBUTE_UNUSED)
+ unsigned int align, int incr)
{
rtx_code_label *cmp_label, *top_label;
rtx iter, x_addr, y_addr, tmp;
@@ -2277,7 +2397,38 @@ emit_block_move_via_loop (rtx x, rtx y, rtx size,
cmp_label = gen_label_rtx ();
iter = gen_reg_rtx (iter_mode);
- emit_move_insn (iter, const0_rtx);
+ bool downwards = incr < 0;
+ rtx iter_init;
+ rtx_code iter_cond;
+ rtx iter_limit;
+ rtx iter_incr;
+ machine_mode move_mode;
+ if (downwards)
+ {
+ incr = -incr;
+ iter_init = size;
+ iter_cond = GE;
+ iter_limit = const0_rtx;
+ iter_incr = GEN_INT (incr);
+ }
+ else
+ {
+ iter_init = const0_rtx;
+ iter_cond = LT;
+ iter_limit = size;
+ iter_incr = GEN_INT (incr);
+ }
+ emit_move_insn (iter, iter_init);
+
+ scalar_int_mode int_move_mode
+ = smallest_int_mode_for_size (incr * BITS_PER_UNIT);
+ if (GET_MODE_BITSIZE (int_move_mode) != incr * BITS_PER_UNIT)
+ {
+ move_mode = BLKmode;
+ gcc_checking_assert (can_move_by_pieces (incr, align));
+ }
+ else
+ move_mode = int_move_mode;
x_addr = force_operand (XEXP (x, 0), NULL_RTX);
y_addr = force_operand (XEXP (y, 0), NULL_RTX);
@@ -2293,19 +2444,32 @@ emit_block_move_via_loop (rtx x, rtx y, rtx size,
tmp = convert_modes (y_addr_mode, iter_mode, iter, true);
y_addr = simplify_gen_binary (PLUS, y_addr_mode, y_addr, tmp);
- x = change_address (x, QImode, x_addr);
- y = change_address (y, QImode, y_addr);
+ x = change_address (x, move_mode, x_addr);
+ y = change_address (y, move_mode, y_addr);
+
+ if (move_mode == BLKmode)
+ {
+ bool done;
+ emit_block_move_hints (x, y, iter_incr, BLOCK_OP_NO_LIBCALL,
+ align, incr, incr, incr, incr,
+ false, &done, false);
+ gcc_checking_assert (done);
+ }
+ else
+ emit_move_insn (x, y);
- emit_move_insn (x, y);
+ if (downwards)
+ emit_label (cmp_label);
- tmp = expand_simple_binop (iter_mode, PLUS, iter, const1_rtx, iter,
+ tmp = expand_simple_binop (iter_mode, PLUS, iter, iter_incr, iter,
true, OPTAB_LIB_WIDEN);
if (tmp != iter)
emit_move_insn (iter, tmp);
- emit_label (cmp_label);
+ if (!downwards)
+ emit_label (cmp_label);
- emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
+ emit_cmp_and_jump_insns (iter, iter_limit, iter_cond, NULL_RTX, iter_mode,
true, top_label,
profile_probability::guessed_always ()
.apply_scale (9, 10));
diff --git a/gcc/expr.h b/gcc/expr.h
index e3ba9eb5370..d9fc47c9114 100644
--- a/gcc/expr.h
+++ b/gcc/expr.h
@@ -126,7 +126,8 @@ struct by_pieces_prev
fixed_size_mode mode;
};
-extern rtx emit_block_move (rtx, rtx, rtx, enum block_op_methods);
+extern rtx emit_block_move (rtx, rtx, rtx, enum block_op_methods,
+ unsigned ctz_size = 0);
extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
unsigned int, HOST_WIDE_INT,
unsigned HOST_WIDE_INT,
@@ -134,7 +135,8 @@ extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
unsigned HOST_WIDE_INT,
bool bail_out_libcall = false,
bool *is_move_done = NULL,
- bool might_overlap = false);
+ bool might_overlap = false,
+ unsigned ctz_size = 0);
extern rtx emit_block_cmp_hints (rtx, rtx, rtx, tree, rtx, bool,
by_pieces_constfn, void *);
extern bool emit_storent_insn (rtx to, rtx from);
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c
new file mode 100644
index 00000000000..2a7d74fbee4
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c
@@ -0,0 +1,8 @@
+/* { dg-do compile } */
+/* { dg-options "-finline-stringops=memcpy -g0 -fno-lto" } */
+
+#include "../memcmp-1.c"
+/* Yeah, this memcmp test exercises plenty of memcpy, more than any of the
+ memcpy tests. */
+
+/* { dg-final { scan-assembler-not "memcpy" } } */
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c
new file mode 100644
index 00000000000..5e38bc99ce1
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c
@@ -0,0 +1,7 @@
+/* { dg-do run } */
+/* { dg-options "-finline-stringops=memmove -g0 -fno-lto" } */
+
+#include "../../gcc.c-torture/execute/builtins/memmove.c"
+
+/* { dg-final { scan-assembler-not "memcpy" } } */
+/* { dg-final { scan-assembler-not "memmove" } } */
^ permalink raw reply [flat|nested] 6+ messages in thread
* [gcc(refs/users/aoliva/heads/testme)] add memcpy and memmove loop expanders
@ 2023-01-27 1:57 Alexandre Oliva
0 siblings, 0 replies; 6+ messages in thread
From: Alexandre Oliva @ 2023-01-27 1:57 UTC (permalink / raw)
To: gcc-cvs
https://gcc.gnu.org/g:9aeb96e942930ee9aecce6579e403670660a3100
commit 9aeb96e942930ee9aecce6579e403670660a3100
Author: Alexandre Oliva <oliva@gnu.org>
Date: Thu Jan 26 22:52:18 2023 -0300
add memcpy and memmove loop expanders
Diff:
---
gcc/builtins.cc | 2 +-
gcc/expr.cc | 194 +++++++++++++++++++++--
gcc/expr.h | 6 +-
gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c | 8 +
gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c | 7 +
5 files changed, 199 insertions(+), 18 deletions(-)
diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index b7737678a7d..ca4d4721cdc 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -3756,7 +3756,7 @@ expand_builtin_memory_copy_args (tree dest, tree src, tree len,
expected_align, expected_size,
min_size, max_size, probable_max_size,
use_mempcpy_call, &is_move_done,
- might_overlap);
+ might_overlap, tree_ctz (len));
/* Bail out when a mempcpy call would be expanded as libcall and when
we have a target that provides a fast implementation
diff --git a/gcc/expr.cc b/gcc/expr.cc
index 15be1c8db99..aceb3f514fc 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -80,7 +80,9 @@ static bool emit_block_move_via_pattern (rtx, rtx, rtx, unsigned, unsigned,
HOST_WIDE_INT, unsigned HOST_WIDE_INT,
unsigned HOST_WIDE_INT,
unsigned HOST_WIDE_INT, bool);
-static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned);
+static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned, int);
+static void emit_block_move_via_sized_loop (rtx, rtx, rtx, unsigned, unsigned);
+static void emit_block_move_via_oriented_loop (rtx, rtx, rtx, unsigned, unsigned);
static void clear_by_pieces (rtx, unsigned HOST_WIDE_INT, unsigned int);
static rtx_insn *compress_float_constant (rtx, rtx);
static rtx get_subtarget (rtx);
@@ -1955,6 +1957,8 @@ compare_by_pieces (rtx arg0, rtx arg1, unsigned HOST_WIDE_INT len,
MIN_SIZE is the minimal size of block to move
MAX_SIZE is the maximal size of block to move, if it cannot be represented
in unsigned HOST_WIDE_INT, than it is mask of all ones.
+ CTZ_SIZE is the trailing-zeros count of SIZE; even a nonconstant SIZE is
+ known to be a multiple of 1<<CTZ_SIZE.
Return the address of the new block, if memcpy is called and returns it,
0 otherwise. */
@@ -1966,7 +1970,7 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
unsigned HOST_WIDE_INT max_size,
unsigned HOST_WIDE_INT probable_max_size,
bool bail_out_libcall, bool *is_move_done,
- bool might_overlap)
+ bool might_overlap, unsigned ctz_size)
{
int may_use_call;
rtx retval = 0;
@@ -2052,6 +2056,14 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
}
}
+ bool dynamic_direction = false;
+ if (!pattern_ok && !pieces_ok && may_use_call
+ && (flag_inline_stringops & (might_overlap ? ILSOP_MEMMOVE : ILSOP_MEMCPY)))
+ {
+ may_use_call = 0;
+ dynamic_direction = might_overlap;
+ }
+
if (pattern_ok)
;
else if (pieces_ok)
@@ -2073,10 +2085,12 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
retval = emit_block_copy_via_libcall (x, y, size,
method == BLOCK_OP_TAILCALL);
}
+ else if (dynamic_direction)
+ emit_block_move_via_oriented_loop (x, y, size, align, ctz_size);
else if (might_overlap)
*is_move_done = false;
else
- emit_block_move_via_loop (x, y, size, align);
+ emit_block_move_via_sized_loop (x, y, size, align, ctz_size);
if (method == BLOCK_OP_CALL_PARM)
OK_DEFER_POP;
@@ -2085,7 +2099,8 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
}
rtx
-emit_block_move (rtx x, rtx y, rtx size, enum block_op_methods method)
+emit_block_move (rtx x, rtx y, rtx size, enum block_op_methods method,
+ unsigned int ctz_size)
{
unsigned HOST_WIDE_INT max, min = 0;
if (GET_CODE (size) == CONST_INT)
@@ -2093,7 +2108,8 @@ emit_block_move (rtx x, rtx y, rtx size, enum block_op_methods method)
else
max = GET_MODE_MASK (GET_MODE (size));
return emit_block_move_hints (x, y, size, method, 0, -1,
- min, max, max);
+ min, max, max,
+ false, NULL, false, ctz_size);
}
/* A subroutine of emit_block_move. Returns true if calling the
@@ -2255,13 +2271,117 @@ emit_block_move_via_pattern (rtx x, rtx y, rtx size, unsigned int align,
return false;
}
+/* Like emit_block_move_via_loop, but choose a suitable INCR based on
+ ALIGN and CTZ_SIZE. */
+
+static void
+emit_block_move_via_sized_loop (rtx x, rtx y, rtx size,
+ unsigned int align,
+ unsigned int ctz_size)
+{
+ int incr = align / BITS_PER_UNIT;
+
+ if (CONST_INT_P (size))
+ ctz_size = MAX (ctz_size, (unsigned) wi::ctz (UINTVAL (size)));
+
+ if (HOST_WIDE_INT_1U << ctz_size < (unsigned HOST_WIDE_INT) incr)
+ incr = HOST_WIDE_INT_1U << ctz_size;
+
+ while (incr > 1 && !can_move_by_pieces (incr, align))
+ incr >>= 1;
+
+ gcc_checking_assert (incr);
+
+ return emit_block_move_via_loop (x, y, size, align, incr);
+}
+
+/* Like emit_block_move_via_sized_loop, but besides choosing INCR so
+ as to ensure safe moves even in case of overlap, output dynamic
+ tests to choose between two loops, one moving downwards, another
+ moving upwards. */
+
+static void
+emit_block_move_via_oriented_loop (rtx x, rtx y, rtx size,
+ unsigned int align,
+ unsigned int ctz_size)
+{
+ int incr = align / BITS_PER_UNIT;
+
+ if (CONST_INT_P (size))
+ ctz_size = MAX (ctz_size, (unsigned) wi::ctz (UINTVAL (size)));
+
+ if (HOST_WIDE_INT_1U << ctz_size < (unsigned HOST_WIDE_INT) incr)
+ incr = HOST_WIDE_INT_1U << ctz_size;
+
+ while (incr > 1 && !int_mode_for_size (incr, 0).exists ())
+ incr >>= 1;
+
+ gcc_checking_assert (incr);
+
+ rtx_code_label *upw_label, *end_label;
+ upw_label = gen_label_rtx ();
+ end_label = gen_label_rtx ();
+
+ rtx x_addr = force_operand (XEXP (x, 0), NULL_RTX);
+ rtx y_addr = force_operand (XEXP (y, 0), NULL_RTX);
+ do_pending_stack_adjust ();
+
+ machine_mode mode = GET_MODE (x_addr);
+ if (mode != GET_MODE (y_addr))
+ {
+ scalar_int_mode xmode
+ = smallest_int_mode_for_size (GET_MODE_BITSIZE (mode));
+ scalar_int_mode ymode
+ = smallest_int_mode_for_size (GET_MODE_BITSIZE
+ (GET_MODE (y_addr)));
+ if (GET_MODE_BITSIZE (xmode) < GET_MODE_BITSIZE (ymode))
+ mode = ymode;
+ else
+ mode = xmode;
+
+#ifndef POINTERS_EXTEND_UNSIGNED
+ const int POINTERS_EXTEND_UNSIGNED = 1;
+#endif
+ x_addr = convert_modes (mode, GET_MODE (x_addr), x_addr,
+ POINTERS_EXTEND_UNSIGNED);
+ y_addr = convert_modes (mode, GET_MODE (y_addr), y_addr,
+ POINTERS_EXTEND_UNSIGNED);
+ }
+
+ /* Test for overlap: if (x >= y || x + size <= y) goto upw_label. */
+ emit_cmp_and_jump_insns (x_addr, y_addr, GEU, NULL_RTX, mode,
+ true, upw_label,
+ profile_probability::guessed_always ()
+ .apply_scale (5, 10));
+ rtx tmp = convert_modes (GET_MODE (x_addr), GET_MODE (size), size, true);
+ tmp = simplify_gen_binary (PLUS, GET_MODE (x_addr), x_addr, tmp);
+
+ emit_cmp_and_jump_insns (tmp, y_addr, LEU, NULL_RTX, mode,
+ true, upw_label,
+ profile_probability::guessed_always ()
+ .apply_scale (8, 10));
+
+ emit_block_move_via_loop (x, y, size, align, -incr);
+
+ emit_jump (end_label);
+ emit_label (upw_label);
+
+ emit_block_move_via_loop (x, y, size, align, incr);
+
+ emit_label (end_label);
+}
+
/* A subroutine of emit_block_move. Copy the data via an explicit
- loop. This is used only when libcalls are forbidden. */
-/* ??? It'd be nice to copy in hunks larger than QImode. */
+ loop. This is used only when libcalls are forbidden, or when
+ inlining is required. INCR is the block size to be copied in each
+ loop iteration. If it is negative, the absolute value is used, and
+ the block is copied backwards. INCR must be a power of two, an
+ exact divisor for SIZE and ALIGN, and imply a mode that can be
+ safely copied per iteration assuming no overlap. */
static void
emit_block_move_via_loop (rtx x, rtx y, rtx size,
- unsigned int align ATTRIBUTE_UNUSED)
+ unsigned int align, int incr)
{
rtx_code_label *cmp_label, *top_label;
rtx iter, x_addr, y_addr, tmp;
@@ -2277,7 +2397,38 @@ emit_block_move_via_loop (rtx x, rtx y, rtx size,
cmp_label = gen_label_rtx ();
iter = gen_reg_rtx (iter_mode);
- emit_move_insn (iter, const0_rtx);
+ bool downwards = incr < 0;
+ rtx iter_init;
+ rtx_code iter_cond;
+ rtx iter_limit;
+ rtx iter_incr;
+ machine_mode move_mode;
+ if (downwards)
+ {
+ incr = -incr;
+ iter_init = size;
+ iter_cond = GE;
+ iter_limit = const0_rtx;
+ iter_incr = GEN_INT (incr);
+ }
+ else
+ {
+ iter_init = const0_rtx;
+ iter_cond = LT;
+ iter_limit = size;
+ iter_incr = GEN_INT (incr);
+ }
+ emit_move_insn (iter, iter_init);
+
+ scalar_int_mode int_move_mode
+ = smallest_int_mode_for_size (incr * BITS_PER_UNIT);
+ if (GET_MODE_BITSIZE (int_move_mode) != incr * BITS_PER_UNIT)
+ {
+ move_mode = BLKmode;
+ gcc_checking_assert (can_move_by_pieces (incr, align));
+ }
+ else
+ move_mode = int_move_mode;
x_addr = force_operand (XEXP (x, 0), NULL_RTX);
y_addr = force_operand (XEXP (y, 0), NULL_RTX);
@@ -2293,19 +2444,32 @@ emit_block_move_via_loop (rtx x, rtx y, rtx size,
tmp = convert_modes (y_addr_mode, iter_mode, iter, true);
y_addr = simplify_gen_binary (PLUS, y_addr_mode, y_addr, tmp);
- x = change_address (x, QImode, x_addr);
- y = change_address (y, QImode, y_addr);
+ x = change_address (x, move_mode, x_addr);
+ y = change_address (y, move_mode, y_addr);
+
+ if (move_mode == BLKmode)
+ {
+ bool done;
+ emit_block_move_hints (x, y, iter_incr, BLOCK_OP_NO_LIBCALL,
+ align, incr, incr, incr, incr,
+ false, &done, false);
+ gcc_checking_assert (done);
+ }
+ else
+ emit_move_insn (x, y);
- emit_move_insn (x, y);
+ if (downwards)
+ emit_label (cmp_label);
- tmp = expand_simple_binop (iter_mode, PLUS, iter, const1_rtx, iter,
+ tmp = expand_simple_binop (iter_mode, PLUS, iter, iter_incr, iter,
true, OPTAB_LIB_WIDEN);
if (tmp != iter)
emit_move_insn (iter, tmp);
- emit_label (cmp_label);
+ if (!downwards)
+ emit_label (cmp_label);
- emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
+ emit_cmp_and_jump_insns (iter, iter_limit, iter_cond, NULL_RTX, iter_mode,
true, top_label,
profile_probability::guessed_always ()
.apply_scale (9, 10));
diff --git a/gcc/expr.h b/gcc/expr.h
index e3ba9eb5370..d9fc47c9114 100644
--- a/gcc/expr.h
+++ b/gcc/expr.h
@@ -126,7 +126,8 @@ struct by_pieces_prev
fixed_size_mode mode;
};
-extern rtx emit_block_move (rtx, rtx, rtx, enum block_op_methods);
+extern rtx emit_block_move (rtx, rtx, rtx, enum block_op_methods,
+ unsigned ctz_size = 0);
extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
unsigned int, HOST_WIDE_INT,
unsigned HOST_WIDE_INT,
@@ -134,7 +135,8 @@ extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
unsigned HOST_WIDE_INT,
bool bail_out_libcall = false,
bool *is_move_done = NULL,
- bool might_overlap = false);
+ bool might_overlap = false,
+ unsigned ctz_size = 0);
extern rtx emit_block_cmp_hints (rtx, rtx, rtx, tree, rtx, bool,
by_pieces_constfn, void *);
extern bool emit_storent_insn (rtx to, rtx from);
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c
new file mode 100644
index 00000000000..2a7d74fbee4
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c
@@ -0,0 +1,8 @@
+/* { dg-do compile } */
+/* { dg-options "-finline-stringops=memcpy -g0 -fno-lto" } */
+
+#include "../memcmp-1.c"
+/* Yeah, this memcmp test exercises plenty of memcpy, more than any of the
+ memcpy tests. */
+
+/* { dg-final { scan-assembler-not "memcpy" } } */
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c
new file mode 100644
index 00000000000..5e38bc99ce1
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c
@@ -0,0 +1,7 @@
+/* { dg-do run } */
+/* { dg-options "-finline-stringops=memmove -g0 -fno-lto" } */
+
+#include "../../gcc.c-torture/execute/builtins/memmove.c"
+
+/* { dg-final { scan-assembler-not "memcpy" } } */
+/* { dg-final { scan-assembler-not "memmove" } } */
^ permalink raw reply [flat|nested] 6+ messages in thread
* [gcc(refs/users/aoliva/heads/testme)] add memcpy and memmove loop expanders
@ 2023-01-26 6:09 Alexandre Oliva
0 siblings, 0 replies; 6+ messages in thread
From: Alexandre Oliva @ 2023-01-26 6:09 UTC (permalink / raw)
To: gcc-cvs
https://gcc.gnu.org/g:cd610334640ffd43dfd9313f8405e139b6a973fa
commit cd610334640ffd43dfd9313f8405e139b6a973fa
Author: Alexandre Oliva <oliva@gnu.org>
Date: Fri Jan 20 19:23:02 2023 -0300
add memcpy and memmove loop expanders
Diff:
---
gcc/builtins.cc | 2 +-
gcc/expr.cc | 194 +++++++++++++++++++++++++++++++++++++++++++++++++++-----
gcc/expr.h | 6 +-
3 files changed, 184 insertions(+), 18 deletions(-)
diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index b7737678a7d..ca4d4721cdc 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -3756,7 +3756,7 @@ expand_builtin_memory_copy_args (tree dest, tree src, tree len,
expected_align, expected_size,
min_size, max_size, probable_max_size,
use_mempcpy_call, &is_move_done,
- might_overlap);
+ might_overlap, tree_ctz (len));
/* Bail out when a mempcpy call would be expanded as libcall and when
we have a target that provides a fast implementation
diff --git a/gcc/expr.cc b/gcc/expr.cc
index 15be1c8db99..aceb3f514fc 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -80,7 +80,9 @@ static bool emit_block_move_via_pattern (rtx, rtx, rtx, unsigned, unsigned,
HOST_WIDE_INT, unsigned HOST_WIDE_INT,
unsigned HOST_WIDE_INT,
unsigned HOST_WIDE_INT, bool);
-static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned);
+static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned, int);
+static void emit_block_move_via_sized_loop (rtx, rtx, rtx, unsigned, unsigned);
+static void emit_block_move_via_oriented_loop (rtx, rtx, rtx, unsigned, unsigned);
static void clear_by_pieces (rtx, unsigned HOST_WIDE_INT, unsigned int);
static rtx_insn *compress_float_constant (rtx, rtx);
static rtx get_subtarget (rtx);
@@ -1955,6 +1957,8 @@ compare_by_pieces (rtx arg0, rtx arg1, unsigned HOST_WIDE_INT len,
MIN_SIZE is the minimal size of block to move
MAX_SIZE is the maximal size of block to move, if it cannot be represented
in unsigned HOST_WIDE_INT, than it is mask of all ones.
+ CTZ_SIZE is the trailing-zeros count of SIZE; even a nonconstant SIZE is
+ known to be a multiple of 1<<CTZ_SIZE.
Return the address of the new block, if memcpy is called and returns it,
0 otherwise. */
@@ -1966,7 +1970,7 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
unsigned HOST_WIDE_INT max_size,
unsigned HOST_WIDE_INT probable_max_size,
bool bail_out_libcall, bool *is_move_done,
- bool might_overlap)
+ bool might_overlap, unsigned ctz_size)
{
int may_use_call;
rtx retval = 0;
@@ -2052,6 +2056,14 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
}
}
+ bool dynamic_direction = false;
+ if (!pattern_ok && !pieces_ok && may_use_call
+ && (flag_inline_stringops & (might_overlap ? ILSOP_MEMMOVE : ILSOP_MEMCPY)))
+ {
+ may_use_call = 0;
+ dynamic_direction = might_overlap;
+ }
+
if (pattern_ok)
;
else if (pieces_ok)
@@ -2073,10 +2085,12 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
retval = emit_block_copy_via_libcall (x, y, size,
method == BLOCK_OP_TAILCALL);
}
+ else if (dynamic_direction)
+ emit_block_move_via_oriented_loop (x, y, size, align, ctz_size);
else if (might_overlap)
*is_move_done = false;
else
- emit_block_move_via_loop (x, y, size, align);
+ emit_block_move_via_sized_loop (x, y, size, align, ctz_size);
if (method == BLOCK_OP_CALL_PARM)
OK_DEFER_POP;
@@ -2085,7 +2099,8 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
}
rtx
-emit_block_move (rtx x, rtx y, rtx size, enum block_op_methods method)
+emit_block_move (rtx x, rtx y, rtx size, enum block_op_methods method,
+ unsigned int ctz_size)
{
unsigned HOST_WIDE_INT max, min = 0;
if (GET_CODE (size) == CONST_INT)
@@ -2093,7 +2108,8 @@ emit_block_move (rtx x, rtx y, rtx size, enum block_op_methods method)
else
max = GET_MODE_MASK (GET_MODE (size));
return emit_block_move_hints (x, y, size, method, 0, -1,
- min, max, max);
+ min, max, max,
+ false, NULL, false, ctz_size);
}
/* A subroutine of emit_block_move. Returns true if calling the
@@ -2255,13 +2271,117 @@ emit_block_move_via_pattern (rtx x, rtx y, rtx size, unsigned int align,
return false;
}
+/* Like emit_block_move_via_loop, but choose a suitable INCR based on
+ ALIGN and CTZ_SIZE. */
+
+static void
+emit_block_move_via_sized_loop (rtx x, rtx y, rtx size,
+ unsigned int align,
+ unsigned int ctz_size)
+{
+ int incr = align / BITS_PER_UNIT;
+
+ if (CONST_INT_P (size))
+ ctz_size = MAX (ctz_size, (unsigned) wi::ctz (UINTVAL (size)));
+
+ if (HOST_WIDE_INT_1U << ctz_size < (unsigned HOST_WIDE_INT) incr)
+ incr = HOST_WIDE_INT_1U << ctz_size;
+
+ while (incr > 1 && !can_move_by_pieces (incr, align))
+ incr >>= 1;
+
+ gcc_checking_assert (incr);
+
+ return emit_block_move_via_loop (x, y, size, align, incr);
+}
+
+/* Like emit_block_move_via_sized_loop, but besides choosing INCR so
+ as to ensure safe moves even in case of overlap, output dynamic
+ tests to choose between two loops, one moving downwards, another
+ moving upwards. */
+
+static void
+emit_block_move_via_oriented_loop (rtx x, rtx y, rtx size,
+ unsigned int align,
+ unsigned int ctz_size)
+{
+ int incr = align / BITS_PER_UNIT;
+
+ if (CONST_INT_P (size))
+ ctz_size = MAX (ctz_size, (unsigned) wi::ctz (UINTVAL (size)));
+
+ if (HOST_WIDE_INT_1U << ctz_size < (unsigned HOST_WIDE_INT) incr)
+ incr = HOST_WIDE_INT_1U << ctz_size;
+
+ while (incr > 1 && !int_mode_for_size (incr, 0).exists ())
+ incr >>= 1;
+
+ gcc_checking_assert (incr);
+
+ rtx_code_label *upw_label, *end_label;
+ upw_label = gen_label_rtx ();
+ end_label = gen_label_rtx ();
+
+ rtx x_addr = force_operand (XEXP (x, 0), NULL_RTX);
+ rtx y_addr = force_operand (XEXP (y, 0), NULL_RTX);
+ do_pending_stack_adjust ();
+
+ machine_mode mode = GET_MODE (x_addr);
+ if (mode != GET_MODE (y_addr))
+ {
+ scalar_int_mode xmode
+ = smallest_int_mode_for_size (GET_MODE_BITSIZE (mode));
+ scalar_int_mode ymode
+ = smallest_int_mode_for_size (GET_MODE_BITSIZE
+ (GET_MODE (y_addr)));
+ if (GET_MODE_BITSIZE (xmode) < GET_MODE_BITSIZE (ymode))
+ mode = ymode;
+ else
+ mode = xmode;
+
+#ifndef POINTERS_EXTEND_UNSIGNED
+ const int POINTERS_EXTEND_UNSIGNED = 1;
+#endif
+ x_addr = convert_modes (mode, GET_MODE (x_addr), x_addr,
+ POINTERS_EXTEND_UNSIGNED);
+ y_addr = convert_modes (mode, GET_MODE (y_addr), y_addr,
+ POINTERS_EXTEND_UNSIGNED);
+ }
+
+ /* Test for overlap: if (x >= y || x + size <= y) goto upw_label. */
+ emit_cmp_and_jump_insns (x_addr, y_addr, GEU, NULL_RTX, mode,
+ true, upw_label,
+ profile_probability::guessed_always ()
+ .apply_scale (5, 10));
+ rtx tmp = convert_modes (GET_MODE (x_addr), GET_MODE (size), size, true);
+ tmp = simplify_gen_binary (PLUS, GET_MODE (x_addr), x_addr, tmp);
+
+ emit_cmp_and_jump_insns (tmp, y_addr, LEU, NULL_RTX, mode,
+ true, upw_label,
+ profile_probability::guessed_always ()
+ .apply_scale (8, 10));
+
+ emit_block_move_via_loop (x, y, size, align, -incr);
+
+ emit_jump (end_label);
+ emit_label (upw_label);
+
+ emit_block_move_via_loop (x, y, size, align, incr);
+
+ emit_label (end_label);
+}
+
/* A subroutine of emit_block_move. Copy the data via an explicit
- loop. This is used only when libcalls are forbidden. */
-/* ??? It'd be nice to copy in hunks larger than QImode. */
+ loop. This is used only when libcalls are forbidden, or when
+ inlining is required. INCR is the block size to be copied in each
+ loop iteration. If it is negative, the absolute value is used, and
+ the block is copied backwards. INCR must be a power of two, an
+ exact divisor for SIZE and ALIGN, and imply a mode that can be
+ safely copied per iteration assuming no overlap. */
static void
emit_block_move_via_loop (rtx x, rtx y, rtx size,
- unsigned int align ATTRIBUTE_UNUSED)
+ unsigned int align, int incr)
{
rtx_code_label *cmp_label, *top_label;
rtx iter, x_addr, y_addr, tmp;
@@ -2277,7 +2397,38 @@ emit_block_move_via_loop (rtx x, rtx y, rtx size,
cmp_label = gen_label_rtx ();
iter = gen_reg_rtx (iter_mode);
- emit_move_insn (iter, const0_rtx);
+ bool downwards = incr < 0;
+ rtx iter_init;
+ rtx_code iter_cond;
+ rtx iter_limit;
+ rtx iter_incr;
+ machine_mode move_mode;
+ if (downwards)
+ {
+ incr = -incr;
+ iter_init = size;
+ iter_cond = GE;
+ iter_limit = const0_rtx;
+ iter_incr = GEN_INT (incr);
+ }
+ else
+ {
+ iter_init = const0_rtx;
+ iter_cond = LT;
+ iter_limit = size;
+ iter_incr = GEN_INT (incr);
+ }
+ emit_move_insn (iter, iter_init);
+
+ scalar_int_mode int_move_mode
+ = smallest_int_mode_for_size (incr * BITS_PER_UNIT);
+ if (GET_MODE_BITSIZE (int_move_mode) != incr * BITS_PER_UNIT)
+ {
+ move_mode = BLKmode;
+ gcc_checking_assert (can_move_by_pieces (incr, align));
+ }
+ else
+ move_mode = int_move_mode;
x_addr = force_operand (XEXP (x, 0), NULL_RTX);
y_addr = force_operand (XEXP (y, 0), NULL_RTX);
@@ -2293,19 +2444,32 @@ emit_block_move_via_loop (rtx x, rtx y, rtx size,
tmp = convert_modes (y_addr_mode, iter_mode, iter, true);
y_addr = simplify_gen_binary (PLUS, y_addr_mode, y_addr, tmp);
- x = change_address (x, QImode, x_addr);
- y = change_address (y, QImode, y_addr);
+ x = change_address (x, move_mode, x_addr);
+ y = change_address (y, move_mode, y_addr);
+
+ if (move_mode == BLKmode)
+ {
+ bool done;
+ emit_block_move_hints (x, y, iter_incr, BLOCK_OP_NO_LIBCALL,
+ align, incr, incr, incr, incr,
+ false, &done, false);
+ gcc_checking_assert (done);
+ }
+ else
+ emit_move_insn (x, y);
- emit_move_insn (x, y);
+ if (downwards)
+ emit_label (cmp_label);
- tmp = expand_simple_binop (iter_mode, PLUS, iter, const1_rtx, iter,
+ tmp = expand_simple_binop (iter_mode, PLUS, iter, iter_incr, iter,
true, OPTAB_LIB_WIDEN);
if (tmp != iter)
emit_move_insn (iter, tmp);
- emit_label (cmp_label);
+ if (!downwards)
+ emit_label (cmp_label);
- emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
+ emit_cmp_and_jump_insns (iter, iter_limit, iter_cond, NULL_RTX, iter_mode,
true, top_label,
profile_probability::guessed_always ()
.apply_scale (9, 10));
diff --git a/gcc/expr.h b/gcc/expr.h
index e3ba9eb5370..d9fc47c9114 100644
--- a/gcc/expr.h
+++ b/gcc/expr.h
@@ -126,7 +126,8 @@ struct by_pieces_prev
fixed_size_mode mode;
};
-extern rtx emit_block_move (rtx, rtx, rtx, enum block_op_methods);
+extern rtx emit_block_move (rtx, rtx, rtx, enum block_op_methods,
+ unsigned ctz_size = 0);
extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
unsigned int, HOST_WIDE_INT,
unsigned HOST_WIDE_INT,
@@ -134,7 +135,8 @@ extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
unsigned HOST_WIDE_INT,
bool bail_out_libcall = false,
bool *is_move_done = NULL,
- bool might_overlap = false);
+ bool might_overlap = false,
+ unsigned ctz_size = 0);
extern rtx emit_block_cmp_hints (rtx, rtx, rtx, tree, rtx, bool,
by_pieces_constfn, void *);
extern bool emit_storent_insn (rtx to, rtx from);
^ permalink raw reply [flat|nested] 6+ messages in thread
* [gcc(refs/users/aoliva/heads/testme)] add memcpy and memmove loop expanders
@ 2023-01-21 1:06 Alexandre Oliva
0 siblings, 0 replies; 6+ messages in thread
From: Alexandre Oliva @ 2023-01-21 1:06 UTC (permalink / raw)
To: gcc-cvs
https://gcc.gnu.org/g:284bc638524a3055a189e84fdc58e4858af24feb
commit 284bc638524a3055a189e84fdc58e4858af24feb
Author: Alexandre Oliva <oliva@gnu.org>
Date: Fri Jan 20 19:23:02 2023 -0300
add memcpy and memmove loop expanders
Diff:
---
gcc/builtins.cc | 2 +-
gcc/expr.cc | 192 +++++++++++++++++++++++++++++++++++++++++++++++++++-----
gcc/expr.h | 6 +-
3 files changed, 182 insertions(+), 18 deletions(-)
diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index b7737678a7d..ca4d4721cdc 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -3756,7 +3756,7 @@ expand_builtin_memory_copy_args (tree dest, tree src, tree len,
expected_align, expected_size,
min_size, max_size, probable_max_size,
use_mempcpy_call, &is_move_done,
- might_overlap);
+ might_overlap, tree_ctz (len));
/* Bail out when a mempcpy call would be expanded as libcall and when
we have a target that provides a fast implementation
diff --git a/gcc/expr.cc b/gcc/expr.cc
index 15be1c8db99..a02b0b6ed52 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -80,7 +80,9 @@ static bool emit_block_move_via_pattern (rtx, rtx, rtx, unsigned, unsigned,
HOST_WIDE_INT, unsigned HOST_WIDE_INT,
unsigned HOST_WIDE_INT,
unsigned HOST_WIDE_INT, bool);
-static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned);
+static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned, int);
+static void emit_block_move_via_sized_loop (rtx, rtx, rtx, unsigned, unsigned);
+static void emit_block_move_via_oriented_loop (rtx, rtx, rtx, unsigned, unsigned);
static void clear_by_pieces (rtx, unsigned HOST_WIDE_INT, unsigned int);
static rtx_insn *compress_float_constant (rtx, rtx);
static rtx get_subtarget (rtx);
@@ -1966,7 +1968,7 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
unsigned HOST_WIDE_INT max_size,
unsigned HOST_WIDE_INT probable_max_size,
bool bail_out_libcall, bool *is_move_done,
- bool might_overlap)
+ bool might_overlap, unsigned ctz_size)
{
int may_use_call;
rtx retval = 0;
@@ -2052,6 +2054,14 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
}
}
+ bool dynamic_direction = false;
+ if (!pattern_ok && !pieces_ok && may_use_call
+ && (flag_inline_stringops & (might_overlap ? ILSOP_MEMMOVE : ILSOP_MEMCPY)))
+ {
+ may_use_call = 0;
+ dynamic_direction = might_overlap;
+ }
+
if (pattern_ok)
;
else if (pieces_ok)
@@ -2073,10 +2083,12 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
retval = emit_block_copy_via_libcall (x, y, size,
method == BLOCK_OP_TAILCALL);
}
+ else if (dynamic_direction)
+ emit_block_move_via_oriented_loop (x, y, size, align, ctz_size);
else if (might_overlap)
*is_move_done = false;
else
- emit_block_move_via_loop (x, y, size, align);
+ emit_block_move_via_sized_loop (x, y, size, align, ctz_size);
if (method == BLOCK_OP_CALL_PARM)
OK_DEFER_POP;
@@ -2085,7 +2097,8 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
}
rtx
-emit_block_move (rtx x, rtx y, rtx size, enum block_op_methods method)
+emit_block_move (rtx x, rtx y, rtx size, enum block_op_methods method,
+ unsigned int ctz_size)
{
unsigned HOST_WIDE_INT max, min = 0;
if (GET_CODE (size) == CONST_INT)
@@ -2093,7 +2106,8 @@ emit_block_move (rtx x, rtx y, rtx size, enum block_op_methods method)
else
max = GET_MODE_MASK (GET_MODE (size));
return emit_block_move_hints (x, y, size, method, 0, -1,
- min, max, max);
+ min, max, max,
+ false, NULL, false, ctz_size);
}
/* A subroutine of emit_block_move. Returns true if calling the
@@ -2255,13 +2269,117 @@ emit_block_move_via_pattern (rtx x, rtx y, rtx size, unsigned int align,
return false;
}
+/* Like emit_block_move_via_loop, but choose a suitable INCR based on
+ ALIGN and CTZ_SIZE. */
+
+static void
+emit_block_move_via_sized_loop (rtx x, rtx y, rtx size,
+ unsigned int align,
+ unsigned int ctz_size)
+{
+ int incr = align / BITS_PER_UNIT;
+
+ if (CONST_INT_P (size))
+ ctz_size = MAX (ctz_size, (unsigned) wi::ctz (UINTVAL (size)));
+
+ if (HOST_WIDE_INT_1U << ctz_size < (unsigned HOST_WIDE_INT) incr)
+ incr = HOST_WIDE_INT_1U << ctz_size;
+
+ while (incr > 1 && !can_move_by_pieces (incr, align))
+ incr >>= 1;
+
+ gcc_checking_assert (incr);
+
+ return emit_block_move_via_loop (x, y, size, align, incr);
+}
+
+/* Like emit_block_move_via_sized_loop, but besides choosing INCR so
+ as to ensure safe moves even in case of overlap, output dynamic
+ tests to choose between two loops, one moving downwards, another
+ moving upwards. */
+
+static void
+emit_block_move_via_oriented_loop (rtx x, rtx y, rtx size,
+ unsigned int align,
+ unsigned int ctz_size)
+{
+ int incr = align / BITS_PER_UNIT;
+
+ if (CONST_INT_P (size))
+ ctz_size = MAX (ctz_size, (unsigned) wi::ctz (UINTVAL (size)));
+
+ if (HOST_WIDE_INT_1U << ctz_size < (unsigned HOST_WIDE_INT) incr)
+ incr = HOST_WIDE_INT_1U << ctz_size;
+
+ while (incr > 1 && !int_mode_for_size (incr, 0).exists ())
+ incr >>= 1;
+
+ gcc_checking_assert (incr);
+
+ rtx_code_label *upw_label, *end_label;
+ upw_label = gen_label_rtx ();
+ end_label = gen_label_rtx ();
+
+ rtx x_addr = force_operand (XEXP (x, 0), NULL_RTX);
+ rtx y_addr = force_operand (XEXP (y, 0), NULL_RTX);
+ do_pending_stack_adjust ();
+
+ machine_mode mode = GET_MODE (x_addr);
+ if (mode != GET_MODE (y_addr))
+ {
+ scalar_int_mode xmode
+ = smallest_int_mode_for_size (GET_MODE_BITSIZE (mode));
+ scalar_int_mode ymode
+ = smallest_int_mode_for_size (GET_MODE_BITSIZE
+ (GET_MODE (y_addr)));
+ if (GET_MODE_BITSIZE (xmode) < GET_MODE_BITSIZE (ymode))
+ mode = ymode;
+ else
+ mode = xmode;
+
+#ifndef POINTERS_EXTEND_UNSIGNED
+ const int POINTERS_EXTEND_UNSIGNED = 1;
+#endif
+ x_addr = convert_modes (mode, GET_MODE (x_addr), x_addr,
+ POINTERS_EXTEND_UNSIGNED);
+ y_addr = convert_modes (mode, GET_MODE (y_addr), y_addr,
+ POINTERS_EXTEND_UNSIGNED);
+ }
+
+ /* Test for overlap: if (x >= y || x + size <= y) goto upw_label. */
+ emit_cmp_and_jump_insns (x_addr, y_addr, GEU, NULL_RTX, mode,
+ true, upw_label,
+ profile_probability::guessed_always ()
+ .apply_scale (5, 10));
+ rtx tmp = convert_modes (GET_MODE (x_addr), GET_MODE (size), size, true);
+ tmp = simplify_gen_binary (PLUS, GET_MODE (x_addr), x_addr, tmp);
+
+ emit_cmp_and_jump_insns (tmp, y_addr, LEU, NULL_RTX, mode,
+ true, upw_label,
+ profile_probability::guessed_always ()
+ .apply_scale (8, 10));
+
+ emit_block_move_via_loop (x, y, size, align, -incr);
+
+ emit_jump (end_label);
+ emit_label (upw_label);
+
+ emit_block_move_via_loop (x, y, size, align, incr);
+
+ emit_label (end_label);
+}
+
/* A subroutine of emit_block_move. Copy the data via an explicit
- loop. This is used only when libcalls are forbidden. */
-/* ??? It'd be nice to copy in hunks larger than QImode. */
+ loop. This is used only when libcalls are forbidden, or when
+ inlining is required. INCR is the block size to be copied in each
+ loop iteration. If it is negative, the absolute value is used, and
+ the block is copied backwards. INCR must be a power of two, an
+ exact divisor for SIZE and ALIGN, and imply a mode that can be
+ safely copied per iteration assuming no overlap. */
static void
emit_block_move_via_loop (rtx x, rtx y, rtx size,
- unsigned int align ATTRIBUTE_UNUSED)
+ unsigned int align, int incr)
{
rtx_code_label *cmp_label, *top_label;
rtx iter, x_addr, y_addr, tmp;
@@ -2277,7 +2395,38 @@ emit_block_move_via_loop (rtx x, rtx y, rtx size,
cmp_label = gen_label_rtx ();
iter = gen_reg_rtx (iter_mode);
- emit_move_insn (iter, const0_rtx);
+ bool downwards = incr < 0;
+ rtx iter_init;
+ rtx_code iter_cond;
+ rtx iter_limit;
+ rtx iter_incr;
+ machine_mode move_mode;
+ if (downwards)
+ {
+ incr = -incr;
+ iter_init = size;
+ iter_cond = GE;
+ iter_limit = const0_rtx;
+ iter_incr = GEN_INT (incr);
+ }
+ else
+ {
+ iter_init = const0_rtx;
+ iter_cond = LT;
+ iter_limit = size;
+ iter_incr = GEN_INT (incr);
+ }
+ emit_move_insn (iter, iter_init);
+
+ scalar_int_mode int_move_mode
+ = smallest_int_mode_for_size (incr * BITS_PER_UNIT);
+ if (GET_MODE_BITSIZE (int_move_mode) != incr * BITS_PER_UNIT)
+ {
+ move_mode = BLKmode;
+ gcc_checking_assert (can_move_by_pieces (incr, align));
+ }
+ else
+ move_mode = int_move_mode;
x_addr = force_operand (XEXP (x, 0), NULL_RTX);
y_addr = force_operand (XEXP (y, 0), NULL_RTX);
@@ -2293,19 +2442,32 @@ emit_block_move_via_loop (rtx x, rtx y, rtx size,
tmp = convert_modes (y_addr_mode, iter_mode, iter, true);
y_addr = simplify_gen_binary (PLUS, y_addr_mode, y_addr, tmp);
- x = change_address (x, QImode, x_addr);
- y = change_address (y, QImode, y_addr);
+ x = change_address (x, move_mode, x_addr);
+ y = change_address (y, move_mode, y_addr);
+
+ if (move_mode == BLKmode)
+ {
+ bool done;
+ emit_block_move_hints (x, y, iter_incr, BLOCK_OP_NO_LIBCALL,
+ align, incr, incr, incr, incr,
+ false, &done, false);
+ gcc_checking_assert (done);
+ }
+ else
+ emit_move_insn (x, y);
- emit_move_insn (x, y);
+ if (downwards)
+ emit_label (cmp_label);
- tmp = expand_simple_binop (iter_mode, PLUS, iter, const1_rtx, iter,
+ tmp = expand_simple_binop (iter_mode, PLUS, iter, iter_incr, iter,
true, OPTAB_LIB_WIDEN);
if (tmp != iter)
emit_move_insn (iter, tmp);
- emit_label (cmp_label);
+ if (!downwards)
+ emit_label (cmp_label);
- emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
+ emit_cmp_and_jump_insns (iter, iter_limit, iter_cond, NULL_RTX, iter_mode,
true, top_label,
profile_probability::guessed_always ()
.apply_scale (9, 10));
diff --git a/gcc/expr.h b/gcc/expr.h
index e3ba9eb5370..d9fc47c9114 100644
--- a/gcc/expr.h
+++ b/gcc/expr.h
@@ -126,7 +126,8 @@ struct by_pieces_prev
fixed_size_mode mode;
};
-extern rtx emit_block_move (rtx, rtx, rtx, enum block_op_methods);
+extern rtx emit_block_move (rtx, rtx, rtx, enum block_op_methods,
+ unsigned ctz_size = 0);
extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
unsigned int, HOST_WIDE_INT,
unsigned HOST_WIDE_INT,
@@ -134,7 +135,8 @@ extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
unsigned HOST_WIDE_INT,
bool bail_out_libcall = false,
bool *is_move_done = NULL,
- bool might_overlap = false);
+ bool might_overlap = false,
+ unsigned ctz_size = 0);
extern rtx emit_block_cmp_hints (rtx, rtx, rtx, tree, rtx, bool,
by_pieces_constfn, void *);
extern bool emit_storent_insn (rtx to, rtx from);
^ permalink raw reply [flat|nested] 6+ messages in thread
end of thread, other threads:[~2023-01-27 5:58 UTC | newest]
Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-01-20 23:01 [gcc(refs/users/aoliva/heads/testme)] add memcpy and memmove loop expanders Alexandre Oliva
2023-01-21 1:06 Alexandre Oliva
2023-01-26 6:09 Alexandre Oliva
2023-01-27 1:57 Alexandre Oliva
2023-01-27 2:25 Alexandre Oliva
2023-01-27 5:58 Alexandre Oliva
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).