public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc(refs/users/aoliva/heads/testme)] add memcmp loop expander
@ 2023-01-26 6:34 Alexandre Oliva
0 siblings, 0 replies; 16+ messages in thread
From: Alexandre Oliva @ 2023-01-26 6:34 UTC (permalink / raw)
To: gcc-cvs
https://gcc.gnu.org/g:856cf5e0722d63ddad1754943bfc1e0363184736
commit 856cf5e0722d63ddad1754943bfc1e0363184736
Author: Alexandre Oliva <oliva@gnu.org>
Date: Fri Jan 20 22:01:15 2023 -0300
add memcmp loop expander
Diff:
---
gcc/builtins.cc | 3 +-
gcc/expr.cc | 145 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
gcc/expr.h | 3 +-
3 files changed, 147 insertions(+), 4 deletions(-)
diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index ca4d4721cdc..e55b53485e2 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -4809,7 +4809,8 @@ expand_builtin_memcmp (tree exp, rtx target, bool result_eq)
result = emit_block_cmp_hints (arg1_rtx, arg2_rtx, len_rtx,
TREE_TYPE (len), target,
result_eq, constfn,
- CONST_CAST (char *, rep));
+ CONST_CAST (char *, rep),
+ tree_ctz (len));
if (result)
{
diff --git a/gcc/expr.cc b/gcc/expr.cc
index aceb3f514fc..ae8d29d14bd 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -83,6 +83,8 @@ static bool emit_block_move_via_pattern (rtx, rtx, rtx, unsigned, unsigned,
static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned, int);
static void emit_block_move_via_sized_loop (rtx, rtx, rtx, unsigned, unsigned);
static void emit_block_move_via_oriented_loop (rtx, rtx, rtx, unsigned, unsigned);
+static rtx emit_block_cmp_via_loop (rtx, rtx, rtx, tree, rtx, bool,
+ unsigned, unsigned);
static void clear_by_pieces (rtx, unsigned HOST_WIDE_INT, unsigned int);
static rtx_insn *compress_float_constant (rtx, rtx);
static rtx get_subtarget (rtx);
@@ -2569,7 +2571,8 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
Both X and Y must be MEM rtx's. LEN is an rtx that says how long
they are. LEN_TYPE is the type of the expression that was used to
- calculate it.
+ calculate it, and CTZ_LEN is the known trailing-zeros count of LEN,
+ so LEN must be a multiple of 1<<CTZ_LEN even if it's not constant.
If EQUALITY_ONLY is true, it means we don't have to return the tri-state
value of a normal memcmp call, instead we can just compare for equality.
@@ -2585,7 +2588,7 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
rtx
emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
bool equality_only, by_pieces_constfn y_cfn,
- void *y_cfndata)
+ void *y_cfndata, unsigned ctz_len)
{
rtx result = 0;
@@ -2607,8 +2610,146 @@ emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
else
result = emit_block_cmp_via_cmpmem (x, y, len, len_type, target, align);
+ if (!result && (flag_inline_stringops & ILSOP_MEMCMP))
+ result = emit_block_cmp_via_loop (x, y, len, len_type,
+ target, equality_only,
+ align, ctz_len);
+
return result;
}
+
+rtx
+emit_block_cmp_via_loop (rtx x, rtx y, rtx len, tree size_type, rtx target,
+ bool equality_only, unsigned align, unsigned ctz_len)
+{
+ unsigned incr = align / BITS_PER_UNIT;
+
+ if (CONST_INT_P (len))
+ ctz_len = MAX (ctz_len, (unsigned) wi::ctz (UINTVAL (len)));
+
+ if (HOST_WIDE_INT_1U << ctz_len < (unsigned HOST_WIDE_INT) incr)
+ incr = HOST_WIDE_INT_1U << ctz_len;
+
+ while (incr > 1
+ && can_do_by_pieces (incr, align, COMPARE_BY_PIECES))
+ incr >>= 1;
+
+ rtx_code_label *cmp_label, *top_label, *ne_label, *res_label;
+ rtx iter, x_addr, y_addr, tmp;
+ machine_mode x_addr_mode = get_address_mode (x);
+ machine_mode y_addr_mode = get_address_mode (y);
+ machine_mode iter_mode;
+
+ iter_mode = GET_MODE (len);
+ if (iter_mode == VOIDmode)
+ iter_mode = word_mode;
+
+ top_label = gen_label_rtx ();
+ cmp_label = gen_label_rtx ();
+ ne_label = gen_label_rtx ();
+ res_label = gen_label_rtx ();
+ iter = gen_reg_rtx (iter_mode);
+
+ rtx iter_init = const0_rtx;
+ rtx_code iter_cond = LT;
+ rtx iter_limit = len;
+ rtx iter_incr = GEN_INT (incr);
+ machine_mode cmp_mode;
+
+ emit_move_insn (iter, iter_init);
+
+ scalar_int_mode int_cmp_mode
+ = smallest_int_mode_for_size (incr * BITS_PER_UNIT);
+ if (GET_MODE_BITSIZE (int_cmp_mode) != incr * BITS_PER_UNIT
+ || !can_compare_p (NE, int_cmp_mode, ccp_jump))
+ {
+ cmp_mode = BLKmode;
+ gcc_checking_assert (can_do_by_pieces (incr, align, COMPARE_BY_PIECES));
+ }
+ else
+ cmp_mode = int_cmp_mode;
+
+ x_addr = force_operand (XEXP (x, 0), NULL_RTX);
+ y_addr = force_operand (XEXP (y, 0), NULL_RTX);
+ do_pending_stack_adjust ();
+
+ emit_jump (cmp_label);
+ emit_label (top_label);
+
+ tmp = convert_modes (x_addr_mode, iter_mode, iter, true);
+ x_addr = simplify_gen_binary (PLUS, x_addr_mode, x_addr, tmp);
+
+ if (x_addr_mode != y_addr_mode)
+ tmp = convert_modes (y_addr_mode, iter_mode, iter, true);
+ y_addr = simplify_gen_binary (PLUS, y_addr_mode, y_addr, tmp);
+
+ x = change_address (x, cmp_mode, x_addr);
+ y = change_address (y, cmp_mode, y_addr);
+
+ rtx part_res;
+ if (cmp_mode == BLKmode)
+ part_res = compare_by_pieces (x, y, incr, target, align, 0, 0);
+ else
+ part_res = expand_binop (cmp_mode, sub_optab, x, y, NULL_RTX,
+ true, OPTAB_LIB_WIDEN);
+
+ emit_cmp_and_jump_insns (part_res, GEN_INT (0), NE, NULL_RTX,
+ GET_MODE (part_res), true, ne_label,
+ profile_probability::guessed_always ()
+ .apply_scale (9, 10));
+
+ tmp = expand_simple_binop (iter_mode, PLUS, iter, iter_incr, iter,
+ true, OPTAB_LIB_WIDEN);
+ if (tmp != iter)
+ emit_move_insn (iter, tmp);
+
+ emit_label (cmp_label);
+ emit_cmp_and_jump_insns (iter, iter_limit, iter_cond, NULL_RTX, iter_mode,
+ true, top_label,
+ profile_probability::guessed_always ()
+ .apply_scale (9, 10));
+
+ if (target == NULL_RTX
+ || !REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
+ target = gen_reg_rtx (TYPE_MODE (integer_type_node));
+
+ emit_move_insn (target, const0_rtx);
+ emit_jump (res_label);
+ emit_barrier ();
+
+ emit_label (ne_label);
+
+ if (equality_only)
+ emit_move_insn (target, const1_rtx);
+ else if (incr == 1)
+ convert_move (target, part_res, false);
+ else if (incr > UNITS_PER_WORD)
+ {
+ /* ??? Re-compare the block found to be different one word at a
+ time. */
+ part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), size_type,
+ target, equality_only,
+ incr * BITS_PER_UNIT, 0);
+ if (target != part_res)
+ convert_move (target, part_res, false);
+ }
+ else
+ {
+ /* ??? Re-compare the block found to be different one byte at a
+ time. We could do better using part_res, and being careful
+ about endianness. */
+ part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), size_type,
+ target, equality_only,
+ BITS_PER_UNIT, 0);
+ if (target != part_res)
+ convert_move (target, part_res, false);
+ }
+
+ emit_label (res_label);
+
+ return target;
+}
+
\f
/* Copy all or part of a value X into registers starting at REGNO.
The number of registers to be filled is NREGS. */
diff --git a/gcc/expr.h b/gcc/expr.h
index d9fc47c9114..976c8b69fc1 100644
--- a/gcc/expr.h
+++ b/gcc/expr.h
@@ -138,7 +138,8 @@ extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
bool might_overlap = false,
unsigned ctz_size = 0);
extern rtx emit_block_cmp_hints (rtx, rtx, rtx, tree, rtx, bool,
- by_pieces_constfn, void *);
+ by_pieces_constfn, void *,
+ unsigned ctz_len = 0);
extern bool emit_storent_insn (rtx to, rtx from);
/* Copy all or part of a value X into registers starting at REGNO.
^ permalink raw reply [flat|nested] 16+ messages in thread
* [gcc(refs/users/aoliva/heads/testme)] add memcmp loop expander
@ 2023-01-27 5:58 Alexandre Oliva
0 siblings, 0 replies; 16+ messages in thread
From: Alexandre Oliva @ 2023-01-27 5:58 UTC (permalink / raw)
To: gcc-cvs
https://gcc.gnu.org/g:1f7a6a0f935c73416ef530e050a316dc0fc4d5ad
commit 1f7a6a0f935c73416ef530e050a316dc0fc4d5ad
Author: Alexandre Oliva <oliva@gnu.org>
Date: Thu Jan 26 22:52:20 2023 -0300
add memcmp loop expander
Diff:
---
gcc/builtins.cc | 3 +-
gcc/expr.cc | 165 ++++++++++++++++++++-
gcc/expr.h | 3 +-
gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c | 6 +
.../gcc.dg/torture/inline-mem-cpy-cmp-1.c | 11 ++
5 files changed, 184 insertions(+), 4 deletions(-)
diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index a9f92e9eb29..dac3b83f627 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -4809,7 +4809,8 @@ expand_builtin_memcmp (tree exp, rtx target, bool result_eq)
result = emit_block_cmp_hints (arg1_rtx, arg2_rtx, len_rtx,
TREE_TYPE (len), target,
result_eq, constfn,
- CONST_CAST (char *, rep));
+ CONST_CAST (char *, rep),
+ tree_ctz (len));
if (result)
{
diff --git a/gcc/expr.cc b/gcc/expr.cc
index aceb3f514fc..beb2f6f7bf7 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -83,6 +83,8 @@ static bool emit_block_move_via_pattern (rtx, rtx, rtx, unsigned, unsigned,
static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned, int);
static void emit_block_move_via_sized_loop (rtx, rtx, rtx, unsigned, unsigned);
static void emit_block_move_via_oriented_loop (rtx, rtx, rtx, unsigned, unsigned);
+static rtx emit_block_cmp_via_loop (rtx, rtx, rtx, tree, rtx, bool,
+ unsigned, unsigned);
static void clear_by_pieces (rtx, unsigned HOST_WIDE_INT, unsigned int);
static rtx_insn *compress_float_constant (rtx, rtx);
static rtx get_subtarget (rtx);
@@ -2569,7 +2571,8 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
Both X and Y must be MEM rtx's. LEN is an rtx that says how long
they are. LEN_TYPE is the type of the expression that was used to
- calculate it.
+ calculate it, and CTZ_LEN is the known trailing-zeros count of LEN,
+ so LEN must be a multiple of 1<<CTZ_LEN even if it's not constant.
If EQUALITY_ONLY is true, it means we don't have to return the tri-state
value of a normal memcmp call, instead we can just compare for equality.
@@ -2585,7 +2588,7 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
rtx
emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
bool equality_only, by_pieces_constfn y_cfn,
- void *y_cfndata)
+ void *y_cfndata, unsigned ctz_len)
{
rtx result = 0;
@@ -2607,8 +2610,166 @@ emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
else
result = emit_block_cmp_via_cmpmem (x, y, len, len_type, target, align);
+ if (!result && (flag_inline_stringops & ILSOP_MEMCMP))
+ result = emit_block_cmp_via_loop (x, y, len, len_type,
+ target, equality_only,
+ align, ctz_len);
+
return result;
}
+
+rtx
+emit_block_cmp_via_loop (rtx x, rtx y, rtx len, tree size_type, rtx target,
+ bool equality_only, unsigned align, unsigned ctz_len)
+{
+ unsigned incr = align / BITS_PER_UNIT;
+
+ if (CONST_INT_P (len))
+ ctz_len = MAX (ctz_len, (unsigned) wi::ctz (UINTVAL (len)));
+
+ if (HOST_WIDE_INT_1U << ctz_len < (unsigned HOST_WIDE_INT) incr)
+ incr = HOST_WIDE_INT_1U << ctz_len;
+
+ while (incr > 1
+ && !can_do_by_pieces (incr, align, COMPARE_BY_PIECES))
+ incr >>= 1;
+
+ rtx_code_label *cmp_label, *top_label, *ne_label, *res_label;
+ rtx iter, x_addr, y_addr, tmp;
+ machine_mode x_addr_mode = get_address_mode (x);
+ machine_mode y_addr_mode = get_address_mode (y);
+ machine_mode iter_mode;
+
+ iter_mode = GET_MODE (len);
+ if (iter_mode == VOIDmode)
+ iter_mode = word_mode;
+
+ top_label = gen_label_rtx ();
+ cmp_label = gen_label_rtx ();
+ ne_label = gen_label_rtx ();
+ res_label = gen_label_rtx ();
+ iter = gen_reg_rtx (iter_mode);
+
+ rtx iter_init = const0_rtx;
+ rtx_code iter_cond = LT;
+ rtx iter_limit = len;
+ rtx iter_incr = GEN_INT (incr);
+ machine_mode cmp_mode;
+
+ emit_move_insn (iter, iter_init);
+
+ scalar_int_mode int_cmp_mode
+ = smallest_int_mode_for_size (incr * BITS_PER_UNIT);
+ if (GET_MODE_BITSIZE (int_cmp_mode) != incr * BITS_PER_UNIT
+ || !can_compare_p (NE, int_cmp_mode, ccp_jump))
+ {
+ cmp_mode = BLKmode;
+ gcc_checking_assert (incr != 1);
+ }
+ else
+ cmp_mode = int_cmp_mode;
+
+ x_addr = force_operand (XEXP (x, 0), NULL_RTX);
+ y_addr = force_operand (XEXP (y, 0), NULL_RTX);
+ do_pending_stack_adjust ();
+
+ emit_jump (cmp_label);
+ emit_label (top_label);
+
+ tmp = convert_modes (x_addr_mode, iter_mode, iter, true);
+ x_addr = simplify_gen_binary (PLUS, x_addr_mode, x_addr, tmp);
+
+ if (x_addr_mode != y_addr_mode)
+ tmp = convert_modes (y_addr_mode, iter_mode, iter, true);
+ y_addr = simplify_gen_binary (PLUS, y_addr_mode, y_addr, tmp);
+
+ x = change_address (x, cmp_mode, x_addr);
+ y = change_address (y, cmp_mode, y_addr);
+
+ rtx part_res;
+ if (cmp_mode == BLKmode)
+ part_res = compare_by_pieces (x, y, incr, target, align, 0, 0);
+ else
+ part_res = expand_binop (cmp_mode, sub_optab, x, y, NULL_RTX,
+ true, OPTAB_LIB_WIDEN);
+
+ emit_cmp_and_jump_insns (part_res, GEN_INT (0), NE, NULL_RTX,
+ GET_MODE (part_res), true, ne_label,
+ profile_probability::guessed_always ()
+ .apply_scale (1, 10));
+
+ tmp = expand_simple_binop (iter_mode, PLUS, iter, iter_incr, iter,
+ true, OPTAB_LIB_WIDEN);
+ if (tmp != iter)
+ emit_move_insn (iter, tmp);
+
+ emit_label (cmp_label);
+ emit_cmp_and_jump_insns (iter, iter_limit, iter_cond, NULL_RTX, iter_mode,
+ true, top_label,
+ profile_probability::guessed_always ()
+ .apply_scale (9, 10));
+
+ if (target == NULL_RTX
+ || !REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
+ target = gen_reg_rtx (TYPE_MODE (integer_type_node));
+
+ emit_move_insn (target, const0_rtx);
+ emit_jump (res_label);
+ emit_barrier ();
+
+ emit_label (ne_label);
+
+ if (equality_only)
+ emit_move_insn (target, const1_rtx);
+ else
+ {
+ if (incr > UNITS_PER_WORD)
+ /* ??? Re-compare the block found to be different one word at a
+ time. */
+ part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), size_type,
+ target, equality_only,
+ BITS_PER_WORD, 0);
+ else if (incr > 1)
+ /* ??? Re-compare the block found to be different one byte at a
+ time. We could do better using part_res, and being careful
+ about endianness. */
+ part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), size_type,
+ target, equality_only,
+ BITS_PER_UNIT, 0);
+ else if (known_gt (GET_MODE_BITSIZE (GET_MODE (target)),
+ GET_MODE_BITSIZE (cmp_mode)))
+ part_res = expand_binop (GET_MODE (target), sub_optab, x, y, target,
+ true, OPTAB_LIB_WIDEN);
+ else
+ {
+ /* In the odd chance target is QImode, we can't count on
+ widening subtract to capture the result of the unsigned
+ compares. */
+ rtx_code_label *ltu_label;
+ ltu_label = gen_label_rtx ();
+ emit_cmp_and_jump_insns (x, y, LTU, NULL_RTX,
+ cmp_mode, true, ltu_label,
+ profile_probability::guessed_always ()
+ .apply_scale (5, 10));
+
+ emit_move_insn (target, const1_rtx);
+ emit_jump (res_label);
+ emit_barrier ();
+
+ emit_label (ltu_label);
+ emit_move_insn (target, constm1_rtx);
+ part_res = target;
+ }
+
+ if (target != part_res)
+ convert_move (target, part_res, false);
+ }
+
+ emit_label (res_label);
+
+ return target;
+}
+
\f
/* Copy all or part of a value X into registers starting at REGNO.
The number of registers to be filled is NREGS. */
diff --git a/gcc/expr.h b/gcc/expr.h
index d9fc47c9114..976c8b69fc1 100644
--- a/gcc/expr.h
+++ b/gcc/expr.h
@@ -138,7 +138,8 @@ extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
bool might_overlap = false,
unsigned ctz_size = 0);
extern rtx emit_block_cmp_hints (rtx, rtx, rtx, tree, rtx, bool,
- by_pieces_constfn, void *);
+ by_pieces_constfn, void *,
+ unsigned ctz_len = 0);
extern bool emit_storent_insn (rtx to, rtx from);
/* Copy all or part of a value X into registers starting at REGNO.
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c
new file mode 100644
index 00000000000..bcd774c0f77
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-finline-stringops=memcmp -g0 -fno-lto" } */
+
+#include "../memcmp-1.c"
+
+/* { dg-final { scan-assembler-not "memcmp" } } */
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-cmp-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-cmp-1.c
new file mode 100644
index 00000000000..6b4fef0b20f
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-cmp-1.c
@@ -0,0 +1,11 @@
+/* { dg-do run } */
+/* { dg-options "-finline-stringops -g0 -fno-lto" } */
+/* { dg-require-effective-target ptr32plus } */
+/* { dg-timeout-factor 2 } */
+
+#include "../memcmp-1.c"
+/* Yeah, this memcmp test exercises plenty of memcpy, more than any of the
+ memcpy tests. */
+
+/* { dg-final { scan-assembler-not "memcpy" } } */
+/* { dg-final { scan-assembler-not "memcmp" } } */
^ permalink raw reply [flat|nested] 16+ messages in thread
* [gcc(refs/users/aoliva/heads/testme)] add memcmp loop expander
@ 2023-01-27 2:25 Alexandre Oliva
0 siblings, 0 replies; 16+ messages in thread
From: Alexandre Oliva @ 2023-01-27 2:25 UTC (permalink / raw)
To: gcc-cvs
https://gcc.gnu.org/g:be2ffb9c759274c1e457c4c33b6695b5ca4b71a1
commit be2ffb9c759274c1e457c4c33b6695b5ca4b71a1
Author: Alexandre Oliva <oliva@gnu.org>
Date: Thu Jan 26 22:52:20 2023 -0300
add memcmp loop expander
Diff:
---
gcc/builtins.cc | 3 +-
gcc/expr.cc | 165 ++++++++++++++++++++-
gcc/expr.h | 3 +-
gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c | 6 +
.../gcc.dg/torture/inline-mem-cpy-cmp-1.c | 11 ++
5 files changed, 184 insertions(+), 4 deletions(-)
diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index 4bf404a8a78..2c215bc29df 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -4809,7 +4809,8 @@ expand_builtin_memcmp (tree exp, rtx target, bool result_eq)
result = emit_block_cmp_hints (arg1_rtx, arg2_rtx, len_rtx,
TREE_TYPE (len), target,
result_eq, constfn,
- CONST_CAST (char *, rep));
+ CONST_CAST (char *, rep),
+ tree_ctz (len));
if (result)
{
diff --git a/gcc/expr.cc b/gcc/expr.cc
index aceb3f514fc..beb2f6f7bf7 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -83,6 +83,8 @@ static bool emit_block_move_via_pattern (rtx, rtx, rtx, unsigned, unsigned,
static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned, int);
static void emit_block_move_via_sized_loop (rtx, rtx, rtx, unsigned, unsigned);
static void emit_block_move_via_oriented_loop (rtx, rtx, rtx, unsigned, unsigned);
+static rtx emit_block_cmp_via_loop (rtx, rtx, rtx, tree, rtx, bool,
+ unsigned, unsigned);
static void clear_by_pieces (rtx, unsigned HOST_WIDE_INT, unsigned int);
static rtx_insn *compress_float_constant (rtx, rtx);
static rtx get_subtarget (rtx);
@@ -2569,7 +2571,8 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
Both X and Y must be MEM rtx's. LEN is an rtx that says how long
they are. LEN_TYPE is the type of the expression that was used to
- calculate it.
+ calculate it, and CTZ_LEN is the known trailing-zeros count of LEN,
+ so LEN must be a multiple of 1<<CTZ_LEN even if it's not constant.
If EQUALITY_ONLY is true, it means we don't have to return the tri-state
value of a normal memcmp call, instead we can just compare for equality.
@@ -2585,7 +2588,7 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
rtx
emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
bool equality_only, by_pieces_constfn y_cfn,
- void *y_cfndata)
+ void *y_cfndata, unsigned ctz_len)
{
rtx result = 0;
@@ -2607,8 +2610,166 @@ emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
else
result = emit_block_cmp_via_cmpmem (x, y, len, len_type, target, align);
+ if (!result && (flag_inline_stringops & ILSOP_MEMCMP))
+ result = emit_block_cmp_via_loop (x, y, len, len_type,
+ target, equality_only,
+ align, ctz_len);
+
return result;
}
+
+rtx
+emit_block_cmp_via_loop (rtx x, rtx y, rtx len, tree size_type, rtx target,
+ bool equality_only, unsigned align, unsigned ctz_len)
+{
+ unsigned incr = align / BITS_PER_UNIT;
+
+ if (CONST_INT_P (len))
+ ctz_len = MAX (ctz_len, (unsigned) wi::ctz (UINTVAL (len)));
+
+ if (HOST_WIDE_INT_1U << ctz_len < (unsigned HOST_WIDE_INT) incr)
+ incr = HOST_WIDE_INT_1U << ctz_len;
+
+ while (incr > 1
+ && !can_do_by_pieces (incr, align, COMPARE_BY_PIECES))
+ incr >>= 1;
+
+ rtx_code_label *cmp_label, *top_label, *ne_label, *res_label;
+ rtx iter, x_addr, y_addr, tmp;
+ machine_mode x_addr_mode = get_address_mode (x);
+ machine_mode y_addr_mode = get_address_mode (y);
+ machine_mode iter_mode;
+
+ iter_mode = GET_MODE (len);
+ if (iter_mode == VOIDmode)
+ iter_mode = word_mode;
+
+ top_label = gen_label_rtx ();
+ cmp_label = gen_label_rtx ();
+ ne_label = gen_label_rtx ();
+ res_label = gen_label_rtx ();
+ iter = gen_reg_rtx (iter_mode);
+
+ rtx iter_init = const0_rtx;
+ rtx_code iter_cond = LT;
+ rtx iter_limit = len;
+ rtx iter_incr = GEN_INT (incr);
+ machine_mode cmp_mode;
+
+ emit_move_insn (iter, iter_init);
+
+ scalar_int_mode int_cmp_mode
+ = smallest_int_mode_for_size (incr * BITS_PER_UNIT);
+ if (GET_MODE_BITSIZE (int_cmp_mode) != incr * BITS_PER_UNIT
+ || !can_compare_p (NE, int_cmp_mode, ccp_jump))
+ {
+ cmp_mode = BLKmode;
+ gcc_checking_assert (incr != 1);
+ }
+ else
+ cmp_mode = int_cmp_mode;
+
+ x_addr = force_operand (XEXP (x, 0), NULL_RTX);
+ y_addr = force_operand (XEXP (y, 0), NULL_RTX);
+ do_pending_stack_adjust ();
+
+ emit_jump (cmp_label);
+ emit_label (top_label);
+
+ tmp = convert_modes (x_addr_mode, iter_mode, iter, true);
+ x_addr = simplify_gen_binary (PLUS, x_addr_mode, x_addr, tmp);
+
+ if (x_addr_mode != y_addr_mode)
+ tmp = convert_modes (y_addr_mode, iter_mode, iter, true);
+ y_addr = simplify_gen_binary (PLUS, y_addr_mode, y_addr, tmp);
+
+ x = change_address (x, cmp_mode, x_addr);
+ y = change_address (y, cmp_mode, y_addr);
+
+ rtx part_res;
+ if (cmp_mode == BLKmode)
+ part_res = compare_by_pieces (x, y, incr, target, align, 0, 0);
+ else
+ part_res = expand_binop (cmp_mode, sub_optab, x, y, NULL_RTX,
+ true, OPTAB_LIB_WIDEN);
+
+ emit_cmp_and_jump_insns (part_res, GEN_INT (0), NE, NULL_RTX,
+ GET_MODE (part_res), true, ne_label,
+ profile_probability::guessed_always ()
+ .apply_scale (1, 10));
+
+ tmp = expand_simple_binop (iter_mode, PLUS, iter, iter_incr, iter,
+ true, OPTAB_LIB_WIDEN);
+ if (tmp != iter)
+ emit_move_insn (iter, tmp);
+
+ emit_label (cmp_label);
+ emit_cmp_and_jump_insns (iter, iter_limit, iter_cond, NULL_RTX, iter_mode,
+ true, top_label,
+ profile_probability::guessed_always ()
+ .apply_scale (9, 10));
+
+ if (target == NULL_RTX
+ || !REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
+ target = gen_reg_rtx (TYPE_MODE (integer_type_node));
+
+ emit_move_insn (target, const0_rtx);
+ emit_jump (res_label);
+ emit_barrier ();
+
+ emit_label (ne_label);
+
+ if (equality_only)
+ emit_move_insn (target, const1_rtx);
+ else
+ {
+ if (incr > UNITS_PER_WORD)
+ /* ??? Re-compare the block found to be different one word at a
+ time. */
+ part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), size_type,
+ target, equality_only,
+ BITS_PER_WORD, 0);
+ else if (incr > 1)
+ /* ??? Re-compare the block found to be different one byte at a
+ time. We could do better using part_res, and being careful
+ about endianness. */
+ part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), size_type,
+ target, equality_only,
+ BITS_PER_UNIT, 0);
+ else if (known_gt (GET_MODE_BITSIZE (GET_MODE (target)),
+ GET_MODE_BITSIZE (cmp_mode)))
+ part_res = expand_binop (GET_MODE (target), sub_optab, x, y, target,
+ true, OPTAB_LIB_WIDEN);
+ else
+ {
+ /* In the odd chance target is QImode, we can't count on
+ widening subtract to capture the result of the unsigned
+ compares. */
+ rtx_code_label *ltu_label;
+ ltu_label = gen_label_rtx ();
+ emit_cmp_and_jump_insns (x, y, LTU, NULL_RTX,
+ cmp_mode, true, ltu_label,
+ profile_probability::guessed_always ()
+ .apply_scale (5, 10));
+
+ emit_move_insn (target, const1_rtx);
+ emit_jump (res_label);
+ emit_barrier ();
+
+ emit_label (ltu_label);
+ emit_move_insn (target, constm1_rtx);
+ part_res = target;
+ }
+
+ if (target != part_res)
+ convert_move (target, part_res, false);
+ }
+
+ emit_label (res_label);
+
+ return target;
+}
+
\f
/* Copy all or part of a value X into registers starting at REGNO.
The number of registers to be filled is NREGS. */
diff --git a/gcc/expr.h b/gcc/expr.h
index d9fc47c9114..976c8b69fc1 100644
--- a/gcc/expr.h
+++ b/gcc/expr.h
@@ -138,7 +138,8 @@ extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
bool might_overlap = false,
unsigned ctz_size = 0);
extern rtx emit_block_cmp_hints (rtx, rtx, rtx, tree, rtx, bool,
- by_pieces_constfn, void *);
+ by_pieces_constfn, void *,
+ unsigned ctz_len = 0);
extern bool emit_storent_insn (rtx to, rtx from);
/* Copy all or part of a value X into registers starting at REGNO.
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c
new file mode 100644
index 00000000000..bcd774c0f77
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-finline-stringops=memcmp -g0 -fno-lto" } */
+
+#include "../memcmp-1.c"
+
+/* { dg-final { scan-assembler-not "memcmp" } } */
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-cmp-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-cmp-1.c
new file mode 100644
index 00000000000..6b4fef0b20f
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-cmp-1.c
@@ -0,0 +1,11 @@
+/* { dg-do run } */
+/* { dg-options "-finline-stringops -g0 -fno-lto" } */
+/* { dg-require-effective-target ptr32plus } */
+/* { dg-timeout-factor 2 } */
+
+#include "../memcmp-1.c"
+/* Yeah, this memcmp test exercises plenty of memcpy, more than any of the
+ memcpy tests. */
+
+/* { dg-final { scan-assembler-not "memcpy" } } */
+/* { dg-final { scan-assembler-not "memcmp" } } */
^ permalink raw reply [flat|nested] 16+ messages in thread
* [gcc(refs/users/aoliva/heads/testme)] add memcmp loop expander
@ 2023-01-27 1:57 Alexandre Oliva
0 siblings, 0 replies; 16+ messages in thread
From: Alexandre Oliva @ 2023-01-27 1:57 UTC (permalink / raw)
To: gcc-cvs
https://gcc.gnu.org/g:acb03e3137a3c75089625af0ea88dc19715607b5
commit acb03e3137a3c75089625af0ea88dc19715607b5
Author: Alexandre Oliva <oliva@gnu.org>
Date: Thu Jan 26 22:52:20 2023 -0300
add memcmp loop expander
Diff:
---
gcc/builtins.cc | 3 +-
gcc/expr.cc | 165 ++++++++++++++++++++-
gcc/expr.h | 3 +-
gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c | 6 +
.../gcc.dg/torture/inline-mem-cpy-cmp-1.c | 11 ++
5 files changed, 184 insertions(+), 4 deletions(-)
diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index ca4d4721cdc..e55b53485e2 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -4809,7 +4809,8 @@ expand_builtin_memcmp (tree exp, rtx target, bool result_eq)
result = emit_block_cmp_hints (arg1_rtx, arg2_rtx, len_rtx,
TREE_TYPE (len), target,
result_eq, constfn,
- CONST_CAST (char *, rep));
+ CONST_CAST (char *, rep),
+ tree_ctz (len));
if (result)
{
diff --git a/gcc/expr.cc b/gcc/expr.cc
index aceb3f514fc..beb2f6f7bf7 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -83,6 +83,8 @@ static bool emit_block_move_via_pattern (rtx, rtx, rtx, unsigned, unsigned,
static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned, int);
static void emit_block_move_via_sized_loop (rtx, rtx, rtx, unsigned, unsigned);
static void emit_block_move_via_oriented_loop (rtx, rtx, rtx, unsigned, unsigned);
+static rtx emit_block_cmp_via_loop (rtx, rtx, rtx, tree, rtx, bool,
+ unsigned, unsigned);
static void clear_by_pieces (rtx, unsigned HOST_WIDE_INT, unsigned int);
static rtx_insn *compress_float_constant (rtx, rtx);
static rtx get_subtarget (rtx);
@@ -2569,7 +2571,8 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
Both X and Y must be MEM rtx's. LEN is an rtx that says how long
they are. LEN_TYPE is the type of the expression that was used to
- calculate it.
+ calculate it, and CTZ_LEN is the known trailing-zeros count of LEN,
+ so LEN must be a multiple of 1<<CTZ_LEN even if it's not constant.
If EQUALITY_ONLY is true, it means we don't have to return the tri-state
value of a normal memcmp call, instead we can just compare for equality.
@@ -2585,7 +2588,7 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
rtx
emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
bool equality_only, by_pieces_constfn y_cfn,
- void *y_cfndata)
+ void *y_cfndata, unsigned ctz_len)
{
rtx result = 0;
@@ -2607,8 +2610,166 @@ emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
else
result = emit_block_cmp_via_cmpmem (x, y, len, len_type, target, align);
+ if (!result && (flag_inline_stringops & ILSOP_MEMCMP))
+ result = emit_block_cmp_via_loop (x, y, len, len_type,
+ target, equality_only,
+ align, ctz_len);
+
return result;
}
+
+rtx
+emit_block_cmp_via_loop (rtx x, rtx y, rtx len, tree size_type, rtx target,
+ bool equality_only, unsigned align, unsigned ctz_len)
+{
+ unsigned incr = align / BITS_PER_UNIT;
+
+ if (CONST_INT_P (len))
+ ctz_len = MAX (ctz_len, (unsigned) wi::ctz (UINTVAL (len)));
+
+ if (HOST_WIDE_INT_1U << ctz_len < (unsigned HOST_WIDE_INT) incr)
+ incr = HOST_WIDE_INT_1U << ctz_len;
+
+ while (incr > 1
+ && !can_do_by_pieces (incr, align, COMPARE_BY_PIECES))
+ incr >>= 1;
+
+ rtx_code_label *cmp_label, *top_label, *ne_label, *res_label;
+ rtx iter, x_addr, y_addr, tmp;
+ machine_mode x_addr_mode = get_address_mode (x);
+ machine_mode y_addr_mode = get_address_mode (y);
+ machine_mode iter_mode;
+
+ iter_mode = GET_MODE (len);
+ if (iter_mode == VOIDmode)
+ iter_mode = word_mode;
+
+ top_label = gen_label_rtx ();
+ cmp_label = gen_label_rtx ();
+ ne_label = gen_label_rtx ();
+ res_label = gen_label_rtx ();
+ iter = gen_reg_rtx (iter_mode);
+
+ rtx iter_init = const0_rtx;
+ rtx_code iter_cond = LT;
+ rtx iter_limit = len;
+ rtx iter_incr = GEN_INT (incr);
+ machine_mode cmp_mode;
+
+ emit_move_insn (iter, iter_init);
+
+ scalar_int_mode int_cmp_mode
+ = smallest_int_mode_for_size (incr * BITS_PER_UNIT);
+ if (GET_MODE_BITSIZE (int_cmp_mode) != incr * BITS_PER_UNIT
+ || !can_compare_p (NE, int_cmp_mode, ccp_jump))
+ {
+ cmp_mode = BLKmode;
+ gcc_checking_assert (incr != 1);
+ }
+ else
+ cmp_mode = int_cmp_mode;
+
+ x_addr = force_operand (XEXP (x, 0), NULL_RTX);
+ y_addr = force_operand (XEXP (y, 0), NULL_RTX);
+ do_pending_stack_adjust ();
+
+ emit_jump (cmp_label);
+ emit_label (top_label);
+
+ tmp = convert_modes (x_addr_mode, iter_mode, iter, true);
+ x_addr = simplify_gen_binary (PLUS, x_addr_mode, x_addr, tmp);
+
+ if (x_addr_mode != y_addr_mode)
+ tmp = convert_modes (y_addr_mode, iter_mode, iter, true);
+ y_addr = simplify_gen_binary (PLUS, y_addr_mode, y_addr, tmp);
+
+ x = change_address (x, cmp_mode, x_addr);
+ y = change_address (y, cmp_mode, y_addr);
+
+ rtx part_res;
+ if (cmp_mode == BLKmode)
+ part_res = compare_by_pieces (x, y, incr, target, align, 0, 0);
+ else
+ part_res = expand_binop (cmp_mode, sub_optab, x, y, NULL_RTX,
+ true, OPTAB_LIB_WIDEN);
+
+ emit_cmp_and_jump_insns (part_res, GEN_INT (0), NE, NULL_RTX,
+ GET_MODE (part_res), true, ne_label,
+ profile_probability::guessed_always ()
+ .apply_scale (1, 10));
+
+ tmp = expand_simple_binop (iter_mode, PLUS, iter, iter_incr, iter,
+ true, OPTAB_LIB_WIDEN);
+ if (tmp != iter)
+ emit_move_insn (iter, tmp);
+
+ emit_label (cmp_label);
+ emit_cmp_and_jump_insns (iter, iter_limit, iter_cond, NULL_RTX, iter_mode,
+ true, top_label,
+ profile_probability::guessed_always ()
+ .apply_scale (9, 10));
+
+ if (target == NULL_RTX
+ || !REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
+ target = gen_reg_rtx (TYPE_MODE (integer_type_node));
+
+ emit_move_insn (target, const0_rtx);
+ emit_jump (res_label);
+ emit_barrier ();
+
+ emit_label (ne_label);
+
+ if (equality_only)
+ emit_move_insn (target, const1_rtx);
+ else
+ {
+ if (incr > UNITS_PER_WORD)
+ /* ??? Re-compare the block found to be different one word at a
+ time. */
+ part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), size_type,
+ target, equality_only,
+ BITS_PER_WORD, 0);
+ else if (incr > 1)
+ /* ??? Re-compare the block found to be different one byte at a
+ time. We could do better using part_res, and being careful
+ about endianness. */
+ part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), size_type,
+ target, equality_only,
+ BITS_PER_UNIT, 0);
+ else if (known_gt (GET_MODE_BITSIZE (GET_MODE (target)),
+ GET_MODE_BITSIZE (cmp_mode)))
+ part_res = expand_binop (GET_MODE (target), sub_optab, x, y, target,
+ true, OPTAB_LIB_WIDEN);
+ else
+ {
+ /* In the odd chance target is QImode, we can't count on
+ widening subtract to capture the result of the unsigned
+ compares. */
+ rtx_code_label *ltu_label;
+ ltu_label = gen_label_rtx ();
+ emit_cmp_and_jump_insns (x, y, LTU, NULL_RTX,
+ cmp_mode, true, ltu_label,
+ profile_probability::guessed_always ()
+ .apply_scale (5, 10));
+
+ emit_move_insn (target, const1_rtx);
+ emit_jump (res_label);
+ emit_barrier ();
+
+ emit_label (ltu_label);
+ emit_move_insn (target, constm1_rtx);
+ part_res = target;
+ }
+
+ if (target != part_res)
+ convert_move (target, part_res, false);
+ }
+
+ emit_label (res_label);
+
+ return target;
+}
+
\f
/* Copy all or part of a value X into registers starting at REGNO.
The number of registers to be filled is NREGS. */
diff --git a/gcc/expr.h b/gcc/expr.h
index d9fc47c9114..976c8b69fc1 100644
--- a/gcc/expr.h
+++ b/gcc/expr.h
@@ -138,7 +138,8 @@ extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
bool might_overlap = false,
unsigned ctz_size = 0);
extern rtx emit_block_cmp_hints (rtx, rtx, rtx, tree, rtx, bool,
- by_pieces_constfn, void *);
+ by_pieces_constfn, void *,
+ unsigned ctz_len = 0);
extern bool emit_storent_insn (rtx to, rtx from);
/* Copy all or part of a value X into registers starting at REGNO.
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c
new file mode 100644
index 00000000000..bcd774c0f77
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-finline-stringops=memcmp -g0 -fno-lto" } */
+
+#include "../memcmp-1.c"
+
+/* { dg-final { scan-assembler-not "memcmp" } } */
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-cmp-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-cmp-1.c
new file mode 100644
index 00000000000..6b4fef0b20f
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-cmp-1.c
@@ -0,0 +1,11 @@
+/* { dg-do run } */
+/* { dg-options "-finline-stringops -g0 -fno-lto" } */
+/* { dg-require-effective-target ptr32plus } */
+/* { dg-timeout-factor 2 } */
+
+#include "../memcmp-1.c"
+/* Yeah, this memcmp test exercises plenty of memcpy, more than any of the
+ memcpy tests. */
+
+/* { dg-final { scan-assembler-not "memcpy" } } */
+/* { dg-final { scan-assembler-not "memcmp" } } */
^ permalink raw reply [flat|nested] 16+ messages in thread
* [gcc(refs/users/aoliva/heads/testme)] add memcmp loop expander
@ 2023-01-26 8:44 Alexandre Oliva
0 siblings, 0 replies; 16+ messages in thread
From: Alexandre Oliva @ 2023-01-26 8:44 UTC (permalink / raw)
To: gcc-cvs
https://gcc.gnu.org/g:bfd738f7bf883f39cef1d0e8bf08594749e542ce
commit bfd738f7bf883f39cef1d0e8bf08594749e542ce
Author: Alexandre Oliva <oliva@gnu.org>
Date: Fri Jan 20 22:01:15 2023 -0300
add memcmp loop expander
Diff:
---
gcc/builtins.cc | 3 +-
gcc/expr.cc | 165 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
gcc/expr.h | 3 +-
3 files changed, 167 insertions(+), 4 deletions(-)
diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index ca4d4721cdc..e55b53485e2 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -4809,7 +4809,8 @@ expand_builtin_memcmp (tree exp, rtx target, bool result_eq)
result = emit_block_cmp_hints (arg1_rtx, arg2_rtx, len_rtx,
TREE_TYPE (len), target,
result_eq, constfn,
- CONST_CAST (char *, rep));
+ CONST_CAST (char *, rep),
+ tree_ctz (len));
if (result)
{
diff --git a/gcc/expr.cc b/gcc/expr.cc
index aceb3f514fc..beb2f6f7bf7 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -83,6 +83,8 @@ static bool emit_block_move_via_pattern (rtx, rtx, rtx, unsigned, unsigned,
static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned, int);
static void emit_block_move_via_sized_loop (rtx, rtx, rtx, unsigned, unsigned);
static void emit_block_move_via_oriented_loop (rtx, rtx, rtx, unsigned, unsigned);
+static rtx emit_block_cmp_via_loop (rtx, rtx, rtx, tree, rtx, bool,
+ unsigned, unsigned);
static void clear_by_pieces (rtx, unsigned HOST_WIDE_INT, unsigned int);
static rtx_insn *compress_float_constant (rtx, rtx);
static rtx get_subtarget (rtx);
@@ -2569,7 +2571,8 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
Both X and Y must be MEM rtx's. LEN is an rtx that says how long
they are. LEN_TYPE is the type of the expression that was used to
- calculate it.
+ calculate it, and CTZ_LEN is the known trailing-zeros count of LEN,
+ so LEN must be a multiple of 1<<CTZ_LEN even if it's not constant.
If EQUALITY_ONLY is true, it means we don't have to return the tri-state
value of a normal memcmp call, instead we can just compare for equality.
@@ -2585,7 +2588,7 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
rtx
emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
bool equality_only, by_pieces_constfn y_cfn,
- void *y_cfndata)
+ void *y_cfndata, unsigned ctz_len)
{
rtx result = 0;
@@ -2607,8 +2610,166 @@ emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
else
result = emit_block_cmp_via_cmpmem (x, y, len, len_type, target, align);
+ if (!result && (flag_inline_stringops & ILSOP_MEMCMP))
+ result = emit_block_cmp_via_loop (x, y, len, len_type,
+ target, equality_only,
+ align, ctz_len);
+
return result;
}
+
+rtx
+emit_block_cmp_via_loop (rtx x, rtx y, rtx len, tree size_type, rtx target,
+ bool equality_only, unsigned align, unsigned ctz_len)
+{
+ unsigned incr = align / BITS_PER_UNIT;
+
+ if (CONST_INT_P (len))
+ ctz_len = MAX (ctz_len, (unsigned) wi::ctz (UINTVAL (len)));
+
+ if (HOST_WIDE_INT_1U << ctz_len < (unsigned HOST_WIDE_INT) incr)
+ incr = HOST_WIDE_INT_1U << ctz_len;
+
+ while (incr > 1
+ && !can_do_by_pieces (incr, align, COMPARE_BY_PIECES))
+ incr >>= 1;
+
+ rtx_code_label *cmp_label, *top_label, *ne_label, *res_label;
+ rtx iter, x_addr, y_addr, tmp;
+ machine_mode x_addr_mode = get_address_mode (x);
+ machine_mode y_addr_mode = get_address_mode (y);
+ machine_mode iter_mode;
+
+ iter_mode = GET_MODE (len);
+ if (iter_mode == VOIDmode)
+ iter_mode = word_mode;
+
+ top_label = gen_label_rtx ();
+ cmp_label = gen_label_rtx ();
+ ne_label = gen_label_rtx ();
+ res_label = gen_label_rtx ();
+ iter = gen_reg_rtx (iter_mode);
+
+ rtx iter_init = const0_rtx;
+ rtx_code iter_cond = LT;
+ rtx iter_limit = len;
+ rtx iter_incr = GEN_INT (incr);
+ machine_mode cmp_mode;
+
+ emit_move_insn (iter, iter_init);
+
+ scalar_int_mode int_cmp_mode
+ = smallest_int_mode_for_size (incr * BITS_PER_UNIT);
+ if (GET_MODE_BITSIZE (int_cmp_mode) != incr * BITS_PER_UNIT
+ || !can_compare_p (NE, int_cmp_mode, ccp_jump))
+ {
+ cmp_mode = BLKmode;
+ gcc_checking_assert (incr != 1);
+ }
+ else
+ cmp_mode = int_cmp_mode;
+
+ x_addr = force_operand (XEXP (x, 0), NULL_RTX);
+ y_addr = force_operand (XEXP (y, 0), NULL_RTX);
+ do_pending_stack_adjust ();
+
+ emit_jump (cmp_label);
+ emit_label (top_label);
+
+ tmp = convert_modes (x_addr_mode, iter_mode, iter, true);
+ x_addr = simplify_gen_binary (PLUS, x_addr_mode, x_addr, tmp);
+
+ if (x_addr_mode != y_addr_mode)
+ tmp = convert_modes (y_addr_mode, iter_mode, iter, true);
+ y_addr = simplify_gen_binary (PLUS, y_addr_mode, y_addr, tmp);
+
+ x = change_address (x, cmp_mode, x_addr);
+ y = change_address (y, cmp_mode, y_addr);
+
+ rtx part_res;
+ if (cmp_mode == BLKmode)
+ part_res = compare_by_pieces (x, y, incr, target, align, 0, 0);
+ else
+ part_res = expand_binop (cmp_mode, sub_optab, x, y, NULL_RTX,
+ true, OPTAB_LIB_WIDEN);
+
+ emit_cmp_and_jump_insns (part_res, GEN_INT (0), NE, NULL_RTX,
+ GET_MODE (part_res), true, ne_label,
+ profile_probability::guessed_always ()
+ .apply_scale (1, 10));
+
+ tmp = expand_simple_binop (iter_mode, PLUS, iter, iter_incr, iter,
+ true, OPTAB_LIB_WIDEN);
+ if (tmp != iter)
+ emit_move_insn (iter, tmp);
+
+ emit_label (cmp_label);
+ emit_cmp_and_jump_insns (iter, iter_limit, iter_cond, NULL_RTX, iter_mode,
+ true, top_label,
+ profile_probability::guessed_always ()
+ .apply_scale (9, 10));
+
+ if (target == NULL_RTX
+ || !REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
+ target = gen_reg_rtx (TYPE_MODE (integer_type_node));
+
+ emit_move_insn (target, const0_rtx);
+ emit_jump (res_label);
+ emit_barrier ();
+
+ emit_label (ne_label);
+
+ if (equality_only)
+ emit_move_insn (target, const1_rtx);
+ else
+ {
+ if (incr > UNITS_PER_WORD)
+ /* ??? Re-compare the block found to be different one word at a
+ time. */
+ part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), size_type,
+ target, equality_only,
+ BITS_PER_WORD, 0);
+ else if (incr > 1)
+ /* ??? Re-compare the block found to be different one byte at a
+ time. We could do better using part_res, and being careful
+ about endianness. */
+ part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), size_type,
+ target, equality_only,
+ BITS_PER_UNIT, 0);
+ else if (known_gt (GET_MODE_BITSIZE (GET_MODE (target)),
+ GET_MODE_BITSIZE (cmp_mode)))
+ part_res = expand_binop (GET_MODE (target), sub_optab, x, y, target,
+ true, OPTAB_LIB_WIDEN);
+ else
+ {
+ /* In the odd chance target is QImode, we can't count on
+ widening subtract to capture the result of the unsigned
+ compares. */
+ rtx_code_label *ltu_label;
+ ltu_label = gen_label_rtx ();
+ emit_cmp_and_jump_insns (x, y, LTU, NULL_RTX,
+ cmp_mode, true, ltu_label,
+ profile_probability::guessed_always ()
+ .apply_scale (5, 10));
+
+ emit_move_insn (target, const1_rtx);
+ emit_jump (res_label);
+ emit_barrier ();
+
+ emit_label (ltu_label);
+ emit_move_insn (target, constm1_rtx);
+ part_res = target;
+ }
+
+ if (target != part_res)
+ convert_move (target, part_res, false);
+ }
+
+ emit_label (res_label);
+
+ return target;
+}
+
\f
/* Copy all or part of a value X into registers starting at REGNO.
The number of registers to be filled is NREGS. */
diff --git a/gcc/expr.h b/gcc/expr.h
index d9fc47c9114..976c8b69fc1 100644
--- a/gcc/expr.h
+++ b/gcc/expr.h
@@ -138,7 +138,8 @@ extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
bool might_overlap = false,
unsigned ctz_size = 0);
extern rtx emit_block_cmp_hints (rtx, rtx, rtx, tree, rtx, bool,
- by_pieces_constfn, void *);
+ by_pieces_constfn, void *,
+ unsigned ctz_len = 0);
extern bool emit_storent_insn (rtx to, rtx from);
/* Copy all or part of a value X into registers starting at REGNO.
^ permalink raw reply [flat|nested] 16+ messages in thread
* [gcc(refs/users/aoliva/heads/testme)] add memcmp loop expander
@ 2023-01-26 8:21 Alexandre Oliva
0 siblings, 0 replies; 16+ messages in thread
From: Alexandre Oliva @ 2023-01-26 8:21 UTC (permalink / raw)
To: gcc-cvs
https://gcc.gnu.org/g:c234c022348c1ada7594294ade986951521a6a5b
commit c234c022348c1ada7594294ade986951521a6a5b
Author: Alexandre Oliva <oliva@gnu.org>
Date: Fri Jan 20 22:01:15 2023 -0300
add memcmp loop expander
Diff:
---
gcc/builtins.cc | 3 +-
gcc/expr.cc | 165 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
gcc/expr.h | 3 +-
3 files changed, 167 insertions(+), 4 deletions(-)
diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index ca4d4721cdc..e55b53485e2 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -4809,7 +4809,8 @@ expand_builtin_memcmp (tree exp, rtx target, bool result_eq)
result = emit_block_cmp_hints (arg1_rtx, arg2_rtx, len_rtx,
TREE_TYPE (len), target,
result_eq, constfn,
- CONST_CAST (char *, rep));
+ CONST_CAST (char *, rep),
+ tree_ctz (len));
if (result)
{
diff --git a/gcc/expr.cc b/gcc/expr.cc
index aceb3f514fc..7cb693525ba 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -83,6 +83,8 @@ static bool emit_block_move_via_pattern (rtx, rtx, rtx, unsigned, unsigned,
static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned, int);
static void emit_block_move_via_sized_loop (rtx, rtx, rtx, unsigned, unsigned);
static void emit_block_move_via_oriented_loop (rtx, rtx, rtx, unsigned, unsigned);
+static rtx emit_block_cmp_via_loop (rtx, rtx, rtx, tree, rtx, bool,
+ unsigned, unsigned);
static void clear_by_pieces (rtx, unsigned HOST_WIDE_INT, unsigned int);
static rtx_insn *compress_float_constant (rtx, rtx);
static rtx get_subtarget (rtx);
@@ -2569,7 +2571,8 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
Both X and Y must be MEM rtx's. LEN is an rtx that says how long
they are. LEN_TYPE is the type of the expression that was used to
- calculate it.
+ calculate it, and CTZ_LEN is the known trailing-zeros count of LEN,
+ so LEN must be a multiple of 1<<CTZ_LEN even if it's not constant.
If EQUALITY_ONLY is true, it means we don't have to return the tri-state
value of a normal memcmp call, instead we can just compare for equality.
@@ -2585,7 +2588,7 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
rtx
emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
bool equality_only, by_pieces_constfn y_cfn,
- void *y_cfndata)
+ void *y_cfndata, unsigned ctz_len)
{
rtx result = 0;
@@ -2607,8 +2610,166 @@ emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
else
result = emit_block_cmp_via_cmpmem (x, y, len, len_type, target, align);
+ if (!result && (flag_inline_stringops & ILSOP_MEMCMP))
+ result = emit_block_cmp_via_loop (x, y, len, len_type,
+ target, equality_only,
+ align, ctz_len);
+
return result;
}
+
+rtx
+emit_block_cmp_via_loop (rtx x, rtx y, rtx len, tree size_type, rtx target,
+ bool equality_only, unsigned align, unsigned ctz_len)
+{
+ unsigned incr = align / BITS_PER_UNIT;
+
+ if (CONST_INT_P (len))
+ ctz_len = MAX (ctz_len, (unsigned) wi::ctz (UINTVAL (len)));
+
+ if (HOST_WIDE_INT_1U << ctz_len < (unsigned HOST_WIDE_INT) incr)
+ incr = HOST_WIDE_INT_1U << ctz_len;
+
+ while (incr > 1
+ && !can_do_by_pieces (incr, align, COMPARE_BY_PIECES))
+ incr >>= 1;
+
+ rtx_code_label *cmp_label, *top_label, *ne_label, *res_label;
+ rtx iter, x_addr, y_addr, tmp;
+ machine_mode x_addr_mode = get_address_mode (x);
+ machine_mode y_addr_mode = get_address_mode (y);
+ machine_mode iter_mode;
+
+ iter_mode = GET_MODE (len);
+ if (iter_mode == VOIDmode)
+ iter_mode = word_mode;
+
+ top_label = gen_label_rtx ();
+ cmp_label = gen_label_rtx ();
+ ne_label = gen_label_rtx ();
+ res_label = gen_label_rtx ();
+ iter = gen_reg_rtx (iter_mode);
+
+ rtx iter_init = const0_rtx;
+ rtx_code iter_cond = LT;
+ rtx iter_limit = len;
+ rtx iter_incr = GEN_INT (incr);
+ machine_mode cmp_mode;
+
+ emit_move_insn (iter, iter_init);
+
+ scalar_int_mode int_cmp_mode
+ = smallest_int_mode_for_size (incr * BITS_PER_UNIT);
+ if (GET_MODE_BITSIZE (int_cmp_mode) != incr * BITS_PER_UNIT
+ || !can_compare_p (NE, int_cmp_mode, ccp_jump))
+ {
+ cmp_mode = BLKmode;
+ gcc_checking_assert (incr != 1);
+ }
+ else
+ cmp_mode = int_cmp_mode;
+
+ x_addr = force_operand (XEXP (x, 0), NULL_RTX);
+ y_addr = force_operand (XEXP (y, 0), NULL_RTX);
+ do_pending_stack_adjust ();
+
+ emit_jump (cmp_label);
+ emit_label (top_label);
+
+ tmp = convert_modes (x_addr_mode, iter_mode, iter, true);
+ x_addr = simplify_gen_binary (PLUS, x_addr_mode, x_addr, tmp);
+
+ if (x_addr_mode != y_addr_mode)
+ tmp = convert_modes (y_addr_mode, iter_mode, iter, true);
+ y_addr = simplify_gen_binary (PLUS, y_addr_mode, y_addr, tmp);
+
+ x = change_address (x, cmp_mode, x_addr);
+ y = change_address (y, cmp_mode, y_addr);
+
+ rtx part_res;
+ if (cmp_mode == BLKmode)
+ part_res = compare_by_pieces (x, y, incr, target, align, 0, 0);
+ else
+ part_res = expand_binop (cmp_mode, sub_optab, x, y, NULL_RTX,
+ true, OPTAB_LIB_WIDEN);
+
+ emit_cmp_and_jump_insns (part_res, GEN_INT (0), NE, NULL_RTX,
+ GET_MODE (part_res), true, ne_label,
+ profile_probability::guessed_always ()
+ .apply_scale (1, 10));
+
+ tmp = expand_simple_binop (iter_mode, PLUS, iter, iter_incr, iter,
+ true, OPTAB_LIB_WIDEN);
+ if (tmp != iter)
+ emit_move_insn (iter, tmp);
+
+ emit_label (cmp_label);
+ emit_cmp_and_jump_insns (iter, iter_limit, iter_cond, NULL_RTX, iter_mode,
+ true, top_label,
+ profile_probability::guessed_always ()
+ .apply_scale (9, 10));
+
+ if (target == NULL_RTX
+ || !REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
+ target = gen_reg_rtx (TYPE_MODE (integer_type_node));
+
+ emit_move_insn (target, const0_rtx);
+ emit_jump (res_label);
+ emit_barrier ();
+
+ emit_label (ne_label);
+
+ if (equality_only)
+ emit_move_insn (target, const1_rtx);
+ else
+ {
+ if (incr > UNITS_PER_WORD)
+ /* ??? Re-compare the block found to be different one word at a
+ time. */
+ part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), size_type,
+ target, equality_only,
+ BITS_PER_WORD, 0);
+ else if (incr > 1)
+ /* ??? Re-compare the block found to be different one byte at a
+ time. We could do better using part_res, and being careful
+ about endianness. */
+ part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), size_type,
+ target, equality_only,
+ BITS_PER_UNIT, 0);
+ else if (GET_MODE_BITSIZE (GET_MODE (target))
+ > GET_MODE_BITSIZE (cmp_mode))
+ part_res = expand_binop (GET_MODE (target), sub_optab, x, y, target,
+ true, OPTAB_LIB_WIDEN);
+ else
+ {
+ /* In the odd chance target is QImode, we can't count on
+ widening subtract to capture the result of the unsigned
+ compares. */
+ rtx_code_label *ltu_label;
+ ltu_label = gen_label_rtx ();
+ emit_cmp_and_jump_insns (x, y, LTU, NULL_RTX,
+ cmp_mode, true, ltu_label,
+ profile_probability::guessed_always ()
+ .apply_scale (5, 10));
+
+ emit_move_insn (target, const1_rtx);
+ emit_jump (res_label);
+ emit_barrier ();
+
+ emit_label (ltu_label);
+ emit_move_insn (target, constm1_rtx);
+ part_res = target;
+ }
+
+ if (target != part_res)
+ convert_move (target, part_res, false);
+ }
+
+ emit_label (res_label);
+
+ return target;
+}
+
\f
/* Copy all or part of a value X into registers starting at REGNO.
The number of registers to be filled is NREGS. */
diff --git a/gcc/expr.h b/gcc/expr.h
index d9fc47c9114..976c8b69fc1 100644
--- a/gcc/expr.h
+++ b/gcc/expr.h
@@ -138,7 +138,8 @@ extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
bool might_overlap = false,
unsigned ctz_size = 0);
extern rtx emit_block_cmp_hints (rtx, rtx, rtx, tree, rtx, bool,
- by_pieces_constfn, void *);
+ by_pieces_constfn, void *,
+ unsigned ctz_len = 0);
extern bool emit_storent_insn (rtx to, rtx from);
/* Copy all or part of a value X into registers starting at REGNO.
^ permalink raw reply [flat|nested] 16+ messages in thread
* [gcc(refs/users/aoliva/heads/testme)] add memcmp loop expander
@ 2023-01-26 8:01 Alexandre Oliva
0 siblings, 0 replies; 16+ messages in thread
From: Alexandre Oliva @ 2023-01-26 8:01 UTC (permalink / raw)
To: gcc-cvs
https://gcc.gnu.org/g:14be539acc63313992496f72f4ebb23c44804ed8
commit 14be539acc63313992496f72f4ebb23c44804ed8
Author: Alexandre Oliva <oliva@gnu.org>
Date: Fri Jan 20 22:01:15 2023 -0300
add memcmp loop expander
Diff:
---
gcc/builtins.cc | 3 +-
gcc/expr.cc | 144 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
gcc/expr.h | 3 +-
3 files changed, 146 insertions(+), 4 deletions(-)
diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index ca4d4721cdc..e55b53485e2 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -4809,7 +4809,8 @@ expand_builtin_memcmp (tree exp, rtx target, bool result_eq)
result = emit_block_cmp_hints (arg1_rtx, arg2_rtx, len_rtx,
TREE_TYPE (len), target,
result_eq, constfn,
- CONST_CAST (char *, rep));
+ CONST_CAST (char *, rep),
+ tree_ctz (len));
if (result)
{
diff --git a/gcc/expr.cc b/gcc/expr.cc
index aceb3f514fc..8e40ae8a15f 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -83,6 +83,8 @@ static bool emit_block_move_via_pattern (rtx, rtx, rtx, unsigned, unsigned,
static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned, int);
static void emit_block_move_via_sized_loop (rtx, rtx, rtx, unsigned, unsigned);
static void emit_block_move_via_oriented_loop (rtx, rtx, rtx, unsigned, unsigned);
+static rtx emit_block_cmp_via_loop (rtx, rtx, rtx, tree, rtx, bool,
+ unsigned, unsigned);
static void clear_by_pieces (rtx, unsigned HOST_WIDE_INT, unsigned int);
static rtx_insn *compress_float_constant (rtx, rtx);
static rtx get_subtarget (rtx);
@@ -2569,7 +2571,8 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
Both X and Y must be MEM rtx's. LEN is an rtx that says how long
they are. LEN_TYPE is the type of the expression that was used to
- calculate it.
+ calculate it, and CTZ_LEN is the known trailing-zeros count of LEN,
+ so LEN must be a multiple of 1<<CTZ_LEN even if it's not constant.
If EQUALITY_ONLY is true, it means we don't have to return the tri-state
value of a normal memcmp call, instead we can just compare for equality.
@@ -2585,7 +2588,7 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
rtx
emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
bool equality_only, by_pieces_constfn y_cfn,
- void *y_cfndata)
+ void *y_cfndata, unsigned ctz_len)
{
rtx result = 0;
@@ -2607,8 +2610,145 @@ emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
else
result = emit_block_cmp_via_cmpmem (x, y, len, len_type, target, align);
+ if (!result && (flag_inline_stringops & ILSOP_MEMCMP))
+ result = emit_block_cmp_via_loop (x, y, len, len_type,
+ target, equality_only,
+ align, ctz_len);
+
return result;
}
+
+rtx
+emit_block_cmp_via_loop (rtx x, rtx y, rtx len, tree size_type, rtx target,
+ bool equality_only, unsigned align, unsigned ctz_len)
+{
+ unsigned incr = align / BITS_PER_UNIT;
+
+ if (CONST_INT_P (len))
+ ctz_len = MAX (ctz_len, (unsigned) wi::ctz (UINTVAL (len)));
+
+ if (HOST_WIDE_INT_1U << ctz_len < (unsigned HOST_WIDE_INT) incr)
+ incr = HOST_WIDE_INT_1U << ctz_len;
+
+ while (incr > 1
+ && !can_do_by_pieces (incr, align, COMPARE_BY_PIECES))
+ incr >>= 1;
+
+ rtx_code_label *cmp_label, *top_label, *ne_label, *res_label;
+ rtx iter, x_addr, y_addr, tmp;
+ machine_mode x_addr_mode = get_address_mode (x);
+ machine_mode y_addr_mode = get_address_mode (y);
+ machine_mode iter_mode;
+
+ iter_mode = GET_MODE (len);
+ if (iter_mode == VOIDmode)
+ iter_mode = word_mode;
+
+ top_label = gen_label_rtx ();
+ cmp_label = gen_label_rtx ();
+ ne_label = gen_label_rtx ();
+ res_label = gen_label_rtx ();
+ iter = gen_reg_rtx (iter_mode);
+
+ rtx iter_init = const0_rtx;
+ rtx_code iter_cond = LT;
+ rtx iter_limit = len;
+ rtx iter_incr = GEN_INT (incr);
+ machine_mode cmp_mode;
+
+ emit_move_insn (iter, iter_init);
+
+ scalar_int_mode int_cmp_mode
+ = smallest_int_mode_for_size (incr * BITS_PER_UNIT);
+ if (GET_MODE_BITSIZE (int_cmp_mode) != incr * BITS_PER_UNIT
+ || !can_compare_p (NE, int_cmp_mode, ccp_jump))
+ {
+ cmp_mode = BLKmode;
+ gcc_checking_assert (incr != 1);
+ }
+ else
+ cmp_mode = int_cmp_mode;
+
+ x_addr = force_operand (XEXP (x, 0), NULL_RTX);
+ y_addr = force_operand (XEXP (y, 0), NULL_RTX);
+ do_pending_stack_adjust ();
+
+ emit_jump (cmp_label);
+ emit_label (top_label);
+
+ tmp = convert_modes (x_addr_mode, iter_mode, iter, true);
+ x_addr = simplify_gen_binary (PLUS, x_addr_mode, x_addr, tmp);
+
+ if (x_addr_mode != y_addr_mode)
+ tmp = convert_modes (y_addr_mode, iter_mode, iter, true);
+ y_addr = simplify_gen_binary (PLUS, y_addr_mode, y_addr, tmp);
+
+ x = change_address (x, cmp_mode, x_addr);
+ y = change_address (y, cmp_mode, y_addr);
+
+ rtx part_res;
+ if (cmp_mode == BLKmode)
+ part_res = compare_by_pieces (x, y, incr, target, align, 0, 0);
+ else
+ part_res = expand_binop (cmp_mode, sub_optab, x, y, NULL_RTX,
+ true, OPTAB_LIB_WIDEN);
+
+ emit_cmp_and_jump_insns (part_res, GEN_INT (0), NE, NULL_RTX,
+ GET_MODE (part_res), true, ne_label,
+ profile_probability::guessed_always ()
+ .apply_scale (9, 10));
+
+ tmp = expand_simple_binop (iter_mode, PLUS, iter, iter_incr, iter,
+ true, OPTAB_LIB_WIDEN);
+ if (tmp != iter)
+ emit_move_insn (iter, tmp);
+
+ emit_label (cmp_label);
+ emit_cmp_and_jump_insns (iter, iter_limit, iter_cond, NULL_RTX, iter_mode,
+ true, top_label,
+ profile_probability::guessed_always ()
+ .apply_scale (9, 10));
+
+ if (target == NULL_RTX
+ || !REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
+ target = gen_reg_rtx (TYPE_MODE (integer_type_node));
+
+ emit_move_insn (target, const0_rtx);
+ emit_jump (res_label);
+ emit_barrier ();
+
+ emit_label (ne_label);
+
+ if (equality_only)
+ emit_move_insn (target, const1_rtx);
+ else
+ {
+ if (incr > UNITS_PER_WORD)
+ /* ??? Re-compare the block found to be different one word at a
+ time. */
+ part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), size_type,
+ target, equality_only,
+ BITS_PER_WORD, 0);
+ else if (incr > 1)
+ /* ??? Re-compare the block found to be different one byte at a
+ time. We could do better using part_res, and being careful
+ about endianness. */
+ part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), size_type,
+ target, equality_only,
+ BITS_PER_UNIT, 0);
+ else
+ part_res = expand_binop (GET_MODE (target), sub_optab, x, y, target,
+ true, OPTAB_LIB_WIDEN);
+
+ if (target != part_res)
+ convert_move (target, part_res, false);
+ }
+
+ emit_label (res_label);
+
+ return target;
+}
+
\f
/* Copy all or part of a value X into registers starting at REGNO.
The number of registers to be filled is NREGS. */
diff --git a/gcc/expr.h b/gcc/expr.h
index d9fc47c9114..976c8b69fc1 100644
--- a/gcc/expr.h
+++ b/gcc/expr.h
@@ -138,7 +138,8 @@ extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
bool might_overlap = false,
unsigned ctz_size = 0);
extern rtx emit_block_cmp_hints (rtx, rtx, rtx, tree, rtx, bool,
- by_pieces_constfn, void *);
+ by_pieces_constfn, void *,
+ unsigned ctz_len = 0);
extern bool emit_storent_insn (rtx to, rtx from);
/* Copy all or part of a value X into registers starting at REGNO.
^ permalink raw reply [flat|nested] 16+ messages in thread
* [gcc(refs/users/aoliva/heads/testme)] add memcmp loop expander
@ 2023-01-26 7:02 Alexandre Oliva
0 siblings, 0 replies; 16+ messages in thread
From: Alexandre Oliva @ 2023-01-26 7:02 UTC (permalink / raw)
To: gcc-cvs
https://gcc.gnu.org/g:4707a9232f8aa7f60e8745778a1f7f75bdc8540a
commit 4707a9232f8aa7f60e8745778a1f7f75bdc8540a
Author: Alexandre Oliva <oliva@gnu.org>
Date: Fri Jan 20 22:01:15 2023 -0300
add memcmp loop expander
Diff:
---
gcc/builtins.cc | 3 +-
gcc/expr.cc | 141 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
gcc/expr.h | 3 +-
3 files changed, 143 insertions(+), 4 deletions(-)
diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index ca4d4721cdc..e55b53485e2 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -4809,7 +4809,8 @@ expand_builtin_memcmp (tree exp, rtx target, bool result_eq)
result = emit_block_cmp_hints (arg1_rtx, arg2_rtx, len_rtx,
TREE_TYPE (len), target,
result_eq, constfn,
- CONST_CAST (char *, rep));
+ CONST_CAST (char *, rep),
+ tree_ctz (len));
if (result)
{
diff --git a/gcc/expr.cc b/gcc/expr.cc
index aceb3f514fc..3e40e1e588f 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -83,6 +83,8 @@ static bool emit_block_move_via_pattern (rtx, rtx, rtx, unsigned, unsigned,
static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned, int);
static void emit_block_move_via_sized_loop (rtx, rtx, rtx, unsigned, unsigned);
static void emit_block_move_via_oriented_loop (rtx, rtx, rtx, unsigned, unsigned);
+static rtx emit_block_cmp_via_loop (rtx, rtx, rtx, tree, rtx, bool,
+ unsigned, unsigned);
static void clear_by_pieces (rtx, unsigned HOST_WIDE_INT, unsigned int);
static rtx_insn *compress_float_constant (rtx, rtx);
static rtx get_subtarget (rtx);
@@ -2569,7 +2571,8 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
Both X and Y must be MEM rtx's. LEN is an rtx that says how long
they are. LEN_TYPE is the type of the expression that was used to
- calculate it.
+ calculate it, and CTZ_LEN is the known trailing-zeros count of LEN,
+ so LEN must be a multiple of 1<<CTZ_LEN even if it's not constant.
If EQUALITY_ONLY is true, it means we don't have to return the tri-state
value of a normal memcmp call, instead we can just compare for equality.
@@ -2585,7 +2588,7 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
rtx
emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
bool equality_only, by_pieces_constfn y_cfn,
- void *y_cfndata)
+ void *y_cfndata, unsigned ctz_len)
{
rtx result = 0;
@@ -2607,8 +2610,142 @@ emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
else
result = emit_block_cmp_via_cmpmem (x, y, len, len_type, target, align);
+ if (!result && (flag_inline_stringops & ILSOP_MEMCMP))
+ result = emit_block_cmp_via_loop (x, y, len, len_type,
+ target, equality_only,
+ align, ctz_len);
+
return result;
}
+
+rtx
+emit_block_cmp_via_loop (rtx x, rtx y, rtx len, tree size_type, rtx target,
+ bool equality_only, unsigned align, unsigned ctz_len)
+{
+ unsigned incr = align / BITS_PER_UNIT;
+
+ if (CONST_INT_P (len))
+ ctz_len = MAX (ctz_len, (unsigned) wi::ctz (UINTVAL (len)));
+
+ if (HOST_WIDE_INT_1U << ctz_len < (unsigned HOST_WIDE_INT) incr)
+ incr = HOST_WIDE_INT_1U << ctz_len;
+
+ while (incr > 1
+ && !can_do_by_pieces (incr, align, COMPARE_BY_PIECES))
+ incr >>= 1;
+
+ rtx_code_label *cmp_label, *top_label, *ne_label, *res_label;
+ rtx iter, x_addr, y_addr, tmp;
+ machine_mode x_addr_mode = get_address_mode (x);
+ machine_mode y_addr_mode = get_address_mode (y);
+ machine_mode iter_mode;
+
+ iter_mode = GET_MODE (len);
+ if (iter_mode == VOIDmode)
+ iter_mode = word_mode;
+
+ top_label = gen_label_rtx ();
+ cmp_label = gen_label_rtx ();
+ ne_label = gen_label_rtx ();
+ res_label = gen_label_rtx ();
+ iter = gen_reg_rtx (iter_mode);
+
+ rtx iter_init = const0_rtx;
+ rtx_code iter_cond = LT;
+ rtx iter_limit = len;
+ rtx iter_incr = GEN_INT (incr);
+ machine_mode cmp_mode;
+
+ emit_move_insn (iter, iter_init);
+
+ scalar_int_mode int_cmp_mode
+ = smallest_int_mode_for_size (incr * BITS_PER_UNIT);
+ if (GET_MODE_BITSIZE (int_cmp_mode) != incr * BITS_PER_UNIT
+ || !can_compare_p (NE, int_cmp_mode, ccp_jump))
+ {
+ cmp_mode = BLKmode;
+ gcc_checking_assert (incr != 1);
+ }
+ else
+ cmp_mode = int_cmp_mode;
+
+ x_addr = force_operand (XEXP (x, 0), NULL_RTX);
+ y_addr = force_operand (XEXP (y, 0), NULL_RTX);
+ do_pending_stack_adjust ();
+
+ emit_jump (cmp_label);
+ emit_label (top_label);
+
+ tmp = convert_modes (x_addr_mode, iter_mode, iter, true);
+ x_addr = simplify_gen_binary (PLUS, x_addr_mode, x_addr, tmp);
+
+ if (x_addr_mode != y_addr_mode)
+ tmp = convert_modes (y_addr_mode, iter_mode, iter, true);
+ y_addr = simplify_gen_binary (PLUS, y_addr_mode, y_addr, tmp);
+
+ x = change_address (x, cmp_mode, x_addr);
+ y = change_address (y, cmp_mode, y_addr);
+
+ rtx part_res;
+ if (cmp_mode == BLKmode)
+ part_res = compare_by_pieces (x, y, incr, target, align, 0, 0);
+ else
+ part_res = expand_binop (cmp_mode, sub_optab, x, y, NULL_RTX,
+ true, OPTAB_LIB_WIDEN);
+
+ emit_cmp_and_jump_insns (part_res, GEN_INT (0), NE, NULL_RTX,
+ GET_MODE (part_res), true, ne_label,
+ profile_probability::guessed_always ()
+ .apply_scale (9, 10));
+
+ tmp = expand_simple_binop (iter_mode, PLUS, iter, iter_incr, iter,
+ true, OPTAB_LIB_WIDEN);
+ if (tmp != iter)
+ emit_move_insn (iter, tmp);
+
+ emit_label (cmp_label);
+ emit_cmp_and_jump_insns (iter, iter_limit, iter_cond, NULL_RTX, iter_mode,
+ true, top_label,
+ profile_probability::guessed_always ()
+ .apply_scale (9, 10));
+
+ if (target == NULL_RTX
+ || !REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
+ target = gen_reg_rtx (TYPE_MODE (integer_type_node));
+
+ emit_move_insn (target, const0_rtx);
+ emit_jump (res_label);
+ emit_barrier ();
+
+ emit_label (ne_label);
+
+ if (equality_only)
+ emit_move_insn (target, const1_rtx);
+ else
+ {
+ if (incr > UNITS_PER_WORD)
+ /* ??? Re-compare the block found to be different one word at a
+ time. */
+ part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), size_type,
+ target, equality_only,
+ BITS_PER_WORD, 0);
+ else if (incr > 1)
+ /* ??? Re-compare the block found to be different one byte at a
+ time. We could do better using part_res, and being careful
+ about endianness. */
+ part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), size_type,
+ target, equality_only,
+ BITS_PER_UNIT, 0);
+
+ if (target != part_res)
+ convert_move (target, part_res, false);
+ }
+
+ emit_label (res_label);
+
+ return target;
+}
+
\f
/* Copy all or part of a value X into registers starting at REGNO.
The number of registers to be filled is NREGS. */
diff --git a/gcc/expr.h b/gcc/expr.h
index d9fc47c9114..976c8b69fc1 100644
--- a/gcc/expr.h
+++ b/gcc/expr.h
@@ -138,7 +138,8 @@ extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
bool might_overlap = false,
unsigned ctz_size = 0);
extern rtx emit_block_cmp_hints (rtx, rtx, rtx, tree, rtx, bool,
- by_pieces_constfn, void *);
+ by_pieces_constfn, void *,
+ unsigned ctz_len = 0);
extern bool emit_storent_insn (rtx to, rtx from);
/* Copy all or part of a value X into registers starting at REGNO.
^ permalink raw reply [flat|nested] 16+ messages in thread
* [gcc(refs/users/aoliva/heads/testme)] add memcmp loop expander
@ 2023-01-26 6:40 Alexandre Oliva
0 siblings, 0 replies; 16+ messages in thread
From: Alexandre Oliva @ 2023-01-26 6:40 UTC (permalink / raw)
To: gcc-cvs
https://gcc.gnu.org/g:5198c39dd33b64da34d1c76cee6592d9814f0861
commit 5198c39dd33b64da34d1c76cee6592d9814f0861
Author: Alexandre Oliva <oliva@gnu.org>
Date: Fri Jan 20 22:01:15 2023 -0300
add memcmp loop expander
Diff:
---
gcc/builtins.cc | 3 +-
gcc/expr.cc | 145 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
gcc/expr.h | 3 +-
3 files changed, 147 insertions(+), 4 deletions(-)
diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index ca4d4721cdc..e55b53485e2 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -4809,7 +4809,8 @@ expand_builtin_memcmp (tree exp, rtx target, bool result_eq)
result = emit_block_cmp_hints (arg1_rtx, arg2_rtx, len_rtx,
TREE_TYPE (len), target,
result_eq, constfn,
- CONST_CAST (char *, rep));
+ CONST_CAST (char *, rep),
+ tree_ctz (len));
if (result)
{
diff --git a/gcc/expr.cc b/gcc/expr.cc
index aceb3f514fc..2913452797c 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -83,6 +83,8 @@ static bool emit_block_move_via_pattern (rtx, rtx, rtx, unsigned, unsigned,
static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned, int);
static void emit_block_move_via_sized_loop (rtx, rtx, rtx, unsigned, unsigned);
static void emit_block_move_via_oriented_loop (rtx, rtx, rtx, unsigned, unsigned);
+static rtx emit_block_cmp_via_loop (rtx, rtx, rtx, tree, rtx, bool,
+ unsigned, unsigned);
static void clear_by_pieces (rtx, unsigned HOST_WIDE_INT, unsigned int);
static rtx_insn *compress_float_constant (rtx, rtx);
static rtx get_subtarget (rtx);
@@ -2569,7 +2571,8 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
Both X and Y must be MEM rtx's. LEN is an rtx that says how long
they are. LEN_TYPE is the type of the expression that was used to
- calculate it.
+ calculate it, and CTZ_LEN is the known trailing-zeros count of LEN,
+ so LEN must be a multiple of 1<<CTZ_LEN even if it's not constant.
If EQUALITY_ONLY is true, it means we don't have to return the tri-state
value of a normal memcmp call, instead we can just compare for equality.
@@ -2585,7 +2588,7 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
rtx
emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
bool equality_only, by_pieces_constfn y_cfn,
- void *y_cfndata)
+ void *y_cfndata, unsigned ctz_len)
{
rtx result = 0;
@@ -2607,8 +2610,146 @@ emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
else
result = emit_block_cmp_via_cmpmem (x, y, len, len_type, target, align);
+ if (!result && (flag_inline_stringops & ILSOP_MEMCMP))
+ result = emit_block_cmp_via_loop (x, y, len, len_type,
+ target, equality_only,
+ align, ctz_len);
+
return result;
}
+
+rtx
+emit_block_cmp_via_loop (rtx x, rtx y, rtx len, tree size_type, rtx target,
+ bool equality_only, unsigned align, unsigned ctz_len)
+{
+ unsigned incr = align / BITS_PER_UNIT;
+
+ if (CONST_INT_P (len))
+ ctz_len = MAX (ctz_len, (unsigned) wi::ctz (UINTVAL (len)));
+
+ if (HOST_WIDE_INT_1U << ctz_len < (unsigned HOST_WIDE_INT) incr)
+ incr = HOST_WIDE_INT_1U << ctz_len;
+
+ while (incr > 1
+ && !can_do_by_pieces (incr, align, COMPARE_BY_PIECES))
+ incr >>= 1;
+
+ rtx_code_label *cmp_label, *top_label, *ne_label, *res_label;
+ rtx iter, x_addr, y_addr, tmp;
+ machine_mode x_addr_mode = get_address_mode (x);
+ machine_mode y_addr_mode = get_address_mode (y);
+ machine_mode iter_mode;
+
+ iter_mode = GET_MODE (len);
+ if (iter_mode == VOIDmode)
+ iter_mode = word_mode;
+
+ top_label = gen_label_rtx ();
+ cmp_label = gen_label_rtx ();
+ ne_label = gen_label_rtx ();
+ res_label = gen_label_rtx ();
+ iter = gen_reg_rtx (iter_mode);
+
+ rtx iter_init = const0_rtx;
+ rtx_code iter_cond = LT;
+ rtx iter_limit = len;
+ rtx iter_incr = GEN_INT (incr);
+ machine_mode cmp_mode;
+
+ emit_move_insn (iter, iter_init);
+
+ scalar_int_mode int_cmp_mode
+ = smallest_int_mode_for_size (incr * BITS_PER_UNIT);
+ if (GET_MODE_BITSIZE (int_cmp_mode) != incr * BITS_PER_UNIT
+ || !can_compare_p (NE, int_cmp_mode, ccp_jump))
+ {
+ cmp_mode = BLKmode;
+ gcc_checking_assert (can_do_by_pieces (incr, align, COMPARE_BY_PIECES));
+ }
+ else
+ cmp_mode = int_cmp_mode;
+
+ x_addr = force_operand (XEXP (x, 0), NULL_RTX);
+ y_addr = force_operand (XEXP (y, 0), NULL_RTX);
+ do_pending_stack_adjust ();
+
+ emit_jump (cmp_label);
+ emit_label (top_label);
+
+ tmp = convert_modes (x_addr_mode, iter_mode, iter, true);
+ x_addr = simplify_gen_binary (PLUS, x_addr_mode, x_addr, tmp);
+
+ if (x_addr_mode != y_addr_mode)
+ tmp = convert_modes (y_addr_mode, iter_mode, iter, true);
+ y_addr = simplify_gen_binary (PLUS, y_addr_mode, y_addr, tmp);
+
+ x = change_address (x, cmp_mode, x_addr);
+ y = change_address (y, cmp_mode, y_addr);
+
+ rtx part_res;
+ if (cmp_mode == BLKmode)
+ part_res = compare_by_pieces (x, y, incr, target, align, 0, 0);
+ else
+ part_res = expand_binop (cmp_mode, sub_optab, x, y, NULL_RTX,
+ true, OPTAB_LIB_WIDEN);
+
+ emit_cmp_and_jump_insns (part_res, GEN_INT (0), NE, NULL_RTX,
+ GET_MODE (part_res), true, ne_label,
+ profile_probability::guessed_always ()
+ .apply_scale (9, 10));
+
+ tmp = expand_simple_binop (iter_mode, PLUS, iter, iter_incr, iter,
+ true, OPTAB_LIB_WIDEN);
+ if (tmp != iter)
+ emit_move_insn (iter, tmp);
+
+ emit_label (cmp_label);
+ emit_cmp_and_jump_insns (iter, iter_limit, iter_cond, NULL_RTX, iter_mode,
+ true, top_label,
+ profile_probability::guessed_always ()
+ .apply_scale (9, 10));
+
+ if (target == NULL_RTX
+ || !REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
+ target = gen_reg_rtx (TYPE_MODE (integer_type_node));
+
+ emit_move_insn (target, const0_rtx);
+ emit_jump (res_label);
+ emit_barrier ();
+
+ emit_label (ne_label);
+
+ if (equality_only)
+ emit_move_insn (target, const1_rtx);
+ else if (incr == 1)
+ convert_move (target, part_res, false);
+ else if (incr > UNITS_PER_WORD)
+ {
+ /* ??? Re-compare the block found to be different one word at a
+ time. */
+ part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), size_type,
+ target, equality_only,
+ BITS_PER_WORD, 0);
+ if (target != part_res)
+ convert_move (target, part_res, false);
+ }
+ else
+ {
+ /* ??? Re-compare the block found to be different one byte at a
+ time. We could do better using part_res, and being careful
+ about endianness. */
+ part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), size_type,
+ target, equality_only,
+ BITS_PER_UNIT, 0);
+ if (target != part_res)
+ convert_move (target, part_res, false);
+ }
+
+ emit_label (res_label);
+
+ return target;
+}
+
\f
/* Copy all or part of a value X into registers starting at REGNO.
The number of registers to be filled is NREGS. */
diff --git a/gcc/expr.h b/gcc/expr.h
index d9fc47c9114..976c8b69fc1 100644
--- a/gcc/expr.h
+++ b/gcc/expr.h
@@ -138,7 +138,8 @@ extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
bool might_overlap = false,
unsigned ctz_size = 0);
extern rtx emit_block_cmp_hints (rtx, rtx, rtx, tree, rtx, bool,
- by_pieces_constfn, void *);
+ by_pieces_constfn, void *,
+ unsigned ctz_len = 0);
extern bool emit_storent_insn (rtx to, rtx from);
/* Copy all or part of a value X into registers starting at REGNO.
^ permalink raw reply [flat|nested] 16+ messages in thread
* [gcc(refs/users/aoliva/heads/testme)] add memcmp loop expander
@ 2023-01-26 6:38 Alexandre Oliva
0 siblings, 0 replies; 16+ messages in thread
From: Alexandre Oliva @ 2023-01-26 6:38 UTC (permalink / raw)
To: gcc-cvs
https://gcc.gnu.org/g:abc556fb901c7566e61fb79989b57df19d30fc32
commit abc556fb901c7566e61fb79989b57df19d30fc32
Author: Alexandre Oliva <oliva@gnu.org>
Date: Fri Jan 20 22:01:15 2023 -0300
add memcmp loop expander
Diff:
---
gcc/builtins.cc | 3 +-
gcc/expr.cc | 145 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
gcc/expr.h | 3 +-
3 files changed, 147 insertions(+), 4 deletions(-)
diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index ca4d4721cdc..e55b53485e2 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -4809,7 +4809,8 @@ expand_builtin_memcmp (tree exp, rtx target, bool result_eq)
result = emit_block_cmp_hints (arg1_rtx, arg2_rtx, len_rtx,
TREE_TYPE (len), target,
result_eq, constfn,
- CONST_CAST (char *, rep));
+ CONST_CAST (char *, rep),
+ tree_ctz (len));
if (result)
{
diff --git a/gcc/expr.cc b/gcc/expr.cc
index aceb3f514fc..3116ad0ab09 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -83,6 +83,8 @@ static bool emit_block_move_via_pattern (rtx, rtx, rtx, unsigned, unsigned,
static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned, int);
static void emit_block_move_via_sized_loop (rtx, rtx, rtx, unsigned, unsigned);
static void emit_block_move_via_oriented_loop (rtx, rtx, rtx, unsigned, unsigned);
+static rtx emit_block_cmp_via_loop (rtx, rtx, rtx, tree, rtx, bool,
+ unsigned, unsigned);
static void clear_by_pieces (rtx, unsigned HOST_WIDE_INT, unsigned int);
static rtx_insn *compress_float_constant (rtx, rtx);
static rtx get_subtarget (rtx);
@@ -2569,7 +2571,8 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
Both X and Y must be MEM rtx's. LEN is an rtx that says how long
they are. LEN_TYPE is the type of the expression that was used to
- calculate it.
+ calculate it, and CTZ_LEN is the known trailing-zeros count of LEN,
+ so LEN must be a multiple of 1<<CTZ_LEN even if it's not constant.
If EQUALITY_ONLY is true, it means we don't have to return the tri-state
value of a normal memcmp call, instead we can just compare for equality.
@@ -2585,7 +2588,7 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
rtx
emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
bool equality_only, by_pieces_constfn y_cfn,
- void *y_cfndata)
+ void *y_cfndata, unsigned ctz_len)
{
rtx result = 0;
@@ -2607,8 +2610,146 @@ emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
else
result = emit_block_cmp_via_cmpmem (x, y, len, len_type, target, align);
+ if (!result && (flag_inline_stringops & ILSOP_MEMCMP))
+ result = emit_block_cmp_via_loop (x, y, len, len_type,
+ target, equality_only,
+ align, ctz_len);
+
return result;
}
+
+rtx
+emit_block_cmp_via_loop (rtx x, rtx y, rtx len, tree size_type, rtx target,
+ bool equality_only, unsigned align, unsigned ctz_len)
+{
+ unsigned incr = align / BITS_PER_UNIT;
+
+ if (CONST_INT_P (len))
+ ctz_len = MAX (ctz_len, (unsigned) wi::ctz (UINTVAL (len)));
+
+ if (HOST_WIDE_INT_1U << ctz_len < (unsigned HOST_WIDE_INT) incr)
+ incr = HOST_WIDE_INT_1U << ctz_len;
+
+ while (incr > 1
+ && can_do_by_pieces (incr, align, COMPARE_BY_PIECES))
+ incr >>= 1;
+
+ rtx_code_label *cmp_label, *top_label, *ne_label, *res_label;
+ rtx iter, x_addr, y_addr, tmp;
+ machine_mode x_addr_mode = get_address_mode (x);
+ machine_mode y_addr_mode = get_address_mode (y);
+ machine_mode iter_mode;
+
+ iter_mode = GET_MODE (len);
+ if (iter_mode == VOIDmode)
+ iter_mode = word_mode;
+
+ top_label = gen_label_rtx ();
+ cmp_label = gen_label_rtx ();
+ ne_label = gen_label_rtx ();
+ res_label = gen_label_rtx ();
+ iter = gen_reg_rtx (iter_mode);
+
+ rtx iter_init = const0_rtx;
+ rtx_code iter_cond = LT;
+ rtx iter_limit = len;
+ rtx iter_incr = GEN_INT (incr);
+ machine_mode cmp_mode;
+
+ emit_move_insn (iter, iter_init);
+
+ scalar_int_mode int_cmp_mode
+ = smallest_int_mode_for_size (incr * BITS_PER_UNIT);
+ if (GET_MODE_BITSIZE (int_cmp_mode) != incr * BITS_PER_UNIT
+ || !can_compare_p (NE, int_cmp_mode, ccp_jump))
+ {
+ cmp_mode = BLKmode;
+ gcc_checking_assert (can_do_by_pieces (incr, align, COMPARE_BY_PIECES));
+ }
+ else
+ cmp_mode = int_cmp_mode;
+
+ x_addr = force_operand (XEXP (x, 0), NULL_RTX);
+ y_addr = force_operand (XEXP (y, 0), NULL_RTX);
+ do_pending_stack_adjust ();
+
+ emit_jump (cmp_label);
+ emit_label (top_label);
+
+ tmp = convert_modes (x_addr_mode, iter_mode, iter, true);
+ x_addr = simplify_gen_binary (PLUS, x_addr_mode, x_addr, tmp);
+
+ if (x_addr_mode != y_addr_mode)
+ tmp = convert_modes (y_addr_mode, iter_mode, iter, true);
+ y_addr = simplify_gen_binary (PLUS, y_addr_mode, y_addr, tmp);
+
+ x = change_address (x, cmp_mode, x_addr);
+ y = change_address (y, cmp_mode, y_addr);
+
+ rtx part_res;
+ if (cmp_mode == BLKmode)
+ part_res = compare_by_pieces (x, y, incr, target, align, 0, 0);
+ else
+ part_res = expand_binop (cmp_mode, sub_optab, x, y, NULL_RTX,
+ true, OPTAB_LIB_WIDEN);
+
+ emit_cmp_and_jump_insns (part_res, GEN_INT (0), NE, NULL_RTX,
+ GET_MODE (part_res), true, ne_label,
+ profile_probability::guessed_always ()
+ .apply_scale (9, 10));
+
+ tmp = expand_simple_binop (iter_mode, PLUS, iter, iter_incr, iter,
+ true, OPTAB_LIB_WIDEN);
+ if (tmp != iter)
+ emit_move_insn (iter, tmp);
+
+ emit_label (cmp_label);
+ emit_cmp_and_jump_insns (iter, iter_limit, iter_cond, NULL_RTX, iter_mode,
+ true, top_label,
+ profile_probability::guessed_always ()
+ .apply_scale (9, 10));
+
+ if (target == NULL_RTX
+ || !REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
+ target = gen_reg_rtx (TYPE_MODE (integer_type_node));
+
+ emit_move_insn (target, const0_rtx);
+ emit_jump (res_label);
+ emit_barrier ();
+
+ emit_label (ne_label);
+
+ if (equality_only)
+ emit_move_insn (target, const1_rtx);
+ else if (incr == 1)
+ convert_move (target, part_res, false);
+ else if (incr > UNITS_PER_WORD)
+ {
+ /* ??? Re-compare the block found to be different one word at a
+ time. */
+ part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), size_type,
+ target, equality_only,
+ BITS_PER_WORD, 0);
+ if (target != part_res)
+ convert_move (target, part_res, false);
+ }
+ else
+ {
+ /* ??? Re-compare the block found to be different one byte at a
+ time. We could do better using part_res, and being careful
+ about endianness. */
+ part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), size_type,
+ target, equality_only,
+ BITS_PER_UNIT, 0);
+ if (target != part_res)
+ convert_move (target, part_res, false);
+ }
+
+ emit_label (res_label);
+
+ return target;
+}
+
\f
/* Copy all or part of a value X into registers starting at REGNO.
The number of registers to be filled is NREGS. */
diff --git a/gcc/expr.h b/gcc/expr.h
index d9fc47c9114..976c8b69fc1 100644
--- a/gcc/expr.h
+++ b/gcc/expr.h
@@ -138,7 +138,8 @@ extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
bool might_overlap = false,
unsigned ctz_size = 0);
extern rtx emit_block_cmp_hints (rtx, rtx, rtx, tree, rtx, bool,
- by_pieces_constfn, void *);
+ by_pieces_constfn, void *,
+ unsigned ctz_len = 0);
extern bool emit_storent_insn (rtx to, rtx from);
/* Copy all or part of a value X into registers starting at REGNO.
^ permalink raw reply [flat|nested] 16+ messages in thread
* [gcc(refs/users/aoliva/heads/testme)] add memcmp loop expander
@ 2023-01-26 6:14 Alexandre Oliva
0 siblings, 0 replies; 16+ messages in thread
From: Alexandre Oliva @ 2023-01-26 6:14 UTC (permalink / raw)
To: gcc-cvs
https://gcc.gnu.org/g:ddb78c573c96d1368519421fa71b9a3e3f9c5ce5
commit ddb78c573c96d1368519421fa71b9a3e3f9c5ce5
Author: Alexandre Oliva <oliva@gnu.org>
Date: Fri Jan 20 22:01:15 2023 -0300
add memcmp loop expander
Diff:
---
gcc/builtins.cc | 3 +-
gcc/expr.cc | 138 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
gcc/expr.h | 3 +-
3 files changed, 140 insertions(+), 4 deletions(-)
diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index ca4d4721cdc..e55b53485e2 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -4809,7 +4809,8 @@ expand_builtin_memcmp (tree exp, rtx target, bool result_eq)
result = emit_block_cmp_hints (arg1_rtx, arg2_rtx, len_rtx,
TREE_TYPE (len), target,
result_eq, constfn,
- CONST_CAST (char *, rep));
+ CONST_CAST (char *, rep),
+ tree_ctz (len));
if (result)
{
diff --git a/gcc/expr.cc b/gcc/expr.cc
index aceb3f514fc..933a28e3891 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -83,6 +83,8 @@ static bool emit_block_move_via_pattern (rtx, rtx, rtx, unsigned, unsigned,
static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned, int);
static void emit_block_move_via_sized_loop (rtx, rtx, rtx, unsigned, unsigned);
static void emit_block_move_via_oriented_loop (rtx, rtx, rtx, unsigned, unsigned);
+static rtx emit_block_cmp_via_loop (rtx, rtx, rtx, tree, rtx, bool,
+ unsigned, unsigned);
static void clear_by_pieces (rtx, unsigned HOST_WIDE_INT, unsigned int);
static rtx_insn *compress_float_constant (rtx, rtx);
static rtx get_subtarget (rtx);
@@ -2569,7 +2571,8 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
Both X and Y must be MEM rtx's. LEN is an rtx that says how long
they are. LEN_TYPE is the type of the expression that was used to
- calculate it.
+ calculate it, and CTZ_LEN is the known trailing-zeros count of LEN,
+ so LEN must be a multiple of 1<<CTZ_LEN even if it's not constant.
If EQUALITY_ONLY is true, it means we don't have to return the tri-state
value of a normal memcmp call, instead we can just compare for equality.
@@ -2585,7 +2588,7 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
rtx
emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
bool equality_only, by_pieces_constfn y_cfn,
- void *y_cfndata)
+ void *y_cfndata, unsigned ctz_len)
{
rtx result = 0;
@@ -2607,8 +2610,139 @@ emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
else
result = emit_block_cmp_via_cmpmem (x, y, len, len_type, target, align);
+ if (!result && (flag_inline_stringops & ILSOP_MEMCMP))
+ result = emit_block_cmp_via_loop (x, y, len, len_type,
+ target, equality_only,
+ align, ctz_len);
+
return result;
}
+
+rtx
+emit_block_cmp_via_loop (rtx x, rtx y, rtx len, tree size_type, rtx target,
+ bool equality_only, unsigned align, unsigned ctz_len)
+{
+ unsigned incr = align / BITS_PER_UNIT;
+
+ if (CONST_INT_P (len))
+ ctz_len = MAX (ctz_len, (unsigned) wi::ctz (UINTVAL (len)));
+
+ if (HOST_WIDE_INT_1U << ctz_len < (unsigned HOST_WIDE_INT) incr)
+ incr = HOST_WIDE_INT_1U << ctz_len;
+
+ while (incr > 1
+ && !(equality_only
+ ? can_do_by_pieces (incr, align, COMPARE_BY_PIECES)
+ : int_mode_for_size (incr, 0).exists ()))
+ incr >>= 1;
+
+ rtx_code_label *cmp_label, *top_label, *ne_label, *res_label;
+ rtx iter, x_addr, y_addr, tmp;
+ machine_mode x_addr_mode = get_address_mode (x);
+ machine_mode y_addr_mode = get_address_mode (y);
+ machine_mode iter_mode;
+
+ iter_mode = GET_MODE (len);
+ if (iter_mode == VOIDmode)
+ iter_mode = word_mode;
+
+ top_label = gen_label_rtx ();
+ cmp_label = gen_label_rtx ();
+ ne_label = gen_label_rtx ();
+ res_label = gen_label_rtx ();
+ iter = gen_reg_rtx (iter_mode);
+
+ rtx iter_init = const0_rtx;
+ rtx_code iter_cond = LT;
+ rtx iter_limit = len;
+ rtx iter_incr = GEN_INT (incr);
+ machine_mode cmp_mode;
+
+ emit_move_insn (iter, iter_init);
+
+ scalar_int_mode int_cmp_mode
+ = smallest_int_mode_for_size (incr * BITS_PER_UNIT);
+ if (GET_MODE_BITSIZE (int_cmp_mode) != incr * BITS_PER_UNIT
+ || !can_compare_p (NE, int_cmp_mode, ccp_jump))
+ {
+ cmp_mode = BLKmode;
+ gcc_checking_assert (can_do_by_pieces (incr, align, COMPARE_BY_PIECES));
+ if (!equality_only)
+ return NULL_RTX;
+ }
+ else
+ cmp_mode = int_cmp_mode;
+
+ x_addr = force_operand (XEXP (x, 0), NULL_RTX);
+ y_addr = force_operand (XEXP (y, 0), NULL_RTX);
+ do_pending_stack_adjust ();
+
+ emit_jump (cmp_label);
+ emit_label (top_label);
+
+ tmp = convert_modes (x_addr_mode, iter_mode, iter, true);
+ x_addr = simplify_gen_binary (PLUS, x_addr_mode, x_addr, tmp);
+
+ if (x_addr_mode != y_addr_mode)
+ tmp = convert_modes (y_addr_mode, iter_mode, iter, true);
+ y_addr = simplify_gen_binary (PLUS, y_addr_mode, y_addr, tmp);
+
+ x = change_address (x, cmp_mode, x_addr);
+ y = change_address (y, cmp_mode, y_addr);
+
+ rtx part_res;
+ if (cmp_mode == BLKmode)
+ part_res = compare_by_pieces (x, y, incr, target, align, 0, 0);
+ else
+ part_res = expand_binop (cmp_mode, sub_optab, x, y, NULL_RTX,
+ true, OPTAB_LIB_WIDEN);
+
+ emit_cmp_and_jump_insns (part_res, GEN_INT (0), NE, NULL_RTX,
+ GET_MODE (part_res), true, ne_label,
+ profile_probability::guessed_always ()
+ .apply_scale (9, 10));
+
+ tmp = expand_simple_binop (iter_mode, PLUS, iter, iter_incr, iter,
+ true, OPTAB_LIB_WIDEN);
+ if (tmp != iter)
+ emit_move_insn (iter, tmp);
+
+ emit_label (cmp_label);
+ emit_cmp_and_jump_insns (iter, iter_limit, iter_cond, NULL_RTX, iter_mode,
+ true, top_label,
+ profile_probability::guessed_always ()
+ .apply_scale (9, 10));
+
+ if (target == NULL_RTX
+ || !REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
+ target = gen_reg_rtx (TYPE_MODE (integer_type_node));
+
+ emit_move_insn (target, const0_rtx);
+ emit_jump (res_label);
+ emit_barrier ();
+
+ emit_label (ne_label);
+
+ if (equality_only)
+ emit_move_insn (target, const1_rtx);
+ else if (incr == 1)
+ convert_move (target, part_res, false);
+ else
+ {
+ /* ??? Re-compare the block found to be different one byte at a
+ time. We could do better using part_res, and being careful
+ about endianness. */
+ part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), size_type,
+ target, equality_only,
+ BITS_PER_UNIT, 0);
+ convert_move (target, part_res, false);
+ }
+
+ emit_label (res_label);
+
+ return target;
+}
+
\f
/* Copy all or part of a value X into registers starting at REGNO.
The number of registers to be filled is NREGS. */
diff --git a/gcc/expr.h b/gcc/expr.h
index d9fc47c9114..976c8b69fc1 100644
--- a/gcc/expr.h
+++ b/gcc/expr.h
@@ -138,7 +138,8 @@ extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
bool might_overlap = false,
unsigned ctz_size = 0);
extern rtx emit_block_cmp_hints (rtx, rtx, rtx, tree, rtx, bool,
- by_pieces_constfn, void *);
+ by_pieces_constfn, void *,
+ unsigned ctz_len = 0);
extern bool emit_storent_insn (rtx to, rtx from);
/* Copy all or part of a value X into registers starting at REGNO.
^ permalink raw reply [flat|nested] 16+ messages in thread
* [gcc(refs/users/aoliva/heads/testme)] add memcmp loop expander
@ 2023-01-26 6:09 Alexandre Oliva
0 siblings, 0 replies; 16+ messages in thread
From: Alexandre Oliva @ 2023-01-26 6:09 UTC (permalink / raw)
To: gcc-cvs
https://gcc.gnu.org/g:fd9673deea478839fd5a91ce8131e9d7a963de18
commit fd9673deea478839fd5a91ce8131e9d7a963de18
Author: Alexandre Oliva <oliva@gnu.org>
Date: Fri Jan 20 22:01:15 2023 -0300
add memcmp loop expander
Diff:
---
gcc/builtins.cc | 3 +-
gcc/expr.cc | 138 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
gcc/expr.h | 3 +-
3 files changed, 140 insertions(+), 4 deletions(-)
diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index ca4d4721cdc..e55b53485e2 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -4809,7 +4809,8 @@ expand_builtin_memcmp (tree exp, rtx target, bool result_eq)
result = emit_block_cmp_hints (arg1_rtx, arg2_rtx, len_rtx,
TREE_TYPE (len), target,
result_eq, constfn,
- CONST_CAST (char *, rep));
+ CONST_CAST (char *, rep),
+ tree_ctz (len));
if (result)
{
diff --git a/gcc/expr.cc b/gcc/expr.cc
index aceb3f514fc..e8c53b4c7ce 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -83,6 +83,8 @@ static bool emit_block_move_via_pattern (rtx, rtx, rtx, unsigned, unsigned,
static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned, int);
static void emit_block_move_via_sized_loop (rtx, rtx, rtx, unsigned, unsigned);
static void emit_block_move_via_oriented_loop (rtx, rtx, rtx, unsigned, unsigned);
+static rtx emit_block_cmp_via_loop (rtx, rtx, rtx, tree, rtx, bool,
+ unsigned, unsigned);
static void clear_by_pieces (rtx, unsigned HOST_WIDE_INT, unsigned int);
static rtx_insn *compress_float_constant (rtx, rtx);
static rtx get_subtarget (rtx);
@@ -2569,7 +2571,8 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
Both X and Y must be MEM rtx's. LEN is an rtx that says how long
they are. LEN_TYPE is the type of the expression that was used to
- calculate it.
+ calculate it, and CTZ_LEN is the known trailing-zeros count of LEN,
+ so LEN must be a multiple of 1<<CTZ_LEN even if it's not constant.
If EQUALITY_ONLY is true, it means we don't have to return the tri-state
value of a normal memcmp call, instead we can just compare for equality.
@@ -2585,7 +2588,7 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
rtx
emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
bool equality_only, by_pieces_constfn y_cfn,
- void *y_cfndata)
+ void *y_cfndata, unsigned ctz_len)
{
rtx result = 0;
@@ -2607,8 +2610,139 @@ emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
else
result = emit_block_cmp_via_cmpmem (x, y, len, len_type, target, align);
+ if (!result && (flag_inline_stringops & ILSOP_MEMCMP))
+ result = emit_block_cmp_via_loop (x, y, len, len_type,
+ target, equality_only,
+ align, ctz_len);
+
return result;
}
+
+rtx
+emit_block_cmp_via_loop (rtx x, rtx y, rtx len, tree size_type, rtx target,
+ bool equality_only, unsigned align, unsigned ctz_len)
+{
+ unsigned incr = align / BITS_PER_UNIT;
+
+ if (CONST_INT_P (len))
+ ctz_len = MAX (ctz_len, (unsigned) wi::ctz (UINTVAL (len)));
+
+ if (HOST_WIDE_INT_1U << ctz_len < (unsigned HOST_WIDE_INT) incr)
+ incr = HOST_WIDE_INT_1U << ctz_len;
+
+ while (incr > 1
+ && !(equality_only
+ ? can_do_by_pieces (incr, align, COMPARE_BY_PIECES)
+ : int_mode_for_size (incr, 0).exists ())
+ && can_compare_p (NE, int_mode_for_size (incr, 0), ccp_jump))
+ incr >>= 1;
+
+ rtx_code_label *cmp_label, *top_label, *ne_label, *res_label;
+ rtx iter, x_addr, y_addr, tmp;
+ machine_mode x_addr_mode = get_address_mode (x);
+ machine_mode y_addr_mode = get_address_mode (y);
+ machine_mode iter_mode;
+
+ iter_mode = GET_MODE (len);
+ if (iter_mode == VOIDmode)
+ iter_mode = word_mode;
+
+ top_label = gen_label_rtx ();
+ cmp_label = gen_label_rtx ();
+ ne_label = gen_label_rtx ();
+ res_label = gen_label_rtx ();
+ iter = gen_reg_rtx (iter_mode);
+
+ rtx iter_init = const0_rtx;
+ rtx_code iter_cond = LT;
+ rtx iter_limit = len;
+ rtx iter_incr = GEN_INT (incr);
+ machine_mode cmp_mode;
+
+ emit_move_insn (iter, iter_init);
+
+ scalar_int_mode int_cmp_mode
+ = smallest_int_mode_for_size (incr * BITS_PER_UNIT);
+ if (GET_MODE_BITSIZE (int_cmp_mode) != incr * BITS_PER_UNIT)
+ {
+ cmp_mode = BLKmode;
+ gcc_checking_assert (can_do_by_pieces (incr, align, COMPARE_BY_PIECES));
+ if (!equality_only)
+ return NULL_RTX;
+ }
+ else
+ cmp_mode = int_cmp_mode;
+
+ x_addr = force_operand (XEXP (x, 0), NULL_RTX);
+ y_addr = force_operand (XEXP (y, 0), NULL_RTX);
+ do_pending_stack_adjust ();
+
+ emit_jump (cmp_label);
+ emit_label (top_label);
+
+ tmp = convert_modes (x_addr_mode, iter_mode, iter, true);
+ x_addr = simplify_gen_binary (PLUS, x_addr_mode, x_addr, tmp);
+
+ if (x_addr_mode != y_addr_mode)
+ tmp = convert_modes (y_addr_mode, iter_mode, iter, true);
+ y_addr = simplify_gen_binary (PLUS, y_addr_mode, y_addr, tmp);
+
+ x = change_address (x, cmp_mode, x_addr);
+ y = change_address (y, cmp_mode, y_addr);
+
+ rtx part_res;
+ if (cmp_mode == BLKmode)
+ part_res = compare_by_pieces (x, y, incr, target, align, 0, 0);
+ else
+ part_res = expand_binop (cmp_mode, sub_optab, x, y, NULL_RTX,
+ true, OPTAB_LIB_WIDEN);
+
+ emit_cmp_and_jump_insns (part_res, GEN_INT (0), NE, NULL_RTX,
+ GET_MODE (part_res), true, ne_label,
+ profile_probability::guessed_always ()
+ .apply_scale (9, 10));
+
+ tmp = expand_simple_binop (iter_mode, PLUS, iter, iter_incr, iter,
+ true, OPTAB_LIB_WIDEN);
+ if (tmp != iter)
+ emit_move_insn (iter, tmp);
+
+ emit_label (cmp_label);
+ emit_cmp_and_jump_insns (iter, iter_limit, iter_cond, NULL_RTX, iter_mode,
+ true, top_label,
+ profile_probability::guessed_always ()
+ .apply_scale (9, 10));
+
+ if (target == NULL_RTX
+ || !REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
+ target = gen_reg_rtx (TYPE_MODE (integer_type_node));
+
+ emit_move_insn (target, const0_rtx);
+ emit_jump (res_label);
+ emit_barrier ();
+
+ emit_label (ne_label);
+
+ if (equality_only)
+ emit_move_insn (target, const1_rtx);
+ else if (incr == 1)
+ convert_move (target, part_res, false);
+ else
+ {
+ /* ??? Re-compare the block found to be different one byte at a
+ time. We could do better using part_res, and being careful
+ about endianness. */
+ part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), size_type,
+ target, equality_only,
+ BITS_PER_UNIT, 0);
+ convert_move (target, part_res, false);
+ }
+
+ emit_label (res_label);
+
+ return target;
+}
+
\f
/* Copy all or part of a value X into registers starting at REGNO.
The number of registers to be filled is NREGS. */
diff --git a/gcc/expr.h b/gcc/expr.h
index d9fc47c9114..976c8b69fc1 100644
--- a/gcc/expr.h
+++ b/gcc/expr.h
@@ -138,7 +138,8 @@ extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
bool might_overlap = false,
unsigned ctz_size = 0);
extern rtx emit_block_cmp_hints (rtx, rtx, rtx, tree, rtx, bool,
- by_pieces_constfn, void *);
+ by_pieces_constfn, void *,
+ unsigned ctz_len = 0);
extern bool emit_storent_insn (rtx to, rtx from);
/* Copy all or part of a value X into registers starting at REGNO.
^ permalink raw reply [flat|nested] 16+ messages in thread
* [gcc(refs/users/aoliva/heads/testme)] add memcmp loop expander
@ 2023-01-26 5:17 Alexandre Oliva
0 siblings, 0 replies; 16+ messages in thread
From: Alexandre Oliva @ 2023-01-26 5:17 UTC (permalink / raw)
To: gcc-cvs
https://gcc.gnu.org/g:87147caf4eadff1b779896ddc22b8a0201b9d98e
commit 87147caf4eadff1b779896ddc22b8a0201b9d98e
Author: Alexandre Oliva <oliva@gnu.org>
Date: Fri Jan 20 22:01:15 2023 -0300
add memcmp loop expander
Diff:
---
gcc/builtins.cc | 3 +-
gcc/expr.cc | 137 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
gcc/expr.h | 3 +-
3 files changed, 139 insertions(+), 4 deletions(-)
diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index ca4d4721cdc..e55b53485e2 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -4809,7 +4809,8 @@ expand_builtin_memcmp (tree exp, rtx target, bool result_eq)
result = emit_block_cmp_hints (arg1_rtx, arg2_rtx, len_rtx,
TREE_TYPE (len), target,
result_eq, constfn,
- CONST_CAST (char *, rep));
+ CONST_CAST (char *, rep),
+ tree_ctz (len));
if (result)
{
diff --git a/gcc/expr.cc b/gcc/expr.cc
index a02b0b6ed52..8de92272a10 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -83,6 +83,8 @@ static bool emit_block_move_via_pattern (rtx, rtx, rtx, unsigned, unsigned,
static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned, int);
static void emit_block_move_via_sized_loop (rtx, rtx, rtx, unsigned, unsigned);
static void emit_block_move_via_oriented_loop (rtx, rtx, rtx, unsigned, unsigned);
+static rtx emit_block_cmp_via_loop (rtx, rtx, rtx, tree, rtx, bool,
+ unsigned, unsigned);
static void clear_by_pieces (rtx, unsigned HOST_WIDE_INT, unsigned int);
static rtx_insn *compress_float_constant (rtx, rtx);
static rtx get_subtarget (rtx);
@@ -2567,7 +2569,8 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
Both X and Y must be MEM rtx's. LEN is an rtx that says how long
they are. LEN_TYPE is the type of the expression that was used to
- calculate it.
+ calculate it, and CTZ_LEN is the known trailing-zeros count of LEN,
+ so LEN must be a multiple of 1<<CTZ_LEN even if it's not constant.
If EQUALITY_ONLY is true, it means we don't have to return the tri-state
value of a normal memcmp call, instead we can just compare for equality.
@@ -2583,7 +2586,7 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
rtx
emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
bool equality_only, by_pieces_constfn y_cfn,
- void *y_cfndata)
+ void *y_cfndata, unsigned ctz_len)
{
rtx result = 0;
@@ -2605,8 +2608,138 @@ emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
else
result = emit_block_cmp_via_cmpmem (x, y, len, len_type, target, align);
+ if (!result && (flag_inline_stringops & ILSOP_MEMCMP))
+ result = emit_block_cmp_via_loop (x, y, len, len_type,
+ target, equality_only,
+ align, ctz_len);
+
return result;
}
+
+rtx
+emit_block_cmp_via_loop (rtx x, rtx y, rtx len, tree size_type, rtx target,
+ bool equality_only, unsigned align, unsigned ctz_len)
+{
+ unsigned incr = align / BITS_PER_UNIT;
+
+ if (CONST_INT_P (len))
+ ctz_len = MAX (ctz_len, (unsigned) wi::ctz (UINTVAL (len)));
+
+ if (HOST_WIDE_INT_1U << ctz_len < (unsigned HOST_WIDE_INT) incr)
+ incr = HOST_WIDE_INT_1U << ctz_len;
+
+ while (incr > 1
+ && !(equality_only
+ ? can_do_by_pieces (incr, align, COMPARE_BY_PIECES)
+ : int_mode_for_size (incr, 0).exists ()))
+ incr >>= 1;
+
+ rtx_code_label *cmp_label, *top_label, *ne_label, *res_label;
+ rtx iter, x_addr, y_addr, tmp;
+ machine_mode x_addr_mode = get_address_mode (x);
+ machine_mode y_addr_mode = get_address_mode (y);
+ machine_mode iter_mode;
+
+ iter_mode = GET_MODE (len);
+ if (iter_mode == VOIDmode)
+ iter_mode = word_mode;
+
+ top_label = gen_label_rtx ();
+ cmp_label = gen_label_rtx ();
+ ne_label = gen_label_rtx ();
+ res_label = gen_label_rtx ();
+ iter = gen_reg_rtx (iter_mode);
+
+ rtx iter_init = const0_rtx;
+ rtx_code iter_cond = LT;
+ rtx iter_limit = len;
+ rtx iter_incr = GEN_INT (incr);
+ machine_mode cmp_mode;
+
+ emit_move_insn (iter, iter_init);
+
+ scalar_int_mode int_cmp_mode
+ = smallest_int_mode_for_size (incr * BITS_PER_UNIT);
+ if (GET_MODE_BITSIZE (int_cmp_mode) != incr * BITS_PER_UNIT)
+ {
+ cmp_mode = BLKmode;
+ gcc_checking_assert (can_do_by_pieces (incr, align, COMPARE_BY_PIECES));
+ if (!equality_only)
+ return NULL_RTX;
+ }
+ else
+ cmp_mode = int_cmp_mode;
+
+ x_addr = force_operand (XEXP (x, 0), NULL_RTX);
+ y_addr = force_operand (XEXP (y, 0), NULL_RTX);
+ do_pending_stack_adjust ();
+
+ emit_jump (cmp_label);
+ emit_label (top_label);
+
+ tmp = convert_modes (x_addr_mode, iter_mode, iter, true);
+ x_addr = simplify_gen_binary (PLUS, x_addr_mode, x_addr, tmp);
+
+ if (x_addr_mode != y_addr_mode)
+ tmp = convert_modes (y_addr_mode, iter_mode, iter, true);
+ y_addr = simplify_gen_binary (PLUS, y_addr_mode, y_addr, tmp);
+
+ x = change_address (x, cmp_mode, x_addr);
+ y = change_address (y, cmp_mode, y_addr);
+
+ rtx part_res;
+ if (cmp_mode == BLKmode)
+ part_res = compare_by_pieces (x, y, incr, target, align, 0, 0);
+ else
+ part_res = expand_binop (cmp_mode, sub_optab, x, y, NULL_RTX,
+ true, OPTAB_LIB_WIDEN);
+
+ emit_cmp_and_jump_insns (part_res, GEN_INT (0), NE, NULL_RTX,
+ GET_MODE (part_res), true, ne_label,
+ profile_probability::guessed_always ()
+ .apply_scale (9, 10));
+
+ tmp = expand_simple_binop (iter_mode, PLUS, iter, iter_incr, iter,
+ true, OPTAB_LIB_WIDEN);
+ if (tmp != iter)
+ emit_move_insn (iter, tmp);
+
+ emit_label (cmp_label);
+ emit_cmp_and_jump_insns (iter, iter_limit, iter_cond, NULL_RTX, iter_mode,
+ true, top_label,
+ profile_probability::guessed_always ()
+ .apply_scale (9, 10));
+
+ if (target == NULL_RTX
+ || !REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
+ target = gen_reg_rtx (TYPE_MODE (integer_type_node));
+
+ emit_move_insn (target, const0_rtx);
+ emit_jump (res_label);
+ emit_barrier ();
+
+ emit_label (ne_label);
+
+ if (equality_only)
+ emit_move_insn (target, const1_rtx);
+ else if (incr == 1)
+ convert_move (target, part_res, false);
+ else
+ {
+ /* ??? Re-compare the block found to be different one byte at a
+ time. We could do better using part_res, and being careful
+ about endianness. */
+ part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), size_type,
+ target, equality_only,
+ BITS_PER_UNIT, 0);
+ convert_move (target, part_res, false);
+ }
+
+ emit_label (res_label);
+
+ return target;
+}
+
\f
/* Copy all or part of a value X into registers starting at REGNO.
The number of registers to be filled is NREGS. */
diff --git a/gcc/expr.h b/gcc/expr.h
index d9fc47c9114..976c8b69fc1 100644
--- a/gcc/expr.h
+++ b/gcc/expr.h
@@ -138,7 +138,8 @@ extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
bool might_overlap = false,
unsigned ctz_size = 0);
extern rtx emit_block_cmp_hints (rtx, rtx, rtx, tree, rtx, bool,
- by_pieces_constfn, void *);
+ by_pieces_constfn, void *,
+ unsigned ctz_len = 0);
extern bool emit_storent_insn (rtx to, rtx from);
/* Copy all or part of a value X into registers starting at REGNO.
^ permalink raw reply [flat|nested] 16+ messages in thread
* [gcc(refs/users/aoliva/heads/testme)] add memcmp loop expander
@ 2023-01-26 4:59 Alexandre Oliva
0 siblings, 0 replies; 16+ messages in thread
From: Alexandre Oliva @ 2023-01-26 4:59 UTC (permalink / raw)
To: gcc-cvs
https://gcc.gnu.org/g:cbd6df23e279d3d9a4a5ca0b57b1dad32e5118c3
commit cbd6df23e279d3d9a4a5ca0b57b1dad32e5118c3
Author: Alexandre Oliva <oliva@gnu.org>
Date: Fri Jan 20 22:01:15 2023 -0300
add memcmp loop expander
Diff:
---
gcc/builtins.cc | 3 +-
gcc/expr.cc | 134 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
gcc/expr.h | 3 +-
3 files changed, 137 insertions(+), 3 deletions(-)
diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index ca4d4721cdc..e55b53485e2 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -4809,7 +4809,8 @@ expand_builtin_memcmp (tree exp, rtx target, bool result_eq)
result = emit_block_cmp_hints (arg1_rtx, arg2_rtx, len_rtx,
TREE_TYPE (len), target,
result_eq, constfn,
- CONST_CAST (char *, rep));
+ CONST_CAST (char *, rep),
+ tree_ctz (len));
if (result)
{
diff --git a/gcc/expr.cc b/gcc/expr.cc
index a02b0b6ed52..2225fd8085b 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -83,6 +83,8 @@ static bool emit_block_move_via_pattern (rtx, rtx, rtx, unsigned, unsigned,
static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned, int);
static void emit_block_move_via_sized_loop (rtx, rtx, rtx, unsigned, unsigned);
static void emit_block_move_via_oriented_loop (rtx, rtx, rtx, unsigned, unsigned);
+static rtx emit_block_cmp_via_loop (rtx, rtx, rtx, tree, rtx, bool,
+ unsigned, unsigned);
static void clear_by_pieces (rtx, unsigned HOST_WIDE_INT, unsigned int);
static rtx_insn *compress_float_constant (rtx, rtx);
static rtx get_subtarget (rtx);
@@ -2583,7 +2585,7 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
rtx
emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
bool equality_only, by_pieces_constfn y_cfn,
- void *y_cfndata)
+ void *y_cfndata, unsigned ctz_size)
{
rtx result = 0;
@@ -2605,8 +2607,138 @@ emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
else
result = emit_block_cmp_via_cmpmem (x, y, len, len_type, target, align);
+ if (!result && (flag_inline_stringops & ILSOP_MEMCMP))
+ result = emit_block_cmp_via_loop (x, y, len, len_type,
+ target, equality_only,
+ align, ctz_size);
+
return result;
}
+
+rtx
+emit_block_cmp_via_loop (rtx x, rtx y, rtx len, tree size_type, rtx target,
+ bool equality_only, unsigned align, unsigned ctz_size)
+{
+ unsigned incr = align / BITS_PER_UNIT;
+
+ if (CONST_INT_P (len))
+ ctz_size = MAX (ctz_size, (unsigned) wi::ctz (UINTVAL (len)));
+
+ if (HOST_WIDE_INT_1U << ctz_size < (unsigned HOST_WIDE_INT) incr)
+ incr = HOST_WIDE_INT_1U << ctz_size;
+
+ while (incr > 1
+ && !(equality_only
+ ? can_do_by_pieces (incr, align, COMPARE_BY_PIECES)
+ : int_mode_for_size (incr, 0).exists ()))
+ incr >>= 1;
+
+ rtx_code_label *cmp_label, *top_label, *ne_label, *res_label;
+ rtx iter, x_addr, y_addr, tmp;
+ machine_mode x_addr_mode = get_address_mode (x);
+ machine_mode y_addr_mode = get_address_mode (y);
+ machine_mode iter_mode;
+
+ iter_mode = GET_MODE (len);
+ if (iter_mode == VOIDmode)
+ iter_mode = word_mode;
+
+ top_label = gen_label_rtx ();
+ cmp_label = gen_label_rtx ();
+ ne_label = gen_label_rtx ();
+ res_label = gen_label_rtx ();
+ iter = gen_reg_rtx (iter_mode);
+
+ rtx iter_init = const0_rtx;
+ rtx_code iter_cond = LT;
+ rtx iter_limit = len;
+ rtx iter_incr = GEN_INT (incr);
+ machine_mode cmp_mode;
+
+ emit_move_insn (iter, iter_init);
+
+ scalar_int_mode int_cmp_mode
+ = smallest_int_mode_for_size (incr * BITS_PER_UNIT);
+ if (GET_MODE_BITSIZE (int_cmp_mode) != incr * BITS_PER_UNIT)
+ {
+ cmp_mode = BLKmode;
+ gcc_checking_assert (can_do_by_pieces (incr, align, COMPARE_BY_PIECES));
+ if (!equality_only)
+ return NULL_RTX;
+ }
+ else
+ cmp_mode = int_cmp_mode;
+
+ x_addr = force_operand (XEXP (x, 0), NULL_RTX);
+ y_addr = force_operand (XEXP (y, 0), NULL_RTX);
+ do_pending_stack_adjust ();
+
+ emit_jump (cmp_label);
+ emit_label (top_label);
+
+ tmp = convert_modes (x_addr_mode, iter_mode, iter, true);
+ x_addr = simplify_gen_binary (PLUS, x_addr_mode, x_addr, tmp);
+
+ if (x_addr_mode != y_addr_mode)
+ tmp = convert_modes (y_addr_mode, iter_mode, iter, true);
+ y_addr = simplify_gen_binary (PLUS, y_addr_mode, y_addr, tmp);
+
+ x = change_address (x, cmp_mode, x_addr);
+ y = change_address (y, cmp_mode, y_addr);
+
+ rtx part_res;
+ if (cmp_mode == BLKmode)
+ part_res = compare_by_pieces (x, y, incr, target, align, 0, 0);
+ else
+ part_res = expand_binop (cmp_mode, sub_optab, x, y, NULL_RTX,
+ true, OPTAB_LIB_WIDEN);
+
+ emit_cmp_and_jump_insns (part_res, GEN_INT (0), NE, NULL_RTX,
+ GET_MODE (part_res), true, ne_label,
+ profile_probability::guessed_always ()
+ .apply_scale (9, 10));
+
+ tmp = expand_simple_binop (iter_mode, PLUS, iter, iter_incr, iter,
+ true, OPTAB_LIB_WIDEN);
+ if (tmp != iter)
+ emit_move_insn (iter, tmp);
+
+ emit_label (cmp_label);
+ emit_cmp_and_jump_insns (iter, iter_limit, iter_cond, NULL_RTX, iter_mode,
+ true, top_label,
+ profile_probability::guessed_always ()
+ .apply_scale (9, 10));
+
+ if (target == NULL_RTX
+ || !REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
+ target = gen_reg_rtx (TYPE_MODE (integer_type_node));
+
+ emit_move_insn (target, const0_rtx);
+ emit_jump (res_label);
+ emit_barrier ();
+
+ emit_label (ne_label);
+
+ if (equality_only)
+ emit_move_insn (target, const1_rtx);
+ else if (incr == 1)
+ convert_move (target, part_res, false);
+ else
+ {
+ /* ??? Re-compare the block found to be different one byte at a
+ time. We could do better using part_res, and being careful
+ about endianness. */
+ part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), size_type,
+ target, equality_only,
+ BITS_PER_UNIT, 0);
+ convert_move (target, part_res, false);
+ }
+
+ emit_label (res_label);
+
+ return target;
+}
+
\f
/* Copy all or part of a value X into registers starting at REGNO.
The number of registers to be filled is NREGS. */
diff --git a/gcc/expr.h b/gcc/expr.h
index d9fc47c9114..6366675f231 100644
--- a/gcc/expr.h
+++ b/gcc/expr.h
@@ -138,7 +138,8 @@ extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
bool might_overlap = false,
unsigned ctz_size = 0);
extern rtx emit_block_cmp_hints (rtx, rtx, rtx, tree, rtx, bool,
- by_pieces_constfn, void *);
+ by_pieces_constfn, void *,
+ unsigned ctz_size = 0);
extern bool emit_storent_insn (rtx to, rtx from);
/* Copy all or part of a value X into registers starting at REGNO.
^ permalink raw reply [flat|nested] 16+ messages in thread
* [gcc(refs/users/aoliva/heads/testme)] add memcmp loop expander
@ 2023-01-26 3:37 Alexandre Oliva
0 siblings, 0 replies; 16+ messages in thread
From: Alexandre Oliva @ 2023-01-26 3:37 UTC (permalink / raw)
To: gcc-cvs
https://gcc.gnu.org/g:d82adc21442ccf03a3937084b1de3dd518989da8
commit d82adc21442ccf03a3937084b1de3dd518989da8
Author: Alexandre Oliva <oliva@gnu.org>
Date: Fri Jan 20 22:01:15 2023 -0300
add memcmp loop expander
Diff:
---
gcc/expr.cc | 134 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
gcc/expr.h | 3 +-
2 files changed, 135 insertions(+), 2 deletions(-)
diff --git a/gcc/expr.cc b/gcc/expr.cc
index a02b0b6ed52..2225fd8085b 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -83,6 +83,8 @@ static bool emit_block_move_via_pattern (rtx, rtx, rtx, unsigned, unsigned,
static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned, int);
static void emit_block_move_via_sized_loop (rtx, rtx, rtx, unsigned, unsigned);
static void emit_block_move_via_oriented_loop (rtx, rtx, rtx, unsigned, unsigned);
+static rtx emit_block_cmp_via_loop (rtx, rtx, rtx, tree, rtx, bool,
+ unsigned, unsigned);
static void clear_by_pieces (rtx, unsigned HOST_WIDE_INT, unsigned int);
static rtx_insn *compress_float_constant (rtx, rtx);
static rtx get_subtarget (rtx);
@@ -2583,7 +2585,7 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
rtx
emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
bool equality_only, by_pieces_constfn y_cfn,
- void *y_cfndata)
+ void *y_cfndata, unsigned ctz_size)
{
rtx result = 0;
@@ -2605,8 +2607,138 @@ emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
else
result = emit_block_cmp_via_cmpmem (x, y, len, len_type, target, align);
+ if (!result && (flag_inline_stringops & ILSOP_MEMCMP))
+ result = emit_block_cmp_via_loop (x, y, len, len_type,
+ target, equality_only,
+ align, ctz_size);
+
return result;
}
+
+rtx
+emit_block_cmp_via_loop (rtx x, rtx y, rtx len, tree size_type, rtx target,
+ bool equality_only, unsigned align, unsigned ctz_size)
+{
+ unsigned incr = align / BITS_PER_UNIT;
+
+ if (CONST_INT_P (len))
+ ctz_size = MAX (ctz_size, (unsigned) wi::ctz (UINTVAL (len)));
+
+ if (HOST_WIDE_INT_1U << ctz_size < (unsigned HOST_WIDE_INT) incr)
+ incr = HOST_WIDE_INT_1U << ctz_size;
+
+ while (incr > 1
+ && !(equality_only
+ ? can_do_by_pieces (incr, align, COMPARE_BY_PIECES)
+ : int_mode_for_size (incr, 0).exists ()))
+ incr >>= 1;
+
+ rtx_code_label *cmp_label, *top_label, *ne_label, *res_label;
+ rtx iter, x_addr, y_addr, tmp;
+ machine_mode x_addr_mode = get_address_mode (x);
+ machine_mode y_addr_mode = get_address_mode (y);
+ machine_mode iter_mode;
+
+ iter_mode = GET_MODE (len);
+ if (iter_mode == VOIDmode)
+ iter_mode = word_mode;
+
+ top_label = gen_label_rtx ();
+ cmp_label = gen_label_rtx ();
+ ne_label = gen_label_rtx ();
+ res_label = gen_label_rtx ();
+ iter = gen_reg_rtx (iter_mode);
+
+ rtx iter_init = const0_rtx;
+ rtx_code iter_cond = LT;
+ rtx iter_limit = len;
+ rtx iter_incr = GEN_INT (incr);
+ machine_mode cmp_mode;
+
+ emit_move_insn (iter, iter_init);
+
+ scalar_int_mode int_cmp_mode
+ = smallest_int_mode_for_size (incr * BITS_PER_UNIT);
+ if (GET_MODE_BITSIZE (int_cmp_mode) != incr * BITS_PER_UNIT)
+ {
+ cmp_mode = BLKmode;
+ gcc_checking_assert (can_do_by_pieces (incr, align, COMPARE_BY_PIECES));
+ if (!equality_only)
+ return NULL_RTX;
+ }
+ else
+ cmp_mode = int_cmp_mode;
+
+ x_addr = force_operand (XEXP (x, 0), NULL_RTX);
+ y_addr = force_operand (XEXP (y, 0), NULL_RTX);
+ do_pending_stack_adjust ();
+
+ emit_jump (cmp_label);
+ emit_label (top_label);
+
+ tmp = convert_modes (x_addr_mode, iter_mode, iter, true);
+ x_addr = simplify_gen_binary (PLUS, x_addr_mode, x_addr, tmp);
+
+ if (x_addr_mode != y_addr_mode)
+ tmp = convert_modes (y_addr_mode, iter_mode, iter, true);
+ y_addr = simplify_gen_binary (PLUS, y_addr_mode, y_addr, tmp);
+
+ x = change_address (x, cmp_mode, x_addr);
+ y = change_address (y, cmp_mode, y_addr);
+
+ rtx part_res;
+ if (cmp_mode == BLKmode)
+ part_res = compare_by_pieces (x, y, incr, target, align, 0, 0);
+ else
+ part_res = expand_binop (cmp_mode, sub_optab, x, y, NULL_RTX,
+ true, OPTAB_LIB_WIDEN);
+
+ emit_cmp_and_jump_insns (part_res, GEN_INT (0), NE, NULL_RTX,
+ GET_MODE (part_res), true, ne_label,
+ profile_probability::guessed_always ()
+ .apply_scale (9, 10));
+
+ tmp = expand_simple_binop (iter_mode, PLUS, iter, iter_incr, iter,
+ true, OPTAB_LIB_WIDEN);
+ if (tmp != iter)
+ emit_move_insn (iter, tmp);
+
+ emit_label (cmp_label);
+ emit_cmp_and_jump_insns (iter, iter_limit, iter_cond, NULL_RTX, iter_mode,
+ true, top_label,
+ profile_probability::guessed_always ()
+ .apply_scale (9, 10));
+
+ if (target == NULL_RTX
+ || !REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
+ target = gen_reg_rtx (TYPE_MODE (integer_type_node));
+
+ emit_move_insn (target, const0_rtx);
+ emit_jump (res_label);
+ emit_barrier ();
+
+ emit_label (ne_label);
+
+ if (equality_only)
+ emit_move_insn (target, const1_rtx);
+ else if (incr == 1)
+ convert_move (target, part_res, false);
+ else
+ {
+ /* ??? Re-compare the block found to be different one byte at a
+ time. We could do better using part_res, and being careful
+ about endianness. */
+ part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), size_type,
+ target, equality_only,
+ BITS_PER_UNIT, 0);
+ convert_move (target, part_res, false);
+ }
+
+ emit_label (res_label);
+
+ return target;
+}
+
\f
/* Copy all or part of a value X into registers starting at REGNO.
The number of registers to be filled is NREGS. */
diff --git a/gcc/expr.h b/gcc/expr.h
index d9fc47c9114..6366675f231 100644
--- a/gcc/expr.h
+++ b/gcc/expr.h
@@ -138,7 +138,8 @@ extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
bool might_overlap = false,
unsigned ctz_size = 0);
extern rtx emit_block_cmp_hints (rtx, rtx, rtx, tree, rtx, bool,
- by_pieces_constfn, void *);
+ by_pieces_constfn, void *,
+ unsigned ctz_size = 0);
extern bool emit_storent_insn (rtx to, rtx from);
/* Copy all or part of a value X into registers starting at REGNO.
^ permalink raw reply [flat|nested] 16+ messages in thread
* [gcc(refs/users/aoliva/heads/testme)] add memcmp loop expander
@ 2023-01-21 1:06 Alexandre Oliva
0 siblings, 0 replies; 16+ messages in thread
From: Alexandre Oliva @ 2023-01-21 1:06 UTC (permalink / raw)
To: gcc-cvs
https://gcc.gnu.org/g:ed9040c427c2092cb02ce21c6e228c65f68f13e2
commit ed9040c427c2092cb02ce21c6e228c65f68f13e2
Author: Alexandre Oliva <oliva@gnu.org>
Date: Fri Jan 20 22:01:15 2023 -0300
add memcmp loop expander
Diff:
---
gcc/expr.cc | 133 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
gcc/expr.h | 3 +-
2 files changed, 134 insertions(+), 2 deletions(-)
diff --git a/gcc/expr.cc b/gcc/expr.cc
index a02b0b6ed52..aabb6ed963d 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -83,6 +83,8 @@ static bool emit_block_move_via_pattern (rtx, rtx, rtx, unsigned, unsigned,
static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned, int);
static void emit_block_move_via_sized_loop (rtx, rtx, rtx, unsigned, unsigned);
static void emit_block_move_via_oriented_loop (rtx, rtx, rtx, unsigned, unsigned);
+static rtx emit_block_cmp_via_loop (rtx, rtx, rtx, tree, rtx, bool,
+ unsigned, unsigned);
static void clear_by_pieces (rtx, unsigned HOST_WIDE_INT, unsigned int);
static rtx_insn *compress_float_constant (rtx, rtx);
static rtx get_subtarget (rtx);
@@ -2583,7 +2585,7 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
rtx
emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
bool equality_only, by_pieces_constfn y_cfn,
- void *y_cfndata)
+ void *y_cfndata, unsigned ctz_size)
{
rtx result = 0;
@@ -2605,8 +2607,137 @@ emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
else
result = emit_block_cmp_via_cmpmem (x, y, len, len_type, target, align);
+ if (!result && (flag_inline_stringops & ILSOP_MEMCMP))
+ result = emit_block_cmp_via_loop (x, y, len, len_type,
+ target, equality_only,
+ align, ctz_size);
+
return result;
}
+
+rtx
+emit_block_cmp_via_loop (rtx x, rtx y, rtx len, tree size_type, rtx target,
+ bool equality_only, unsigned align, unsigned ctz_size)
+{
+ unsigned incr = align / BITS_PER_UNIT;
+
+ if (CONST_INT_P (len))
+ ctz_size = MAX (ctz_size, (unsigned) wi::ctz (UINTVAL (len)));
+
+ if (HOST_WIDE_INT_1U << ctz_size < (unsigned HOST_WIDE_INT) incr)
+ incr = HOST_WIDE_INT_1U << ctz_size;
+
+ while (incr > 1
+ && !(equality_only
+ ? can_do_by_pieces (incr, align, COMPARE_BY_PIECES)
+ : int_mode_for_size (incr, 0).exists ()))
+ incr >>= 1;
+
+ rtx_code_label *cmp_label, *top_label, *ne_label, *res_label;
+ rtx iter, x_addr, y_addr, tmp;
+ machine_mode x_addr_mode = get_address_mode (x);
+ machine_mode y_addr_mode = get_address_mode (y);
+ machine_mode iter_mode;
+
+ iter_mode = GET_MODE (len);
+ if (iter_mode == VOIDmode)
+ iter_mode = word_mode;
+
+ top_label = gen_label_rtx ();
+ cmp_label = gen_label_rtx ();
+ ne_label = gen_label_rtx ();
+ res_label = gen_label_rtx ();
+ iter = gen_reg_rtx (iter_mode);
+
+ rtx iter_init = const0_rtx;
+ rtx_code iter_cond = LT;
+ rtx iter_limit = len;
+ rtx iter_incr = GEN_INT (incr);
+ machine_mode cmp_mode;
+
+ emit_move_insn (iter, iter_init);
+
+ scalar_int_mode int_cmp_mode
+ = smallest_int_mode_for_size (incr * BITS_PER_UNIT);
+ if (GET_MODE_BITSIZE (int_cmp_mode) != incr * BITS_PER_UNIT)
+ {
+ cmp_mode = BLKmode;
+ gcc_checking_assert (can_do_by_pieces (incr, align, COMPARE_BY_PIECES));
+ if (!equality_only)
+ return NULL_RTX;
+ }
+ else
+ cmp_mode = int_cmp_mode;
+
+ x_addr = force_operand (XEXP (x, 0), NULL_RTX);
+ y_addr = force_operand (XEXP (y, 0), NULL_RTX);
+ do_pending_stack_adjust ();
+
+ emit_jump (cmp_label);
+ emit_label (top_label);
+
+ tmp = convert_modes (x_addr_mode, iter_mode, iter, true);
+ x_addr = simplify_gen_binary (PLUS, x_addr_mode, x_addr, tmp);
+
+ if (x_addr_mode != y_addr_mode)
+ tmp = convert_modes (y_addr_mode, iter_mode, iter, true);
+ y_addr = simplify_gen_binary (PLUS, y_addr_mode, y_addr, tmp);
+
+ x = change_address (x, cmp_mode, x_addr);
+ y = change_address (y, cmp_mode, y_addr);
+
+ rtx part_res;
+ if (cmp_mode == BLKmode)
+ part_res = compare_by_pieces (x, y, incr, target, align, 0, 0);
+ else
+ part_res = expand_binop (cmp_mode, sub_optab, x, y, NULL_RTX,
+ true, OPTAB_LIB_WIDEN);
+
+ emit_cmp_and_jump_insns (part_res, GEN_INT (0), NE, NULL_RTX,
+ GET_MODE (part_res), true, ne_label,
+ profile_probability::guessed_always ()
+ .apply_scale (9, 10));
+
+ tmp = expand_simple_binop (iter_mode, PLUS, iter, iter_incr, iter,
+ true, OPTAB_LIB_WIDEN);
+ if (tmp != iter)
+ emit_move_insn (iter, tmp);
+
+ emit_label (cmp_label);
+ emit_cmp_and_jump_insns (iter, iter_limit, iter_cond, NULL_RTX, iter_mode,
+ true, top_label,
+ profile_probability::guessed_always ()
+ .apply_scale (9, 10));
+
+ if (target == NULL_RTX
+ || !REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
+ target = gen_reg_rtx (TYPE_MODE (integer_type_node));
+
+ emit_move_insn (target, const0_rtx);
+ emit_jump (res_label);
+ emit_barrier ();
+
+ emit_label (ne_label);
+
+ if (equality_only)
+ emit_move_insn (target, const1_rtx);
+ else if (incr == 1)
+ convert_move (target, part_res, false);
+ else
+ {
+ /* ??? Re-compare the block found to be different one byte at a
+ time. We could do better using part_res, and being careful
+ about endianness. */
+ part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), size_type,
+ target, equality_only, 1, 0);
+ convert_move (target, part_res, false);
+ }
+
+ emit_label (res_label);
+
+ return target;
+}
+
\f
/* Copy all or part of a value X into registers starting at REGNO.
The number of registers to be filled is NREGS. */
diff --git a/gcc/expr.h b/gcc/expr.h
index d9fc47c9114..6366675f231 100644
--- a/gcc/expr.h
+++ b/gcc/expr.h
@@ -138,7 +138,8 @@ extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
bool might_overlap = false,
unsigned ctz_size = 0);
extern rtx emit_block_cmp_hints (rtx, rtx, rtx, tree, rtx, bool,
- by_pieces_constfn, void *);
+ by_pieces_constfn, void *,
+ unsigned ctz_size = 0);
extern bool emit_storent_insn (rtx to, rtx from);
/* Copy all or part of a value X into registers starting at REGNO.
^ permalink raw reply [flat|nested] 16+ messages in thread
end of thread, other threads:[~2023-01-27 5:58 UTC | newest]
Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-01-26 6:34 [gcc(refs/users/aoliva/heads/testme)] add memcmp loop expander Alexandre Oliva
-- strict thread matches above, loose matches on Subject: below --
2023-01-27 5:58 Alexandre Oliva
2023-01-27 2:25 Alexandre Oliva
2023-01-27 1:57 Alexandre Oliva
2023-01-26 8:44 Alexandre Oliva
2023-01-26 8:21 Alexandre Oliva
2023-01-26 8:01 Alexandre Oliva
2023-01-26 7:02 Alexandre Oliva
2023-01-26 6:40 Alexandre Oliva
2023-01-26 6:38 Alexandre Oliva
2023-01-26 6:14 Alexandre Oliva
2023-01-26 6:09 Alexandre Oliva
2023-01-26 5:17 Alexandre Oliva
2023-01-26 4:59 Alexandre Oliva
2023-01-26 3:37 Alexandre Oliva
2023-01-21 1:06 Alexandre Oliva
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).