public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc(refs/users/aoliva/heads/testme)] add memcmp loop expander
@ 2023-01-21  1:06 Alexandre Oliva
  0 siblings, 0 replies; 16+ messages in thread
From: Alexandre Oliva @ 2023-01-21  1:06 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:ed9040c427c2092cb02ce21c6e228c65f68f13e2

commit ed9040c427c2092cb02ce21c6e228c65f68f13e2
Author: Alexandre Oliva <oliva@gnu.org>
Date:   Fri Jan 20 22:01:15 2023 -0300

    add memcmp loop expander

Diff:
---
 gcc/expr.cc | 133 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 gcc/expr.h  |   3 +-
 2 files changed, 134 insertions(+), 2 deletions(-)

diff --git a/gcc/expr.cc b/gcc/expr.cc
index a02b0b6ed52..aabb6ed963d 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -83,6 +83,8 @@ static bool emit_block_move_via_pattern (rtx, rtx, rtx, unsigned, unsigned,
 static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned, int);
 static void emit_block_move_via_sized_loop (rtx, rtx, rtx, unsigned, unsigned);
 static void emit_block_move_via_oriented_loop (rtx, rtx, rtx, unsigned, unsigned);
+static rtx emit_block_cmp_via_loop (rtx, rtx, rtx, tree, rtx, bool,
+				    unsigned, unsigned);
 static void clear_by_pieces (rtx, unsigned HOST_WIDE_INT, unsigned int);
 static rtx_insn *compress_float_constant (rtx, rtx);
 static rtx get_subtarget (rtx);
@@ -2583,7 +2585,7 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
 rtx
 emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
 		      bool equality_only, by_pieces_constfn y_cfn,
-		      void *y_cfndata)
+		      void *y_cfndata, unsigned ctz_size)
 {
   rtx result = 0;
 
@@ -2605,8 +2607,137 @@ emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
   else
     result = emit_block_cmp_via_cmpmem (x, y, len, len_type, target, align);
 
+  if (!result && (flag_inline_stringops & ILSOP_MEMCMP))
+    result = emit_block_cmp_via_loop (x, y, len, len_type,
+				      target, equality_only,
+				      align, ctz_size);
+
   return result;
 }
+
+rtx
+emit_block_cmp_via_loop (rtx x, rtx y, rtx len, tree size_type, rtx target,
+			 bool equality_only, unsigned align, unsigned ctz_size)
+{
+  unsigned incr = align / BITS_PER_UNIT;
+
+  if (CONST_INT_P (len))
+    ctz_size = MAX (ctz_size, (unsigned) wi::ctz (UINTVAL (len)));
+
+  if (HOST_WIDE_INT_1U << ctz_size < (unsigned HOST_WIDE_INT) incr)
+    incr = HOST_WIDE_INT_1U << ctz_size;
+
+  while (incr > 1
+	 && !(equality_only
+	      ? can_do_by_pieces (incr, align, COMPARE_BY_PIECES)
+	      : int_mode_for_size (incr, 0).exists ()))
+    incr >>= 1;
+
+  rtx_code_label *cmp_label, *top_label, *ne_label, *res_label;
+  rtx iter, x_addr, y_addr, tmp;
+  machine_mode x_addr_mode = get_address_mode (x);
+  machine_mode y_addr_mode = get_address_mode (y);
+  machine_mode iter_mode;
+
+  iter_mode = GET_MODE (len);
+  if (iter_mode == VOIDmode)
+    iter_mode = word_mode;
+
+  top_label = gen_label_rtx ();
+  cmp_label = gen_label_rtx ();
+  ne_label = gen_label_rtx ();
+  res_label = gen_label_rtx ();
+  iter = gen_reg_rtx (iter_mode);
+
+  rtx iter_init = const0_rtx;
+  rtx_code iter_cond = LT;
+  rtx iter_limit = len;
+  rtx iter_incr = GEN_INT (incr);
+  machine_mode cmp_mode;
+
+  emit_move_insn (iter, iter_init);
+
+  scalar_int_mode int_cmp_mode
+    = smallest_int_mode_for_size (incr * BITS_PER_UNIT);
+  if (GET_MODE_BITSIZE (int_cmp_mode) != incr * BITS_PER_UNIT)
+    {
+      cmp_mode = BLKmode;
+      gcc_checking_assert (can_do_by_pieces (incr, align, COMPARE_BY_PIECES));
+      if (!equality_only)
+	return NULL_RTX;
+    }
+  else
+    cmp_mode = int_cmp_mode;
+
+  x_addr = force_operand (XEXP (x, 0), NULL_RTX);
+  y_addr = force_operand (XEXP (y, 0), NULL_RTX);
+  do_pending_stack_adjust ();
+
+  emit_jump (cmp_label);
+  emit_label (top_label);
+
+  tmp = convert_modes (x_addr_mode, iter_mode, iter, true);
+  x_addr = simplify_gen_binary (PLUS, x_addr_mode, x_addr, tmp);
+
+  if (x_addr_mode != y_addr_mode)
+    tmp = convert_modes (y_addr_mode, iter_mode, iter, true);
+  y_addr = simplify_gen_binary (PLUS, y_addr_mode, y_addr, tmp);
+
+  x = change_address (x, cmp_mode, x_addr);
+  y = change_address (y, cmp_mode, y_addr);
+
+  rtx part_res;
+  if (cmp_mode == BLKmode)
+    part_res = compare_by_pieces (x, y, incr, target, align, 0, 0);
+  else
+    part_res = expand_binop (cmp_mode, sub_optab, x, y, NULL_RTX,
+			     true, OPTAB_LIB_WIDEN);
+
+  emit_cmp_and_jump_insns (part_res, GEN_INT (0), NE, NULL_RTX,
+			   GET_MODE (part_res), true, ne_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (9, 10));
+
+  tmp = expand_simple_binop (iter_mode, PLUS, iter, iter_incr, iter,
+			     true, OPTAB_LIB_WIDEN);
+  if (tmp != iter)
+    emit_move_insn (iter, tmp);
+
+  emit_label (cmp_label);
+  emit_cmp_and_jump_insns (iter, iter_limit, iter_cond, NULL_RTX, iter_mode,
+			   true, top_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (9, 10));
+
+  if (target == NULL_RTX
+      || !REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
+    target = gen_reg_rtx (TYPE_MODE (integer_type_node));
+
+  emit_move_insn (target, const0_rtx);
+  emit_jump (res_label);
+  emit_barrier ();
+
+  emit_label (ne_label);
+
+  if (equality_only)
+    emit_move_insn (target, const1_rtx);
+  else if (incr == 1)
+    convert_move (target, part_res, false);
+  else
+    {
+      /* ??? Re-compare the block found to be different one byte at a
+	 time.  We could do better using part_res, and being careful
+	 about endianness.  */
+      part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), size_type,
+					  target, equality_only, 1, 0);
+      convert_move (target, part_res, false);
+    }
+
+  emit_label (res_label);
+
+  return target;
+}
+
 \f
 /* Copy all or part of a value X into registers starting at REGNO.
    The number of registers to be filled is NREGS.  */
diff --git a/gcc/expr.h b/gcc/expr.h
index d9fc47c9114..6366675f231 100644
--- a/gcc/expr.h
+++ b/gcc/expr.h
@@ -138,7 +138,8 @@ extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
 				  bool might_overlap = false,
 				  unsigned ctz_size = 0);
 extern rtx emit_block_cmp_hints (rtx, rtx, rtx, tree, rtx, bool,
-				 by_pieces_constfn, void *);
+				 by_pieces_constfn, void *,
+				 unsigned ctz_size = 0);
 extern bool emit_storent_insn (rtx to, rtx from);
 
 /* Copy all or part of a value X into registers starting at REGNO.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [gcc(refs/users/aoliva/heads/testme)] add memcmp loop expander
@ 2023-01-27  5:58 Alexandre Oliva
  0 siblings, 0 replies; 16+ messages in thread
From: Alexandre Oliva @ 2023-01-27  5:58 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:1f7a6a0f935c73416ef530e050a316dc0fc4d5ad

commit 1f7a6a0f935c73416ef530e050a316dc0fc4d5ad
Author: Alexandre Oliva <oliva@gnu.org>
Date:   Thu Jan 26 22:52:20 2023 -0300

    add memcmp loop expander

Diff:
---
 gcc/builtins.cc                                    |   3 +-
 gcc/expr.cc                                        | 165 ++++++++++++++++++++-
 gcc/expr.h                                         |   3 +-
 gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c    |   6 +
 .../gcc.dg/torture/inline-mem-cpy-cmp-1.c          |  11 ++
 5 files changed, 184 insertions(+), 4 deletions(-)

diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index a9f92e9eb29..dac3b83f627 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -4809,7 +4809,8 @@ expand_builtin_memcmp (tree exp, rtx target, bool result_eq)
   result = emit_block_cmp_hints (arg1_rtx, arg2_rtx, len_rtx,
 				 TREE_TYPE (len), target,
 				 result_eq, constfn,
-				 CONST_CAST (char *, rep));
+				 CONST_CAST (char *, rep),
+				 tree_ctz (len));
 
   if (result)
     {
diff --git a/gcc/expr.cc b/gcc/expr.cc
index aceb3f514fc..beb2f6f7bf7 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -83,6 +83,8 @@ static bool emit_block_move_via_pattern (rtx, rtx, rtx, unsigned, unsigned,
 static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned, int);
 static void emit_block_move_via_sized_loop (rtx, rtx, rtx, unsigned, unsigned);
 static void emit_block_move_via_oriented_loop (rtx, rtx, rtx, unsigned, unsigned);
+static rtx emit_block_cmp_via_loop (rtx, rtx, rtx, tree, rtx, bool,
+				    unsigned, unsigned);
 static void clear_by_pieces (rtx, unsigned HOST_WIDE_INT, unsigned int);
 static rtx_insn *compress_float_constant (rtx, rtx);
 static rtx get_subtarget (rtx);
@@ -2569,7 +2571,8 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
 
    Both X and Y must be MEM rtx's.  LEN is an rtx that says how long
    they are.  LEN_TYPE is the type of the expression that was used to
-   calculate it.
+   calculate it, and CTZ_LEN is the known trailing-zeros count of LEN,
+   so LEN must be a multiple of 1<<CTZ_LEN even if it's not constant.
 
    If EQUALITY_ONLY is true, it means we don't have to return the tri-state
    value of a normal memcmp call, instead we can just compare for equality.
@@ -2585,7 +2588,7 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
 rtx
 emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
 		      bool equality_only, by_pieces_constfn y_cfn,
-		      void *y_cfndata)
+		      void *y_cfndata, unsigned ctz_len)
 {
   rtx result = 0;
 
@@ -2607,8 +2610,166 @@ emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
   else
     result = emit_block_cmp_via_cmpmem (x, y, len, len_type, target, align);
 
+  if (!result && (flag_inline_stringops & ILSOP_MEMCMP))
+    result = emit_block_cmp_via_loop (x, y, len, len_type,
+				      target, equality_only,
+				      align, ctz_len);
+
   return result;
 }
+
+rtx
+emit_block_cmp_via_loop (rtx x, rtx y, rtx len, tree size_type, rtx target,
+			 bool equality_only, unsigned align, unsigned ctz_len)
+{
+  unsigned incr = align / BITS_PER_UNIT;
+
+  if (CONST_INT_P (len))
+    ctz_len = MAX (ctz_len, (unsigned) wi::ctz (UINTVAL (len)));
+
+  if (HOST_WIDE_INT_1U << ctz_len < (unsigned HOST_WIDE_INT) incr)
+    incr = HOST_WIDE_INT_1U << ctz_len;
+
+  while (incr > 1
+	 && !can_do_by_pieces (incr, align, COMPARE_BY_PIECES))
+    incr >>= 1;
+
+  rtx_code_label *cmp_label, *top_label, *ne_label, *res_label;
+  rtx iter, x_addr, y_addr, tmp;
+  machine_mode x_addr_mode = get_address_mode (x);
+  machine_mode y_addr_mode = get_address_mode (y);
+  machine_mode iter_mode;
+
+  iter_mode = GET_MODE (len);
+  if (iter_mode == VOIDmode)
+    iter_mode = word_mode;
+
+  top_label = gen_label_rtx ();
+  cmp_label = gen_label_rtx ();
+  ne_label = gen_label_rtx ();
+  res_label = gen_label_rtx ();
+  iter = gen_reg_rtx (iter_mode);
+
+  rtx iter_init = const0_rtx;
+  rtx_code iter_cond = LT;
+  rtx iter_limit = len;
+  rtx iter_incr = GEN_INT (incr);
+  machine_mode cmp_mode;
+
+  emit_move_insn (iter, iter_init);
+
+  scalar_int_mode int_cmp_mode
+    = smallest_int_mode_for_size (incr * BITS_PER_UNIT);
+  if (GET_MODE_BITSIZE (int_cmp_mode) != incr * BITS_PER_UNIT
+      || !can_compare_p (NE, int_cmp_mode, ccp_jump))
+    {
+      cmp_mode = BLKmode;
+      gcc_checking_assert (incr != 1);
+    }
+  else
+    cmp_mode = int_cmp_mode;
+
+  x_addr = force_operand (XEXP (x, 0), NULL_RTX);
+  y_addr = force_operand (XEXP (y, 0), NULL_RTX);
+  do_pending_stack_adjust ();
+
+  emit_jump (cmp_label);
+  emit_label (top_label);
+
+  tmp = convert_modes (x_addr_mode, iter_mode, iter, true);
+  x_addr = simplify_gen_binary (PLUS, x_addr_mode, x_addr, tmp);
+
+  if (x_addr_mode != y_addr_mode)
+    tmp = convert_modes (y_addr_mode, iter_mode, iter, true);
+  y_addr = simplify_gen_binary (PLUS, y_addr_mode, y_addr, tmp);
+
+  x = change_address (x, cmp_mode, x_addr);
+  y = change_address (y, cmp_mode, y_addr);
+
+  rtx part_res;
+  if (cmp_mode == BLKmode)
+    part_res = compare_by_pieces (x, y, incr, target, align, 0, 0);
+  else
+    part_res = expand_binop (cmp_mode, sub_optab, x, y, NULL_RTX,
+			     true, OPTAB_LIB_WIDEN);
+
+  emit_cmp_and_jump_insns (part_res, GEN_INT (0), NE, NULL_RTX,
+			   GET_MODE (part_res), true, ne_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (1, 10));
+
+  tmp = expand_simple_binop (iter_mode, PLUS, iter, iter_incr, iter,
+			     true, OPTAB_LIB_WIDEN);
+  if (tmp != iter)
+    emit_move_insn (iter, tmp);
+
+  emit_label (cmp_label);
+  emit_cmp_and_jump_insns (iter, iter_limit, iter_cond, NULL_RTX, iter_mode,
+			   true, top_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (9, 10));
+
+  if (target == NULL_RTX
+      || !REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
+    target = gen_reg_rtx (TYPE_MODE (integer_type_node));
+
+  emit_move_insn (target, const0_rtx);
+  emit_jump (res_label);
+  emit_barrier ();
+
+  emit_label (ne_label);
+
+  if (equality_only)
+    emit_move_insn (target, const1_rtx);
+  else
+    {
+      if (incr > UNITS_PER_WORD)
+	/* ??? Re-compare the block found to be different one word at a
+	   time.  */
+	part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), size_type,
+					    target, equality_only,
+					    BITS_PER_WORD, 0);
+      else if (incr > 1)
+	/* ??? Re-compare the block found to be different one byte at a
+	   time.  We could do better using part_res, and being careful
+	   about endianness.  */
+	part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), size_type,
+					    target, equality_only,
+					    BITS_PER_UNIT, 0);
+      else if (known_gt (GET_MODE_BITSIZE (GET_MODE (target)),
+			 GET_MODE_BITSIZE (cmp_mode)))
+	part_res = expand_binop (GET_MODE (target), sub_optab, x, y, target,
+				 true, OPTAB_LIB_WIDEN);
+      else
+	{
+	  /* In the odd chance target is QImode, we can't count on
+	     widening subtract to capture the result of the unsigned
+	     compares.  */
+	  rtx_code_label *ltu_label;
+	  ltu_label = gen_label_rtx ();
+	  emit_cmp_and_jump_insns (x, y, LTU, NULL_RTX,
+				   cmp_mode, true, ltu_label,
+				   profile_probability::guessed_always ()
+				   .apply_scale (5, 10));
+
+	  emit_move_insn (target, const1_rtx);
+	  emit_jump (res_label);
+	  emit_barrier ();
+
+	  emit_label (ltu_label);
+	  emit_move_insn (target, constm1_rtx);
+	  part_res = target;
+	}
+
+      if (target != part_res)
+	convert_move (target, part_res, false);
+    }
+
+  emit_label (res_label);
+
+  return target;
+}
+
 \f
 /* Copy all or part of a value X into registers starting at REGNO.
    The number of registers to be filled is NREGS.  */
diff --git a/gcc/expr.h b/gcc/expr.h
index d9fc47c9114..976c8b69fc1 100644
--- a/gcc/expr.h
+++ b/gcc/expr.h
@@ -138,7 +138,8 @@ extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
 				  bool might_overlap = false,
 				  unsigned ctz_size = 0);
 extern rtx emit_block_cmp_hints (rtx, rtx, rtx, tree, rtx, bool,
-				 by_pieces_constfn, void *);
+				 by_pieces_constfn, void *,
+				 unsigned ctz_len = 0);
 extern bool emit_storent_insn (rtx to, rtx from);
 
 /* Copy all or part of a value X into registers starting at REGNO.
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c
new file mode 100644
index 00000000000..bcd774c0f77
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-finline-stringops=memcmp -g0 -fno-lto" } */
+
+#include "../memcmp-1.c"
+
+/* { dg-final { scan-assembler-not "memcmp" } } */
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-cmp-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-cmp-1.c
new file mode 100644
index 00000000000..6b4fef0b20f
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-cmp-1.c
@@ -0,0 +1,11 @@
+/* { dg-do run } */
+/* { dg-options "-finline-stringops -g0 -fno-lto" } */
+/* { dg-require-effective-target ptr32plus } */
+/* { dg-timeout-factor 2 } */
+
+#include "../memcmp-1.c"
+/* Yeah, this memcmp test exercises plenty of memcpy, more than any of the
+   memcpy tests.  */
+
+/* { dg-final { scan-assembler-not "memcpy" } } */
+/* { dg-final { scan-assembler-not "memcmp" } } */

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [gcc(refs/users/aoliva/heads/testme)] add memcmp loop expander
@ 2023-01-27  2:25 Alexandre Oliva
  0 siblings, 0 replies; 16+ messages in thread
From: Alexandre Oliva @ 2023-01-27  2:25 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:be2ffb9c759274c1e457c4c33b6695b5ca4b71a1

commit be2ffb9c759274c1e457c4c33b6695b5ca4b71a1
Author: Alexandre Oliva <oliva@gnu.org>
Date:   Thu Jan 26 22:52:20 2023 -0300

    add memcmp loop expander

Diff:
---
 gcc/builtins.cc                                    |   3 +-
 gcc/expr.cc                                        | 165 ++++++++++++++++++++-
 gcc/expr.h                                         |   3 +-
 gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c    |   6 +
 .../gcc.dg/torture/inline-mem-cpy-cmp-1.c          |  11 ++
 5 files changed, 184 insertions(+), 4 deletions(-)

diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index 4bf404a8a78..2c215bc29df 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -4809,7 +4809,8 @@ expand_builtin_memcmp (tree exp, rtx target, bool result_eq)
   result = emit_block_cmp_hints (arg1_rtx, arg2_rtx, len_rtx,
 				 TREE_TYPE (len), target,
 				 result_eq, constfn,
-				 CONST_CAST (char *, rep));
+				 CONST_CAST (char *, rep),
+				 tree_ctz (len));
 
   if (result)
     {
diff --git a/gcc/expr.cc b/gcc/expr.cc
index aceb3f514fc..beb2f6f7bf7 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -83,6 +83,8 @@ static bool emit_block_move_via_pattern (rtx, rtx, rtx, unsigned, unsigned,
 static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned, int);
 static void emit_block_move_via_sized_loop (rtx, rtx, rtx, unsigned, unsigned);
 static void emit_block_move_via_oriented_loop (rtx, rtx, rtx, unsigned, unsigned);
+static rtx emit_block_cmp_via_loop (rtx, rtx, rtx, tree, rtx, bool,
+				    unsigned, unsigned);
 static void clear_by_pieces (rtx, unsigned HOST_WIDE_INT, unsigned int);
 static rtx_insn *compress_float_constant (rtx, rtx);
 static rtx get_subtarget (rtx);
@@ -2569,7 +2571,8 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
 
    Both X and Y must be MEM rtx's.  LEN is an rtx that says how long
    they are.  LEN_TYPE is the type of the expression that was used to
-   calculate it.
+   calculate it, and CTZ_LEN is the known trailing-zeros count of LEN,
+   so LEN must be a multiple of 1<<CTZ_LEN even if it's not constant.
 
    If EQUALITY_ONLY is true, it means we don't have to return the tri-state
    value of a normal memcmp call, instead we can just compare for equality.
@@ -2585,7 +2588,7 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
 rtx
 emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
 		      bool equality_only, by_pieces_constfn y_cfn,
-		      void *y_cfndata)
+		      void *y_cfndata, unsigned ctz_len)
 {
   rtx result = 0;
 
@@ -2607,8 +2610,166 @@ emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
   else
     result = emit_block_cmp_via_cmpmem (x, y, len, len_type, target, align);
 
+  if (!result && (flag_inline_stringops & ILSOP_MEMCMP))
+    result = emit_block_cmp_via_loop (x, y, len, len_type,
+				      target, equality_only,
+				      align, ctz_len);
+
   return result;
 }
+
+rtx
+emit_block_cmp_via_loop (rtx x, rtx y, rtx len, tree size_type, rtx target,
+			 bool equality_only, unsigned align, unsigned ctz_len)
+{
+  unsigned incr = align / BITS_PER_UNIT;
+
+  if (CONST_INT_P (len))
+    ctz_len = MAX (ctz_len, (unsigned) wi::ctz (UINTVAL (len)));
+
+  if (HOST_WIDE_INT_1U << ctz_len < (unsigned HOST_WIDE_INT) incr)
+    incr = HOST_WIDE_INT_1U << ctz_len;
+
+  while (incr > 1
+	 && !can_do_by_pieces (incr, align, COMPARE_BY_PIECES))
+    incr >>= 1;
+
+  rtx_code_label *cmp_label, *top_label, *ne_label, *res_label;
+  rtx iter, x_addr, y_addr, tmp;
+  machine_mode x_addr_mode = get_address_mode (x);
+  machine_mode y_addr_mode = get_address_mode (y);
+  machine_mode iter_mode;
+
+  iter_mode = GET_MODE (len);
+  if (iter_mode == VOIDmode)
+    iter_mode = word_mode;
+
+  top_label = gen_label_rtx ();
+  cmp_label = gen_label_rtx ();
+  ne_label = gen_label_rtx ();
+  res_label = gen_label_rtx ();
+  iter = gen_reg_rtx (iter_mode);
+
+  rtx iter_init = const0_rtx;
+  rtx_code iter_cond = LT;
+  rtx iter_limit = len;
+  rtx iter_incr = GEN_INT (incr);
+  machine_mode cmp_mode;
+
+  emit_move_insn (iter, iter_init);
+
+  scalar_int_mode int_cmp_mode
+    = smallest_int_mode_for_size (incr * BITS_PER_UNIT);
+  if (GET_MODE_BITSIZE (int_cmp_mode) != incr * BITS_PER_UNIT
+      || !can_compare_p (NE, int_cmp_mode, ccp_jump))
+    {
+      cmp_mode = BLKmode;
+      gcc_checking_assert (incr != 1);
+    }
+  else
+    cmp_mode = int_cmp_mode;
+
+  x_addr = force_operand (XEXP (x, 0), NULL_RTX);
+  y_addr = force_operand (XEXP (y, 0), NULL_RTX);
+  do_pending_stack_adjust ();
+
+  emit_jump (cmp_label);
+  emit_label (top_label);
+
+  tmp = convert_modes (x_addr_mode, iter_mode, iter, true);
+  x_addr = simplify_gen_binary (PLUS, x_addr_mode, x_addr, tmp);
+
+  if (x_addr_mode != y_addr_mode)
+    tmp = convert_modes (y_addr_mode, iter_mode, iter, true);
+  y_addr = simplify_gen_binary (PLUS, y_addr_mode, y_addr, tmp);
+
+  x = change_address (x, cmp_mode, x_addr);
+  y = change_address (y, cmp_mode, y_addr);
+
+  rtx part_res;
+  if (cmp_mode == BLKmode)
+    part_res = compare_by_pieces (x, y, incr, target, align, 0, 0);
+  else
+    part_res = expand_binop (cmp_mode, sub_optab, x, y, NULL_RTX,
+			     true, OPTAB_LIB_WIDEN);
+
+  emit_cmp_and_jump_insns (part_res, GEN_INT (0), NE, NULL_RTX,
+			   GET_MODE (part_res), true, ne_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (1, 10));
+
+  tmp = expand_simple_binop (iter_mode, PLUS, iter, iter_incr, iter,
+			     true, OPTAB_LIB_WIDEN);
+  if (tmp != iter)
+    emit_move_insn (iter, tmp);
+
+  emit_label (cmp_label);
+  emit_cmp_and_jump_insns (iter, iter_limit, iter_cond, NULL_RTX, iter_mode,
+			   true, top_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (9, 10));
+
+  if (target == NULL_RTX
+      || !REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
+    target = gen_reg_rtx (TYPE_MODE (integer_type_node));
+
+  emit_move_insn (target, const0_rtx);
+  emit_jump (res_label);
+  emit_barrier ();
+
+  emit_label (ne_label);
+
+  if (equality_only)
+    emit_move_insn (target, const1_rtx);
+  else
+    {
+      if (incr > UNITS_PER_WORD)
+	/* ??? Re-compare the block found to be different one word at a
+	   time.  */
+	part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), size_type,
+					    target, equality_only,
+					    BITS_PER_WORD, 0);
+      else if (incr > 1)
+	/* ??? Re-compare the block found to be different one byte at a
+	   time.  We could do better using part_res, and being careful
+	   about endianness.  */
+	part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), size_type,
+					    target, equality_only,
+					    BITS_PER_UNIT, 0);
+      else if (known_gt (GET_MODE_BITSIZE (GET_MODE (target)),
+			 GET_MODE_BITSIZE (cmp_mode)))
+	part_res = expand_binop (GET_MODE (target), sub_optab, x, y, target,
+				 true, OPTAB_LIB_WIDEN);
+      else
+	{
+	  /* In the odd chance target is QImode, we can't count on
+	     widening subtract to capture the result of the unsigned
+	     compares.  */
+	  rtx_code_label *ltu_label;
+	  ltu_label = gen_label_rtx ();
+	  emit_cmp_and_jump_insns (x, y, LTU, NULL_RTX,
+				   cmp_mode, true, ltu_label,
+				   profile_probability::guessed_always ()
+				   .apply_scale (5, 10));
+
+	  emit_move_insn (target, const1_rtx);
+	  emit_jump (res_label);
+	  emit_barrier ();
+
+	  emit_label (ltu_label);
+	  emit_move_insn (target, constm1_rtx);
+	  part_res = target;
+	}
+
+      if (target != part_res)
+	convert_move (target, part_res, false);
+    }
+
+  emit_label (res_label);
+
+  return target;
+}
+
 \f
 /* Copy all or part of a value X into registers starting at REGNO.
    The number of registers to be filled is NREGS.  */
diff --git a/gcc/expr.h b/gcc/expr.h
index d9fc47c9114..976c8b69fc1 100644
--- a/gcc/expr.h
+++ b/gcc/expr.h
@@ -138,7 +138,8 @@ extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
 				  bool might_overlap = false,
 				  unsigned ctz_size = 0);
 extern rtx emit_block_cmp_hints (rtx, rtx, rtx, tree, rtx, bool,
-				 by_pieces_constfn, void *);
+				 by_pieces_constfn, void *,
+				 unsigned ctz_len = 0);
 extern bool emit_storent_insn (rtx to, rtx from);
 
 /* Copy all or part of a value X into registers starting at REGNO.
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c
new file mode 100644
index 00000000000..bcd774c0f77
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-finline-stringops=memcmp -g0 -fno-lto" } */
+
+#include "../memcmp-1.c"
+
+/* { dg-final { scan-assembler-not "memcmp" } } */
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-cmp-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-cmp-1.c
new file mode 100644
index 00000000000..6b4fef0b20f
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-cmp-1.c
@@ -0,0 +1,11 @@
+/* { dg-do run } */
+/* { dg-options "-finline-stringops -g0 -fno-lto" } */
+/* { dg-require-effective-target ptr32plus } */
+/* { dg-timeout-factor 2 } */
+
+#include "../memcmp-1.c"
+/* Yeah, this memcmp test exercises plenty of memcpy, more than any of the
+   memcpy tests.  */
+
+/* { dg-final { scan-assembler-not "memcpy" } } */
+/* { dg-final { scan-assembler-not "memcmp" } } */

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [gcc(refs/users/aoliva/heads/testme)] add memcmp loop expander
@ 2023-01-27  1:57 Alexandre Oliva
  0 siblings, 0 replies; 16+ messages in thread
From: Alexandre Oliva @ 2023-01-27  1:57 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:acb03e3137a3c75089625af0ea88dc19715607b5

commit acb03e3137a3c75089625af0ea88dc19715607b5
Author: Alexandre Oliva <oliva@gnu.org>
Date:   Thu Jan 26 22:52:20 2023 -0300

    add memcmp loop expander

Diff:
---
 gcc/builtins.cc                                    |   3 +-
 gcc/expr.cc                                        | 165 ++++++++++++++++++++-
 gcc/expr.h                                         |   3 +-
 gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c    |   6 +
 .../gcc.dg/torture/inline-mem-cpy-cmp-1.c          |  11 ++
 5 files changed, 184 insertions(+), 4 deletions(-)

diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index ca4d4721cdc..e55b53485e2 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -4809,7 +4809,8 @@ expand_builtin_memcmp (tree exp, rtx target, bool result_eq)
   result = emit_block_cmp_hints (arg1_rtx, arg2_rtx, len_rtx,
 				 TREE_TYPE (len), target,
 				 result_eq, constfn,
-				 CONST_CAST (char *, rep));
+				 CONST_CAST (char *, rep),
+				 tree_ctz (len));
 
   if (result)
     {
diff --git a/gcc/expr.cc b/gcc/expr.cc
index aceb3f514fc..beb2f6f7bf7 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -83,6 +83,8 @@ static bool emit_block_move_via_pattern (rtx, rtx, rtx, unsigned, unsigned,
 static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned, int);
 static void emit_block_move_via_sized_loop (rtx, rtx, rtx, unsigned, unsigned);
 static void emit_block_move_via_oriented_loop (rtx, rtx, rtx, unsigned, unsigned);
+static rtx emit_block_cmp_via_loop (rtx, rtx, rtx, tree, rtx, bool,
+				    unsigned, unsigned);
 static void clear_by_pieces (rtx, unsigned HOST_WIDE_INT, unsigned int);
 static rtx_insn *compress_float_constant (rtx, rtx);
 static rtx get_subtarget (rtx);
@@ -2569,7 +2571,8 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
 
    Both X and Y must be MEM rtx's.  LEN is an rtx that says how long
    they are.  LEN_TYPE is the type of the expression that was used to
-   calculate it.
+   calculate it, and CTZ_LEN is the known trailing-zeros count of LEN,
+   so LEN must be a multiple of 1<<CTZ_LEN even if it's not constant.
 
    If EQUALITY_ONLY is true, it means we don't have to return the tri-state
    value of a normal memcmp call, instead we can just compare for equality.
@@ -2585,7 +2588,7 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
 rtx
 emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
 		      bool equality_only, by_pieces_constfn y_cfn,
-		      void *y_cfndata)
+		      void *y_cfndata, unsigned ctz_len)
 {
   rtx result = 0;
 
@@ -2607,8 +2610,166 @@ emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
   else
     result = emit_block_cmp_via_cmpmem (x, y, len, len_type, target, align);
 
+  if (!result && (flag_inline_stringops & ILSOP_MEMCMP))
+    result = emit_block_cmp_via_loop (x, y, len, len_type,
+				      target, equality_only,
+				      align, ctz_len);
+
   return result;
 }
+
+rtx
+emit_block_cmp_via_loop (rtx x, rtx y, rtx len, tree size_type, rtx target,
+			 bool equality_only, unsigned align, unsigned ctz_len)
+{
+  unsigned incr = align / BITS_PER_UNIT;
+
+  if (CONST_INT_P (len))
+    ctz_len = MAX (ctz_len, (unsigned) wi::ctz (UINTVAL (len)));
+
+  if (HOST_WIDE_INT_1U << ctz_len < (unsigned HOST_WIDE_INT) incr)
+    incr = HOST_WIDE_INT_1U << ctz_len;
+
+  while (incr > 1
+	 && !can_do_by_pieces (incr, align, COMPARE_BY_PIECES))
+    incr >>= 1;
+
+  rtx_code_label *cmp_label, *top_label, *ne_label, *res_label;
+  rtx iter, x_addr, y_addr, tmp;
+  machine_mode x_addr_mode = get_address_mode (x);
+  machine_mode y_addr_mode = get_address_mode (y);
+  machine_mode iter_mode;
+
+  iter_mode = GET_MODE (len);
+  if (iter_mode == VOIDmode)
+    iter_mode = word_mode;
+
+  top_label = gen_label_rtx ();
+  cmp_label = gen_label_rtx ();
+  ne_label = gen_label_rtx ();
+  res_label = gen_label_rtx ();
+  iter = gen_reg_rtx (iter_mode);
+
+  rtx iter_init = const0_rtx;
+  rtx_code iter_cond = LT;
+  rtx iter_limit = len;
+  rtx iter_incr = GEN_INT (incr);
+  machine_mode cmp_mode;
+
+  emit_move_insn (iter, iter_init);
+
+  scalar_int_mode int_cmp_mode
+    = smallest_int_mode_for_size (incr * BITS_PER_UNIT);
+  if (GET_MODE_BITSIZE (int_cmp_mode) != incr * BITS_PER_UNIT
+      || !can_compare_p (NE, int_cmp_mode, ccp_jump))
+    {
+      cmp_mode = BLKmode;
+      gcc_checking_assert (incr != 1);
+    }
+  else
+    cmp_mode = int_cmp_mode;
+
+  x_addr = force_operand (XEXP (x, 0), NULL_RTX);
+  y_addr = force_operand (XEXP (y, 0), NULL_RTX);
+  do_pending_stack_adjust ();
+
+  emit_jump (cmp_label);
+  emit_label (top_label);
+
+  tmp = convert_modes (x_addr_mode, iter_mode, iter, true);
+  x_addr = simplify_gen_binary (PLUS, x_addr_mode, x_addr, tmp);
+
+  if (x_addr_mode != y_addr_mode)
+    tmp = convert_modes (y_addr_mode, iter_mode, iter, true);
+  y_addr = simplify_gen_binary (PLUS, y_addr_mode, y_addr, tmp);
+
+  x = change_address (x, cmp_mode, x_addr);
+  y = change_address (y, cmp_mode, y_addr);
+
+  rtx part_res;
+  if (cmp_mode == BLKmode)
+    part_res = compare_by_pieces (x, y, incr, target, align, 0, 0);
+  else
+    part_res = expand_binop (cmp_mode, sub_optab, x, y, NULL_RTX,
+			     true, OPTAB_LIB_WIDEN);
+
+  emit_cmp_and_jump_insns (part_res, GEN_INT (0), NE, NULL_RTX,
+			   GET_MODE (part_res), true, ne_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (1, 10));
+
+  tmp = expand_simple_binop (iter_mode, PLUS, iter, iter_incr, iter,
+			     true, OPTAB_LIB_WIDEN);
+  if (tmp != iter)
+    emit_move_insn (iter, tmp);
+
+  emit_label (cmp_label);
+  emit_cmp_and_jump_insns (iter, iter_limit, iter_cond, NULL_RTX, iter_mode,
+			   true, top_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (9, 10));
+
+  if (target == NULL_RTX
+      || !REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
+    target = gen_reg_rtx (TYPE_MODE (integer_type_node));
+
+  emit_move_insn (target, const0_rtx);
+  emit_jump (res_label);
+  emit_barrier ();
+
+  emit_label (ne_label);
+
+  if (equality_only)
+    emit_move_insn (target, const1_rtx);
+  else
+    {
+      if (incr > UNITS_PER_WORD)
+	/* ??? Re-compare the block found to be different one word at a
+	   time.  */
+	part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), size_type,
+					    target, equality_only,
+					    BITS_PER_WORD, 0);
+      else if (incr > 1)
+	/* ??? Re-compare the block found to be different one byte at a
+	   time.  We could do better using part_res, and being careful
+	   about endianness.  */
+	part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), size_type,
+					    target, equality_only,
+					    BITS_PER_UNIT, 0);
+      else if (known_gt (GET_MODE_BITSIZE (GET_MODE (target)),
+			 GET_MODE_BITSIZE (cmp_mode)))
+	part_res = expand_binop (GET_MODE (target), sub_optab, x, y, target,
+				 true, OPTAB_LIB_WIDEN);
+      else
+	{
+	  /* In the odd chance target is QImode, we can't count on
+	     widening subtract to capture the result of the unsigned
+	     compares.  */
+	  rtx_code_label *ltu_label;
+	  ltu_label = gen_label_rtx ();
+	  emit_cmp_and_jump_insns (x, y, LTU, NULL_RTX,
+				   cmp_mode, true, ltu_label,
+				   profile_probability::guessed_always ()
+				   .apply_scale (5, 10));
+
+	  emit_move_insn (target, const1_rtx);
+	  emit_jump (res_label);
+	  emit_barrier ();
+
+	  emit_label (ltu_label);
+	  emit_move_insn (target, constm1_rtx);
+	  part_res = target;
+	}
+
+      if (target != part_res)
+	convert_move (target, part_res, false);
+    }
+
+  emit_label (res_label);
+
+  return target;
+}
+
 \f
 /* Copy all or part of a value X into registers starting at REGNO.
    The number of registers to be filled is NREGS.  */
diff --git a/gcc/expr.h b/gcc/expr.h
index d9fc47c9114..976c8b69fc1 100644
--- a/gcc/expr.h
+++ b/gcc/expr.h
@@ -138,7 +138,8 @@ extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
 				  bool might_overlap = false,
 				  unsigned ctz_size = 0);
 extern rtx emit_block_cmp_hints (rtx, rtx, rtx, tree, rtx, bool,
-				 by_pieces_constfn, void *);
+				 by_pieces_constfn, void *,
+				 unsigned ctz_len = 0);
 extern bool emit_storent_insn (rtx to, rtx from);
 
 /* Copy all or part of a value X into registers starting at REGNO.
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c
new file mode 100644
index 00000000000..bcd774c0f77
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-finline-stringops=memcmp -g0 -fno-lto" } */
+
+#include "../memcmp-1.c"
+
+/* { dg-final { scan-assembler-not "memcmp" } } */
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-cmp-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-cmp-1.c
new file mode 100644
index 00000000000..6b4fef0b20f
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-cmp-1.c
@@ -0,0 +1,11 @@
+/* { dg-do run } */
+/* { dg-options "-finline-stringops -g0 -fno-lto" } */
+/* { dg-require-effective-target ptr32plus } */
+/* { dg-timeout-factor 2 } */
+
+#include "../memcmp-1.c"
+/* Yeah, this memcmp test exercises plenty of memcpy, more than any of the
+   memcpy tests.  */
+
+/* { dg-final { scan-assembler-not "memcpy" } } */
+/* { dg-final { scan-assembler-not "memcmp" } } */

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [gcc(refs/users/aoliva/heads/testme)] add memcmp loop expander
@ 2023-01-26  8:44 Alexandre Oliva
  0 siblings, 0 replies; 16+ messages in thread
From: Alexandre Oliva @ 2023-01-26  8:44 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:bfd738f7bf883f39cef1d0e8bf08594749e542ce

commit bfd738f7bf883f39cef1d0e8bf08594749e542ce
Author: Alexandre Oliva <oliva@gnu.org>
Date:   Fri Jan 20 22:01:15 2023 -0300

    add memcmp loop expander

Diff:
---
 gcc/builtins.cc |   3 +-
 gcc/expr.cc     | 165 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 gcc/expr.h      |   3 +-
 3 files changed, 167 insertions(+), 4 deletions(-)

diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index ca4d4721cdc..e55b53485e2 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -4809,7 +4809,8 @@ expand_builtin_memcmp (tree exp, rtx target, bool result_eq)
   result = emit_block_cmp_hints (arg1_rtx, arg2_rtx, len_rtx,
 				 TREE_TYPE (len), target,
 				 result_eq, constfn,
-				 CONST_CAST (char *, rep));
+				 CONST_CAST (char *, rep),
+				 tree_ctz (len));
 
   if (result)
     {
diff --git a/gcc/expr.cc b/gcc/expr.cc
index aceb3f514fc..beb2f6f7bf7 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -83,6 +83,8 @@ static bool emit_block_move_via_pattern (rtx, rtx, rtx, unsigned, unsigned,
 static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned, int);
 static void emit_block_move_via_sized_loop (rtx, rtx, rtx, unsigned, unsigned);
 static void emit_block_move_via_oriented_loop (rtx, rtx, rtx, unsigned, unsigned);
+static rtx emit_block_cmp_via_loop (rtx, rtx, rtx, tree, rtx, bool,
+				    unsigned, unsigned);
 static void clear_by_pieces (rtx, unsigned HOST_WIDE_INT, unsigned int);
 static rtx_insn *compress_float_constant (rtx, rtx);
 static rtx get_subtarget (rtx);
@@ -2569,7 +2571,8 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
 
    Both X and Y must be MEM rtx's.  LEN is an rtx that says how long
    they are.  LEN_TYPE is the type of the expression that was used to
-   calculate it.
+   calculate it, and CTZ_LEN is the known trailing-zeros count of LEN,
+   so LEN must be a multiple of 1<<CTZ_LEN even if it's not constant.
 
    If EQUALITY_ONLY is true, it means we don't have to return the tri-state
    value of a normal memcmp call, instead we can just compare for equality.
@@ -2585,7 +2588,7 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
 rtx
 emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
 		      bool equality_only, by_pieces_constfn y_cfn,
-		      void *y_cfndata)
+		      void *y_cfndata, unsigned ctz_len)
 {
   rtx result = 0;
 
@@ -2607,8 +2610,166 @@ emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
   else
     result = emit_block_cmp_via_cmpmem (x, y, len, len_type, target, align);
 
+  if (!result && (flag_inline_stringops & ILSOP_MEMCMP))
+    result = emit_block_cmp_via_loop (x, y, len, len_type,
+				      target, equality_only,
+				      align, ctz_len);
+
   return result;
 }
+
+rtx
+emit_block_cmp_via_loop (rtx x, rtx y, rtx len, tree size_type, rtx target,
+			 bool equality_only, unsigned align, unsigned ctz_len)
+{
+  unsigned incr = align / BITS_PER_UNIT;
+
+  if (CONST_INT_P (len))
+    ctz_len = MAX (ctz_len, (unsigned) wi::ctz (UINTVAL (len)));
+
+  if (HOST_WIDE_INT_1U << ctz_len < (unsigned HOST_WIDE_INT) incr)
+    incr = HOST_WIDE_INT_1U << ctz_len;
+
+  while (incr > 1
+	 && !can_do_by_pieces (incr, align, COMPARE_BY_PIECES))
+    incr >>= 1;
+
+  rtx_code_label *cmp_label, *top_label, *ne_label, *res_label;
+  rtx iter, x_addr, y_addr, tmp;
+  machine_mode x_addr_mode = get_address_mode (x);
+  machine_mode y_addr_mode = get_address_mode (y);
+  machine_mode iter_mode;
+
+  iter_mode = GET_MODE (len);
+  if (iter_mode == VOIDmode)
+    iter_mode = word_mode;
+
+  top_label = gen_label_rtx ();
+  cmp_label = gen_label_rtx ();
+  ne_label = gen_label_rtx ();
+  res_label = gen_label_rtx ();
+  iter = gen_reg_rtx (iter_mode);
+
+  rtx iter_init = const0_rtx;
+  rtx_code iter_cond = LT;
+  rtx iter_limit = len;
+  rtx iter_incr = GEN_INT (incr);
+  machine_mode cmp_mode;
+
+  emit_move_insn (iter, iter_init);
+
+  scalar_int_mode int_cmp_mode
+    = smallest_int_mode_for_size (incr * BITS_PER_UNIT);
+  if (GET_MODE_BITSIZE (int_cmp_mode) != incr * BITS_PER_UNIT
+      || !can_compare_p (NE, int_cmp_mode, ccp_jump))
+    {
+      cmp_mode = BLKmode;
+      gcc_checking_assert (incr != 1);
+    }
+  else
+    cmp_mode = int_cmp_mode;
+
+  x_addr = force_operand (XEXP (x, 0), NULL_RTX);
+  y_addr = force_operand (XEXP (y, 0), NULL_RTX);
+  do_pending_stack_adjust ();
+
+  emit_jump (cmp_label);
+  emit_label (top_label);
+
+  tmp = convert_modes (x_addr_mode, iter_mode, iter, true);
+  x_addr = simplify_gen_binary (PLUS, x_addr_mode, x_addr, tmp);
+
+  if (x_addr_mode != y_addr_mode)
+    tmp = convert_modes (y_addr_mode, iter_mode, iter, true);
+  y_addr = simplify_gen_binary (PLUS, y_addr_mode, y_addr, tmp);
+
+  x = change_address (x, cmp_mode, x_addr);
+  y = change_address (y, cmp_mode, y_addr);
+
+  rtx part_res;
+  if (cmp_mode == BLKmode)
+    part_res = compare_by_pieces (x, y, incr, target, align, 0, 0);
+  else
+    part_res = expand_binop (cmp_mode, sub_optab, x, y, NULL_RTX,
+			     true, OPTAB_LIB_WIDEN);
+
+  emit_cmp_and_jump_insns (part_res, GEN_INT (0), NE, NULL_RTX,
+			   GET_MODE (part_res), true, ne_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (1, 10));
+
+  tmp = expand_simple_binop (iter_mode, PLUS, iter, iter_incr, iter,
+			     true, OPTAB_LIB_WIDEN);
+  if (tmp != iter)
+    emit_move_insn (iter, tmp);
+
+  emit_label (cmp_label);
+  emit_cmp_and_jump_insns (iter, iter_limit, iter_cond, NULL_RTX, iter_mode,
+			   true, top_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (9, 10));
+
+  if (target == NULL_RTX
+      || !REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
+    target = gen_reg_rtx (TYPE_MODE (integer_type_node));
+
+  emit_move_insn (target, const0_rtx);
+  emit_jump (res_label);
+  emit_barrier ();
+
+  emit_label (ne_label);
+
+  if (equality_only)
+    emit_move_insn (target, const1_rtx);
+  else
+    {
+      if (incr > UNITS_PER_WORD)
+	/* ??? Re-compare the block found to be different one word at a
+	   time.  */
+	part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), size_type,
+					    target, equality_only,
+					    BITS_PER_WORD, 0);
+      else if (incr > 1)
+	/* ??? Re-compare the block found to be different one byte at a
+	   time.  We could do better using part_res, and being careful
+	   about endianness.  */
+	part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), size_type,
+					    target, equality_only,
+					    BITS_PER_UNIT, 0);
+      else if (known_gt (GET_MODE_BITSIZE (GET_MODE (target)),
+			 GET_MODE_BITSIZE (cmp_mode)))
+	part_res = expand_binop (GET_MODE (target), sub_optab, x, y, target,
+				 true, OPTAB_LIB_WIDEN);
+      else
+	{
+	  /* In the odd chance target is QImode, we can't count on
+	     widening subtract to capture the result of the unsigned
+	     compares.  */
+	  rtx_code_label *ltu_label;
+	  ltu_label = gen_label_rtx ();
+	  emit_cmp_and_jump_insns (x, y, LTU, NULL_RTX,
+				   cmp_mode, true, ltu_label,
+				   profile_probability::guessed_always ()
+				   .apply_scale (5, 10));
+
+	  emit_move_insn (target, const1_rtx);
+	  emit_jump (res_label);
+	  emit_barrier ();
+
+	  emit_label (ltu_label);
+	  emit_move_insn (target, constm1_rtx);
+	  part_res = target;
+	}
+
+      if (target != part_res)
+	convert_move (target, part_res, false);
+    }
+
+  emit_label (res_label);
+
+  return target;
+}
+
 \f
 /* Copy all or part of a value X into registers starting at REGNO.
    The number of registers to be filled is NREGS.  */
diff --git a/gcc/expr.h b/gcc/expr.h
index d9fc47c9114..976c8b69fc1 100644
--- a/gcc/expr.h
+++ b/gcc/expr.h
@@ -138,7 +138,8 @@ extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
 				  bool might_overlap = false,
 				  unsigned ctz_size = 0);
 extern rtx emit_block_cmp_hints (rtx, rtx, rtx, tree, rtx, bool,
-				 by_pieces_constfn, void *);
+				 by_pieces_constfn, void *,
+				 unsigned ctz_len = 0);
 extern bool emit_storent_insn (rtx to, rtx from);
 
 /* Copy all or part of a value X into registers starting at REGNO.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [gcc(refs/users/aoliva/heads/testme)] add memcmp loop expander
@ 2023-01-26  8:21 Alexandre Oliva
  0 siblings, 0 replies; 16+ messages in thread
From: Alexandre Oliva @ 2023-01-26  8:21 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:c234c022348c1ada7594294ade986951521a6a5b

commit c234c022348c1ada7594294ade986951521a6a5b
Author: Alexandre Oliva <oliva@gnu.org>
Date:   Fri Jan 20 22:01:15 2023 -0300

    add memcmp loop expander

Diff:
---
 gcc/builtins.cc |   3 +-
 gcc/expr.cc     | 165 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 gcc/expr.h      |   3 +-
 3 files changed, 167 insertions(+), 4 deletions(-)

diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index ca4d4721cdc..e55b53485e2 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -4809,7 +4809,8 @@ expand_builtin_memcmp (tree exp, rtx target, bool result_eq)
   result = emit_block_cmp_hints (arg1_rtx, arg2_rtx, len_rtx,
 				 TREE_TYPE (len), target,
 				 result_eq, constfn,
-				 CONST_CAST (char *, rep));
+				 CONST_CAST (char *, rep),
+				 tree_ctz (len));
 
   if (result)
     {
diff --git a/gcc/expr.cc b/gcc/expr.cc
index aceb3f514fc..7cb693525ba 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -83,6 +83,8 @@ static bool emit_block_move_via_pattern (rtx, rtx, rtx, unsigned, unsigned,
 static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned, int);
 static void emit_block_move_via_sized_loop (rtx, rtx, rtx, unsigned, unsigned);
 static void emit_block_move_via_oriented_loop (rtx, rtx, rtx, unsigned, unsigned);
+static rtx emit_block_cmp_via_loop (rtx, rtx, rtx, tree, rtx, bool,
+				    unsigned, unsigned);
 static void clear_by_pieces (rtx, unsigned HOST_WIDE_INT, unsigned int);
 static rtx_insn *compress_float_constant (rtx, rtx);
 static rtx get_subtarget (rtx);
@@ -2569,7 +2571,8 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
 
    Both X and Y must be MEM rtx's.  LEN is an rtx that says how long
    they are.  LEN_TYPE is the type of the expression that was used to
-   calculate it.
+   calculate it, and CTZ_LEN is the known trailing-zeros count of LEN,
+   so LEN must be a multiple of 1<<CTZ_LEN even if it's not constant.
 
    If EQUALITY_ONLY is true, it means we don't have to return the tri-state
    value of a normal memcmp call, instead we can just compare for equality.
@@ -2585,7 +2588,7 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
 rtx
 emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
 		      bool equality_only, by_pieces_constfn y_cfn,
-		      void *y_cfndata)
+		      void *y_cfndata, unsigned ctz_len)
 {
   rtx result = 0;
 
@@ -2607,8 +2610,166 @@ emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
   else
     result = emit_block_cmp_via_cmpmem (x, y, len, len_type, target, align);
 
+  if (!result && (flag_inline_stringops & ILSOP_MEMCMP))
+    result = emit_block_cmp_via_loop (x, y, len, len_type,
+				      target, equality_only,
+				      align, ctz_len);
+
   return result;
 }
+
+rtx
+emit_block_cmp_via_loop (rtx x, rtx y, rtx len, tree size_type, rtx target,
+			 bool equality_only, unsigned align, unsigned ctz_len)
+{
+  unsigned incr = align / BITS_PER_UNIT;
+
+  if (CONST_INT_P (len))
+    ctz_len = MAX (ctz_len, (unsigned) wi::ctz (UINTVAL (len)));
+
+  if (HOST_WIDE_INT_1U << ctz_len < (unsigned HOST_WIDE_INT) incr)
+    incr = HOST_WIDE_INT_1U << ctz_len;
+
+  while (incr > 1
+	 && !can_do_by_pieces (incr, align, COMPARE_BY_PIECES))
+    incr >>= 1;
+
+  rtx_code_label *cmp_label, *top_label, *ne_label, *res_label;
+  rtx iter, x_addr, y_addr, tmp;
+  machine_mode x_addr_mode = get_address_mode (x);
+  machine_mode y_addr_mode = get_address_mode (y);
+  machine_mode iter_mode;
+
+  iter_mode = GET_MODE (len);
+  if (iter_mode == VOIDmode)
+    iter_mode = word_mode;
+
+  top_label = gen_label_rtx ();
+  cmp_label = gen_label_rtx ();
+  ne_label = gen_label_rtx ();
+  res_label = gen_label_rtx ();
+  iter = gen_reg_rtx (iter_mode);
+
+  rtx iter_init = const0_rtx;
+  rtx_code iter_cond = LT;
+  rtx iter_limit = len;
+  rtx iter_incr = GEN_INT (incr);
+  machine_mode cmp_mode;
+
+  emit_move_insn (iter, iter_init);
+
+  scalar_int_mode int_cmp_mode
+    = smallest_int_mode_for_size (incr * BITS_PER_UNIT);
+  if (GET_MODE_BITSIZE (int_cmp_mode) != incr * BITS_PER_UNIT
+      || !can_compare_p (NE, int_cmp_mode, ccp_jump))
+    {
+      cmp_mode = BLKmode;
+      gcc_checking_assert (incr != 1);
+    }
+  else
+    cmp_mode = int_cmp_mode;
+
+  x_addr = force_operand (XEXP (x, 0), NULL_RTX);
+  y_addr = force_operand (XEXP (y, 0), NULL_RTX);
+  do_pending_stack_adjust ();
+
+  emit_jump (cmp_label);
+  emit_label (top_label);
+
+  tmp = convert_modes (x_addr_mode, iter_mode, iter, true);
+  x_addr = simplify_gen_binary (PLUS, x_addr_mode, x_addr, tmp);
+
+  if (x_addr_mode != y_addr_mode)
+    tmp = convert_modes (y_addr_mode, iter_mode, iter, true);
+  y_addr = simplify_gen_binary (PLUS, y_addr_mode, y_addr, tmp);
+
+  x = change_address (x, cmp_mode, x_addr);
+  y = change_address (y, cmp_mode, y_addr);
+
+  rtx part_res;
+  if (cmp_mode == BLKmode)
+    part_res = compare_by_pieces (x, y, incr, target, align, 0, 0);
+  else
+    part_res = expand_binop (cmp_mode, sub_optab, x, y, NULL_RTX,
+			     true, OPTAB_LIB_WIDEN);
+
+  emit_cmp_and_jump_insns (part_res, GEN_INT (0), NE, NULL_RTX,
+			   GET_MODE (part_res), true, ne_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (1, 10));
+
+  tmp = expand_simple_binop (iter_mode, PLUS, iter, iter_incr, iter,
+			     true, OPTAB_LIB_WIDEN);
+  if (tmp != iter)
+    emit_move_insn (iter, tmp);
+
+  emit_label (cmp_label);
+  emit_cmp_and_jump_insns (iter, iter_limit, iter_cond, NULL_RTX, iter_mode,
+			   true, top_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (9, 10));
+
+  if (target == NULL_RTX
+      || !REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
+    target = gen_reg_rtx (TYPE_MODE (integer_type_node));
+
+  emit_move_insn (target, const0_rtx);
+  emit_jump (res_label);
+  emit_barrier ();
+
+  emit_label (ne_label);
+
+  if (equality_only)
+    emit_move_insn (target, const1_rtx);
+  else
+    {
+      if (incr > UNITS_PER_WORD)
+	/* ??? Re-compare the block found to be different one word at a
+	   time.  */
+	part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), size_type,
+					    target, equality_only,
+					    BITS_PER_WORD, 0);
+      else if (incr > 1)
+	/* ??? Re-compare the block found to be different one byte at a
+	   time.  We could do better using part_res, and being careful
+	   about endianness.  */
+	part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), size_type,
+					    target, equality_only,
+					    BITS_PER_UNIT, 0);
+      else if (GET_MODE_BITSIZE (GET_MODE (target))
+	       > GET_MODE_BITSIZE (cmp_mode))
+	part_res = expand_binop (GET_MODE (target), sub_optab, x, y, target,
+				 true, OPTAB_LIB_WIDEN);
+      else
+	{
+	  /* In the odd chance target is QImode, we can't count on
+	     widening subtract to capture the result of the unsigned
+	     compares.  */
+	  rtx_code_label *ltu_label;
+	  ltu_label = gen_label_rtx ();
+	  emit_cmp_and_jump_insns (x, y, LTU, NULL_RTX,
+				   cmp_mode, true, ltu_label,
+				   profile_probability::guessed_always ()
+				   .apply_scale (5, 10));
+
+	  emit_move_insn (target, const1_rtx);
+	  emit_jump (res_label);
+	  emit_barrier ();
+
+	  emit_label (ltu_label);
+	  emit_move_insn (target, constm1_rtx);
+	  part_res = target;
+	}
+
+      if (target != part_res)
+	convert_move (target, part_res, false);
+    }
+
+  emit_label (res_label);
+
+  return target;
+}
+
 \f
 /* Copy all or part of a value X into registers starting at REGNO.
    The number of registers to be filled is NREGS.  */
diff --git a/gcc/expr.h b/gcc/expr.h
index d9fc47c9114..976c8b69fc1 100644
--- a/gcc/expr.h
+++ b/gcc/expr.h
@@ -138,7 +138,8 @@ extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
 				  bool might_overlap = false,
 				  unsigned ctz_size = 0);
 extern rtx emit_block_cmp_hints (rtx, rtx, rtx, tree, rtx, bool,
-				 by_pieces_constfn, void *);
+				 by_pieces_constfn, void *,
+				 unsigned ctz_len = 0);
 extern bool emit_storent_insn (rtx to, rtx from);
 
 /* Copy all or part of a value X into registers starting at REGNO.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [gcc(refs/users/aoliva/heads/testme)] add memcmp loop expander
@ 2023-01-26  8:01 Alexandre Oliva
  0 siblings, 0 replies; 16+ messages in thread
From: Alexandre Oliva @ 2023-01-26  8:01 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:14be539acc63313992496f72f4ebb23c44804ed8

commit 14be539acc63313992496f72f4ebb23c44804ed8
Author: Alexandre Oliva <oliva@gnu.org>
Date:   Fri Jan 20 22:01:15 2023 -0300

    add memcmp loop expander

Diff:
---
 gcc/builtins.cc |   3 +-
 gcc/expr.cc     | 144 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 gcc/expr.h      |   3 +-
 3 files changed, 146 insertions(+), 4 deletions(-)

diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index ca4d4721cdc..e55b53485e2 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -4809,7 +4809,8 @@ expand_builtin_memcmp (tree exp, rtx target, bool result_eq)
   result = emit_block_cmp_hints (arg1_rtx, arg2_rtx, len_rtx,
 				 TREE_TYPE (len), target,
 				 result_eq, constfn,
-				 CONST_CAST (char *, rep));
+				 CONST_CAST (char *, rep),
+				 tree_ctz (len));
 
   if (result)
     {
diff --git a/gcc/expr.cc b/gcc/expr.cc
index aceb3f514fc..8e40ae8a15f 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -83,6 +83,8 @@ static bool emit_block_move_via_pattern (rtx, rtx, rtx, unsigned, unsigned,
 static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned, int);
 static void emit_block_move_via_sized_loop (rtx, rtx, rtx, unsigned, unsigned);
 static void emit_block_move_via_oriented_loop (rtx, rtx, rtx, unsigned, unsigned);
+static rtx emit_block_cmp_via_loop (rtx, rtx, rtx, tree, rtx, bool,
+				    unsigned, unsigned);
 static void clear_by_pieces (rtx, unsigned HOST_WIDE_INT, unsigned int);
 static rtx_insn *compress_float_constant (rtx, rtx);
 static rtx get_subtarget (rtx);
@@ -2569,7 +2571,8 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
 
    Both X and Y must be MEM rtx's.  LEN is an rtx that says how long
    they are.  LEN_TYPE is the type of the expression that was used to
-   calculate it.
+   calculate it, and CTZ_LEN is the known trailing-zeros count of LEN,
+   so LEN must be a multiple of 1<<CTZ_LEN even if it's not constant.
 
    If EQUALITY_ONLY is true, it means we don't have to return the tri-state
    value of a normal memcmp call, instead we can just compare for equality.
@@ -2585,7 +2588,7 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
 rtx
 emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
 		      bool equality_only, by_pieces_constfn y_cfn,
-		      void *y_cfndata)
+		      void *y_cfndata, unsigned ctz_len)
 {
   rtx result = 0;
 
@@ -2607,8 +2610,145 @@ emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
   else
     result = emit_block_cmp_via_cmpmem (x, y, len, len_type, target, align);
 
+  if (!result && (flag_inline_stringops & ILSOP_MEMCMP))
+    result = emit_block_cmp_via_loop (x, y, len, len_type,
+				      target, equality_only,
+				      align, ctz_len);
+
   return result;
 }
+
+rtx
+emit_block_cmp_via_loop (rtx x, rtx y, rtx len, tree size_type, rtx target,
+			 bool equality_only, unsigned align, unsigned ctz_len)
+{
+  unsigned incr = align / BITS_PER_UNIT;
+
+  if (CONST_INT_P (len))
+    ctz_len = MAX (ctz_len, (unsigned) wi::ctz (UINTVAL (len)));
+
+  if (HOST_WIDE_INT_1U << ctz_len < (unsigned HOST_WIDE_INT) incr)
+    incr = HOST_WIDE_INT_1U << ctz_len;
+
+  while (incr > 1
+	 && !can_do_by_pieces (incr, align, COMPARE_BY_PIECES))
+    incr >>= 1;
+
+  rtx_code_label *cmp_label, *top_label, *ne_label, *res_label;
+  rtx iter, x_addr, y_addr, tmp;
+  machine_mode x_addr_mode = get_address_mode (x);
+  machine_mode y_addr_mode = get_address_mode (y);
+  machine_mode iter_mode;
+
+  iter_mode = GET_MODE (len);
+  if (iter_mode == VOIDmode)
+    iter_mode = word_mode;
+
+  top_label = gen_label_rtx ();
+  cmp_label = gen_label_rtx ();
+  ne_label = gen_label_rtx ();
+  res_label = gen_label_rtx ();
+  iter = gen_reg_rtx (iter_mode);
+
+  rtx iter_init = const0_rtx;
+  rtx_code iter_cond = LT;
+  rtx iter_limit = len;
+  rtx iter_incr = GEN_INT (incr);
+  machine_mode cmp_mode;
+
+  emit_move_insn (iter, iter_init);
+
+  scalar_int_mode int_cmp_mode
+    = smallest_int_mode_for_size (incr * BITS_PER_UNIT);
+  if (GET_MODE_BITSIZE (int_cmp_mode) != incr * BITS_PER_UNIT
+      || !can_compare_p (NE, int_cmp_mode, ccp_jump))
+    {
+      cmp_mode = BLKmode;
+      gcc_checking_assert (incr != 1);
+    }
+  else
+    cmp_mode = int_cmp_mode;
+
+  x_addr = force_operand (XEXP (x, 0), NULL_RTX);
+  y_addr = force_operand (XEXP (y, 0), NULL_RTX);
+  do_pending_stack_adjust ();
+
+  emit_jump (cmp_label);
+  emit_label (top_label);
+
+  tmp = convert_modes (x_addr_mode, iter_mode, iter, true);
+  x_addr = simplify_gen_binary (PLUS, x_addr_mode, x_addr, tmp);
+
+  if (x_addr_mode != y_addr_mode)
+    tmp = convert_modes (y_addr_mode, iter_mode, iter, true);
+  y_addr = simplify_gen_binary (PLUS, y_addr_mode, y_addr, tmp);
+
+  x = change_address (x, cmp_mode, x_addr);
+  y = change_address (y, cmp_mode, y_addr);
+
+  rtx part_res;
+  if (cmp_mode == BLKmode)
+    part_res = compare_by_pieces (x, y, incr, target, align, 0, 0);
+  else
+    part_res = expand_binop (cmp_mode, sub_optab, x, y, NULL_RTX,
+			     true, OPTAB_LIB_WIDEN);
+
+  emit_cmp_and_jump_insns (part_res, GEN_INT (0), NE, NULL_RTX,
+			   GET_MODE (part_res), true, ne_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (9, 10));
+
+  tmp = expand_simple_binop (iter_mode, PLUS, iter, iter_incr, iter,
+			     true, OPTAB_LIB_WIDEN);
+  if (tmp != iter)
+    emit_move_insn (iter, tmp);
+
+  emit_label (cmp_label);
+  emit_cmp_and_jump_insns (iter, iter_limit, iter_cond, NULL_RTX, iter_mode,
+			   true, top_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (9, 10));
+
+  if (target == NULL_RTX
+      || !REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
+    target = gen_reg_rtx (TYPE_MODE (integer_type_node));
+
+  emit_move_insn (target, const0_rtx);
+  emit_jump (res_label);
+  emit_barrier ();
+
+  emit_label (ne_label);
+
+  if (equality_only)
+    emit_move_insn (target, const1_rtx);
+  else
+    {
+      if (incr > UNITS_PER_WORD)
+	/* ??? Re-compare the block found to be different one word at a
+	   time.  */
+	part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), size_type,
+					    target, equality_only,
+					    BITS_PER_WORD, 0);
+      else if (incr > 1)
+	/* ??? Re-compare the block found to be different one byte at a
+	   time.  We could do better using part_res, and being careful
+	   about endianness.  */
+	part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), size_type,
+					    target, equality_only,
+					    BITS_PER_UNIT, 0);
+      else
+	part_res = expand_binop (GET_MODE (target), sub_optab, x, y, target,
+				 true, OPTAB_LIB_WIDEN);
+
+      if (target != part_res)
+	convert_move (target, part_res, false);
+    }
+
+  emit_label (res_label);
+
+  return target;
+}
+
 \f
 /* Copy all or part of a value X into registers starting at REGNO.
    The number of registers to be filled is NREGS.  */
diff --git a/gcc/expr.h b/gcc/expr.h
index d9fc47c9114..976c8b69fc1 100644
--- a/gcc/expr.h
+++ b/gcc/expr.h
@@ -138,7 +138,8 @@ extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
 				  bool might_overlap = false,
 				  unsigned ctz_size = 0);
 extern rtx emit_block_cmp_hints (rtx, rtx, rtx, tree, rtx, bool,
-				 by_pieces_constfn, void *);
+				 by_pieces_constfn, void *,
+				 unsigned ctz_len = 0);
 extern bool emit_storent_insn (rtx to, rtx from);
 
 /* Copy all or part of a value X into registers starting at REGNO.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [gcc(refs/users/aoliva/heads/testme)] add memcmp loop expander
@ 2023-01-26  7:02 Alexandre Oliva
  0 siblings, 0 replies; 16+ messages in thread
From: Alexandre Oliva @ 2023-01-26  7:02 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:4707a9232f8aa7f60e8745778a1f7f75bdc8540a

commit 4707a9232f8aa7f60e8745778a1f7f75bdc8540a
Author: Alexandre Oliva <oliva@gnu.org>
Date:   Fri Jan 20 22:01:15 2023 -0300

    add memcmp loop expander

Diff:
---
 gcc/builtins.cc |   3 +-
 gcc/expr.cc     | 141 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 gcc/expr.h      |   3 +-
 3 files changed, 143 insertions(+), 4 deletions(-)

diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index ca4d4721cdc..e55b53485e2 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -4809,7 +4809,8 @@ expand_builtin_memcmp (tree exp, rtx target, bool result_eq)
   result = emit_block_cmp_hints (arg1_rtx, arg2_rtx, len_rtx,
 				 TREE_TYPE (len), target,
 				 result_eq, constfn,
-				 CONST_CAST (char *, rep));
+				 CONST_CAST (char *, rep),
+				 tree_ctz (len));
 
   if (result)
     {
diff --git a/gcc/expr.cc b/gcc/expr.cc
index aceb3f514fc..3e40e1e588f 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -83,6 +83,8 @@ static bool emit_block_move_via_pattern (rtx, rtx, rtx, unsigned, unsigned,
 static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned, int);
 static void emit_block_move_via_sized_loop (rtx, rtx, rtx, unsigned, unsigned);
 static void emit_block_move_via_oriented_loop (rtx, rtx, rtx, unsigned, unsigned);
+static rtx emit_block_cmp_via_loop (rtx, rtx, rtx, tree, rtx, bool,
+				    unsigned, unsigned);
 static void clear_by_pieces (rtx, unsigned HOST_WIDE_INT, unsigned int);
 static rtx_insn *compress_float_constant (rtx, rtx);
 static rtx get_subtarget (rtx);
@@ -2569,7 +2571,8 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
 
    Both X and Y must be MEM rtx's.  LEN is an rtx that says how long
    they are.  LEN_TYPE is the type of the expression that was used to
-   calculate it.
+   calculate it, and CTZ_LEN is the known trailing-zeros count of LEN,
+   so LEN must be a multiple of 1<<CTZ_LEN even if it's not constant.
 
    If EQUALITY_ONLY is true, it means we don't have to return the tri-state
    value of a normal memcmp call, instead we can just compare for equality.
@@ -2585,7 +2588,7 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
 rtx
 emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
 		      bool equality_only, by_pieces_constfn y_cfn,
-		      void *y_cfndata)
+		      void *y_cfndata, unsigned ctz_len)
 {
   rtx result = 0;
 
@@ -2607,8 +2610,142 @@ emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
   else
     result = emit_block_cmp_via_cmpmem (x, y, len, len_type, target, align);
 
+  if (!result && (flag_inline_stringops & ILSOP_MEMCMP))
+    result = emit_block_cmp_via_loop (x, y, len, len_type,
+				      target, equality_only,
+				      align, ctz_len);
+
   return result;
 }
+
+rtx
+emit_block_cmp_via_loop (rtx x, rtx y, rtx len, tree size_type, rtx target,
+			 bool equality_only, unsigned align, unsigned ctz_len)
+{
+  unsigned incr = align / BITS_PER_UNIT;
+
+  if (CONST_INT_P (len))
+    ctz_len = MAX (ctz_len, (unsigned) wi::ctz (UINTVAL (len)));
+
+  if (HOST_WIDE_INT_1U << ctz_len < (unsigned HOST_WIDE_INT) incr)
+    incr = HOST_WIDE_INT_1U << ctz_len;
+
+  while (incr > 1
+	 && !can_do_by_pieces (incr, align, COMPARE_BY_PIECES))
+    incr >>= 1;
+
+  rtx_code_label *cmp_label, *top_label, *ne_label, *res_label;
+  rtx iter, x_addr, y_addr, tmp;
+  machine_mode x_addr_mode = get_address_mode (x);
+  machine_mode y_addr_mode = get_address_mode (y);
+  machine_mode iter_mode;
+
+  iter_mode = GET_MODE (len);
+  if (iter_mode == VOIDmode)
+    iter_mode = word_mode;
+
+  top_label = gen_label_rtx ();
+  cmp_label = gen_label_rtx ();
+  ne_label = gen_label_rtx ();
+  res_label = gen_label_rtx ();
+  iter = gen_reg_rtx (iter_mode);
+
+  rtx iter_init = const0_rtx;
+  rtx_code iter_cond = LT;
+  rtx iter_limit = len;
+  rtx iter_incr = GEN_INT (incr);
+  machine_mode cmp_mode;
+
+  emit_move_insn (iter, iter_init);
+
+  scalar_int_mode int_cmp_mode
+    = smallest_int_mode_for_size (incr * BITS_PER_UNIT);
+  if (GET_MODE_BITSIZE (int_cmp_mode) != incr * BITS_PER_UNIT
+      || !can_compare_p (NE, int_cmp_mode, ccp_jump))
+    {
+      cmp_mode = BLKmode;
+      gcc_checking_assert (incr != 1);
+    }
+  else
+    cmp_mode = int_cmp_mode;
+
+  x_addr = force_operand (XEXP (x, 0), NULL_RTX);
+  y_addr = force_operand (XEXP (y, 0), NULL_RTX);
+  do_pending_stack_adjust ();
+
+  emit_jump (cmp_label);
+  emit_label (top_label);
+
+  tmp = convert_modes (x_addr_mode, iter_mode, iter, true);
+  x_addr = simplify_gen_binary (PLUS, x_addr_mode, x_addr, tmp);
+
+  if (x_addr_mode != y_addr_mode)
+    tmp = convert_modes (y_addr_mode, iter_mode, iter, true);
+  y_addr = simplify_gen_binary (PLUS, y_addr_mode, y_addr, tmp);
+
+  x = change_address (x, cmp_mode, x_addr);
+  y = change_address (y, cmp_mode, y_addr);
+
+  rtx part_res;
+  if (cmp_mode == BLKmode)
+    part_res = compare_by_pieces (x, y, incr, target, align, 0, 0);
+  else
+    part_res = expand_binop (cmp_mode, sub_optab, x, y, NULL_RTX,
+			     true, OPTAB_LIB_WIDEN);
+
+  emit_cmp_and_jump_insns (part_res, GEN_INT (0), NE, NULL_RTX,
+			   GET_MODE (part_res), true, ne_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (9, 10));
+
+  tmp = expand_simple_binop (iter_mode, PLUS, iter, iter_incr, iter,
+			     true, OPTAB_LIB_WIDEN);
+  if (tmp != iter)
+    emit_move_insn (iter, tmp);
+
+  emit_label (cmp_label);
+  emit_cmp_and_jump_insns (iter, iter_limit, iter_cond, NULL_RTX, iter_mode,
+			   true, top_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (9, 10));
+
+  if (target == NULL_RTX
+      || !REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
+    target = gen_reg_rtx (TYPE_MODE (integer_type_node));
+
+  emit_move_insn (target, const0_rtx);
+  emit_jump (res_label);
+  emit_barrier ();
+
+  emit_label (ne_label);
+
+  if (equality_only)
+    emit_move_insn (target, const1_rtx);
+  else
+    {
+      if (incr > UNITS_PER_WORD)
+	/* ??? Re-compare the block found to be different one word at a
+	   time.  */
+	part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), size_type,
+					    target, equality_only,
+					    BITS_PER_WORD, 0);
+      else if (incr > 1)
+	/* ??? Re-compare the block found to be different one byte at a
+	   time.  We could do better using part_res, and being careful
+	   about endianness.  */
+	part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), size_type,
+					    target, equality_only,
+					    BITS_PER_UNIT, 0);
+
+      if (target != part_res)
+	convert_move (target, part_res, false);
+    }
+
+  emit_label (res_label);
+
+  return target;
+}
+
 \f
 /* Copy all or part of a value X into registers starting at REGNO.
    The number of registers to be filled is NREGS.  */
diff --git a/gcc/expr.h b/gcc/expr.h
index d9fc47c9114..976c8b69fc1 100644
--- a/gcc/expr.h
+++ b/gcc/expr.h
@@ -138,7 +138,8 @@ extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
 				  bool might_overlap = false,
 				  unsigned ctz_size = 0);
 extern rtx emit_block_cmp_hints (rtx, rtx, rtx, tree, rtx, bool,
-				 by_pieces_constfn, void *);
+				 by_pieces_constfn, void *,
+				 unsigned ctz_len = 0);
 extern bool emit_storent_insn (rtx to, rtx from);
 
 /* Copy all or part of a value X into registers starting at REGNO.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [gcc(refs/users/aoliva/heads/testme)] add memcmp loop expander
@ 2023-01-26  6:40 Alexandre Oliva
  0 siblings, 0 replies; 16+ messages in thread
From: Alexandre Oliva @ 2023-01-26  6:40 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:5198c39dd33b64da34d1c76cee6592d9814f0861

commit 5198c39dd33b64da34d1c76cee6592d9814f0861
Author: Alexandre Oliva <oliva@gnu.org>
Date:   Fri Jan 20 22:01:15 2023 -0300

    add memcmp loop expander

Diff:
---
 gcc/builtins.cc |   3 +-
 gcc/expr.cc     | 145 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 gcc/expr.h      |   3 +-
 3 files changed, 147 insertions(+), 4 deletions(-)

diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index ca4d4721cdc..e55b53485e2 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -4809,7 +4809,8 @@ expand_builtin_memcmp (tree exp, rtx target, bool result_eq)
   result = emit_block_cmp_hints (arg1_rtx, arg2_rtx, len_rtx,
 				 TREE_TYPE (len), target,
 				 result_eq, constfn,
-				 CONST_CAST (char *, rep));
+				 CONST_CAST (char *, rep),
+				 tree_ctz (len));
 
   if (result)
     {
diff --git a/gcc/expr.cc b/gcc/expr.cc
index aceb3f514fc..2913452797c 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -83,6 +83,8 @@ static bool emit_block_move_via_pattern (rtx, rtx, rtx, unsigned, unsigned,
 static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned, int);
 static void emit_block_move_via_sized_loop (rtx, rtx, rtx, unsigned, unsigned);
 static void emit_block_move_via_oriented_loop (rtx, rtx, rtx, unsigned, unsigned);
+static rtx emit_block_cmp_via_loop (rtx, rtx, rtx, tree, rtx, bool,
+				    unsigned, unsigned);
 static void clear_by_pieces (rtx, unsigned HOST_WIDE_INT, unsigned int);
 static rtx_insn *compress_float_constant (rtx, rtx);
 static rtx get_subtarget (rtx);
@@ -2569,7 +2571,8 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
 
    Both X and Y must be MEM rtx's.  LEN is an rtx that says how long
    they are.  LEN_TYPE is the type of the expression that was used to
-   calculate it.
+   calculate it, and CTZ_LEN is the known trailing-zeros count of LEN,
+   so LEN must be a multiple of 1<<CTZ_LEN even if it's not constant.
 
    If EQUALITY_ONLY is true, it means we don't have to return the tri-state
    value of a normal memcmp call, instead we can just compare for equality.
@@ -2585,7 +2588,7 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
 rtx
 emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
 		      bool equality_only, by_pieces_constfn y_cfn,
-		      void *y_cfndata)
+		      void *y_cfndata, unsigned ctz_len)
 {
   rtx result = 0;
 
@@ -2607,8 +2610,146 @@ emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
   else
     result = emit_block_cmp_via_cmpmem (x, y, len, len_type, target, align);
 
+  if (!result && (flag_inline_stringops & ILSOP_MEMCMP))
+    result = emit_block_cmp_via_loop (x, y, len, len_type,
+				      target, equality_only,
+				      align, ctz_len);
+
   return result;
 }
+
+rtx
+emit_block_cmp_via_loop (rtx x, rtx y, rtx len, tree size_type, rtx target,
+			 bool equality_only, unsigned align, unsigned ctz_len)
+{
+  unsigned incr = align / BITS_PER_UNIT;
+
+  if (CONST_INT_P (len))
+    ctz_len = MAX (ctz_len, (unsigned) wi::ctz (UINTVAL (len)));
+
+  if (HOST_WIDE_INT_1U << ctz_len < (unsigned HOST_WIDE_INT) incr)
+    incr = HOST_WIDE_INT_1U << ctz_len;
+
+  while (incr > 1
+	 && !can_do_by_pieces (incr, align, COMPARE_BY_PIECES))
+    incr >>= 1;
+
+  rtx_code_label *cmp_label, *top_label, *ne_label, *res_label;
+  rtx iter, x_addr, y_addr, tmp;
+  machine_mode x_addr_mode = get_address_mode (x);
+  machine_mode y_addr_mode = get_address_mode (y);
+  machine_mode iter_mode;
+
+  iter_mode = GET_MODE (len);
+  if (iter_mode == VOIDmode)
+    iter_mode = word_mode;
+
+  top_label = gen_label_rtx ();
+  cmp_label = gen_label_rtx ();
+  ne_label = gen_label_rtx ();
+  res_label = gen_label_rtx ();
+  iter = gen_reg_rtx (iter_mode);
+
+  rtx iter_init = const0_rtx;
+  rtx_code iter_cond = LT;
+  rtx iter_limit = len;
+  rtx iter_incr = GEN_INT (incr);
+  machine_mode cmp_mode;
+
+  emit_move_insn (iter, iter_init);
+
+  scalar_int_mode int_cmp_mode
+    = smallest_int_mode_for_size (incr * BITS_PER_UNIT);
+  if (GET_MODE_BITSIZE (int_cmp_mode) != incr * BITS_PER_UNIT
+      || !can_compare_p (NE, int_cmp_mode, ccp_jump))
+    {
+      cmp_mode = BLKmode;
+      gcc_checking_assert (can_do_by_pieces (incr, align, COMPARE_BY_PIECES));
+    }
+  else
+    cmp_mode = int_cmp_mode;
+
+  x_addr = force_operand (XEXP (x, 0), NULL_RTX);
+  y_addr = force_operand (XEXP (y, 0), NULL_RTX);
+  do_pending_stack_adjust ();
+
+  emit_jump (cmp_label);
+  emit_label (top_label);
+
+  tmp = convert_modes (x_addr_mode, iter_mode, iter, true);
+  x_addr = simplify_gen_binary (PLUS, x_addr_mode, x_addr, tmp);
+
+  if (x_addr_mode != y_addr_mode)
+    tmp = convert_modes (y_addr_mode, iter_mode, iter, true);
+  y_addr = simplify_gen_binary (PLUS, y_addr_mode, y_addr, tmp);
+
+  x = change_address (x, cmp_mode, x_addr);
+  y = change_address (y, cmp_mode, y_addr);
+
+  rtx part_res;
+  if (cmp_mode == BLKmode)
+    part_res = compare_by_pieces (x, y, incr, target, align, 0, 0);
+  else
+    part_res = expand_binop (cmp_mode, sub_optab, x, y, NULL_RTX,
+			     true, OPTAB_LIB_WIDEN);
+
+  emit_cmp_and_jump_insns (part_res, GEN_INT (0), NE, NULL_RTX,
+			   GET_MODE (part_res), true, ne_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (9, 10));
+
+  tmp = expand_simple_binop (iter_mode, PLUS, iter, iter_incr, iter,
+			     true, OPTAB_LIB_WIDEN);
+  if (tmp != iter)
+    emit_move_insn (iter, tmp);
+
+  emit_label (cmp_label);
+  emit_cmp_and_jump_insns (iter, iter_limit, iter_cond, NULL_RTX, iter_mode,
+			   true, top_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (9, 10));
+
+  if (target == NULL_RTX
+      || !REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
+    target = gen_reg_rtx (TYPE_MODE (integer_type_node));
+
+  emit_move_insn (target, const0_rtx);
+  emit_jump (res_label);
+  emit_barrier ();
+
+  emit_label (ne_label);
+
+  if (equality_only)
+    emit_move_insn (target, const1_rtx);
+  else if (incr == 1)
+    convert_move (target, part_res, false);
+  else if (incr > UNITS_PER_WORD)
+    {
+      /* ??? Re-compare the block found to be different one word at a
+	 time.  */
+      part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), size_type,
+					  target, equality_only,
+					  BITS_PER_WORD, 0);
+      if (target != part_res)
+	convert_move (target, part_res, false);
+    }
+  else
+    {
+      /* ??? Re-compare the block found to be different one byte at a
+	 time.  We could do better using part_res, and being careful
+	 about endianness.  */
+      part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), size_type,
+					  target, equality_only,
+					  BITS_PER_UNIT, 0);
+      if (target != part_res)
+	convert_move (target, part_res, false);
+    }
+
+  emit_label (res_label);
+
+  return target;
+}
+
 \f
 /* Copy all or part of a value X into registers starting at REGNO.
    The number of registers to be filled is NREGS.  */
diff --git a/gcc/expr.h b/gcc/expr.h
index d9fc47c9114..976c8b69fc1 100644
--- a/gcc/expr.h
+++ b/gcc/expr.h
@@ -138,7 +138,8 @@ extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
 				  bool might_overlap = false,
 				  unsigned ctz_size = 0);
 extern rtx emit_block_cmp_hints (rtx, rtx, rtx, tree, rtx, bool,
-				 by_pieces_constfn, void *);
+				 by_pieces_constfn, void *,
+				 unsigned ctz_len = 0);
 extern bool emit_storent_insn (rtx to, rtx from);
 
 /* Copy all or part of a value X into registers starting at REGNO.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [gcc(refs/users/aoliva/heads/testme)] add memcmp loop expander
@ 2023-01-26  6:38 Alexandre Oliva
  0 siblings, 0 replies; 16+ messages in thread
From: Alexandre Oliva @ 2023-01-26  6:38 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:abc556fb901c7566e61fb79989b57df19d30fc32

commit abc556fb901c7566e61fb79989b57df19d30fc32
Author: Alexandre Oliva <oliva@gnu.org>
Date:   Fri Jan 20 22:01:15 2023 -0300

    add memcmp loop expander

Diff:
---
 gcc/builtins.cc |   3 +-
 gcc/expr.cc     | 145 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 gcc/expr.h      |   3 +-
 3 files changed, 147 insertions(+), 4 deletions(-)

diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index ca4d4721cdc..e55b53485e2 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -4809,7 +4809,8 @@ expand_builtin_memcmp (tree exp, rtx target, bool result_eq)
   result = emit_block_cmp_hints (arg1_rtx, arg2_rtx, len_rtx,
 				 TREE_TYPE (len), target,
 				 result_eq, constfn,
-				 CONST_CAST (char *, rep));
+				 CONST_CAST (char *, rep),
+				 tree_ctz (len));
 
   if (result)
     {
diff --git a/gcc/expr.cc b/gcc/expr.cc
index aceb3f514fc..3116ad0ab09 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -83,6 +83,8 @@ static bool emit_block_move_via_pattern (rtx, rtx, rtx, unsigned, unsigned,
 static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned, int);
 static void emit_block_move_via_sized_loop (rtx, rtx, rtx, unsigned, unsigned);
 static void emit_block_move_via_oriented_loop (rtx, rtx, rtx, unsigned, unsigned);
+static rtx emit_block_cmp_via_loop (rtx, rtx, rtx, tree, rtx, bool,
+				    unsigned, unsigned);
 static void clear_by_pieces (rtx, unsigned HOST_WIDE_INT, unsigned int);
 static rtx_insn *compress_float_constant (rtx, rtx);
 static rtx get_subtarget (rtx);
@@ -2569,7 +2571,8 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
 
    Both X and Y must be MEM rtx's.  LEN is an rtx that says how long
    they are.  LEN_TYPE is the type of the expression that was used to
-   calculate it.
+   calculate it, and CTZ_LEN is the known trailing-zeros count of LEN,
+   so LEN must be a multiple of 1<<CTZ_LEN even if it's not constant.
 
    If EQUALITY_ONLY is true, it means we don't have to return the tri-state
    value of a normal memcmp call, instead we can just compare for equality.
@@ -2585,7 +2588,7 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
 rtx
 emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
 		      bool equality_only, by_pieces_constfn y_cfn,
-		      void *y_cfndata)
+		      void *y_cfndata, unsigned ctz_len)
 {
   rtx result = 0;
 
@@ -2607,8 +2610,146 @@ emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
   else
     result = emit_block_cmp_via_cmpmem (x, y, len, len_type, target, align);
 
+  if (!result && (flag_inline_stringops & ILSOP_MEMCMP))
+    result = emit_block_cmp_via_loop (x, y, len, len_type,
+				      target, equality_only,
+				      align, ctz_len);
+
   return result;
 }
+
+rtx
+emit_block_cmp_via_loop (rtx x, rtx y, rtx len, tree size_type, rtx target,
+			 bool equality_only, unsigned align, unsigned ctz_len)
+{
+  unsigned incr = align / BITS_PER_UNIT;
+
+  if (CONST_INT_P (len))
+    ctz_len = MAX (ctz_len, (unsigned) wi::ctz (UINTVAL (len)));
+
+  if (HOST_WIDE_INT_1U << ctz_len < (unsigned HOST_WIDE_INT) incr)
+    incr = HOST_WIDE_INT_1U << ctz_len;
+
+  while (incr > 1
+	 && can_do_by_pieces (incr, align, COMPARE_BY_PIECES))
+    incr >>= 1;
+
+  rtx_code_label *cmp_label, *top_label, *ne_label, *res_label;
+  rtx iter, x_addr, y_addr, tmp;
+  machine_mode x_addr_mode = get_address_mode (x);
+  machine_mode y_addr_mode = get_address_mode (y);
+  machine_mode iter_mode;
+
+  iter_mode = GET_MODE (len);
+  if (iter_mode == VOIDmode)
+    iter_mode = word_mode;
+
+  top_label = gen_label_rtx ();
+  cmp_label = gen_label_rtx ();
+  ne_label = gen_label_rtx ();
+  res_label = gen_label_rtx ();
+  iter = gen_reg_rtx (iter_mode);
+
+  rtx iter_init = const0_rtx;
+  rtx_code iter_cond = LT;
+  rtx iter_limit = len;
+  rtx iter_incr = GEN_INT (incr);
+  machine_mode cmp_mode;
+
+  emit_move_insn (iter, iter_init);
+
+  scalar_int_mode int_cmp_mode
+    = smallest_int_mode_for_size (incr * BITS_PER_UNIT);
+  if (GET_MODE_BITSIZE (int_cmp_mode) != incr * BITS_PER_UNIT
+      || !can_compare_p (NE, int_cmp_mode, ccp_jump))
+    {
+      cmp_mode = BLKmode;
+      gcc_checking_assert (can_do_by_pieces (incr, align, COMPARE_BY_PIECES));
+    }
+  else
+    cmp_mode = int_cmp_mode;
+
+  x_addr = force_operand (XEXP (x, 0), NULL_RTX);
+  y_addr = force_operand (XEXP (y, 0), NULL_RTX);
+  do_pending_stack_adjust ();
+
+  emit_jump (cmp_label);
+  emit_label (top_label);
+
+  tmp = convert_modes (x_addr_mode, iter_mode, iter, true);
+  x_addr = simplify_gen_binary (PLUS, x_addr_mode, x_addr, tmp);
+
+  if (x_addr_mode != y_addr_mode)
+    tmp = convert_modes (y_addr_mode, iter_mode, iter, true);
+  y_addr = simplify_gen_binary (PLUS, y_addr_mode, y_addr, tmp);
+
+  x = change_address (x, cmp_mode, x_addr);
+  y = change_address (y, cmp_mode, y_addr);
+
+  rtx part_res;
+  if (cmp_mode == BLKmode)
+    part_res = compare_by_pieces (x, y, incr, target, align, 0, 0);
+  else
+    part_res = expand_binop (cmp_mode, sub_optab, x, y, NULL_RTX,
+			     true, OPTAB_LIB_WIDEN);
+
+  emit_cmp_and_jump_insns (part_res, GEN_INT (0), NE, NULL_RTX,
+			   GET_MODE (part_res), true, ne_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (9, 10));
+
+  tmp = expand_simple_binop (iter_mode, PLUS, iter, iter_incr, iter,
+			     true, OPTAB_LIB_WIDEN);
+  if (tmp != iter)
+    emit_move_insn (iter, tmp);
+
+  emit_label (cmp_label);
+  emit_cmp_and_jump_insns (iter, iter_limit, iter_cond, NULL_RTX, iter_mode,
+			   true, top_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (9, 10));
+
+  if (target == NULL_RTX
+      || !REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
+    target = gen_reg_rtx (TYPE_MODE (integer_type_node));
+
+  emit_move_insn (target, const0_rtx);
+  emit_jump (res_label);
+  emit_barrier ();
+
+  emit_label (ne_label);
+
+  if (equality_only)
+    emit_move_insn (target, const1_rtx);
+  else if (incr == 1)
+    convert_move (target, part_res, false);
+  else if (incr > UNITS_PER_WORD)
+    {
+      /* ??? Re-compare the block found to be different one word at a
+	 time.  */
+      part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), size_type,
+					  target, equality_only,
+					  BITS_PER_WORD, 0);
+      if (target != part_res)
+	convert_move (target, part_res, false);
+    }
+  else
+    {
+      /* ??? Re-compare the block found to be different one byte at a
+	 time.  We could do better using part_res, and being careful
+	 about endianness.  */
+      part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), size_type,
+					  target, equality_only,
+					  BITS_PER_UNIT, 0);
+      if (target != part_res)
+	convert_move (target, part_res, false);
+    }
+
+  emit_label (res_label);
+
+  return target;
+}
+
 \f
 /* Copy all or part of a value X into registers starting at REGNO.
    The number of registers to be filled is NREGS.  */
diff --git a/gcc/expr.h b/gcc/expr.h
index d9fc47c9114..976c8b69fc1 100644
--- a/gcc/expr.h
+++ b/gcc/expr.h
@@ -138,7 +138,8 @@ extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
 				  bool might_overlap = false,
 				  unsigned ctz_size = 0);
 extern rtx emit_block_cmp_hints (rtx, rtx, rtx, tree, rtx, bool,
-				 by_pieces_constfn, void *);
+				 by_pieces_constfn, void *,
+				 unsigned ctz_len = 0);
 extern bool emit_storent_insn (rtx to, rtx from);
 
 /* Copy all or part of a value X into registers starting at REGNO.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [gcc(refs/users/aoliva/heads/testme)] add memcmp loop expander
@ 2023-01-26  6:34 Alexandre Oliva
  0 siblings, 0 replies; 16+ messages in thread
From: Alexandre Oliva @ 2023-01-26  6:34 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:856cf5e0722d63ddad1754943bfc1e0363184736

commit 856cf5e0722d63ddad1754943bfc1e0363184736
Author: Alexandre Oliva <oliva@gnu.org>
Date:   Fri Jan 20 22:01:15 2023 -0300

    add memcmp loop expander

Diff:
---
 gcc/builtins.cc |   3 +-
 gcc/expr.cc     | 145 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 gcc/expr.h      |   3 +-
 3 files changed, 147 insertions(+), 4 deletions(-)

diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index ca4d4721cdc..e55b53485e2 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -4809,7 +4809,8 @@ expand_builtin_memcmp (tree exp, rtx target, bool result_eq)
   result = emit_block_cmp_hints (arg1_rtx, arg2_rtx, len_rtx,
 				 TREE_TYPE (len), target,
 				 result_eq, constfn,
-				 CONST_CAST (char *, rep));
+				 CONST_CAST (char *, rep),
+				 tree_ctz (len));
 
   if (result)
     {
diff --git a/gcc/expr.cc b/gcc/expr.cc
index aceb3f514fc..ae8d29d14bd 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -83,6 +83,8 @@ static bool emit_block_move_via_pattern (rtx, rtx, rtx, unsigned, unsigned,
 static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned, int);
 static void emit_block_move_via_sized_loop (rtx, rtx, rtx, unsigned, unsigned);
 static void emit_block_move_via_oriented_loop (rtx, rtx, rtx, unsigned, unsigned);
+static rtx emit_block_cmp_via_loop (rtx, rtx, rtx, tree, rtx, bool,
+				    unsigned, unsigned);
 static void clear_by_pieces (rtx, unsigned HOST_WIDE_INT, unsigned int);
 static rtx_insn *compress_float_constant (rtx, rtx);
 static rtx get_subtarget (rtx);
@@ -2569,7 +2571,8 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
 
    Both X and Y must be MEM rtx's.  LEN is an rtx that says how long
    they are.  LEN_TYPE is the type of the expression that was used to
-   calculate it.
+   calculate it, and CTZ_LEN is the known trailing-zeros count of LEN,
+   so LEN must be a multiple of 1<<CTZ_LEN even if it's not constant.
 
    If EQUALITY_ONLY is true, it means we don't have to return the tri-state
    value of a normal memcmp call, instead we can just compare for equality.
@@ -2585,7 +2588,7 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
 rtx
 emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
 		      bool equality_only, by_pieces_constfn y_cfn,
-		      void *y_cfndata)
+		      void *y_cfndata, unsigned ctz_len)
 {
   rtx result = 0;
 
@@ -2607,8 +2610,146 @@ emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
   else
     result = emit_block_cmp_via_cmpmem (x, y, len, len_type, target, align);
 
+  if (!result && (flag_inline_stringops & ILSOP_MEMCMP))
+    result = emit_block_cmp_via_loop (x, y, len, len_type,
+				      target, equality_only,
+				      align, ctz_len);
+
   return result;
 }
+
+rtx
+emit_block_cmp_via_loop (rtx x, rtx y, rtx len, tree size_type, rtx target,
+			 bool equality_only, unsigned align, unsigned ctz_len)
+{
+  unsigned incr = align / BITS_PER_UNIT;
+
+  if (CONST_INT_P (len))
+    ctz_len = MAX (ctz_len, (unsigned) wi::ctz (UINTVAL (len)));
+
+  if (HOST_WIDE_INT_1U << ctz_len < (unsigned HOST_WIDE_INT) incr)
+    incr = HOST_WIDE_INT_1U << ctz_len;
+
+  while (incr > 1
+	 && can_do_by_pieces (incr, align, COMPARE_BY_PIECES))
+    incr >>= 1;
+
+  rtx_code_label *cmp_label, *top_label, *ne_label, *res_label;
+  rtx iter, x_addr, y_addr, tmp;
+  machine_mode x_addr_mode = get_address_mode (x);
+  machine_mode y_addr_mode = get_address_mode (y);
+  machine_mode iter_mode;
+
+  iter_mode = GET_MODE (len);
+  if (iter_mode == VOIDmode)
+    iter_mode = word_mode;
+
+  top_label = gen_label_rtx ();
+  cmp_label = gen_label_rtx ();
+  ne_label = gen_label_rtx ();
+  res_label = gen_label_rtx ();
+  iter = gen_reg_rtx (iter_mode);
+
+  rtx iter_init = const0_rtx;
+  rtx_code iter_cond = LT;
+  rtx iter_limit = len;
+  rtx iter_incr = GEN_INT (incr);
+  machine_mode cmp_mode;
+
+  emit_move_insn (iter, iter_init);
+
+  scalar_int_mode int_cmp_mode
+    = smallest_int_mode_for_size (incr * BITS_PER_UNIT);
+  if (GET_MODE_BITSIZE (int_cmp_mode) != incr * BITS_PER_UNIT
+      || !can_compare_p (NE, int_cmp_mode, ccp_jump))
+    {
+      cmp_mode = BLKmode;
+      gcc_checking_assert (can_do_by_pieces (incr, align, COMPARE_BY_PIECES));
+    }
+  else
+    cmp_mode = int_cmp_mode;
+
+  x_addr = force_operand (XEXP (x, 0), NULL_RTX);
+  y_addr = force_operand (XEXP (y, 0), NULL_RTX);
+  do_pending_stack_adjust ();
+
+  emit_jump (cmp_label);
+  emit_label (top_label);
+
+  tmp = convert_modes (x_addr_mode, iter_mode, iter, true);
+  x_addr = simplify_gen_binary (PLUS, x_addr_mode, x_addr, tmp);
+
+  if (x_addr_mode != y_addr_mode)
+    tmp = convert_modes (y_addr_mode, iter_mode, iter, true);
+  y_addr = simplify_gen_binary (PLUS, y_addr_mode, y_addr, tmp);
+
+  x = change_address (x, cmp_mode, x_addr);
+  y = change_address (y, cmp_mode, y_addr);
+
+  rtx part_res;
+  if (cmp_mode == BLKmode)
+    part_res = compare_by_pieces (x, y, incr, target, align, 0, 0);
+  else
+    part_res = expand_binop (cmp_mode, sub_optab, x, y, NULL_RTX,
+			     true, OPTAB_LIB_WIDEN);
+
+  emit_cmp_and_jump_insns (part_res, GEN_INT (0), NE, NULL_RTX,
+			   GET_MODE (part_res), true, ne_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (9, 10));
+
+  tmp = expand_simple_binop (iter_mode, PLUS, iter, iter_incr, iter,
+			     true, OPTAB_LIB_WIDEN);
+  if (tmp != iter)
+    emit_move_insn (iter, tmp);
+
+  emit_label (cmp_label);
+  emit_cmp_and_jump_insns (iter, iter_limit, iter_cond, NULL_RTX, iter_mode,
+			   true, top_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (9, 10));
+
+  if (target == NULL_RTX
+      || !REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
+    target = gen_reg_rtx (TYPE_MODE (integer_type_node));
+
+  emit_move_insn (target, const0_rtx);
+  emit_jump (res_label);
+  emit_barrier ();
+
+  emit_label (ne_label);
+
+  if (equality_only)
+    emit_move_insn (target, const1_rtx);
+  else if (incr == 1)
+    convert_move (target, part_res, false);
+  else if (incr > UNITS_PER_WORD)
+    {
+      /* ??? Re-compare the block found to be different one word at a
+	 time.  */
+      part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), size_type,
+					  target, equality_only,
+					  incr * BITS_PER_UNIT, 0);
+      if (target != part_res)
+	convert_move (target, part_res, false);
+    }
+  else
+    {
+      /* ??? Re-compare the block found to be different one byte at a
+	 time.  We could do better using part_res, and being careful
+	 about endianness.  */
+      part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), size_type,
+					  target, equality_only,
+					  BITS_PER_UNIT, 0);
+      if (target != part_res)
+	convert_move (target, part_res, false);
+    }
+
+  emit_label (res_label);
+
+  return target;
+}
+
 \f
 /* Copy all or part of a value X into registers starting at REGNO.
    The number of registers to be filled is NREGS.  */
diff --git a/gcc/expr.h b/gcc/expr.h
index d9fc47c9114..976c8b69fc1 100644
--- a/gcc/expr.h
+++ b/gcc/expr.h
@@ -138,7 +138,8 @@ extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
 				  bool might_overlap = false,
 				  unsigned ctz_size = 0);
 extern rtx emit_block_cmp_hints (rtx, rtx, rtx, tree, rtx, bool,
-				 by_pieces_constfn, void *);
+				 by_pieces_constfn, void *,
+				 unsigned ctz_len = 0);
 extern bool emit_storent_insn (rtx to, rtx from);
 
 /* Copy all or part of a value X into registers starting at REGNO.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [gcc(refs/users/aoliva/heads/testme)] add memcmp loop expander
@ 2023-01-26  6:14 Alexandre Oliva
  0 siblings, 0 replies; 16+ messages in thread
From: Alexandre Oliva @ 2023-01-26  6:14 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:ddb78c573c96d1368519421fa71b9a3e3f9c5ce5

commit ddb78c573c96d1368519421fa71b9a3e3f9c5ce5
Author: Alexandre Oliva <oliva@gnu.org>
Date:   Fri Jan 20 22:01:15 2023 -0300

    add memcmp loop expander

Diff:
---
 gcc/builtins.cc |   3 +-
 gcc/expr.cc     | 138 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 gcc/expr.h      |   3 +-
 3 files changed, 140 insertions(+), 4 deletions(-)

diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index ca4d4721cdc..e55b53485e2 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -4809,7 +4809,8 @@ expand_builtin_memcmp (tree exp, rtx target, bool result_eq)
   result = emit_block_cmp_hints (arg1_rtx, arg2_rtx, len_rtx,
 				 TREE_TYPE (len), target,
 				 result_eq, constfn,
-				 CONST_CAST (char *, rep));
+				 CONST_CAST (char *, rep),
+				 tree_ctz (len));
 
   if (result)
     {
diff --git a/gcc/expr.cc b/gcc/expr.cc
index aceb3f514fc..933a28e3891 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -83,6 +83,8 @@ static bool emit_block_move_via_pattern (rtx, rtx, rtx, unsigned, unsigned,
 static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned, int);
 static void emit_block_move_via_sized_loop (rtx, rtx, rtx, unsigned, unsigned);
 static void emit_block_move_via_oriented_loop (rtx, rtx, rtx, unsigned, unsigned);
+static rtx emit_block_cmp_via_loop (rtx, rtx, rtx, tree, rtx, bool,
+				    unsigned, unsigned);
 static void clear_by_pieces (rtx, unsigned HOST_WIDE_INT, unsigned int);
 static rtx_insn *compress_float_constant (rtx, rtx);
 static rtx get_subtarget (rtx);
@@ -2569,7 +2571,8 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
 
    Both X and Y must be MEM rtx's.  LEN is an rtx that says how long
    they are.  LEN_TYPE is the type of the expression that was used to
-   calculate it.
+   calculate it, and CTZ_LEN is the known trailing-zeros count of LEN,
+   so LEN must be a multiple of 1<<CTZ_LEN even if it's not constant.
 
    If EQUALITY_ONLY is true, it means we don't have to return the tri-state
    value of a normal memcmp call, instead we can just compare for equality.
@@ -2585,7 +2588,7 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
 rtx
 emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
 		      bool equality_only, by_pieces_constfn y_cfn,
-		      void *y_cfndata)
+		      void *y_cfndata, unsigned ctz_len)
 {
   rtx result = 0;
 
@@ -2607,8 +2610,139 @@ emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
   else
     result = emit_block_cmp_via_cmpmem (x, y, len, len_type, target, align);
 
+  if (!result && (flag_inline_stringops & ILSOP_MEMCMP))
+    result = emit_block_cmp_via_loop (x, y, len, len_type,
+				      target, equality_only,
+				      align, ctz_len);
+
   return result;
 }
+
+rtx
+emit_block_cmp_via_loop (rtx x, rtx y, rtx len, tree size_type, rtx target,
+			 bool equality_only, unsigned align, unsigned ctz_len)
+{
+  unsigned incr = align / BITS_PER_UNIT;
+
+  if (CONST_INT_P (len))
+    ctz_len = MAX (ctz_len, (unsigned) wi::ctz (UINTVAL (len)));
+
+  if (HOST_WIDE_INT_1U << ctz_len < (unsigned HOST_WIDE_INT) incr)
+    incr = HOST_WIDE_INT_1U << ctz_len;
+
+  while (incr > 1
+	 && !(equality_only
+	      ? can_do_by_pieces (incr, align, COMPARE_BY_PIECES)
+	      : int_mode_for_size (incr, 0).exists ()))
+    incr >>= 1;
+
+  rtx_code_label *cmp_label, *top_label, *ne_label, *res_label;
+  rtx iter, x_addr, y_addr, tmp;
+  machine_mode x_addr_mode = get_address_mode (x);
+  machine_mode y_addr_mode = get_address_mode (y);
+  machine_mode iter_mode;
+
+  iter_mode = GET_MODE (len);
+  if (iter_mode == VOIDmode)
+    iter_mode = word_mode;
+
+  top_label = gen_label_rtx ();
+  cmp_label = gen_label_rtx ();
+  ne_label = gen_label_rtx ();
+  res_label = gen_label_rtx ();
+  iter = gen_reg_rtx (iter_mode);
+
+  rtx iter_init = const0_rtx;
+  rtx_code iter_cond = LT;
+  rtx iter_limit = len;
+  rtx iter_incr = GEN_INT (incr);
+  machine_mode cmp_mode;
+
+  emit_move_insn (iter, iter_init);
+
+  scalar_int_mode int_cmp_mode
+    = smallest_int_mode_for_size (incr * BITS_PER_UNIT);
+  if (GET_MODE_BITSIZE (int_cmp_mode) != incr * BITS_PER_UNIT
+      || !can_compare_p (NE, int_cmp_mode, ccp_jump))
+    {
+      cmp_mode = BLKmode;
+      gcc_checking_assert (can_do_by_pieces (incr, align, COMPARE_BY_PIECES));
+      if (!equality_only)
+	return NULL_RTX;
+    }
+  else
+    cmp_mode = int_cmp_mode;
+
+  x_addr = force_operand (XEXP (x, 0), NULL_RTX);
+  y_addr = force_operand (XEXP (y, 0), NULL_RTX);
+  do_pending_stack_adjust ();
+
+  emit_jump (cmp_label);
+  emit_label (top_label);
+
+  tmp = convert_modes (x_addr_mode, iter_mode, iter, true);
+  x_addr = simplify_gen_binary (PLUS, x_addr_mode, x_addr, tmp);
+
+  if (x_addr_mode != y_addr_mode)
+    tmp = convert_modes (y_addr_mode, iter_mode, iter, true);
+  y_addr = simplify_gen_binary (PLUS, y_addr_mode, y_addr, tmp);
+
+  x = change_address (x, cmp_mode, x_addr);
+  y = change_address (y, cmp_mode, y_addr);
+
+  rtx part_res;
+  if (cmp_mode == BLKmode)
+    part_res = compare_by_pieces (x, y, incr, target, align, 0, 0);
+  else
+    part_res = expand_binop (cmp_mode, sub_optab, x, y, NULL_RTX,
+			     true, OPTAB_LIB_WIDEN);
+
+  emit_cmp_and_jump_insns (part_res, GEN_INT (0), NE, NULL_RTX,
+			   GET_MODE (part_res), true, ne_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (9, 10));
+
+  tmp = expand_simple_binop (iter_mode, PLUS, iter, iter_incr, iter,
+			     true, OPTAB_LIB_WIDEN);
+  if (tmp != iter)
+    emit_move_insn (iter, tmp);
+
+  emit_label (cmp_label);
+  emit_cmp_and_jump_insns (iter, iter_limit, iter_cond, NULL_RTX, iter_mode,
+			   true, top_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (9, 10));
+
+  if (target == NULL_RTX
+      || !REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
+    target = gen_reg_rtx (TYPE_MODE (integer_type_node));
+
+  emit_move_insn (target, const0_rtx);
+  emit_jump (res_label);
+  emit_barrier ();
+
+  emit_label (ne_label);
+
+  if (equality_only)
+    emit_move_insn (target, const1_rtx);
+  else if (incr == 1)
+    convert_move (target, part_res, false);
+  else
+    {
+      /* ??? Re-compare the block found to be different one byte at a
+	 time.  We could do better using part_res, and being careful
+	 about endianness.  */
+      part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), size_type,
+					  target, equality_only,
+					  BITS_PER_UNIT, 0);
+      convert_move (target, part_res, false);
+    }
+
+  emit_label (res_label);
+
+  return target;
+}
+
 \f
 /* Copy all or part of a value X into registers starting at REGNO.
    The number of registers to be filled is NREGS.  */
diff --git a/gcc/expr.h b/gcc/expr.h
index d9fc47c9114..976c8b69fc1 100644
--- a/gcc/expr.h
+++ b/gcc/expr.h
@@ -138,7 +138,8 @@ extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
 				  bool might_overlap = false,
 				  unsigned ctz_size = 0);
 extern rtx emit_block_cmp_hints (rtx, rtx, rtx, tree, rtx, bool,
-				 by_pieces_constfn, void *);
+				 by_pieces_constfn, void *,
+				 unsigned ctz_len = 0);
 extern bool emit_storent_insn (rtx to, rtx from);
 
 /* Copy all or part of a value X into registers starting at REGNO.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [gcc(refs/users/aoliva/heads/testme)] add memcmp loop expander
@ 2023-01-26  6:09 Alexandre Oliva
  0 siblings, 0 replies; 16+ messages in thread
From: Alexandre Oliva @ 2023-01-26  6:09 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:fd9673deea478839fd5a91ce8131e9d7a963de18

commit fd9673deea478839fd5a91ce8131e9d7a963de18
Author: Alexandre Oliva <oliva@gnu.org>
Date:   Fri Jan 20 22:01:15 2023 -0300

    add memcmp loop expander

Diff:
---
 gcc/builtins.cc |   3 +-
 gcc/expr.cc     | 138 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 gcc/expr.h      |   3 +-
 3 files changed, 140 insertions(+), 4 deletions(-)

diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index ca4d4721cdc..e55b53485e2 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -4809,7 +4809,8 @@ expand_builtin_memcmp (tree exp, rtx target, bool result_eq)
   result = emit_block_cmp_hints (arg1_rtx, arg2_rtx, len_rtx,
 				 TREE_TYPE (len), target,
 				 result_eq, constfn,
-				 CONST_CAST (char *, rep));
+				 CONST_CAST (char *, rep),
+				 tree_ctz (len));
 
   if (result)
     {
diff --git a/gcc/expr.cc b/gcc/expr.cc
index aceb3f514fc..e8c53b4c7ce 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -83,6 +83,8 @@ static bool emit_block_move_via_pattern (rtx, rtx, rtx, unsigned, unsigned,
 static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned, int);
 static void emit_block_move_via_sized_loop (rtx, rtx, rtx, unsigned, unsigned);
 static void emit_block_move_via_oriented_loop (rtx, rtx, rtx, unsigned, unsigned);
+static rtx emit_block_cmp_via_loop (rtx, rtx, rtx, tree, rtx, bool,
+				    unsigned, unsigned);
 static void clear_by_pieces (rtx, unsigned HOST_WIDE_INT, unsigned int);
 static rtx_insn *compress_float_constant (rtx, rtx);
 static rtx get_subtarget (rtx);
@@ -2569,7 +2571,8 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
 
    Both X and Y must be MEM rtx's.  LEN is an rtx that says how long
    they are.  LEN_TYPE is the type of the expression that was used to
-   calculate it.
+   calculate it, and CTZ_LEN is the known trailing-zeros count of LEN,
+   so LEN must be a multiple of 1<<CTZ_LEN even if it's not constant.
 
    If EQUALITY_ONLY is true, it means we don't have to return the tri-state
    value of a normal memcmp call, instead we can just compare for equality.
@@ -2585,7 +2588,7 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
 rtx
 emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
 		      bool equality_only, by_pieces_constfn y_cfn,
-		      void *y_cfndata)
+		      void *y_cfndata, unsigned ctz_len)
 {
   rtx result = 0;
 
@@ -2607,8 +2610,139 @@ emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
   else
     result = emit_block_cmp_via_cmpmem (x, y, len, len_type, target, align);
 
+  if (!result && (flag_inline_stringops & ILSOP_MEMCMP))
+    result = emit_block_cmp_via_loop (x, y, len, len_type,
+				      target, equality_only,
+				      align, ctz_len);
+
   return result;
 }
+
+rtx
+emit_block_cmp_via_loop (rtx x, rtx y, rtx len, tree size_type, rtx target,
+			 bool equality_only, unsigned align, unsigned ctz_len)
+{
+  unsigned incr = align / BITS_PER_UNIT;
+
+  if (CONST_INT_P (len))
+    ctz_len = MAX (ctz_len, (unsigned) wi::ctz (UINTVAL (len)));
+
+  if (HOST_WIDE_INT_1U << ctz_len < (unsigned HOST_WIDE_INT) incr)
+    incr = HOST_WIDE_INT_1U << ctz_len;
+
+  while (incr > 1
+	 && !(equality_only
+	      ? can_do_by_pieces (incr, align, COMPARE_BY_PIECES)
+	      : int_mode_for_size (incr, 0).exists ())
+	 && can_compare_p (NE, int_mode_for_size (incr, 0), ccp_jump))
+    incr >>= 1;
+
+  rtx_code_label *cmp_label, *top_label, *ne_label, *res_label;
+  rtx iter, x_addr, y_addr, tmp;
+  machine_mode x_addr_mode = get_address_mode (x);
+  machine_mode y_addr_mode = get_address_mode (y);
+  machine_mode iter_mode;
+
+  iter_mode = GET_MODE (len);
+  if (iter_mode == VOIDmode)
+    iter_mode = word_mode;
+
+  top_label = gen_label_rtx ();
+  cmp_label = gen_label_rtx ();
+  ne_label = gen_label_rtx ();
+  res_label = gen_label_rtx ();
+  iter = gen_reg_rtx (iter_mode);
+
+  rtx iter_init = const0_rtx;
+  rtx_code iter_cond = LT;
+  rtx iter_limit = len;
+  rtx iter_incr = GEN_INT (incr);
+  machine_mode cmp_mode;
+
+  emit_move_insn (iter, iter_init);
+
+  scalar_int_mode int_cmp_mode
+    = smallest_int_mode_for_size (incr * BITS_PER_UNIT);
+  if (GET_MODE_BITSIZE (int_cmp_mode) != incr * BITS_PER_UNIT)
+    {
+      cmp_mode = BLKmode;
+      gcc_checking_assert (can_do_by_pieces (incr, align, COMPARE_BY_PIECES));
+      if (!equality_only)
+	return NULL_RTX;
+    }
+  else
+    cmp_mode = int_cmp_mode;
+
+  x_addr = force_operand (XEXP (x, 0), NULL_RTX);
+  y_addr = force_operand (XEXP (y, 0), NULL_RTX);
+  do_pending_stack_adjust ();
+
+  emit_jump (cmp_label);
+  emit_label (top_label);
+
+  tmp = convert_modes (x_addr_mode, iter_mode, iter, true);
+  x_addr = simplify_gen_binary (PLUS, x_addr_mode, x_addr, tmp);
+
+  if (x_addr_mode != y_addr_mode)
+    tmp = convert_modes (y_addr_mode, iter_mode, iter, true);
+  y_addr = simplify_gen_binary (PLUS, y_addr_mode, y_addr, tmp);
+
+  x = change_address (x, cmp_mode, x_addr);
+  y = change_address (y, cmp_mode, y_addr);
+
+  rtx part_res;
+  if (cmp_mode == BLKmode)
+    part_res = compare_by_pieces (x, y, incr, target, align, 0, 0);
+  else
+    part_res = expand_binop (cmp_mode, sub_optab, x, y, NULL_RTX,
+			     true, OPTAB_LIB_WIDEN);
+
+  emit_cmp_and_jump_insns (part_res, GEN_INT (0), NE, NULL_RTX,
+			   GET_MODE (part_res), true, ne_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (9, 10));
+
+  tmp = expand_simple_binop (iter_mode, PLUS, iter, iter_incr, iter,
+			     true, OPTAB_LIB_WIDEN);
+  if (tmp != iter)
+    emit_move_insn (iter, tmp);
+
+  emit_label (cmp_label);
+  emit_cmp_and_jump_insns (iter, iter_limit, iter_cond, NULL_RTX, iter_mode,
+			   true, top_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (9, 10));
+
+  if (target == NULL_RTX
+      || !REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
+    target = gen_reg_rtx (TYPE_MODE (integer_type_node));
+
+  emit_move_insn (target, const0_rtx);
+  emit_jump (res_label);
+  emit_barrier ();
+
+  emit_label (ne_label);
+
+  if (equality_only)
+    emit_move_insn (target, const1_rtx);
+  else if (incr == 1)
+    convert_move (target, part_res, false);
+  else
+    {
+      /* ??? Re-compare the block found to be different one byte at a
+	 time.  We could do better using part_res, and being careful
+	 about endianness.  */
+      part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), size_type,
+					  target, equality_only,
+					  BITS_PER_UNIT, 0);
+      convert_move (target, part_res, false);
+    }
+
+  emit_label (res_label);
+
+  return target;
+}
+
 \f
 /* Copy all or part of a value X into registers starting at REGNO.
    The number of registers to be filled is NREGS.  */
diff --git a/gcc/expr.h b/gcc/expr.h
index d9fc47c9114..976c8b69fc1 100644
--- a/gcc/expr.h
+++ b/gcc/expr.h
@@ -138,7 +138,8 @@ extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
 				  bool might_overlap = false,
 				  unsigned ctz_size = 0);
 extern rtx emit_block_cmp_hints (rtx, rtx, rtx, tree, rtx, bool,
-				 by_pieces_constfn, void *);
+				 by_pieces_constfn, void *,
+				 unsigned ctz_len = 0);
 extern bool emit_storent_insn (rtx to, rtx from);
 
 /* Copy all or part of a value X into registers starting at REGNO.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [gcc(refs/users/aoliva/heads/testme)] add memcmp loop expander
@ 2023-01-26  5:17 Alexandre Oliva
  0 siblings, 0 replies; 16+ messages in thread
From: Alexandre Oliva @ 2023-01-26  5:17 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:87147caf4eadff1b779896ddc22b8a0201b9d98e

commit 87147caf4eadff1b779896ddc22b8a0201b9d98e
Author: Alexandre Oliva <oliva@gnu.org>
Date:   Fri Jan 20 22:01:15 2023 -0300

    add memcmp loop expander

Diff:
---
 gcc/builtins.cc |   3 +-
 gcc/expr.cc     | 137 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 gcc/expr.h      |   3 +-
 3 files changed, 139 insertions(+), 4 deletions(-)

diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index ca4d4721cdc..e55b53485e2 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -4809,7 +4809,8 @@ expand_builtin_memcmp (tree exp, rtx target, bool result_eq)
   result = emit_block_cmp_hints (arg1_rtx, arg2_rtx, len_rtx,
 				 TREE_TYPE (len), target,
 				 result_eq, constfn,
-				 CONST_CAST (char *, rep));
+				 CONST_CAST (char *, rep),
+				 tree_ctz (len));
 
   if (result)
     {
diff --git a/gcc/expr.cc b/gcc/expr.cc
index a02b0b6ed52..8de92272a10 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -83,6 +83,8 @@ static bool emit_block_move_via_pattern (rtx, rtx, rtx, unsigned, unsigned,
 static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned, int);
 static void emit_block_move_via_sized_loop (rtx, rtx, rtx, unsigned, unsigned);
 static void emit_block_move_via_oriented_loop (rtx, rtx, rtx, unsigned, unsigned);
+static rtx emit_block_cmp_via_loop (rtx, rtx, rtx, tree, rtx, bool,
+				    unsigned, unsigned);
 static void clear_by_pieces (rtx, unsigned HOST_WIDE_INT, unsigned int);
 static rtx_insn *compress_float_constant (rtx, rtx);
 static rtx get_subtarget (rtx);
@@ -2567,7 +2569,8 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
 
    Both X and Y must be MEM rtx's.  LEN is an rtx that says how long
    they are.  LEN_TYPE is the type of the expression that was used to
-   calculate it.
+   calculate it, and CTZ_LEN is the known trailing-zeros count of LEN,
+   so LEN must be a multiple of 1<<CTZ_LEN even if it's not constant.
 
    If EQUALITY_ONLY is true, it means we don't have to return the tri-state
    value of a normal memcmp call, instead we can just compare for equality.
@@ -2583,7 +2586,7 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
 rtx
 emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
 		      bool equality_only, by_pieces_constfn y_cfn,
-		      void *y_cfndata)
+		      void *y_cfndata, unsigned ctz_len)
 {
   rtx result = 0;
 
@@ -2605,8 +2608,138 @@ emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
   else
     result = emit_block_cmp_via_cmpmem (x, y, len, len_type, target, align);
 
+  if (!result && (flag_inline_stringops & ILSOP_MEMCMP))
+    result = emit_block_cmp_via_loop (x, y, len, len_type,
+				      target, equality_only,
+				      align, ctz_len);
+
   return result;
 }
+
+rtx
+emit_block_cmp_via_loop (rtx x, rtx y, rtx len, tree size_type, rtx target,
+			 bool equality_only, unsigned align, unsigned ctz_len)
+{
+  unsigned incr = align / BITS_PER_UNIT;
+
+  if (CONST_INT_P (len))
+    ctz_len = MAX (ctz_len, (unsigned) wi::ctz (UINTVAL (len)));
+
+  if (HOST_WIDE_INT_1U << ctz_len < (unsigned HOST_WIDE_INT) incr)
+    incr = HOST_WIDE_INT_1U << ctz_len;
+
+  while (incr > 1
+	 && !(equality_only
+	      ? can_do_by_pieces (incr, align, COMPARE_BY_PIECES)
+	      : int_mode_for_size (incr, 0).exists ()))
+    incr >>= 1;
+
+  rtx_code_label *cmp_label, *top_label, *ne_label, *res_label;
+  rtx iter, x_addr, y_addr, tmp;
+  machine_mode x_addr_mode = get_address_mode (x);
+  machine_mode y_addr_mode = get_address_mode (y);
+  machine_mode iter_mode;
+
+  iter_mode = GET_MODE (len);
+  if (iter_mode == VOIDmode)
+    iter_mode = word_mode;
+
+  top_label = gen_label_rtx ();
+  cmp_label = gen_label_rtx ();
+  ne_label = gen_label_rtx ();
+  res_label = gen_label_rtx ();
+  iter = gen_reg_rtx (iter_mode);
+
+  rtx iter_init = const0_rtx;
+  rtx_code iter_cond = LT;
+  rtx iter_limit = len;
+  rtx iter_incr = GEN_INT (incr);
+  machine_mode cmp_mode;
+
+  emit_move_insn (iter, iter_init);
+
+  scalar_int_mode int_cmp_mode
+    = smallest_int_mode_for_size (incr * BITS_PER_UNIT);
+  if (GET_MODE_BITSIZE (int_cmp_mode) != incr * BITS_PER_UNIT)
+    {
+      cmp_mode = BLKmode;
+      gcc_checking_assert (can_do_by_pieces (incr, align, COMPARE_BY_PIECES));
+      if (!equality_only)
+	return NULL_RTX;
+    }
+  else
+    cmp_mode = int_cmp_mode;
+
+  x_addr = force_operand (XEXP (x, 0), NULL_RTX);
+  y_addr = force_operand (XEXP (y, 0), NULL_RTX);
+  do_pending_stack_adjust ();
+
+  emit_jump (cmp_label);
+  emit_label (top_label);
+
+  tmp = convert_modes (x_addr_mode, iter_mode, iter, true);
+  x_addr = simplify_gen_binary (PLUS, x_addr_mode, x_addr, tmp);
+
+  if (x_addr_mode != y_addr_mode)
+    tmp = convert_modes (y_addr_mode, iter_mode, iter, true);
+  y_addr = simplify_gen_binary (PLUS, y_addr_mode, y_addr, tmp);
+
+  x = change_address (x, cmp_mode, x_addr);
+  y = change_address (y, cmp_mode, y_addr);
+
+  rtx part_res;
+  if (cmp_mode == BLKmode)
+    part_res = compare_by_pieces (x, y, incr, target, align, 0, 0);
+  else
+    part_res = expand_binop (cmp_mode, sub_optab, x, y, NULL_RTX,
+			     true, OPTAB_LIB_WIDEN);
+
+  emit_cmp_and_jump_insns (part_res, GEN_INT (0), NE, NULL_RTX,
+			   GET_MODE (part_res), true, ne_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (9, 10));
+
+  tmp = expand_simple_binop (iter_mode, PLUS, iter, iter_incr, iter,
+			     true, OPTAB_LIB_WIDEN);
+  if (tmp != iter)
+    emit_move_insn (iter, tmp);
+
+  emit_label (cmp_label);
+  emit_cmp_and_jump_insns (iter, iter_limit, iter_cond, NULL_RTX, iter_mode,
+			   true, top_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (9, 10));
+
+  if (target == NULL_RTX
+      || !REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
+    target = gen_reg_rtx (TYPE_MODE (integer_type_node));
+
+  emit_move_insn (target, const0_rtx);
+  emit_jump (res_label);
+  emit_barrier ();
+
+  emit_label (ne_label);
+
+  if (equality_only)
+    emit_move_insn (target, const1_rtx);
+  else if (incr == 1)
+    convert_move (target, part_res, false);
+  else
+    {
+      /* ??? Re-compare the block found to be different one byte at a
+	 time.  We could do better using part_res, and being careful
+	 about endianness.  */
+      part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), size_type,
+					  target, equality_only,
+					  BITS_PER_UNIT, 0);
+      convert_move (target, part_res, false);
+    }
+
+  emit_label (res_label);
+
+  return target;
+}
+
 \f
 /* Copy all or part of a value X into registers starting at REGNO.
    The number of registers to be filled is NREGS.  */
diff --git a/gcc/expr.h b/gcc/expr.h
index d9fc47c9114..976c8b69fc1 100644
--- a/gcc/expr.h
+++ b/gcc/expr.h
@@ -138,7 +138,8 @@ extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
 				  bool might_overlap = false,
 				  unsigned ctz_size = 0);
 extern rtx emit_block_cmp_hints (rtx, rtx, rtx, tree, rtx, bool,
-				 by_pieces_constfn, void *);
+				 by_pieces_constfn, void *,
+				 unsigned ctz_len = 0);
 extern bool emit_storent_insn (rtx to, rtx from);
 
 /* Copy all or part of a value X into registers starting at REGNO.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [gcc(refs/users/aoliva/heads/testme)] add memcmp loop expander
@ 2023-01-26  4:59 Alexandre Oliva
  0 siblings, 0 replies; 16+ messages in thread
From: Alexandre Oliva @ 2023-01-26  4:59 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:cbd6df23e279d3d9a4a5ca0b57b1dad32e5118c3

commit cbd6df23e279d3d9a4a5ca0b57b1dad32e5118c3
Author: Alexandre Oliva <oliva@gnu.org>
Date:   Fri Jan 20 22:01:15 2023 -0300

    add memcmp loop expander

Diff:
---
 gcc/builtins.cc |   3 +-
 gcc/expr.cc     | 134 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 gcc/expr.h      |   3 +-
 3 files changed, 137 insertions(+), 3 deletions(-)

diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index ca4d4721cdc..e55b53485e2 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -4809,7 +4809,8 @@ expand_builtin_memcmp (tree exp, rtx target, bool result_eq)
   result = emit_block_cmp_hints (arg1_rtx, arg2_rtx, len_rtx,
 				 TREE_TYPE (len), target,
 				 result_eq, constfn,
-				 CONST_CAST (char *, rep));
+				 CONST_CAST (char *, rep),
+				 tree_ctz (len));
 
   if (result)
     {
diff --git a/gcc/expr.cc b/gcc/expr.cc
index a02b0b6ed52..2225fd8085b 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -83,6 +83,8 @@ static bool emit_block_move_via_pattern (rtx, rtx, rtx, unsigned, unsigned,
 static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned, int);
 static void emit_block_move_via_sized_loop (rtx, rtx, rtx, unsigned, unsigned);
 static void emit_block_move_via_oriented_loop (rtx, rtx, rtx, unsigned, unsigned);
+static rtx emit_block_cmp_via_loop (rtx, rtx, rtx, tree, rtx, bool,
+				    unsigned, unsigned);
 static void clear_by_pieces (rtx, unsigned HOST_WIDE_INT, unsigned int);
 static rtx_insn *compress_float_constant (rtx, rtx);
 static rtx get_subtarget (rtx);
@@ -2583,7 +2585,7 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
 rtx
 emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
 		      bool equality_only, by_pieces_constfn y_cfn,
-		      void *y_cfndata)
+		      void *y_cfndata, unsigned ctz_size)
 {
   rtx result = 0;
 
@@ -2605,8 +2607,138 @@ emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
   else
     result = emit_block_cmp_via_cmpmem (x, y, len, len_type, target, align);
 
+  if (!result && (flag_inline_stringops & ILSOP_MEMCMP))
+    result = emit_block_cmp_via_loop (x, y, len, len_type,
+				      target, equality_only,
+				      align, ctz_size);
+
   return result;
 }
+
+rtx
+emit_block_cmp_via_loop (rtx x, rtx y, rtx len, tree size_type, rtx target,
+			 bool equality_only, unsigned align, unsigned ctz_size)
+{
+  unsigned incr = align / BITS_PER_UNIT;
+
+  if (CONST_INT_P (len))
+    ctz_size = MAX (ctz_size, (unsigned) wi::ctz (UINTVAL (len)));
+
+  if (HOST_WIDE_INT_1U << ctz_size < (unsigned HOST_WIDE_INT) incr)
+    incr = HOST_WIDE_INT_1U << ctz_size;
+
+  while (incr > 1
+	 && !(equality_only
+	      ? can_do_by_pieces (incr, align, COMPARE_BY_PIECES)
+	      : int_mode_for_size (incr, 0).exists ()))
+    incr >>= 1;
+
+  rtx_code_label *cmp_label, *top_label, *ne_label, *res_label;
+  rtx iter, x_addr, y_addr, tmp;
+  machine_mode x_addr_mode = get_address_mode (x);
+  machine_mode y_addr_mode = get_address_mode (y);
+  machine_mode iter_mode;
+
+  iter_mode = GET_MODE (len);
+  if (iter_mode == VOIDmode)
+    iter_mode = word_mode;
+
+  top_label = gen_label_rtx ();
+  cmp_label = gen_label_rtx ();
+  ne_label = gen_label_rtx ();
+  res_label = gen_label_rtx ();
+  iter = gen_reg_rtx (iter_mode);
+
+  rtx iter_init = const0_rtx;
+  rtx_code iter_cond = LT;
+  rtx iter_limit = len;
+  rtx iter_incr = GEN_INT (incr);
+  machine_mode cmp_mode;
+
+  emit_move_insn (iter, iter_init);
+
+  scalar_int_mode int_cmp_mode
+    = smallest_int_mode_for_size (incr * BITS_PER_UNIT);
+  if (GET_MODE_BITSIZE (int_cmp_mode) != incr * BITS_PER_UNIT)
+    {
+      cmp_mode = BLKmode;
+      gcc_checking_assert (can_do_by_pieces (incr, align, COMPARE_BY_PIECES));
+      if (!equality_only)
+	return NULL_RTX;
+    }
+  else
+    cmp_mode = int_cmp_mode;
+
+  x_addr = force_operand (XEXP (x, 0), NULL_RTX);
+  y_addr = force_operand (XEXP (y, 0), NULL_RTX);
+  do_pending_stack_adjust ();
+
+  emit_jump (cmp_label);
+  emit_label (top_label);
+
+  tmp = convert_modes (x_addr_mode, iter_mode, iter, true);
+  x_addr = simplify_gen_binary (PLUS, x_addr_mode, x_addr, tmp);
+
+  if (x_addr_mode != y_addr_mode)
+    tmp = convert_modes (y_addr_mode, iter_mode, iter, true);
+  y_addr = simplify_gen_binary (PLUS, y_addr_mode, y_addr, tmp);
+
+  x = change_address (x, cmp_mode, x_addr);
+  y = change_address (y, cmp_mode, y_addr);
+
+  rtx part_res;
+  if (cmp_mode == BLKmode)
+    part_res = compare_by_pieces (x, y, incr, target, align, 0, 0);
+  else
+    part_res = expand_binop (cmp_mode, sub_optab, x, y, NULL_RTX,
+			     true, OPTAB_LIB_WIDEN);
+
+  emit_cmp_and_jump_insns (part_res, GEN_INT (0), NE, NULL_RTX,
+			   GET_MODE (part_res), true, ne_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (9, 10));
+
+  tmp = expand_simple_binop (iter_mode, PLUS, iter, iter_incr, iter,
+			     true, OPTAB_LIB_WIDEN);
+  if (tmp != iter)
+    emit_move_insn (iter, tmp);
+
+  emit_label (cmp_label);
+  emit_cmp_and_jump_insns (iter, iter_limit, iter_cond, NULL_RTX, iter_mode,
+			   true, top_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (9, 10));
+
+  if (target == NULL_RTX
+      || !REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
+    target = gen_reg_rtx (TYPE_MODE (integer_type_node));
+
+  emit_move_insn (target, const0_rtx);
+  emit_jump (res_label);
+  emit_barrier ();
+
+  emit_label (ne_label);
+
+  if (equality_only)
+    emit_move_insn (target, const1_rtx);
+  else if (incr == 1)
+    convert_move (target, part_res, false);
+  else
+    {
+      /* ??? Re-compare the block found to be different one byte at a
+	 time.  We could do better using part_res, and being careful
+	 about endianness.  */
+      part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), size_type,
+					  target, equality_only,
+					  BITS_PER_UNIT, 0);
+      convert_move (target, part_res, false);
+    }
+
+  emit_label (res_label);
+
+  return target;
+}
+
 \f
 /* Copy all or part of a value X into registers starting at REGNO.
    The number of registers to be filled is NREGS.  */
diff --git a/gcc/expr.h b/gcc/expr.h
index d9fc47c9114..6366675f231 100644
--- a/gcc/expr.h
+++ b/gcc/expr.h
@@ -138,7 +138,8 @@ extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
 				  bool might_overlap = false,
 				  unsigned ctz_size = 0);
 extern rtx emit_block_cmp_hints (rtx, rtx, rtx, tree, rtx, bool,
-				 by_pieces_constfn, void *);
+				 by_pieces_constfn, void *,
+				 unsigned ctz_size = 0);
 extern bool emit_storent_insn (rtx to, rtx from);
 
 /* Copy all or part of a value X into registers starting at REGNO.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [gcc(refs/users/aoliva/heads/testme)] add memcmp loop expander
@ 2023-01-26  3:37 Alexandre Oliva
  0 siblings, 0 replies; 16+ messages in thread
From: Alexandre Oliva @ 2023-01-26  3:37 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:d82adc21442ccf03a3937084b1de3dd518989da8

commit d82adc21442ccf03a3937084b1de3dd518989da8
Author: Alexandre Oliva <oliva@gnu.org>
Date:   Fri Jan 20 22:01:15 2023 -0300

    add memcmp loop expander

Diff:
---
 gcc/expr.cc | 134 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 gcc/expr.h  |   3 +-
 2 files changed, 135 insertions(+), 2 deletions(-)

diff --git a/gcc/expr.cc b/gcc/expr.cc
index a02b0b6ed52..2225fd8085b 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -83,6 +83,8 @@ static bool emit_block_move_via_pattern (rtx, rtx, rtx, unsigned, unsigned,
 static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned, int);
 static void emit_block_move_via_sized_loop (rtx, rtx, rtx, unsigned, unsigned);
 static void emit_block_move_via_oriented_loop (rtx, rtx, rtx, unsigned, unsigned);
+static rtx emit_block_cmp_via_loop (rtx, rtx, rtx, tree, rtx, bool,
+				    unsigned, unsigned);
 static void clear_by_pieces (rtx, unsigned HOST_WIDE_INT, unsigned int);
 static rtx_insn *compress_float_constant (rtx, rtx);
 static rtx get_subtarget (rtx);
@@ -2583,7 +2585,7 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
 rtx
 emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
 		      bool equality_only, by_pieces_constfn y_cfn,
-		      void *y_cfndata)
+		      void *y_cfndata, unsigned ctz_size)
 {
   rtx result = 0;
 
@@ -2605,8 +2607,138 @@ emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
   else
     result = emit_block_cmp_via_cmpmem (x, y, len, len_type, target, align);
 
+  if (!result && (flag_inline_stringops & ILSOP_MEMCMP))
+    result = emit_block_cmp_via_loop (x, y, len, len_type,
+				      target, equality_only,
+				      align, ctz_size);
+
   return result;
 }
+
+rtx
+emit_block_cmp_via_loop (rtx x, rtx y, rtx len, tree size_type, rtx target,
+			 bool equality_only, unsigned align, unsigned ctz_size)
+{
+  unsigned incr = align / BITS_PER_UNIT;
+
+  if (CONST_INT_P (len))
+    ctz_size = MAX (ctz_size, (unsigned) wi::ctz (UINTVAL (len)));
+
+  if (HOST_WIDE_INT_1U << ctz_size < (unsigned HOST_WIDE_INT) incr)
+    incr = HOST_WIDE_INT_1U << ctz_size;
+
+  while (incr > 1
+	 && !(equality_only
+	      ? can_do_by_pieces (incr, align, COMPARE_BY_PIECES)
+	      : int_mode_for_size (incr, 0).exists ()))
+    incr >>= 1;
+
+  rtx_code_label *cmp_label, *top_label, *ne_label, *res_label;
+  rtx iter, x_addr, y_addr, tmp;
+  machine_mode x_addr_mode = get_address_mode (x);
+  machine_mode y_addr_mode = get_address_mode (y);
+  machine_mode iter_mode;
+
+  iter_mode = GET_MODE (len);
+  if (iter_mode == VOIDmode)
+    iter_mode = word_mode;
+
+  top_label = gen_label_rtx ();
+  cmp_label = gen_label_rtx ();
+  ne_label = gen_label_rtx ();
+  res_label = gen_label_rtx ();
+  iter = gen_reg_rtx (iter_mode);
+
+  rtx iter_init = const0_rtx;
+  rtx_code iter_cond = LT;
+  rtx iter_limit = len;
+  rtx iter_incr = GEN_INT (incr);
+  machine_mode cmp_mode;
+
+  emit_move_insn (iter, iter_init);
+
+  scalar_int_mode int_cmp_mode
+    = smallest_int_mode_for_size (incr * BITS_PER_UNIT);
+  if (GET_MODE_BITSIZE (int_cmp_mode) != incr * BITS_PER_UNIT)
+    {
+      cmp_mode = BLKmode;
+      gcc_checking_assert (can_do_by_pieces (incr, align, COMPARE_BY_PIECES));
+      if (!equality_only)
+	return NULL_RTX;
+    }
+  else
+    cmp_mode = int_cmp_mode;
+
+  x_addr = force_operand (XEXP (x, 0), NULL_RTX);
+  y_addr = force_operand (XEXP (y, 0), NULL_RTX);
+  do_pending_stack_adjust ();
+
+  emit_jump (cmp_label);
+  emit_label (top_label);
+
+  tmp = convert_modes (x_addr_mode, iter_mode, iter, true);
+  x_addr = simplify_gen_binary (PLUS, x_addr_mode, x_addr, tmp);
+
+  if (x_addr_mode != y_addr_mode)
+    tmp = convert_modes (y_addr_mode, iter_mode, iter, true);
+  y_addr = simplify_gen_binary (PLUS, y_addr_mode, y_addr, tmp);
+
+  x = change_address (x, cmp_mode, x_addr);
+  y = change_address (y, cmp_mode, y_addr);
+
+  rtx part_res;
+  if (cmp_mode == BLKmode)
+    part_res = compare_by_pieces (x, y, incr, target, align, 0, 0);
+  else
+    part_res = expand_binop (cmp_mode, sub_optab, x, y, NULL_RTX,
+			     true, OPTAB_LIB_WIDEN);
+
+  emit_cmp_and_jump_insns (part_res, GEN_INT (0), NE, NULL_RTX,
+			   GET_MODE (part_res), true, ne_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (9, 10));
+
+  tmp = expand_simple_binop (iter_mode, PLUS, iter, iter_incr, iter,
+			     true, OPTAB_LIB_WIDEN);
+  if (tmp != iter)
+    emit_move_insn (iter, tmp);
+
+  emit_label (cmp_label);
+  emit_cmp_and_jump_insns (iter, iter_limit, iter_cond, NULL_RTX, iter_mode,
+			   true, top_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (9, 10));
+
+  if (target == NULL_RTX
+      || !REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
+    target = gen_reg_rtx (TYPE_MODE (integer_type_node));
+
+  emit_move_insn (target, const0_rtx);
+  emit_jump (res_label);
+  emit_barrier ();
+
+  emit_label (ne_label);
+
+  if (equality_only)
+    emit_move_insn (target, const1_rtx);
+  else if (incr == 1)
+    convert_move (target, part_res, false);
+  else
+    {
+      /* ??? Re-compare the block found to be different one byte at a
+	 time.  We could do better using part_res, and being careful
+	 about endianness.  */
+      part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), size_type,
+					  target, equality_only,
+					  BITS_PER_UNIT, 0);
+      convert_move (target, part_res, false);
+    }
+
+  emit_label (res_label);
+
+  return target;
+}
+
 \f
 /* Copy all or part of a value X into registers starting at REGNO.
    The number of registers to be filled is NREGS.  */
diff --git a/gcc/expr.h b/gcc/expr.h
index d9fc47c9114..6366675f231 100644
--- a/gcc/expr.h
+++ b/gcc/expr.h
@@ -138,7 +138,8 @@ extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
 				  bool might_overlap = false,
 				  unsigned ctz_size = 0);
 extern rtx emit_block_cmp_hints (rtx, rtx, rtx, tree, rtx, bool,
-				 by_pieces_constfn, void *);
+				 by_pieces_constfn, void *,
+				 unsigned ctz_size = 0);
 extern bool emit_storent_insn (rtx to, rtx from);
 
 /* Copy all or part of a value X into registers starting at REGNO.

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2023-01-27  5:58 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-01-21  1:06 [gcc(refs/users/aoliva/heads/testme)] add memcmp loop expander Alexandre Oliva
2023-01-26  3:37 Alexandre Oliva
2023-01-26  4:59 Alexandre Oliva
2023-01-26  5:17 Alexandre Oliva
2023-01-26  6:09 Alexandre Oliva
2023-01-26  6:14 Alexandre Oliva
2023-01-26  6:34 Alexandre Oliva
2023-01-26  6:38 Alexandre Oliva
2023-01-26  6:40 Alexandre Oliva
2023-01-26  7:02 Alexandre Oliva
2023-01-26  8:01 Alexandre Oliva
2023-01-26  8:21 Alexandre Oliva
2023-01-26  8:44 Alexandre Oliva
2023-01-27  1:57 Alexandre Oliva
2023-01-27  2:25 Alexandre Oliva
2023-01-27  5:58 Alexandre Oliva

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).