public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH 1/2] RISC-V: Add tests for cpymemsi expansion
@ 2024-05-08  5:52 Christoph Müllner
  2024-05-08  5:52 ` [PATCH 2/2] RISC-V: Add cmpmemsi expansion Christoph Müllner
                   ` (2 more replies)
  0 siblings, 3 replies; 9+ messages in thread
From: Christoph Müllner @ 2024-05-08  5:52 UTC (permalink / raw)
  To: gcc-patches, Kito Cheng, Jim Wilson, Palmer Dabbelt,
	Andrew Waterman, Philipp Tomsich, Jeff Law, Vineet Gupta
  Cc: Christoph Müllner

cpymemsi expansion was available for RISC-V since the initial port.
However, there are not tests to detect regression.
This patch adds such tests.

Three of the tests target the expansion requirements (known length and
alignment). One test reuses an existing memcpy test from the by-pieces
framework (gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c).

gcc/testsuite/ChangeLog:

	* gcc.target/riscv/cpymemsi-1.c: New test.
	* gcc.target/riscv/cpymemsi-2.c: New test.
	* gcc.target/riscv/cpymemsi-3.c: New test.
	* gcc.target/riscv/cpymemsi.c: New test.

Signed-off-by: Christoph Müllner <christoph.muellner@vrull.eu>
---
 gcc/testsuite/gcc.target/riscv/cpymemsi-1.c |  9 +++++
 gcc/testsuite/gcc.target/riscv/cpymemsi-2.c | 42 ++++++++++++++++++++
 gcc/testsuite/gcc.target/riscv/cpymemsi-3.c | 43 +++++++++++++++++++++
 gcc/testsuite/gcc.target/riscv/cpymemsi.c   | 22 +++++++++++
 4 files changed, 116 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/riscv/cpymemsi-1.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/cpymemsi-2.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/cpymemsi-3.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/cpymemsi.c

diff --git a/gcc/testsuite/gcc.target/riscv/cpymemsi-1.c b/gcc/testsuite/gcc.target/riscv/cpymemsi-1.c
new file mode 100644
index 00000000000..983b564ccaf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/cpymemsi-1.c
@@ -0,0 +1,9 @@
+/* { dg-do run } */
+/* { dg-options "-march=rv32gc -save-temps -g0 -fno-lto" { target { rv32 } } } */
+/* { dg-options "-march=rv64gc -save-temps -g0 -fno-lto" { target { rv64 } } } */
+/* { dg-additional-options "-DRUN_FRACTION=11" { target simulator } } */
+/* { dg-timeout-factor 2 } */
+
+#include "../../gcc.dg/memcmp-1.c"
+/* Yeah, this memcmp test exercises plenty of memcpy, more than any of the
+   memcpy tests.  */
diff --git a/gcc/testsuite/gcc.target/riscv/cpymemsi-2.c b/gcc/testsuite/gcc.target/riscv/cpymemsi-2.c
new file mode 100644
index 00000000000..833d1c04487
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/cpymemsi-2.c
@@ -0,0 +1,42 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv32gc" { target { rv32 } } } */
+/* { dg-options "-march=rv64gc" { target { rv64 } } } */
+/* { dg-skip-if "" { *-*-* } { "-O0" "-Os" "-Og" "-Oz" } } */
+
+#include <stddef.h>
+#define aligned32 __attribute__ ((aligned (32)))
+
+const char myconst15[] aligned32 = { 1, 2, 3, 4, 5, 6, 7,
+				     0, 1, 2, 3, 4, 5, 6, 7 };
+const char myconst23[] aligned32 = { 1, 2, 3, 4, 5, 6, 7,
+				     0, 1, 2, 3, 4, 5, 6, 7,
+				     0, 1, 2, 3, 4, 5, 6, 7 };
+const char myconst31[] aligned32 = { 1, 2, 3, 4, 5, 6, 7,
+				     0, 1, 2, 3, 4, 5, 6, 7,
+				     0, 1, 2, 3, 4, 5, 6, 7,
+				     0, 1, 2, 3, 4, 5, 6, 7 };
+
+/* No expansion (unknown alignment) */
+#define MY_MEM_CPY_N(N)					\
+void my_mem_cpy_##N (char *b1, const char *b2)		\
+{							\
+  __builtin_memcpy (b1, b2, N);				\
+}
+
+/* No expansion (unknown alignment) */
+#define MY_MEM_CPY_CONST_N(N)				\
+void my_mem_cpy_const_##N (char *b1)			\
+{							\
+  __builtin_memcpy (b1, myconst##N, sizeof(myconst##N));\
+}
+
+MY_MEM_CPY_N(15)
+MY_MEM_CPY_CONST_N(15)
+
+MY_MEM_CPY_N(23)
+MY_MEM_CPY_CONST_N(23)
+
+MY_MEM_CPY_N(31)
+MY_MEM_CPY_CONST_N(31)
+
+/* { dg-final { scan-assembler-times "\t(call|tail)\tmemcpy" 6 } } */
diff --git a/gcc/testsuite/gcc.target/riscv/cpymemsi-3.c b/gcc/testsuite/gcc.target/riscv/cpymemsi-3.c
new file mode 100644
index 00000000000..803765195b2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/cpymemsi-3.c
@@ -0,0 +1,43 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv32gc" { target { rv32 } } } */
+/* { dg-options "-march=rv64gc" { target { rv64 } } } */
+/* { dg-skip-if "" { *-*-* } { "-O0" "-Os" "-Og" "-Oz" } } */
+
+#include <stddef.h>
+#define aligned32 __attribute__ ((aligned (32)))
+
+const char myconst15[] aligned32 = { 1, 2, 3, 4, 5, 6, 7,
+				     0, 1, 2, 3, 4, 5, 6, 7 };
+const char myconst23[] aligned32 = { 1, 2, 3, 4, 5, 6, 7,
+				     0, 1, 2, 3, 4, 5, 6, 7,
+				     0, 1, 2, 3, 4, 5, 6, 7 };
+const char myconst31[] aligned32 = { 1, 2, 3, 4, 5, 6, 7,
+				     0, 1, 2, 3, 4, 5, 6, 7,
+				     0, 1, 2, 3, 4, 5, 6, 7,
+				     0, 1, 2, 3, 4, 5, 6, 7 };
+
+#define MY_MEM_CPY_ALIGNED_N(N)				\
+void my_mem_cpy_aligned_##N(char *b1, const char *b2)	\
+{							\
+  b1 = __builtin_assume_aligned (b1, 4096);		\
+  b2 = __builtin_assume_aligned (b2, 4096);		\
+  __builtin_memcpy (b1, b2, N);				\
+}
+
+#define MY_MEM_CPY_ALIGNED_CONST_N(N)			\
+void my_mem_cpy_aligned_const_##N(char *b1)		\
+{							\
+  b1 = __builtin_assume_aligned (b1, 4096);		\
+  __builtin_memcpy (b1, myconst##N, sizeof(myconst##N));	\
+}
+
+MY_MEM_CPY_ALIGNED_N(15)
+MY_MEM_CPY_ALIGNED_CONST_N(15)
+
+MY_MEM_CPY_ALIGNED_N(23)
+MY_MEM_CPY_ALIGNED_CONST_N(23)
+
+MY_MEM_CPY_ALIGNED_N(31)
+MY_MEM_CPY_ALIGNED_CONST_N(31)
+
+/* { dg-final { scan-assembler-not "\t(call|tail)\tmemcpy" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/cpymemsi.c b/gcc/testsuite/gcc.target/riscv/cpymemsi.c
new file mode 100644
index 00000000000..d27a488c8ed
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/cpymemsi.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv32gc" { target { rv32 } } } */
+/* { dg-options "-march=rv64gc" { target { rv64 } } } */
+/* { dg-skip-if "" { *-*-* } { "-O0" "-Os" "-Og" "-Oz" } } */
+
+#include <stddef.h>
+
+/* No expansion (unknown size) */
+void my_mem_cpy_n(char *b1, const char *b2, size_t n)
+{
+  __builtin_memcpy (b1, b2, n);
+}
+
+/* No expansion (unknown size) */
+void my_mem_cpy_aligned(char *b1, const char *b2, size_t n)
+{
+  b1 = __builtin_assume_aligned (b1, 4096);
+  b2 = __builtin_assume_aligned (b2, 4096);
+  __builtin_memcpy (b1, b2, n);
+}
+
+/* { dg-final { scan-assembler-times "\t(call|tail)\tmemcpy" 2 } } */
-- 
2.44.0


^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH 2/2] RISC-V: Add cmpmemsi expansion
  2024-05-08  5:52 [PATCH 1/2] RISC-V: Add tests for cpymemsi expansion Christoph Müllner
@ 2024-05-08  5:52 ` Christoph Müllner
  2024-05-09 14:50   ` Jeff Law
  2024-05-08 22:21 ` [PATCH 1/2] RISC-V: Add tests for cpymemsi expansion Jeff Law
  2024-05-10  4:01 ` Patrick O'Neill
  2 siblings, 1 reply; 9+ messages in thread
From: Christoph Müllner @ 2024-05-08  5:52 UTC (permalink / raw)
  To: gcc-patches, Kito Cheng, Jim Wilson, Palmer Dabbelt,
	Andrew Waterman, Philipp Tomsich, Jeff Law, Vineet Gupta
  Cc: Christoph Müllner

GCC has a generic cmpmemsi expansion via the by-pieces framework,
which shows some room for target-specific optimizations.
E.g. for comparing two aligned memory blocks of 15 bytes
we get the following sequence:

my_mem_cmp_aligned_15:
        li      a4,0
        j       .L2
.L8:
        bgeu    a4,a7,.L7
.L2:
        add     a2,a0,a4
        add     a3,a1,a4
        lbu     a5,0(a2)
        lbu     a6,0(a3)
        addi    a4,a4,1
        li      a7,15    // missed hoisting
        subw    a5,a5,a6
        andi    a5,a5,0xff // useless
        beq     a5,zero,.L8
        lbu     a0,0(a2) // loading again!
        lbu     a5,0(a3) // loading again!
        subw    a0,a0,a5
        ret
.L7:
        li      a0,0
        ret

Diff first byte: 15 insns
Diff second byte: 25 insns
No diff: 25 insns

Possible improvements:
* unroll the loop and use load-with-displacement to avoid offset increments
* load and compare multiple (aligned) bytes at once
* Use the bitmanip/strcmp result calculation (reverse words and
  synthesize (a2 >= a3) ? 1 : -1 in a branchless sequence)

When applying these improvements we get the following sequence:

my_mem_cmp_aligned_15:
        ld      a5,0(a0)
        ld      a4,0(a1)
        bne     a5,a4,.L2
        ld      a5,8(a0)
        ld      a4,8(a1)
        slli    a5,a5,8
        slli    a4,a4,8
        bne     a5,a4,.L2
        li      a0,0
.L3:
        sext.w  a0,a0
        ret
.L2:
        rev8    a5,a5
        rev8    a4,a4
        sltu    a5,a5,a4
        neg     a5,a5
        ori     a0,a5,1
        j       .L3

Diff first byte: 11 insns
Diff second byte: 16 insns
No diff: 11 insns

This patch implements this improvements.

The tests consist of a execution test (similar to
gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c) and a few tests
that test the expansion conditions (known length and alignment).

Similar to the cpymemsi expansion this patch does not introduce any
gating for the cmpmemsi expansion (on top of requiring the known length,
alignment and Zbb).

Bootstrapped and SPEC CPU 2017 tested.

gcc/ChangeLog:

	* config/riscv/riscv-protos.h (riscv_expand_block_compare): New
	prototype.
	* config/riscv/riscv-string.cc (GEN_EMIT_HELPER2): New helper.
	(do_load_from_addr): Add support for HI and SI/64 modes.
	(emit_memcmp_scalar_load_and_compare): New helper to emit memcmp.
	(emit_memcmp_scalar_result_calculation): Likewise.
	(riscv_expand_block_compare_scalar): Likewise.
	(riscv_expand_block_compare): New RISC-V expander for memory compare.
	* config/riscv/riscv.md (cmpmemsi): New cmpmem expansion.

gcc/testsuite/ChangeLog:

	* gcc.target/riscv/cmpmemsi-1.c: New test.
	* gcc.target/riscv/cmpmemsi-2.c: New test.
	* gcc.target/riscv/cmpmemsi-3.c: New test.
	* gcc.target/riscv/cmpmemsi.c: New test.

Signed-off-by: Christoph Müllner <christoph.muellner@vrull.eu>
---
 gcc/config/riscv/riscv-protos.h             |   1 +
 gcc/config/riscv/riscv-string.cc            | 161 ++++++++++++++++++++
 gcc/config/riscv/riscv.md                   |  15 ++
 gcc/testsuite/gcc.target/riscv/cmpmemsi-1.c |   6 +
 gcc/testsuite/gcc.target/riscv/cmpmemsi-2.c |  42 +++++
 gcc/testsuite/gcc.target/riscv/cmpmemsi-3.c |  43 ++++++
 gcc/testsuite/gcc.target/riscv/cmpmemsi.c   |  22 +++
 7 files changed, 290 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/riscv/cmpmemsi-1.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/cmpmemsi-2.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/cmpmemsi-3.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/cmpmemsi.c

diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index e5aebf3fc3d..30ffe30be1d 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -188,6 +188,7 @@ rtl_opt_pass * make_pass_avlprop (gcc::context *ctxt);
 rtl_opt_pass * make_pass_vsetvl (gcc::context *ctxt);
 
 /* Routines implemented in riscv-string.c.  */
+extern bool riscv_expand_block_compare (rtx, rtx, rtx, rtx);
 extern bool riscv_expand_block_move (rtx, rtx, rtx);
 
 /* Information about one CPU we know about.  */
diff --git a/gcc/config/riscv/riscv-string.cc b/gcc/config/riscv/riscv-string.cc
index b09b51d7526..9d4dc0cb827 100644
--- a/gcc/config/riscv/riscv-string.cc
+++ b/gcc/config/riscv/riscv-string.cc
@@ -86,6 +86,7 @@ GEN_EMIT_HELPER2(th_rev) /* do_th_rev2  */
 GEN_EMIT_HELPER2(th_tstnbz) /* do_th_tstnbz2  */
 GEN_EMIT_HELPER3(xor) /* do_xor3  */
 GEN_EMIT_HELPER2(zero_extendqi) /* do_zero_extendqi2  */
+GEN_EMIT_HELPER2(zero_extendhi) /* do_zero_extendhi2  */
 
 #undef GEN_EMIT_HELPER2
 #undef GEN_EMIT_HELPER3
@@ -109,6 +110,10 @@ do_load_from_addr (machine_mode mode, rtx dest, rtx addr_reg, rtx addr)
 
   if (mode == QImode)
     do_zero_extendqi2 (dest, mem);
+  else if (mode == HImode)
+    do_zero_extendhi2 (dest, mem);
+  else if (mode == SImode && TARGET_64BIT)
+    emit_insn (gen_zero_extendsidi2 (dest, mem));
   else if (mode == Xmode)
     emit_move_insn (dest, mem);
   else
@@ -610,6 +615,162 @@ riscv_expand_strlen (rtx result, rtx src, rtx search_char, rtx align)
   return false;
 }
 
+static void
+emit_memcmp_scalar_load_and_compare (rtx result, rtx src1, rtx src2,
+				     unsigned HOST_WIDE_INT nbytes,
+				     rtx data1, rtx data2,
+				     rtx diff_label, rtx final_label)
+{
+  const unsigned HOST_WIDE_INT xlen = GET_MODE_SIZE (Xmode);
+  rtx src1_addr = force_reg (Pmode, XEXP (src1, 0));
+  rtx src2_addr = force_reg (Pmode, XEXP (src2, 0));
+  unsigned HOST_WIDE_INT offset = 0;
+
+  while (nbytes > 0)
+    {
+      unsigned HOST_WIDE_INT cmp_bytes = xlen < nbytes ? xlen : nbytes;
+      machine_mode load_mode;
+
+      /* Special cases to avoid masking of trailing bytes.  */
+      if (cmp_bytes == 1)
+	load_mode = QImode;
+      else if (cmp_bytes == 2)
+	load_mode = HImode;
+      else if (cmp_bytes == 4)
+	load_mode = SImode;
+      else
+	load_mode = Xmode;
+
+      rtx addr1 = gen_rtx_PLUS (Pmode, src1_addr, GEN_INT (offset));
+      do_load_from_addr (load_mode, data1, addr1, src1);
+      rtx addr2 = gen_rtx_PLUS (Pmode, src2_addr, GEN_INT (offset));
+      do_load_from_addr (load_mode, data2, addr2, src2);
+
+      /* Fast-path for a single byte.  */
+      if (cmp_bytes == 1)
+	{
+	  rtx tmp = gen_reg_rtx (Xmode);
+	  do_sub3 (tmp, data1, data2);
+	  emit_insn (gen_movsi (result, gen_lowpart (SImode, tmp)));
+	  emit_jump_insn (gen_jump (final_label));
+	  emit_barrier (); /* No fall-through.  */
+	  return;
+	}
+
+      /* Shift off trailing bytes in words if needed.  */
+      unsigned int load_bytes = GET_MODE_SIZE (load_mode).to_constant ();
+      if (cmp_bytes < load_bytes)
+	{
+	  int shamt = (load_bytes - cmp_bytes) * BITS_PER_UNIT;
+	  do_ashl3 (data1, data1, GEN_INT (shamt));
+	  do_ashl3 (data2, data2, GEN_INT (shamt));
+	}
+
+      /* Break out if data1 != data2 */
+      rtx cond = gen_rtx_NE (VOIDmode, data1, data2);
+      emit_unlikely_jump_insn (gen_cbranch4 (Pmode, cond, data1,
+					     data2, diff_label));
+      /* Fall-through on equality.  */
+
+      offset += cmp_bytes;
+      nbytes -= cmp_bytes;
+    }
+}
+
+static void
+emit_memcmp_scalar_result_calculation (rtx result, rtx data1, rtx data2)
+{
+  /* Get bytes in big-endian order and compare as words.  */
+  do_bswap2 (data1, data1);
+  do_bswap2 (data2, data2);
+  /* Synthesize (data1 >= data2) ? 1 : -1 in a branchless sequence.  */
+  rtx tmp = gen_reg_rtx (Xmode);
+  emit_insn (gen_slt_3 (LTU, Xmode, Xmode, tmp, data1, data2));
+  do_neg2 (tmp, tmp);
+  do_ior3 (tmp, tmp, const1_rtx);
+  emit_insn (gen_movsi (result, gen_lowpart (SImode, tmp)));
+}
+
+static bool
+riscv_expand_block_compare_scalar (rtx result, rtx src1, rtx src2, rtx nbytes)
+{
+  const unsigned HOST_WIDE_INT xlen = GET_MODE_SIZE (Xmode);
+
+  if (optimize_function_for_size_p (cfun))
+    return false;
+
+  /* We don't support big endian.  */
+  if (BYTES_BIG_ENDIAN)
+    return false;
+
+  if (!CONST_INT_P (nbytes))
+    return false;
+
+  /* We need the rev (bswap) instruction.  */
+  if (!TARGET_ZBB)
+    return false;
+
+  unsigned HOST_WIDE_INT length = UINTVAL (nbytes);
+
+  /* Limit to 12-bits (maximum load-offset).  */
+  if (length > IMM_REACH)
+    length = IMM_REACH;
+
+  /* We need xlen-aligned memory.  */
+  unsigned HOST_WIDE_INT align = MIN (MEM_ALIGN (src1), MEM_ALIGN (src2));
+  if (align < (xlen * BITS_PER_UNIT))
+    return false;
+
+  if (length > RISCV_MAX_MOVE_BYTES_STRAIGHT)
+    return false;
+
+  /* Overall structure of emitted code:
+       Load-and-compare:
+	 - Load data1 and data2
+	 - Compare strings and either:
+	   - Fall-through on equality
+	   - Jump to end_label if data1 != data2
+       // Fall-through
+       Set result to 0 and jump to final_label
+     diff_label:
+       Calculate result value with the use of data1 and data2
+       Jump to final_label
+     final_label:
+       // Nothing.  */
+
+  rtx data1 = gen_reg_rtx (Xmode);
+  rtx data2 = gen_reg_rtx (Xmode);
+  rtx diff_label = gen_label_rtx ();
+  rtx final_label = gen_label_rtx ();
+
+  /* Generate a sequence of zbb instructions to compare out
+     to the length specified.  */
+  emit_memcmp_scalar_load_and_compare (result, src1, src2, length,
+				       data1, data2,
+				       diff_label, final_label);
+
+  emit_insn (gen_rtx_SET (result, gen_rtx_CONST_INT (SImode, 0)));
+  emit_jump_insn (gen_jump (final_label));
+  emit_barrier (); /* No fall-through.  */
+
+  emit_label (diff_label);
+  emit_memcmp_scalar_result_calculation (result, data1, data2);
+  emit_jump_insn (gen_jump (final_label));
+  emit_barrier (); /* No fall-through.  */
+
+  emit_label (final_label);
+  return true;
+}
+
+bool
+riscv_expand_block_compare (rtx result, rtx src1, rtx src2, rtx nbytes)
+{
+  if (stringop_strategy & STRATEGY_SCALAR)
+    return riscv_expand_block_compare_scalar (result, src1, src2, nbytes);
+
+  return false;
+}
+
 /* Emit straight-line code to move LENGTH bytes from SRC to DEST.
    Assume that the areas do not overlap.  */
 
diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
index d4676507b45..6a1181cb180 100644
--- a/gcc/config/riscv/riscv.md
+++ b/gcc/config/riscv/riscv.md
@@ -2585,6 +2585,21 @@ (define_split
   DONE;
 })
 
+(define_expand "cmpmemsi"
+  [(parallel [(set (match_operand:SI 0)
+               (compare:SI (match_operand:BLK 1)
+                           (match_operand:BLK 2)))
+	      (use (match_operand:SI 3))
+	      (use (match_operand:SI 4))])]
+  "!optimize_size"
+{
+  if (riscv_expand_block_compare (operands[0], operands[1], operands[2],
+                                  operands[3]))
+    DONE;
+  else
+    FAIL;
+})
+
 (define_expand "cpymem<mode>"
   [(parallel [(set (match_operand:BLK 0 "general_operand")
 		   (match_operand:BLK 1 "general_operand"))
diff --git a/gcc/testsuite/gcc.target/riscv/cmpmemsi-1.c b/gcc/testsuite/gcc.target/riscv/cmpmemsi-1.c
new file mode 100644
index 00000000000..d7e0bc47407
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/cmpmemsi-1.c
@@ -0,0 +1,6 @@
+/* { dg-do run } */
+/* { dg-options "-march=rv32gc_zbb -save-temps -g0 -fno-lto" { target { rv32 } } } */
+/* { dg-options "-march=rv64gc_zbb -save-temps -g0 -fno-lto" { target { rv64 } } } */
+/* { dg-timeout-factor 2 } */
+
+#include "../../gcc.dg/memcmp-1.c"
diff --git a/gcc/testsuite/gcc.target/riscv/cmpmemsi-2.c b/gcc/testsuite/gcc.target/riscv/cmpmemsi-2.c
new file mode 100644
index 00000000000..77aa88b5b9c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/cmpmemsi-2.c
@@ -0,0 +1,42 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv32gc_zbb" { target { rv32 } } } */
+/* { dg-options "-march=rv64gc_zbb" { target { rv64 } } } */
+/* { dg-skip-if "" { *-*-* } { "-O0" "-Os" "-Og" "-Oz" } } */
+
+#include <stddef.h>
+#define aligned32 __attribute__ ((aligned (32)))
+
+const char myconst15[] aligned32 = { 1, 2, 3, 4, 5, 6, 7,
+				     0, 1, 2, 3, 4, 5, 6, 7 };
+const char myconst23[] aligned32 = { 1, 2, 3, 4, 5, 6, 7,
+				     0, 1, 2, 3, 4, 5, 6, 7,
+				     0, 1, 2, 3, 4, 5, 6, 7 };
+const char myconst31[] aligned32 = { 1, 2, 3, 4, 5, 6, 7,
+				     0, 1, 2, 3, 4, 5, 6, 7,
+				     0, 1, 2, 3, 4, 5, 6, 7,
+				     0, 1, 2, 3, 4, 5, 6, 7 };
+
+/* No expansion (unknown alignment) */
+#define MY_MEM_CMP_N(N)						\
+int my_mem_cmp_##N(const char *b1, const char *b2)		\
+{								\
+  return __builtin_memcmp (b1, b2, N);				\
+}
+
+/* No expansion (unknown alignment) */
+#define MY_MEM_CMP_CONST_N(N)					\
+int my_mem_cmp_const_##N(const char *b1)			\
+{								\
+  return __builtin_memcmp (b1, myconst##N, sizeof(myconst##N));	\
+}
+
+MY_MEM_CMP_N(15)
+MY_MEM_CMP_CONST_N(15)
+
+MY_MEM_CMP_N(23)
+MY_MEM_CMP_CONST_N(23)
+
+MY_MEM_CMP_N(31)
+MY_MEM_CMP_CONST_N(31)
+
+/* { dg-final { scan-assembler-times "\t(call|tail)\tmemcmp" 6 } } */
diff --git a/gcc/testsuite/gcc.target/riscv/cmpmemsi-3.c b/gcc/testsuite/gcc.target/riscv/cmpmemsi-3.c
new file mode 100644
index 00000000000..193cd4a343e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/cmpmemsi-3.c
@@ -0,0 +1,43 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv32gc_zbb" { target { rv32 } } } */
+/* { dg-options "-march=rv64gc_zbb" { target { rv64 } } } */
+/* { dg-skip-if "" { *-*-* } { "-O0" "-Os" "-Og" "-Oz" } } */
+
+#include <stddef.h>
+#define aligned32 __attribute__ ((aligned (32)))
+
+const char myconst15[] aligned32 = { 1, 2, 3, 4, 5, 6, 7,
+				     0, 1, 2, 3, 4, 5, 6, 7 };
+const char myconst23[] aligned32 = { 1, 2, 3, 4, 5, 6, 7,
+				     0, 1, 2, 3, 4, 5, 6, 7,
+				     0, 1, 2, 3, 4, 5, 6, 7 };
+const char myconst31[] aligned32 = { 1, 2, 3, 4, 5, 6, 7,
+				     0, 1, 2, 3, 4, 5, 6, 7,
+				     0, 1, 2, 3, 4, 5, 6, 7,
+				     0, 1, 2, 3, 4, 5, 6, 7 };
+
+#define MY_MEM_CMP_ALIGNED_N(N)					\
+int my_mem_cmp_aligned_##N(const char *b1, const char *b2)	\
+{								\
+  b1 = __builtin_assume_aligned (b1, 4096);			\
+  b2 = __builtin_assume_aligned (b2, 4096);			\
+  return __builtin_memcmp (b1, b2, N);				\
+}
+
+#define MY_MEM_CMP_ALIGNED_CONST_N(N)				\
+int my_mem_cmp_aligned_const_##N(const char *b1)		\
+{								\
+  b1 = __builtin_assume_aligned (b1, 4096);			\
+  return __builtin_memcmp (b1, myconst##N, sizeof(myconst##N));	\
+}
+
+MY_MEM_CMP_ALIGNED_N(15)
+MY_MEM_CMP_ALIGNED_CONST_N(15)
+
+MY_MEM_CMP_ALIGNED_N(23)
+MY_MEM_CMP_ALIGNED_CONST_N(23)
+
+MY_MEM_CMP_ALIGNED_N(31)
+MY_MEM_CMP_ALIGNED_CONST_N(31)
+
+/* { dg-final { scan-assembler-not "\t(call|tail)\tmemcmp" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/cmpmemsi.c b/gcc/testsuite/gcc.target/riscv/cmpmemsi.c
new file mode 100644
index 00000000000..f4ccf269924
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/cmpmemsi.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv32gc_zbb" { target { rv32 } } } */
+/* { dg-options "-march=rv64gc_zbb" { target { rv64 } } } */
+/* { dg-skip-if "" { *-*-* } { "-O0" "-Os" "-Og" "-Oz" } } */
+
+#include <stddef.h>
+
+/* No expansion (unknown size) */
+int my_mem_cmp_n(const char *b1, const char *b2, size_t n)
+{
+  return __builtin_memcmp (b1, b2, n);
+}
+
+/* No expansion (unknown size) */
+int my_mem_cmp_aligned(const char *b1, const char *b2, size_t n)
+{
+  b1 = __builtin_assume_aligned (b1, 4096);
+  b2 = __builtin_assume_aligned (b2, 4096);
+  return __builtin_memcmp (b1, b2, n);
+}
+
+/* { dg-final { scan-assembler-times "\t(call|tail)\tmemcmp" 2 } } */
-- 
2.44.0


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 1/2] RISC-V: Add tests for cpymemsi expansion
  2024-05-08  5:52 [PATCH 1/2] RISC-V: Add tests for cpymemsi expansion Christoph Müllner
  2024-05-08  5:52 ` [PATCH 2/2] RISC-V: Add cmpmemsi expansion Christoph Müllner
@ 2024-05-08 22:21 ` Jeff Law
  2024-05-10  4:01 ` Patrick O'Neill
  2 siblings, 0 replies; 9+ messages in thread
From: Jeff Law @ 2024-05-08 22:21 UTC (permalink / raw)
  To: Christoph Müllner, gcc-patches, Kito Cheng, Jim Wilson,
	Palmer Dabbelt, Andrew Waterman, Philipp Tomsich, Vineet Gupta



On 5/7/24 11:52 PM, Christoph Müllner wrote:
> cpymemsi expansion was available for RISC-V since the initial port.
> However, there are not tests to detect regression.
> This patch adds such tests.
> 
> Three of the tests target the expansion requirements (known length and
> alignment). One test reuses an existing memcpy test from the by-pieces
> framework (gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c).
> 
> gcc/testsuite/ChangeLog:
> 
> 	* gcc.target/riscv/cpymemsi-1.c: New test.
> 	* gcc.target/riscv/cpymemsi-2.c: New test.
> 	* gcc.target/riscv/cpymemsi-3.c: New test.
> 	* gcc.target/riscv/cpymemsi.c: New test.
OK
jeff


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 2/2] RISC-V: Add cmpmemsi expansion
  2024-05-08  5:52 ` [PATCH 2/2] RISC-V: Add cmpmemsi expansion Christoph Müllner
@ 2024-05-09 14:50   ` Jeff Law
  2024-05-15  6:49     ` Christoph Müllner
  0 siblings, 1 reply; 9+ messages in thread
From: Jeff Law @ 2024-05-09 14:50 UTC (permalink / raw)
  To: Christoph Müllner, gcc-patches, Kito Cheng, Jim Wilson,
	Palmer Dabbelt, Andrew Waterman, Philipp Tomsich, Vineet Gupta



On 5/7/24 11:52 PM, Christoph Müllner wrote:
> GCC has a generic cmpmemsi expansion via the by-pieces framework,
> which shows some room for target-specific optimizations.
> E.g. for comparing two aligned memory blocks of 15 bytes
> we get the following sequence:
> 
> my_mem_cmp_aligned_15:
>          li      a4,0
>          j       .L2
> .L8:
>          bgeu    a4,a7,.L7
> .L2:
>          add     a2,a0,a4
>          add     a3,a1,a4
>          lbu     a5,0(a2)
>          lbu     a6,0(a3)
>          addi    a4,a4,1
>          li      a7,15    // missed hoisting
>          subw    a5,a5,a6
>          andi    a5,a5,0xff // useless
>          beq     a5,zero,.L8
>          lbu     a0,0(a2) // loading again!
>          lbu     a5,0(a3) // loading again!
>          subw    a0,a0,a5
>          ret
> .L7:
>          li      a0,0
>          ret
> 
> Diff first byte: 15 insns
> Diff second byte: 25 insns
> No diff: 25 insns
> 
> Possible improvements:
> * unroll the loop and use load-with-displacement to avoid offset increments
> * load and compare multiple (aligned) bytes at once
> * Use the bitmanip/strcmp result calculation (reverse words and
>    synthesize (a2 >= a3) ? 1 : -1 in a branchless sequence)
> 
> When applying these improvements we get the following sequence:
> 
> my_mem_cmp_aligned_15:
>          ld      a5,0(a0)
>          ld      a4,0(a1)
>          bne     a5,a4,.L2
>          ld      a5,8(a0)
>          ld      a4,8(a1)
>          slli    a5,a5,8
>          slli    a4,a4,8
>          bne     a5,a4,.L2
>          li      a0,0
> .L3:
>          sext.w  a0,a0
>          ret
> .L2:
>          rev8    a5,a5
>          rev8    a4,a4
>          sltu    a5,a5,a4
>          neg     a5,a5
>          ori     a0,a5,1
>          j       .L3
> 
> Diff first byte: 11 insns
> Diff second byte: 16 insns
> No diff: 11 insns
> 
> This patch implements this improvements.
> 
> The tests consist of a execution test (similar to
> gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c) and a few tests
> that test the expansion conditions (known length and alignment).
> 
> Similar to the cpymemsi expansion this patch does not introduce any
> gating for the cmpmemsi expansion (on top of requiring the known length,
> alignment and Zbb).
> 
> Bootstrapped and SPEC CPU 2017 tested.
> 
> gcc/ChangeLog:
> 
> 	* config/riscv/riscv-protos.h (riscv_expand_block_compare): New
> 	prototype.
> 	* config/riscv/riscv-string.cc (GEN_EMIT_HELPER2): New helper.
> 	(do_load_from_addr): Add support for HI and SI/64 modes.
> 	(emit_memcmp_scalar_load_and_compare): New helper to emit memcmp.
> 	(emit_memcmp_scalar_result_calculation): Likewise.
> 	(riscv_expand_block_compare_scalar): Likewise.
> 	(riscv_expand_block_compare): New RISC-V expander for memory compare.
> 	* config/riscv/riscv.md (cmpmemsi): New cmpmem expansion.
> 
> gcc/testsuite/ChangeLog:
> 
> 	* gcc.target/riscv/cmpmemsi-1.c: New test.
> 	* gcc.target/riscv/cmpmemsi-2.c: New test.
> 	* gcc.target/riscv/cmpmemsi-3.c: New test.
> 	* gcc.target/riscv/cmpmemsi.c: New test.
> 
> Signed-off-by: Christoph Müllner <christoph.muellner@vrull.eu>
> ---
>   gcc/config/riscv/riscv-protos.h             |   1 +
>   gcc/config/riscv/riscv-string.cc            | 161 ++++++++++++++++++++
>   gcc/config/riscv/riscv.md                   |  15 ++
>   gcc/testsuite/gcc.target/riscv/cmpmemsi-1.c |   6 +
>   gcc/testsuite/gcc.target/riscv/cmpmemsi-2.c |  42 +++++
>   gcc/testsuite/gcc.target/riscv/cmpmemsi-3.c |  43 ++++++
>   gcc/testsuite/gcc.target/riscv/cmpmemsi.c   |  22 +++
>   7 files changed, 290 insertions(+)
>   create mode 100644 gcc/testsuite/gcc.target/riscv/cmpmemsi-1.c
>   create mode 100644 gcc/testsuite/gcc.target/riscv/cmpmemsi-2.c
>   create mode 100644 gcc/testsuite/gcc.target/riscv/cmpmemsi-3.c
>   create mode 100644 gcc/testsuite/gcc.target/riscv/cmpmemsi.c
> 
> diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
> index e5aebf3fc3d..30ffe30be1d 100644
> --- a/gcc/config/riscv/riscv-protos.h
> +++ b/gcc/config/riscv/riscv-protos.h
> @@ -188,6 +188,7 @@ rtl_opt_pass * make_pass_avlprop (gcc::context *ctxt);
>   rtl_opt_pass * make_pass_vsetvl (gcc::context *ctxt);
>   
>   /* Routines implemented in riscv-string.c.  */
> +extern bool riscv_expand_block_compare (rtx, rtx, rtx, rtx);
>   extern bool riscv_expand_block_move (rtx, rtx, rtx);
>   
>   /* Information about one CPU we know about.  */
> diff --git a/gcc/config/riscv/riscv-string.cc b/gcc/config/riscv/riscv-string.cc
> index b09b51d7526..9d4dc0cb827 100644
> --- a/gcc/config/riscv/riscv-string.cc
> +++ b/gcc/config/riscv/riscv-string.cc
> @@ -86,6 +86,7 @@ GEN_EMIT_HELPER2(th_rev) /* do_th_rev2  */
>   GEN_EMIT_HELPER2(th_tstnbz) /* do_th_tstnbz2  */
>   GEN_EMIT_HELPER3(xor) /* do_xor3  */
>   GEN_EMIT_HELPER2(zero_extendqi) /* do_zero_extendqi2  */
> +GEN_EMIT_HELPER2(zero_extendhi) /* do_zero_extendhi2  */
>   
>   #undef GEN_EMIT_HELPER2
>   #undef GEN_EMIT_HELPER3
> @@ -109,6 +110,10 @@ do_load_from_addr (machine_mode mode, rtx dest, rtx addr_reg, rtx addr)
>   
>     if (mode == QImode)
>       do_zero_extendqi2 (dest, mem);
> +  else if (mode == HImode)
> +    do_zero_extendhi2 (dest, mem);
> +  else if (mode == SImode && TARGET_64BIT)
> +    emit_insn (gen_zero_extendsidi2 (dest, mem));
>     else if (mode == Xmode)
>       emit_move_insn (dest, mem);
>     else
> @@ -610,6 +615,162 @@ riscv_expand_strlen (rtx result, rtx src, rtx search_char, rtx align)
>     return false;
>   }
>   
> +static void
> +emit_memcmp_scalar_load_and_compare (rtx result, rtx src1, rtx src2,
> +				     unsigned HOST_WIDE_INT nbytes,
> +				     rtx data1, rtx data2,
> +				     rtx diff_label, rtx final_label)
Needs a function comment.


> +{
> +  const unsigned HOST_WIDE_INT xlen = GET_MODE_SIZE (Xmode);
> +  rtx src1_addr = force_reg (Pmode, XEXP (src1, 0));
> +  rtx src2_addr = force_reg (Pmode, XEXP (src2, 0));
> +  unsigned HOST_WIDE_INT offset = 0;
> +
> +  while (nbytes > 0)
> +    {
> +      unsigned HOST_WIDE_INT cmp_bytes = xlen < nbytes ? xlen : nbytes;
> +      machine_mode load_mode;
> +
> +      /* Special cases to avoid masking of trailing bytes.  */
> +      if (cmp_bytes == 1)
> +	load_mode = QImode;
> +      else if (cmp_bytes == 2)
> +	load_mode = HImode;
> +      else if (cmp_bytes == 4)
> +	load_mode = SImode;
> +      else
> +	load_mode = Xmode;
> +
> +      rtx addr1 = gen_rtx_PLUS (Pmode, src1_addr, GEN_INT (offset));
> +      do_load_from_addr (load_mode, data1, addr1, src1);
> +      rtx addr2 = gen_rtx_PLUS (Pmode, src2_addr, GEN_INT (offset));
> +      do_load_from_addr (load_mode, data2, addr2, src2);
So we're assuming that something simplifies x + 0 -> x for the first 
iteration of this loop.  I'm not sure that's going to be true for -O0. 
Can you double check that if this expansion code is used with -O0 that 
it gets cleaned up?  adjust_address or something else might be able to 
do this for you automagically.

> +
> +static void
> +emit_memcmp_scalar_result_calculation (rtx result, rtx data1, rtx data2)
Needs function comment.



> +
> +static bool
> +riscv_expand_block_compare_scalar (rtx result, rtx src1, rtx src2, rtx nbytes)
Function comment.


> +
> +  /* We need xlen-aligned memory.  */
> +  unsigned HOST_WIDE_INT align = MIN (MEM_ALIGN (src1), MEM_ALIGN (src2));
> +  if (align < (xlen * BITS_PER_UNIT))
> +    return false;
Not needed for this submission, but this probably could be relaxed based 
on uarch tuning.

You might also ponder if we can do the result calculation in WORD_MODE. 
One of the things in my queue to submit is adjustment to the inline 
strcmp code to do that.  When done carefully we can squash out a sign 
extension.  Again, not strictly necessary, but perhaps a followup if 
you're interested.

So generally OK.  The only technical concern is generation of x+0 as an 
address.  Depending on what you find there take appropriate action.  If 
nothing is needed in that space, then just fixup the missing comments 
and this is fine for the trunk.

Jeff

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 1/2] RISC-V: Add tests for cpymemsi expansion
  2024-05-08  5:52 [PATCH 1/2] RISC-V: Add tests for cpymemsi expansion Christoph Müllner
  2024-05-08  5:52 ` [PATCH 2/2] RISC-V: Add cmpmemsi expansion Christoph Müllner
  2024-05-08 22:21 ` [PATCH 1/2] RISC-V: Add tests for cpymemsi expansion Jeff Law
@ 2024-05-10  4:01 ` Patrick O'Neill
  2024-05-15  5:00   ` Christoph Müllner
  2 siblings, 1 reply; 9+ messages in thread
From: Patrick O'Neill @ 2024-05-10  4:01 UTC (permalink / raw)
  To: Christoph Müllner; +Cc: gcc-patches, Jeff Law

Hi Christoph,

cpymemsi-1.c fails on a subset of newlib targets.

"UNRESOLVED: gcc.target/riscv/cpymemsi-1.c   -O0  compilation failed to 
produce executable"

Full list of failing targets here (New Failures section): 
https://github.com/patrick-rivos/gcc-postcommit-ci/issues/906

Thanks,

Patrick

On 5/7/24 22:52, Christoph Müllner wrote:
> cpymemsi expansion was available for RISC-V since the initial port.
> However, there are not tests to detect regression.
> This patch adds such tests.
>
> Three of the tests target the expansion requirements (known length and
> alignment). One test reuses an existing memcpy test from the by-pieces
> framework (gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c).
>
> gcc/testsuite/ChangeLog:
>
> 	* gcc.target/riscv/cpymemsi-1.c: New test.
> 	* gcc.target/riscv/cpymemsi-2.c: New test.
> 	* gcc.target/riscv/cpymemsi-3.c: New test.
> 	* gcc.target/riscv/cpymemsi.c: New test.
>
> Signed-off-by: Christoph Müllner <christoph.muellner@vrull.eu>
> ---

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 1/2] RISC-V: Add tests for cpymemsi expansion
  2024-05-10  4:01 ` Patrick O'Neill
@ 2024-05-15  5:00   ` Christoph Müllner
  2024-05-15 20:22     ` Patrick O'Neill
  0 siblings, 1 reply; 9+ messages in thread
From: Christoph Müllner @ 2024-05-15  5:00 UTC (permalink / raw)
  To: Patrick O'Neill; +Cc: gcc-patches, Jeff Law

On Fri, May 10, 2024 at 6:01 AM Patrick O'Neill <patrick@rivosinc.com> wrote:
>
> Hi Christoph,
>
> cpymemsi-1.c fails on a subset of newlib targets.
>
> "UNRESOLVED: gcc.target/riscv/cpymemsi-1.c   -O0  compilation failed to
> produce executable"
>
> Full list of failing targets here (New Failures section):
> https://github.com/patrick-rivos/gcc-postcommit-ci/issues/906

Thanks for reporting!
I'm having a hard time figuring out what the issue is here, as I can't
reproduce it locally.
This test is an execution test ("dg-do run"), so I wonder if this
might be the issue?

Thanks,
Christoph

>
> Thanks,
>
> Patrick
>
> On 5/7/24 22:52, Christoph Müllner wrote:
> > cpymemsi expansion was available for RISC-V since the initial port.
> > However, there are not tests to detect regression.
> > This patch adds such tests.
> >
> > Three of the tests target the expansion requirements (known length and
> > alignment). One test reuses an existing memcpy test from the by-pieces
> > framework (gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c).
> >
> > gcc/testsuite/ChangeLog:
> >
> >       * gcc.target/riscv/cpymemsi-1.c: New test.
> >       * gcc.target/riscv/cpymemsi-2.c: New test.
> >       * gcc.target/riscv/cpymemsi-3.c: New test.
> >       * gcc.target/riscv/cpymemsi.c: New test.
> >
> > Signed-off-by: Christoph Müllner <christoph.muellner@vrull.eu>
> > ---

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 2/2] RISC-V: Add cmpmemsi expansion
  2024-05-09 14:50   ` Jeff Law
@ 2024-05-15  6:49     ` Christoph Müllner
  0 siblings, 0 replies; 9+ messages in thread
From: Christoph Müllner @ 2024-05-15  6:49 UTC (permalink / raw)
  To: Jeff Law
  Cc: gcc-patches, Kito Cheng, Jim Wilson, Palmer Dabbelt,
	Andrew Waterman, Philipp Tomsich, Vineet Gupta

On Thu, May 9, 2024 at 4:50 PM Jeff Law <jeffreyalaw@gmail.com> wrote:
>
>
>
> On 5/7/24 11:52 PM, Christoph Müllner wrote:
> > GCC has a generic cmpmemsi expansion via the by-pieces framework,
> > which shows some room for target-specific optimizations.
> > E.g. for comparing two aligned memory blocks of 15 bytes
> > we get the following sequence:
> >
> > my_mem_cmp_aligned_15:
> >          li      a4,0
> >          j       .L2
> > .L8:
> >          bgeu    a4,a7,.L7
> > .L2:
> >          add     a2,a0,a4
> >          add     a3,a1,a4
> >          lbu     a5,0(a2)
> >          lbu     a6,0(a3)
> >          addi    a4,a4,1
> >          li      a7,15    // missed hoisting
> >          subw    a5,a5,a6
> >          andi    a5,a5,0xff // useless
> >          beq     a5,zero,.L8
> >          lbu     a0,0(a2) // loading again!
> >          lbu     a5,0(a3) // loading again!
> >          subw    a0,a0,a5
> >          ret
> > .L7:
> >          li      a0,0
> >          ret
> >
> > Diff first byte: 15 insns
> > Diff second byte: 25 insns
> > No diff: 25 insns
> >
> > Possible improvements:
> > * unroll the loop and use load-with-displacement to avoid offset increments
> > * load and compare multiple (aligned) bytes at once
> > * Use the bitmanip/strcmp result calculation (reverse words and
> >    synthesize (a2 >= a3) ? 1 : -1 in a branchless sequence)
> >
> > When applying these improvements we get the following sequence:
> >
> > my_mem_cmp_aligned_15:
> >          ld      a5,0(a0)
> >          ld      a4,0(a1)
> >          bne     a5,a4,.L2
> >          ld      a5,8(a0)
> >          ld      a4,8(a1)
> >          slli    a5,a5,8
> >          slli    a4,a4,8
> >          bne     a5,a4,.L2
> >          li      a0,0
> > .L3:
> >          sext.w  a0,a0
> >          ret
> > .L2:
> >          rev8    a5,a5
> >          rev8    a4,a4
> >          sltu    a5,a5,a4
> >          neg     a5,a5
> >          ori     a0,a5,1
> >          j       .L3
> >
> > Diff first byte: 11 insns
> > Diff second byte: 16 insns
> > No diff: 11 insns
> >
> > This patch implements this improvements.
> >
> > The tests consist of a execution test (similar to
> > gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c) and a few tests
> > that test the expansion conditions (known length and alignment).
> >
> > Similar to the cpymemsi expansion this patch does not introduce any
> > gating for the cmpmemsi expansion (on top of requiring the known length,
> > alignment and Zbb).
> >
> > Bootstrapped and SPEC CPU 2017 tested.
> >
> > gcc/ChangeLog:
> >
> >       * config/riscv/riscv-protos.h (riscv_expand_block_compare): New
> >       prototype.
> >       * config/riscv/riscv-string.cc (GEN_EMIT_HELPER2): New helper.
> >       (do_load_from_addr): Add support for HI and SI/64 modes.
> >       (emit_memcmp_scalar_load_and_compare): New helper to emit memcmp.
> >       (emit_memcmp_scalar_result_calculation): Likewise.
> >       (riscv_expand_block_compare_scalar): Likewise.
> >       (riscv_expand_block_compare): New RISC-V expander for memory compare.
> >       * config/riscv/riscv.md (cmpmemsi): New cmpmem expansion.
> >
> > gcc/testsuite/ChangeLog:
> >
> >       * gcc.target/riscv/cmpmemsi-1.c: New test.
> >       * gcc.target/riscv/cmpmemsi-2.c: New test.
> >       * gcc.target/riscv/cmpmemsi-3.c: New test.
> >       * gcc.target/riscv/cmpmemsi.c: New test.
> >
> > Signed-off-by: Christoph Müllner <christoph.muellner@vrull.eu>
> > ---
> >   gcc/config/riscv/riscv-protos.h             |   1 +
> >   gcc/config/riscv/riscv-string.cc            | 161 ++++++++++++++++++++
> >   gcc/config/riscv/riscv.md                   |  15 ++
> >   gcc/testsuite/gcc.target/riscv/cmpmemsi-1.c |   6 +
> >   gcc/testsuite/gcc.target/riscv/cmpmemsi-2.c |  42 +++++
> >   gcc/testsuite/gcc.target/riscv/cmpmemsi-3.c |  43 ++++++
> >   gcc/testsuite/gcc.target/riscv/cmpmemsi.c   |  22 +++
> >   7 files changed, 290 insertions(+)
> >   create mode 100644 gcc/testsuite/gcc.target/riscv/cmpmemsi-1.c
> >   create mode 100644 gcc/testsuite/gcc.target/riscv/cmpmemsi-2.c
> >   create mode 100644 gcc/testsuite/gcc.target/riscv/cmpmemsi-3.c
> >   create mode 100644 gcc/testsuite/gcc.target/riscv/cmpmemsi.c
> >
> > diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
> > index e5aebf3fc3d..30ffe30be1d 100644
> > --- a/gcc/config/riscv/riscv-protos.h
> > +++ b/gcc/config/riscv/riscv-protos.h
> > @@ -188,6 +188,7 @@ rtl_opt_pass * make_pass_avlprop (gcc::context *ctxt);
> >   rtl_opt_pass * make_pass_vsetvl (gcc::context *ctxt);
> >
> >   /* Routines implemented in riscv-string.c.  */
> > +extern bool riscv_expand_block_compare (rtx, rtx, rtx, rtx);
> >   extern bool riscv_expand_block_move (rtx, rtx, rtx);
> >
> >   /* Information about one CPU we know about.  */
> > diff --git a/gcc/config/riscv/riscv-string.cc b/gcc/config/riscv/riscv-string.cc
> > index b09b51d7526..9d4dc0cb827 100644
> > --- a/gcc/config/riscv/riscv-string.cc
> > +++ b/gcc/config/riscv/riscv-string.cc
> > @@ -86,6 +86,7 @@ GEN_EMIT_HELPER2(th_rev) /* do_th_rev2  */
> >   GEN_EMIT_HELPER2(th_tstnbz) /* do_th_tstnbz2  */
> >   GEN_EMIT_HELPER3(xor) /* do_xor3  */
> >   GEN_EMIT_HELPER2(zero_extendqi) /* do_zero_extendqi2  */
> > +GEN_EMIT_HELPER2(zero_extendhi) /* do_zero_extendhi2  */
> >
> >   #undef GEN_EMIT_HELPER2
> >   #undef GEN_EMIT_HELPER3
> > @@ -109,6 +110,10 @@ do_load_from_addr (machine_mode mode, rtx dest, rtx addr_reg, rtx addr)
> >
> >     if (mode == QImode)
> >       do_zero_extendqi2 (dest, mem);
> > +  else if (mode == HImode)
> > +    do_zero_extendhi2 (dest, mem);
> > +  else if (mode == SImode && TARGET_64BIT)
> > +    emit_insn (gen_zero_extendsidi2 (dest, mem));
> >     else if (mode == Xmode)
> >       emit_move_insn (dest, mem);
> >     else
> > @@ -610,6 +615,162 @@ riscv_expand_strlen (rtx result, rtx src, rtx search_char, rtx align)
> >     return false;
> >   }
> >
> > +static void
> > +emit_memcmp_scalar_load_and_compare (rtx result, rtx src1, rtx src2,
> > +                                  unsigned HOST_WIDE_INT nbytes,
> > +                                  rtx data1, rtx data2,
> > +                                  rtx diff_label, rtx final_label)
> Needs a function comment.

Will do.

>
>
> > +{
> > +  const unsigned HOST_WIDE_INT xlen = GET_MODE_SIZE (Xmode);
> > +  rtx src1_addr = force_reg (Pmode, XEXP (src1, 0));
> > +  rtx src2_addr = force_reg (Pmode, XEXP (src2, 0));
> > +  unsigned HOST_WIDE_INT offset = 0;
> > +
> > +  while (nbytes > 0)
> > +    {
> > +      unsigned HOST_WIDE_INT cmp_bytes = xlen < nbytes ? xlen : nbytes;
> > +      machine_mode load_mode;
> > +
> > +      /* Special cases to avoid masking of trailing bytes.  */
> > +      if (cmp_bytes == 1)
> > +     load_mode = QImode;
> > +      else if (cmp_bytes == 2)
> > +     load_mode = HImode;
> > +      else if (cmp_bytes == 4)
> > +     load_mode = SImode;
> > +      else
> > +     load_mode = Xmode;
> > +
> > +      rtx addr1 = gen_rtx_PLUS (Pmode, src1_addr, GEN_INT (offset));
> > +      do_load_from_addr (load_mode, data1, addr1, src1);
> > +      rtx addr2 = gen_rtx_PLUS (Pmode, src2_addr, GEN_INT (offset));
> > +      do_load_from_addr (load_mode, data2, addr2, src2);
> So we're assuming that something simplifies x + 0 -> x for the first
> iteration of this loop.  I'm not sure that's going to be true for -O0.
> Can you double check that if this expansion code is used with -O0 that
> it gets cleaned up?  adjust_address or something else might be able to
> do this for you automagically.

-O0 does not trigger the expansion.
But I'll change to adjust_address(), where this is properly handled.
I'll also send a patch to adjust this for the strcmp expansion (where
I copied from).


>
> > +
> > +static void
> > +emit_memcmp_scalar_result_calculation (rtx result, rtx data1, rtx data2)
> Needs function comment.
>
>
>
> > +
> > +static bool
> > +riscv_expand_block_compare_scalar (rtx result, rtx src1, rtx src2, rtx nbytes)
> Function comment.

Will do.

>
>
> > +
> > +  /* We need xlen-aligned memory.  */
> > +  unsigned HOST_WIDE_INT align = MIN (MEM_ALIGN (src1), MEM_ALIGN (src2));
> > +  if (align < (xlen * BITS_PER_UNIT))
> > +    return false;
> Not needed for this submission, but this probably could be relaxed based
> on uarch tuning.



>
> You might also ponder if we can do the result calculation in WORD_MODE.
> One of the things in my queue to submit is adjustment to the inline
> strcmp code to do that.  When done carefully we can squash out a sign
> extension.  Again, not strictly necessary, but perhaps a followup if
> you're interested.
>
> So generally OK.  The only technical concern is generation of x+0 as an
> address.  Depending on what you find there take appropriate action.  If
> nothing is needed in that space, then just fixup the missing comments
> and this is fine for the trunk.
>
> Jeff

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 1/2] RISC-V: Add tests for cpymemsi expansion
  2024-05-15  5:00   ` Christoph Müllner
@ 2024-05-15 20:22     ` Patrick O'Neill
  2024-05-16  8:02       ` Christoph Müllner
  0 siblings, 1 reply; 9+ messages in thread
From: Patrick O'Neill @ 2024-05-15 20:22 UTC (permalink / raw)
  To: Christoph Müllner; +Cc: gcc-patches, Jeff Law

[-- Attachment #1: Type: text/plain, Size: 2081 bytes --]


On 5/14/24 22:00, Christoph Müllner wrote:
> On Fri, May 10, 2024 at 6:01 AM Patrick O'Neill<patrick@rivosinc.com>  wrote:
>> Hi Christoph,
>>
>> cpymemsi-1.c fails on a subset of newlib targets.
>>
>> "UNRESOLVED: gcc.target/riscv/cpymemsi-1.c   -O0  compilation failed to
>> produce executable"
>>
>> Full list of failing targets here (New Failures section):
>> https://github.com/patrick-rivos/gcc-postcommit-ci/issues/906
> Thanks for reporting!
> I'm having a hard time figuring out what the issue is here, as I can't
> reproduce it locally.
> This test is an execution test ("dg-do run"), so I wonder if this
> might be the issue?

riscv-gnu-toolchain configure command: ../configure --prefix=$(pwd) 
-with-arch=rv32imac_zba_zbb_zbc_zbs -with-abi=ilp32

Here's the verbose logs:

Executing on host: /scratch/tc-testing/tc-upstream/build/build-gcc-newlib-stage2/gcc/xgcc -B/scratch/tc-testing/tc-upstream/build/build-gcc-newlib-stage2/gcc/  /scratch/tc-testing/tc-upstream/gcc/gcc/testsuite/gcc.target/riscv/cpymemsi-1.c  -march=rv32imac_zba_zbb_zbc_zbs -mabi=ilp32 -mcmodel=medlow   -fdiagnostics-plain-output    -O0  -march=rv32gc -save-temps -g0 -fno-lto -DRUN_FRACTION=11      -lm  -o ./cpymemsi-1.exe    (timeout = 1200)
spawn -ignore SIGHUP /scratch/tc-testing/tc-upstream/build/build-gcc-newlib-stage2/gcc/xgcc -B/scratch/tc-testing/tc-upstream/build/build-gcc-newlib-stage2/gcc/ /scratch/tc-testing/tc-upstream/gcc/gcc/testsuite/gcc.target/riscv/cpymemsi-1.c -march=rv32imac_zba_zbb_zbc_zbs -mabi=ilp32 -mcmodel=medlow -fdiagnostics-plain-output -O0 -march=rv32gc -save-temps -g0 -fno-lto -DRUN_FRACTION=11 -lm -o ./cpymemsi-1.exe
xgcc: fatal error: Cannot find suitable multilib set for '-march=rv32imafdc_zicsr_zifencei'/'-mabi=ilp32'
compilation terminated.
compiler exited with status 1
FAIL: gcc.target/riscv/cpymemsi-1.c   -O0  (test for excess errors)

Looks like it's only failing on targets without the 'f' extension so 
maybe we need to add a riscv_f to avoid running on non-f targets 
(similar to what we have for riscv_v)?

Patrick

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 1/2] RISC-V: Add tests for cpymemsi expansion
  2024-05-15 20:22     ` Patrick O'Neill
@ 2024-05-16  8:02       ` Christoph Müllner
  0 siblings, 0 replies; 9+ messages in thread
From: Christoph Müllner @ 2024-05-16  8:02 UTC (permalink / raw)
  To: Patrick O'Neill; +Cc: gcc-patches, Jeff Law, Kito Cheng

On Wed, May 15, 2024 at 10:22 PM Patrick O'Neill <patrick@rivosinc.com> wrote:
>
>
> On 5/14/24 22:00, Christoph Müllner wrote:
>
> On Fri, May 10, 2024 at 6:01 AM Patrick O'Neill <patrick@rivosinc.com> wrote:
>
> Hi Christoph,
>
> cpymemsi-1.c fails on a subset of newlib targets.
>
> "UNRESOLVED: gcc.target/riscv/cpymemsi-1.c   -O0  compilation failed to
> produce executable"
>
> Full list of failing targets here (New Failures section):
> https://github.com/patrick-rivos/gcc-postcommit-ci/issues/906
>
> Thanks for reporting!
> I'm having a hard time figuring out what the issue is here, as I can't
> reproduce it locally.
> This test is an execution test ("dg-do run"), so I wonder if this
> might be the issue?
>
> riscv-gnu-toolchain configure command: ../configure --prefix=$(pwd) -with-arch=rv32imac_zba_zbb_zbc_zbs -with-abi=ilp32
>
> Here's the verbose logs:
>
> Executing on host: /scratch/tc-testing/tc-upstream/build/build-gcc-newlib-stage2/gcc/xgcc -B/scratch/tc-testing/tc-upstream/build/build-gcc-newlib-stage2/gcc/  /scratch/tc-testing/tc-upstream/gcc/gcc/testsuite/gcc.target/riscv/cpymemsi-1.c  -march=rv32imac_zba_zbb_zbc_zbs -mabi=ilp32 -mcmodel=medlow   -fdiagnostics-plain-output    -O0  -march=rv32gc -save-temps -g0 -fno-lto -DRUN_FRACTION=11      -lm  -o ./cpymemsi-1.exe    (timeout = 1200)
> spawn -ignore SIGHUP /scratch/tc-testing/tc-upstream/build/build-gcc-newlib-stage2/gcc/xgcc -B/scratch/tc-testing/tc-upstream/build/build-gcc-newlib-stage2/gcc/ /scratch/tc-testing/tc-upstream/gcc/gcc/testsuite/gcc.target/riscv/cpymemsi-1.c -march=rv32imac_zba_zbb_zbc_zbs -mabi=ilp32 -mcmodel=medlow -fdiagnostics-plain-output -O0 -march=rv32gc -save-temps -g0 -fno-lto -DRUN_FRACTION=11 -lm -o ./cpymemsi-1.exe
> xgcc: fatal error: Cannot find suitable multilib set for '-march=rv32imafdc_zicsr_zifencei'/'-mabi=ilp32'
> compilation terminated.
> compiler exited with status 1
> FAIL: gcc.target/riscv/cpymemsi-1.c   -O0  (test for excess errors)
>
> Looks like it's only failing on targets without the 'f' extension so maybe we need to add a riscv_f to avoid running on non-f targets (similar to what we have for riscv_v)?

Ok, now I understand what's going on.
For "dg-do run" tests we should be more liberal with the provided
`-march` string in dg-options
(or be more restrictive using effective-target checks if necessary -
which is not the case here).
I've sent out the following patch, which should address this issue:
  https://gcc.gnu.org/pipermail/gcc-patches/2024-May/651841.html

BR
Christoph

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2024-05-16  8:03 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-05-08  5:52 [PATCH 1/2] RISC-V: Add tests for cpymemsi expansion Christoph Müllner
2024-05-08  5:52 ` [PATCH 2/2] RISC-V: Add cmpmemsi expansion Christoph Müllner
2024-05-09 14:50   ` Jeff Law
2024-05-15  6:49     ` Christoph Müllner
2024-05-08 22:21 ` [PATCH 1/2] RISC-V: Add tests for cpymemsi expansion Jeff Law
2024-05-10  4:01 ` Patrick O'Neill
2024-05-15  5:00   ` Christoph Müllner
2024-05-15 20:22     ` Patrick O'Neill
2024-05-16  8:02       ` Christoph Müllner

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).