public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH] RISC-V: Add rawmemchr expander.
@ 2023-10-26 20:29 Robin Dapp
  2023-10-26 22:37 ` 钟居哲
  0 siblings, 1 reply; 12+ messages in thread
From: Robin Dapp @ 2023-10-26 20:29 UTC (permalink / raw)
  To: gcc-patches, palmer, Kito Cheng, jeffreyalaw, juzhe.zhong; +Cc: rdapp.gcc

Hi,

this patch adds a vectorized rawmemchr expander.
It's basically strstr but for 8, 16 and 32-byte needles.

Apart from adjusting the common-code tests I re-used a
similar test that Stefan added to the s390 backend.

Regards
 Robin

gcc/ChangeLog:

	* config/riscv/autovec.md (rawmemchr<ANYI:mode>): New expander.
	* config/riscv/riscv-protos.h (enum insn_type): Define.
	(expand_rawmemchr): New function.
	* config/riscv/riscv-v.cc (expand_rawmemchr): Add vectorized
	rawmemchr.
	* internal-fn.cc (expand_RAWMEMCHR): Fix typo.

gcc/testsuite/ChangeLog:

	* gcc.dg/tree-ssa/ldist-rawmemchr-1.c: Add riscv.
	* gcc.dg/tree-ssa/ldist-rawmemchr-2.c: Ditto.
	* gcc.target/riscv/rvv/autovec/rawmemchr-1.c: New test.
---
 gcc/config/riscv/autovec.md                   | 13 +++
 gcc/config/riscv/riscv-protos.h               |  1 +
 gcc/config/riscv/riscv-v.cc                   | 99 +++++++++++++++++++
 gcc/internal-fn.cc                            |  2 +-
 .../gcc.dg/tree-ssa/ldist-rawmemchr-1.c       |  8 +-
 .../gcc.dg/tree-ssa/ldist-rawmemchr-2.c       |  8 +-
 .../riscv/rvv/autovec/rawmemchr-1.c           | 99 +++++++++++++++++++
 7 files changed, 221 insertions(+), 9 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/rawmemchr-1.c

diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md
index 80910ba3cc2..5f49d73be44 100644
--- a/gcc/config/riscv/autovec.md
+++ b/gcc/config/riscv/autovec.md
@@ -2367,3 +2367,16 @@ (define_expand "lfloor<mode><v_i_l_ll_convert>2"
     DONE;
   }
 )
+
+;; Implement rawmemchr[qi|si|hi].
+(define_expand "rawmemchr<ANYI:mode>"
+  [(match_operand      0 "register_operand")
+   (match_operand      1 "memory_operand")
+   (match_operand:ANYI 2 "const_int_operand")]
+  "TARGET_VECTOR"
+  {
+    riscv_vector::expand_rawmemchr(<MODE>mode, operands[0], operands[1],
+				   operands[2]);
+    DONE;
+  }
+)
diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index 668d75043ca..3c092c82ab1 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -522,6 +522,7 @@ void expand_cond_unop (unsigned, rtx *);
 void expand_cond_binop (unsigned, rtx *);
 void expand_cond_ternop (unsigned, rtx *);
 void expand_popcount (rtx *);
+void expand_rawmemchr (machine_mode, rtx, rtx, rtx);
 
 /* Rounding mode bitfield for fixed point VXRM.  */
 enum fixed_point_rounding_mode
diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index 3fe8125801b..653796a5ad2 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -2215,6 +2215,105 @@ expand_block_move (rtx dst_in, rtx src_in, rtx length_in)
   return true;
 }
 
+/* Implement rawmemchr<mode> using vector instructions.
+   It can be assumed that the needle is in the haystack, otherwise the
+   behavior is undefined.  */
+
+void
+expand_rawmemchr (machine_mode mode, rtx dst, rtx src, rtx pat)
+{
+  /*
+    rawmemchr:
+    loop:
+       vsetvli a1, zero, e[8,16,32,64], m1, ta, ma
+       vle[8,16,32,64]ff.v v8, (a0)  # Load.
+       csrr a1, vl		     # Get number of bytes read.
+       vmseq.vx v0, v8, pat	     # v0 = (v8 == {pat, pat, ...})
+       vfirst.m a2, v0		     # Find first hit.
+       add a0, a0, a1		     # Bump pointer.
+       bltz a2, loop		     # Not found?
+
+       sub a0, a0, a1		     # Go back by a1.
+       shll a2, a2, [0,1,2,3]	     # Shift to get byte offset.
+       add a0, a0, a2		     # Add the offset.
+
+       ret
+  */
+  gcc_assert (TARGET_VECTOR);
+
+  machine_mode vmode;
+  switch (mode)
+    {
+      case QImode:
+	vmode = E_RVVM1QImode;
+	break;
+      case HImode:
+	vmode = E_RVVM1HImode;
+	break;
+      case SImode:
+	vmode = E_RVVM1SImode;
+	break;
+      case DImode:
+	vmode = E_RVVM1DImode;
+	break;
+      default:
+	gcc_unreachable ();
+    }
+  machine_mode mask_mode = get_mask_mode (vmode);
+
+  rtx cnt = gen_reg_rtx (Pmode);
+  rtx end = gen_reg_rtx (Pmode);
+  rtx vec = gen_reg_rtx (vmode);
+  rtx mask = gen_reg_rtx (mask_mode);
+
+  /* After finding the first vector element matching the needle, we
+     need to multiply by the vector element width (SEW) in order to
+     return a pointer to the matching byte.  */
+  unsigned int shift = exact_log2 (GET_MODE_SIZE (mode).to_constant ());
+
+  rtx src_addr = copy_addr_to_reg (XEXP (src, 0));
+
+  rtx loop = gen_label_rtx ();
+  emit_label (loop);
+
+  rtx vsrc = change_address (src, vmode, src_addr);
+
+  /* Emit a first-fault load.  */
+  rtx vlops[] = {vec, vsrc};
+  emit_vlmax_insn (code_for_pred_fault_load (vmode), UNARY_OP, vlops);
+
+  /* Read how far we read.  */
+  if (Pmode == SImode)
+    emit_insn (gen_read_vlsi (cnt));
+  else
+    emit_insn (gen_read_vldi_zero_extend (cnt));
+
+  /* Compare needle with haystack and store in a mask.  */
+  rtx eq = gen_rtx_EQ (mask_mode, gen_const_vec_duplicate (vmode, pat), vec);
+  rtx vmsops[] = {mask, eq, vec, pat};
+  emit_nonvlmax_insn (code_for_pred_eqne_scalar (vmode), COMPARE_OP, vmsops,
+		      cnt);
+
+  /* Find the first bit in the mask.  */
+  rtx vfops[] = {end, mask};
+  emit_nonvlmax_insn (code_for_pred_ffs (mask_mode, Pmode),
+		      CPOP_OP, vfops, cnt);
+
+  /* Bump the pointer.  */
+  emit_insn (gen_rtx_SET (src_addr, gen_rtx_PLUS (Pmode, src_addr, cnt)));
+
+  /* Emit the loop condition.  */
+  rtx test = gen_rtx_LT (VOIDmode, end, const0_rtx);
+  emit_jump_insn (gen_cbranch4 (Pmode, test, end, const0_rtx, loop));
+
+  /*  We overran by CNT, subtract it.  */
+  emit_insn (gen_rtx_SET (src_addr, gen_rtx_MINUS (Pmode, src_addr, cnt)));
+
+  /*  We found something at SRC + END * [1,2,4,8].  */
+  emit_insn (gen_rtx_SET (end, gen_rtx_ASHIFT (Pmode, end, GEN_INT (shift))));
+  emit_insn (gen_rtx_SET (dst, gen_rtx_PLUS (Pmode, src_addr, end)));
+}
+
 /* Return the vectorization machine mode for RVV according to LMUL.  */
 machine_mode
 preferred_simd_mode (scalar_mode mode)
diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
index 61d5a9e4772..e7451b96353 100644
--- a/gcc/internal-fn.cc
+++ b/gcc/internal-fn.cc
@@ -3208,7 +3208,7 @@ expand_VEC_CONVERT (internal_fn, gcall *)
   gcc_unreachable ();
 }
 
-/* Expand IFN_RAWMEMCHAR internal function.  */
+/* Expand IFN_RAWMEMCHR internal function.  */
 
 void
 expand_RAWMEMCHR (internal_fn, gcall *stmt)
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ldist-rawmemchr-1.c b/gcc/testsuite/gcc.dg/tree-ssa/ldist-rawmemchr-1.c
index bf6335f6360..adf53b10def 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/ldist-rawmemchr-1.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ldist-rawmemchr-1.c
@@ -1,9 +1,9 @@
-/* { dg-do run { target s390x-*-* } } */
+/* { dg-do run { target { { s390x-*-* } || { riscv_v } } } } */
 /* { dg-options "-O2 -ftree-loop-distribution -fdump-tree-ldist-details" } */
 /* { dg-additional-options "-march=z13 -mzarch" { target s390x-*-* } } */
-/* { dg-final { scan-tree-dump-times "generated rawmemchrQI" 2 "ldist" { target s390x-*-* } } } */
-/* { dg-final { scan-tree-dump-times "generated rawmemchrHI" 2 "ldist" { target s390x-*-* } } } */
-/* { dg-final { scan-tree-dump-times "generated rawmemchrSI" 2 "ldist" { target s390x-*-* } } } */
+/* { dg-final { scan-tree-dump-times "generated rawmemchrQI" 2 "ldist" { target { { s390x-*-* } || { riscv_v } } } } } */
+/* { dg-final { scan-tree-dump-times "generated rawmemchrHI" 2 "ldist" { target { { s390x-*-* } || { riscv_v } } } } } */
+/* { dg-final { scan-tree-dump-times "generated rawmemchrSI" 2 "ldist" { target { { s390x-*-* } || { riscv_v } } } } } */
 
 /* Rawmemchr pattern: reduction stmt and no store */
 
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ldist-rawmemchr-2.c b/gcc/testsuite/gcc.dg/tree-ssa/ldist-rawmemchr-2.c
index 83f5a35a322..6c8a485a3aa 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/ldist-rawmemchr-2.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ldist-rawmemchr-2.c
@@ -1,9 +1,9 @@
-/* { dg-do run { target s390x-*-* } } */
+/* { dg-do run { target { { s390x-*-* } || { riscv_v } } } } */
 /* { dg-options "-O2 -ftree-loop-distribution -fdump-tree-ldist-details" } */
 /* { dg-additional-options "-march=z13 -mzarch" { target s390x-*-* } } */
-/* { dg-final { scan-tree-dump-times "generated rawmemchrQI" 2 "ldist" { target s390x-*-* } } } */
-/* { dg-final { scan-tree-dump-times "generated rawmemchrHI" 2 "ldist" { target s390x-*-* } } } */
-/* { dg-final { scan-tree-dump-times "generated rawmemchrSI" 2 "ldist" { target s390x-*-* } } } */
+/* { dg-final { scan-tree-dump-times "generated rawmemchrQI" 2 "ldist" { target { { s390x-*-* } || { riscv_v } } } } } */
+/* { dg-final { scan-tree-dump-times "generated rawmemchrHI" 2 "ldist" { target { { s390x-*-* } || { riscv_v } } } } } */
+/* { dg-final { scan-tree-dump-times "generated rawmemchrSI" 2 "ldist" { target { { s390x-*-* } || { riscv_v } } } } } */
 
 /* Rawmemchr pattern: reduction stmt and store */
 
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/rawmemchr-1.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/rawmemchr-1.c
new file mode 100644
index 00000000000..ba83cb3836f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/rawmemchr-1.c
@@ -0,0 +1,99 @@
+/* { dg-do run { target { riscv_v } } } */
+/* { dg-additional-options "-std=gnu99 -O2 -ftree-loop-distribution -fdump-tree-ldist-details" } */
+/* { dg-final { scan-tree-dump-times "generated rawmemchrQI" 2 "ldist" } } */
+/* { dg-final { scan-tree-dump-times "generated rawmemchrHI" 2 "ldist" } } */
+/* { dg-final { scan-tree-dump-times "generated rawmemchrSI" 2 "ldist" } } */
+
+#include <string.h>
+#include <assert.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+#define rawmemchrT(T, pattern)     \
+__attribute__((noinline,noclone))  \
+T* rawmemchr_##T (T *s)            \
+{                                  \
+  while (*s != pattern)            \
+    ++s;                           \
+  return s;                        \
+}
+
+rawmemchrT(int8_t, (int8_t)0xde)
+rawmemchrT(uint8_t, 0xde)
+rawmemchrT(int16_t, (int16_t)0xdead)
+rawmemchrT(uint16_t, 0xdead)
+rawmemchrT(int32_t, (int32_t)0xdeadbeef)
+rawmemchrT(uint32_t, 0xdeadbeef)
+
+#define runT(T, pattern)                           \
+void run_##T ()                                    \
+{                                                  \
+  T *buf = malloc (4096 * 2 * sizeof(T));          \
+  assert (buf != NULL);                            \
+  memset (buf, 0xa, 4096 * 2 * sizeof(T));         \
+  /* ensure q is 4096-byte aligned */              \
+  T *q = (T*)((unsigned char *)buf                 \
+              + (4096 - ((uintptr_t)buf & 4095))); \
+  T *p;                                            \
+  /* unaligned + block boundary + 1st load */      \
+  p = (T *) ((uintptr_t)q - 8);                    \
+  p[2] = pattern;                                  \
+  assert ((rawmemchr_##T (&p[0]) == &p[2]));       \
+  p[2] = (T) 0xaaaaaaaa;                           \
+  /* unaligned + block boundary + 2nd load */      \
+  p = (T *) ((uintptr_t)q - 8);                    \
+  p[6] = pattern;                                  \
+  assert ((rawmemchr_##T (&p[0]) == &p[6]));       \
+  p[6] = (T) 0xaaaaaaaa;                           \
+  /* unaligned + 1st load */                       \
+  q[5] = pattern;                                  \
+  assert ((rawmemchr_##T (&q[2]) == &q[5]));       \
+  q[5] = (T) 0xaaaaaaaa;                           \
+  /* unaligned + 2nd load */                       \
+  q[14] = pattern;                                 \
+  assert ((rawmemchr_##T (&q[2]) == &q[14]));      \
+  q[14] = (T) 0xaaaaaaaa;                          \
+  /* unaligned + 3rd load */                       \
+  q[19] = pattern;                                 \
+  assert ((rawmemchr_##T (&q[2]) == &q[19]));      \
+  q[19] = (T) 0xaaaaaaaa;                          \
+  /* unaligned + 4th load */                       \
+  q[25] = pattern;                                 \
+  assert ((rawmemchr_##T (&q[2]) == &q[25]));      \
+  q[25] = (T) 0xaaaaaaaa;                          \
+  /* aligned + 1st load */                         \
+  q[5] = pattern;                                  \
+  assert ((rawmemchr_##T (&q[0]) == &q[5]));       \
+  q[5] = (T) 0xaaaaaaaa;                           \
+  /* aligned + 2nd load */                         \
+  q[14] = pattern;                                 \
+  assert ((rawmemchr_##T (&q[0]) == &q[14]));      \
+  q[14] = (T) 0xaaaaaaaa;                          \
+  /* aligned + 3rd load */                         \
+  q[19] = pattern;                                 \
+  assert ((rawmemchr_##T (&q[0]) == &q[19]));      \
+  q[19] = (T) 0xaaaaaaaa;                          \
+  /* aligned + 4th load */                         \
+  q[25] = pattern;                                 \
+  assert ((rawmemchr_##T (&q[0]) == &q[25]));      \
+  q[25] = (T) 0xaaaaaaaa;                          \
+  free (buf);                                      \
+}
+
+runT(int8_t, (int8_t)0xde)
+runT(uint8_t, 0xde)
+runT(int16_t, (int16_t)0xdead)
+runT(uint16_t, 0xdead)
+runT(int32_t, (int32_t)0xdeadbeef)
+runT(uint32_t, 0xdeadbeef)
+
+int main (void)
+{
+  run_uint8_t ();
+  run_int8_t ();
+  run_uint16_t ();
+  run_int16_t ();
+  run_uint32_t ();
+  run_int32_t ();
+  return 0;
+}
-- 
2.41.0

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2023-10-27 13:39 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-10-26 20:29 [PATCH] RISC-V: Add rawmemchr expander Robin Dapp
2023-10-26 22:37 ` 钟居哲
2023-10-27  7:38   ` Robin Dapp
2023-10-27  7:39     ` juzhe.zhong
2023-10-27  7:47       ` Kito Cheng
2023-10-27  7:51         ` Robin Dapp
2023-10-27  7:52           ` juzhe.zhong
2023-10-27  7:53             ` Robin Dapp
2023-10-27  8:50               ` Robin Dapp
2023-10-27 11:39                 ` 钟居哲
2023-10-27 11:46                   ` Robin Dapp
2023-10-27 13:38     ` Jeff Law

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).