commit 9464e72bcc9123b619215af8cfef491772a3ebd9
Author: Joern Rennecke <joern.rennecke@embecosm.com>
Date:   Mon Oct 2 03:16:09 2023 +0100

    cpymem for RISC-V with v extension
    
    gcc/
            * config/riscv/riscv-protos.h (riscv_vector::expand_block_move):
            Declare.
            * config/riscv/riscv-v.cc (riscv_vector::expand_block_move):
            New function.
            * config/riscv/riscv.md (cpymemsi): Use riscv_vector::expand_block_move.
            Change to ..
            (cpymem<P:mode>) .. this.
    
    gcc/testsuite/
            * gcc.target/riscv/rvv/base/cpymem-1.c: New test.
            * gcc.target/riscv/rvv/base/cpymem-2.c: Likewise.
    
    Co-Authored-By: Juzhe-Zhong <juzhe.zhong@rivai.ai>

diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index af5baf37e6a..43426a5326b 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -492,6 +492,7 @@ bool slide1_sew64_helper (int, machine_mode, machine_mode,
 			  machine_mode, rtx *);
 rtx gen_avl_for_scalar_move (rtx);
 void expand_tuple_move (rtx *);
+bool expand_block_move (rtx, rtx, rtx);
 machine_mode preferred_simd_mode (scalar_mode);
 machine_mode get_mask_mode (machine_mode);
 void expand_vec_series (rtx, rtx, rtx);
diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index 097457562bd..29e138e1da2 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -49,6 +49,7 @@
 #include "tm-constrs.h"
 #include "rtx-vector-builder.h"
 #include "targhooks.h"
+#include "predict.h"
 
 using namespace riscv_vector;
 
@@ -1991,6 +1992,206 @@ expand_tuple_move (rtx *ops)
     }
 }
 
+/* Used by cpymemsi in riscv.md .  */
+
+bool
+expand_block_move (rtx dst_in, rtx src_in, rtx length_in)
+{
+  /*
+    memcpy:
+	mv a3, a0                       # Copy destination
+    loop:
+	vsetvli t0, a2, e8, m8, ta, ma  # Vectors of 8b
+	vle8.v v0, (a1)                 # Load bytes
+	add a1, a1, t0                  # Bump pointer
+	sub a2, a2, t0                  # Decrement count
+	vse8.v v0, (a3)                 # Store bytes
+	add a3, a3, t0                  # Bump pointer
+	bnez a2, loop                   # Any more?
+	ret                             # Return
+  */
+  if (!TARGET_VECTOR)
+    return false;
+  HOST_WIDE_INT potential_ew
+    = (MIN (MIN (MEM_ALIGN (src_in), MEM_ALIGN (dst_in)), BITS_PER_WORD)
+       / BITS_PER_UNIT);
+  machine_mode vmode = VOIDmode;
+  bool need_loop = true;
+  bool size_p = optimize_function_for_size_p (cfun);
+  rtx src, dst;
+  rtx end = gen_reg_rtx (Pmode);
+  rtx vec;
+  rtx length_rtx = length_in;
+
+  if (CONST_INT_P (length_in))
+    {
+      HOST_WIDE_INT length = INTVAL (length_in);
+
+    /* By using LMUL=8, we can copy as many bytes in one go as there
+       are bits in a vector register.  If the entire block thus fits,
+       we don't need a loop.  */
+    if (length <= TARGET_MIN_VLEN)
+      {
+	need_loop = false;
+
+	/* If a single scalar load / store pair can do the job, leave it
+	   to the scalar code to do that.  */
+	/* ??? If fast unaligned access is supported, the scalar code could
+	   use suitably sized scalars irrespective of alignemnt.  If that
+	   gets fixed, we have to adjust the test here.  */
+
+	if (pow2p_hwi (length) && length <= potential_ew)
+	  return false;
+      }
+
+      /* Find the vector mode to use.  Using the largest possible element
+	 size is likely to give smaller constants, and thus potentially
+	 reducing code size.  However, if we need a loop, we need to update
+	 the pointers, and that is more complicated with a larger element
+	 size, unless we use an immediate, which prevents us from dynamically
+	 using the targets transfer size that the hart supports.  And then,
+	 unless we know the *exact* vector size of the hart, we'd need
+	 multiple vsetvli / branch statements, so it's not even a size win.
+	 If, in the future, we find an RISCV-V implementation that is slower
+	 for small element widths, we might allow larger element widths for
+	 loops too.  */
+      if (need_loop)
+	potential_ew = 1;
+      for (; potential_ew; potential_ew >>= 1)
+	{
+	  scalar_int_mode elem_mode;
+	  unsigned HOST_WIDE_INT bits = potential_ew * BITS_PER_UNIT;
+	  unsigned HOST_WIDE_INT per_iter;
+	  HOST_WIDE_INT nunits;
+
+	  if (need_loop)
+	    per_iter = TARGET_MIN_VLEN;
+	  else
+	    per_iter = length;
+	  nunits = per_iter / potential_ew;
+
+	  /* Unless we get an implementation that's slow for small element
+	     size / non-word-aligned accesses, we assume that the hardware
+	     handles this well, and we don't want to complicate the code
+	     with shifting word contents around or handling extra bytes at
+	     the start and/or end.  So we want the total transfer size and
+	     alignment to fit with the element size.  */
+	  if (length % potential_ew != 0
+	      || !int_mode_for_size (bits, 0).exists (&elem_mode))
+	    continue;
+	  /* Find the mode to use for the copy inside the loop - or the
+	     sole copy, if there is no loop.  */
+	  if (!need_loop)
+	    {
+	      /* Try if we have an exact mode for the copy.  */
+	      if (get_vector_mode (elem_mode, nunits).exists (&vmode))
+		break;
+	      /* Since we don't have a mode that exactlty matches the transfer
+		 size, we'll need to use pred_store, which is not available
+		 for all vector modes, but only iE_RVV_M* modes, hence trying
+		 to find a vector mode for a merely rounded-up size is
+		 pointless.
+		 Still, by choosing a lower LMUL factor that still allows
+		 an entire transfer, we can reduce register pressure.  */
+	      for (unsigned lmul = 1; lmul <= 4; lmul <<= 1)
+		if (TARGET_MIN_VLEN * lmul <= nunits * BITS_PER_UNIT
+		    /* Avoid loosing the option of using vsetivli .  */
+		    && (nunits <= 31 * lmul || nunits > 31 * 8)
+		    && (get_vector_mode
+			 (elem_mode,
+			  exact_div (BYTES_PER_RISCV_VECTOR * lmul,
+				     potential_ew)
+			  ).exists (&vmode)))
+		  break;
+	    }
+
+	  /* The RVVM8?I modes are notionally 8 * BYTES_PER_RISCV_VECTOR bytes
+	     wide.  BYTES_PER_RISCV_VECTOR can't be eavenly divided by
+	     the sizes of larger element types; the LMUL factor of 8 can at
+	     the moment be divided by the SEW, with SEW of up to 8 bytes,
+	     but there are reserved encodings so there might be larger
+	     SEW in the future.  */
+	  if (get_vector_mode (elem_mode,
+			       exact_div (BYTES_PER_RISCV_VECTOR * 8,
+					  potential_ew)).exists (&vmode))
+	    break;
+
+	  /* We may get here if we tried an element size that's larger than
+	     the hardware supports, but we should at least find a suitable
+	     byte vector mode.  */
+	  gcc_assert (potential_ew > 1);
+	}
+      if (potential_ew > 1)
+	length_rtx = GEN_INT (length / potential_ew);
+    }
+  else
+    {
+      vmode = E_RVVM8QImode;
+    }
+
+  /* A memcpy libcall in the worst case takes 3 instructions to prepare the
+     arguments + 1 for the call.  When RVV should take 7 instructions and
+     we're optimizing for size a libcall may be preferable.  */
+  if (size_p && need_loop)
+    return false;
+
+  /* length_rtx holds the (remaining) length of the required copy.
+     cnt holds the length we copy with the current load/store pair.  */
+  rtx cnt = length_rtx;
+  rtx label = NULL_RTX;
+  rtx dst_addr = copy_addr_to_reg (XEXP (dst_in, 0));
+  rtx src_addr = copy_addr_to_reg (XEXP (src_in, 0));
+
+  if (need_loop)
+    {
+      length_rtx = copy_to_mode_reg (Pmode, length_rtx);
+      cnt = gen_reg_rtx (Pmode);
+      label = gen_label_rtx ();
+
+      emit_label (label);
+      emit_insn (gen_no_side_effects_vsetvl_rtx (vmode, cnt, length_rtx));
+    }
+
+  vec = gen_reg_rtx (vmode);
+  src = change_address (src_in, vmode, src_addr);
+  dst = change_address (dst_in, vmode, dst_addr);
+
+  /* If we don't need a loop and have a suitable mode to describe the size,
+     just do a load / store pair and leave it up to the later lazy code
+     motion pass to insert the appropriate vsetvli.  */
+  if (!need_loop && known_eq (GET_MODE_SIZE (vmode), INTVAL (length_in)))
+    {
+      emit_move_insn (vec, src);
+      emit_move_insn (dst, vec);
+    }
+  else
+    {
+      machine_mode mask_mode = get_vector_mode (BImode, GET_MODE_NUNITS (vmode)).require ();
+      rtx mask =  CONSTM1_RTX (mask_mode);
+      if (!satisfies_constraint_K (cnt))
+	cnt= force_reg (Pmode, cnt);
+      rtx m_ops[] = {vec, mask, src};
+      emit_nonvlmax_insn (code_for_pred_mov (vmode), UNARY_OP_TAMA,
+			  m_ops, cnt);
+      emit_insn (gen_pred_store (vmode, dst, mask, vec, cnt,
+				 get_avl_type_rtx (NONVLMAX)));
+    }
+
+  if (need_loop)
+    {
+      emit_insn (gen_rtx_SET (src_addr, gen_rtx_PLUS (Pmode, src_addr, cnt)));
+      emit_insn (gen_rtx_SET (dst_addr, gen_rtx_PLUS (Pmode, dst_addr, cnt)));
+      emit_insn (gen_rtx_SET (length_rtx, gen_rtx_MINUS (Pmode, length_rtx, cnt)));
+
+      /* Emit the loop condition.  */
+      rtx test = gen_rtx_NE (VOIDmode, end, const0_rtx);
+      emit_jump_insn (gen_cbranch4 (Pmode, test, length_rtx, const0_rtx, label));
+      emit_insn (gen_nop ());
+    }
+
+  return true;
+}
+
 /* Return the vectorization machine mode for RVV according to LMUL.  */
 machine_mode
 preferred_simd_mode (scalar_mode mode)
diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
index e00b8ee3579..1ebe8f92284 100644
--- a/gcc/config/riscv/riscv.md
+++ b/gcc/config/riscv/riscv.md
@@ -2271,14 +2271,16 @@
   DONE;
 })
 
-(define_expand "cpymemsi"
+(define_expand "cpymem<mode>"
   [(parallel [(set (match_operand:BLK 0 "general_operand")
 		   (match_operand:BLK 1 "general_operand"))
-	      (use (match_operand:SI 2 ""))
+	      (use (match_operand:P 2 ""))
 	      (use (match_operand:SI 3 "const_int_operand"))])]
   ""
 {
-  if (riscv_expand_block_move (operands[0], operands[1], operands[2]))
+  if (riscv_vector::expand_block_move (operands[0], operands[1], operands[2]))
+    DONE;
+  else if (riscv_expand_block_move (operands[0], operands[1], operands[2]))
     DONE;
   else
     FAIL;
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/cpymem-1.c b/gcc/testsuite/gcc.target/riscv/rvv/base/cpymem-1.c
new file mode 100644
index 00000000000..9bb4904e8e9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/cpymem-1.c
@@ -0,0 +1,71 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O1" } */
+/* { dg-add-options riscv_v } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#if 0 /* Using include files when using a multilib-relevant -march option is dicey */
+#include <string.h>
+#else
+extern void *memcpy(void *__restrict dest, const void *__restrict src, __SIZE_TYPE__ n);
+#endif
+
+/* memcpy should be implemented using the cpymem pattern.
+** f1:
+XX	\.L\d+: # local label is ignored
+**	vsetvli\s+[ta][0-7],a2,e8,m8,ta,ma
+**	vle8\.v\s+v\d+,0\(a1\)
+**	vse8\.v\s+v\d+,0\(a0\)
+**	add\s+a1,a1,[ta][0-7]
+**	add\s+a0,a0,[ta][0-7]
+**	sub\s+a2,a2,[ta][0-7]
+**	bne\s+a2,zero,\.L\d+
+**	ret
+*/
+
+void f1 (void *a, void *b, __SIZE_TYPE__ l)
+{
+  memcpy (a, b, l);
+}
+
+/* We should still use cpymem even with slightly different types, as signed
+   overflow is undefined.
+** f2:
+XX	\.L\d+: # local label is ignored
+**	vsetvli\s+[ta][0-7],a2,e8,m8,ta,ma
+**	vle8\.v\s+v\d+,0\(a1\)
+**	vse8\.v\s+v\d+,0\(a0\)
+**	add\s+a1,a1,[ta][0-7]
+**	add\s+a0,a0,[ta][0-7]
+**	sub\s+a2,a2,[ta][0-7]
+**	bne\s+a2,zero,\.L\d+
+**	ret
+*/
+void f2 (__INT32_TYPE__* a, __INT32_TYPE__* b, int l)
+{
+  memcpy (a, b, l);
+}
+
+/* If it is known that the pointer arguments to memcpy point
+   to an aligned object, cpymem can use that alignment.
+   Use extern here so that we get a known alignment, lest
+   DATA_ALIGNMENT force us to make the scan pattern accomodate
+   code for different alignments depending on word size.
+** f3:
+**        lui\s+[ta][0-7],%hi\(a_a\)
+**        lui\s+[ta][0-7],%hi\(a_b\)
+**        addi\s+a4,[ta][0-7],%lo\(a_b\)
+**        vsetivli\s+zero,16,e32,m4,ta,ma
+**        vle32.v\s+v\d+,0\([ta][0-7]\)
+**        addi\s+[ta][0-7],[ta][0-7],%lo\(a_a\)
+**        vse32\.v\s+v\d+,0\([ta][0-7]\)
+**        ret
+*/
+
+extern struct { __INT32_TYPE__ a[16]; } a_a, a_b;
+
+void f3 ()
+{
+  memcpy (&a_a, &a_b, sizeof a_a);
+}
+
+/* { dg-final { scan-assembler-not {\m(tail|call)\s+memcpy\M} } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/cpymem-2.c b/gcc/testsuite/gcc.target/riscv/rvv/base/cpymem-2.c
new file mode 100644
index 00000000000..7b706b6ef52
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/cpymem-2.c
@@ -0,0 +1,46 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O1" } */
+/* { dg-add-options riscv_v } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+typedef struct { char c[16]; } c16;
+typedef struct { char c[32]; } c32;
+typedef struct { short s; char c[30]; } s16;
+
+/* A short struct copy can use vsetivli.
+** f1:
+**	vsetivli\s+zero,16,e8,m1,ta,ma
+**	vle8.v\s+v1,0\(a1\)
+**	vse8.v\s+v1,0\(a0\)
+**	ret
+*/
+void f1 (c16 *a, c16* b)
+{
+  *a = *b;
+}
+
+/* A longer one needs li.
+** f2:
+**	li\s+[ta][0-7],32
+**	vsetvli\s+zero,[ta][0-7],e8,m2,ta,ma
+**	vle8.v\s+v2,0\(a1\)
+**	vse8.v\s+v2,0\(a0\)
+**	ret
+*/
+void f2 (c32 *a, c32* b)
+{
+  *a = *b;
+}
+
+/* A 32 byte struct is still short enough for vsetivli
+   if we can use an element width larger than 8.
+** f3:
+**	vsetivli\s+zero,16,e16,m2,ta,ma
+**	vle16.v\s+v2,0\(a1\)
+**	vse16.v\s+v2,0\(a0\)
+**	ret
+*/
+void f3 (s16 *a, s16* b)
+{
+  *a = *b;
+}