commit 9464e72bcc9123b619215af8cfef491772a3ebd9 Author: Joern Rennecke Date: Mon Oct 2 03:16:09 2023 +0100 cpymem for RISC-V with v extension gcc/ * config/riscv/riscv-protos.h (riscv_vector::expand_block_move): Declare. * config/riscv/riscv-v.cc (riscv_vector::expand_block_move): New function. * config/riscv/riscv.md (cpymemsi): Use riscv_vector::expand_block_move. Change to .. (cpymem) .. this. gcc/testsuite/ * gcc.target/riscv/rvv/base/cpymem-1.c: New test. * gcc.target/riscv/rvv/base/cpymem-2.c: Likewise. Co-Authored-By: Juzhe-Zhong diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h index af5baf37e6a..43426a5326b 100644 --- a/gcc/config/riscv/riscv-protos.h +++ b/gcc/config/riscv/riscv-protos.h @@ -492,6 +492,7 @@ bool slide1_sew64_helper (int, machine_mode, machine_mode, machine_mode, rtx *); rtx gen_avl_for_scalar_move (rtx); void expand_tuple_move (rtx *); +bool expand_block_move (rtx, rtx, rtx); machine_mode preferred_simd_mode (scalar_mode); machine_mode get_mask_mode (machine_mode); void expand_vec_series (rtx, rtx, rtx); diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc index 097457562bd..29e138e1da2 100644 --- a/gcc/config/riscv/riscv-v.cc +++ b/gcc/config/riscv/riscv-v.cc @@ -49,6 +49,7 @@ #include "tm-constrs.h" #include "rtx-vector-builder.h" #include "targhooks.h" +#include "predict.h" using namespace riscv_vector; @@ -1991,6 +1992,206 @@ expand_tuple_move (rtx *ops) } } +/* Used by cpymemsi in riscv.md . */ + +bool +expand_block_move (rtx dst_in, rtx src_in, rtx length_in) +{ + /* + memcpy: + mv a3, a0 # Copy destination + loop: + vsetvli t0, a2, e8, m8, ta, ma # Vectors of 8b + vle8.v v0, (a1) # Load bytes + add a1, a1, t0 # Bump pointer + sub a2, a2, t0 # Decrement count + vse8.v v0, (a3) # Store bytes + add a3, a3, t0 # Bump pointer + bnez a2, loop # Any more? + ret # Return + */ + if (!TARGET_VECTOR) + return false; + HOST_WIDE_INT potential_ew + = (MIN (MIN (MEM_ALIGN (src_in), MEM_ALIGN (dst_in)), BITS_PER_WORD) + / BITS_PER_UNIT); + machine_mode vmode = VOIDmode; + bool need_loop = true; + bool size_p = optimize_function_for_size_p (cfun); + rtx src, dst; + rtx end = gen_reg_rtx (Pmode); + rtx vec; + rtx length_rtx = length_in; + + if (CONST_INT_P (length_in)) + { + HOST_WIDE_INT length = INTVAL (length_in); + + /* By using LMUL=8, we can copy as many bytes in one go as there + are bits in a vector register. If the entire block thus fits, + we don't need a loop. */ + if (length <= TARGET_MIN_VLEN) + { + need_loop = false; + + /* If a single scalar load / store pair can do the job, leave it + to the scalar code to do that. */ + /* ??? If fast unaligned access is supported, the scalar code could + use suitably sized scalars irrespective of alignemnt. If that + gets fixed, we have to adjust the test here. */ + + if (pow2p_hwi (length) && length <= potential_ew) + return false; + } + + /* Find the vector mode to use. Using the largest possible element + size is likely to give smaller constants, and thus potentially + reducing code size. However, if we need a loop, we need to update + the pointers, and that is more complicated with a larger element + size, unless we use an immediate, which prevents us from dynamically + using the targets transfer size that the hart supports. And then, + unless we know the *exact* vector size of the hart, we'd need + multiple vsetvli / branch statements, so it's not even a size win. + If, in the future, we find an RISCV-V implementation that is slower + for small element widths, we might allow larger element widths for + loops too. */ + if (need_loop) + potential_ew = 1; + for (; potential_ew; potential_ew >>= 1) + { + scalar_int_mode elem_mode; + unsigned HOST_WIDE_INT bits = potential_ew * BITS_PER_UNIT; + unsigned HOST_WIDE_INT per_iter; + HOST_WIDE_INT nunits; + + if (need_loop) + per_iter = TARGET_MIN_VLEN; + else + per_iter = length; + nunits = per_iter / potential_ew; + + /* Unless we get an implementation that's slow for small element + size / non-word-aligned accesses, we assume that the hardware + handles this well, and we don't want to complicate the code + with shifting word contents around or handling extra bytes at + the start and/or end. So we want the total transfer size and + alignment to fit with the element size. */ + if (length % potential_ew != 0 + || !int_mode_for_size (bits, 0).exists (&elem_mode)) + continue; + /* Find the mode to use for the copy inside the loop - or the + sole copy, if there is no loop. */ + if (!need_loop) + { + /* Try if we have an exact mode for the copy. */ + if (get_vector_mode (elem_mode, nunits).exists (&vmode)) + break; + /* Since we don't have a mode that exactlty matches the transfer + size, we'll need to use pred_store, which is not available + for all vector modes, but only iE_RVV_M* modes, hence trying + to find a vector mode for a merely rounded-up size is + pointless. + Still, by choosing a lower LMUL factor that still allows + an entire transfer, we can reduce register pressure. */ + for (unsigned lmul = 1; lmul <= 4; lmul <<= 1) + if (TARGET_MIN_VLEN * lmul <= nunits * BITS_PER_UNIT + /* Avoid loosing the option of using vsetivli . */ + && (nunits <= 31 * lmul || nunits > 31 * 8) + && (get_vector_mode + (elem_mode, + exact_div (BYTES_PER_RISCV_VECTOR * lmul, + potential_ew) + ).exists (&vmode))) + break; + } + + /* The RVVM8?I modes are notionally 8 * BYTES_PER_RISCV_VECTOR bytes + wide. BYTES_PER_RISCV_VECTOR can't be eavenly divided by + the sizes of larger element types; the LMUL factor of 8 can at + the moment be divided by the SEW, with SEW of up to 8 bytes, + but there are reserved encodings so there might be larger + SEW in the future. */ + if (get_vector_mode (elem_mode, + exact_div (BYTES_PER_RISCV_VECTOR * 8, + potential_ew)).exists (&vmode)) + break; + + /* We may get here if we tried an element size that's larger than + the hardware supports, but we should at least find a suitable + byte vector mode. */ + gcc_assert (potential_ew > 1); + } + if (potential_ew > 1) + length_rtx = GEN_INT (length / potential_ew); + } + else + { + vmode = E_RVVM8QImode; + } + + /* A memcpy libcall in the worst case takes 3 instructions to prepare the + arguments + 1 for the call. When RVV should take 7 instructions and + we're optimizing for size a libcall may be preferable. */ + if (size_p && need_loop) + return false; + + /* length_rtx holds the (remaining) length of the required copy. + cnt holds the length we copy with the current load/store pair. */ + rtx cnt = length_rtx; + rtx label = NULL_RTX; + rtx dst_addr = copy_addr_to_reg (XEXP (dst_in, 0)); + rtx src_addr = copy_addr_to_reg (XEXP (src_in, 0)); + + if (need_loop) + { + length_rtx = copy_to_mode_reg (Pmode, length_rtx); + cnt = gen_reg_rtx (Pmode); + label = gen_label_rtx (); + + emit_label (label); + emit_insn (gen_no_side_effects_vsetvl_rtx (vmode, cnt, length_rtx)); + } + + vec = gen_reg_rtx (vmode); + src = change_address (src_in, vmode, src_addr); + dst = change_address (dst_in, vmode, dst_addr); + + /* If we don't need a loop and have a suitable mode to describe the size, + just do a load / store pair and leave it up to the later lazy code + motion pass to insert the appropriate vsetvli. */ + if (!need_loop && known_eq (GET_MODE_SIZE (vmode), INTVAL (length_in))) + { + emit_move_insn (vec, src); + emit_move_insn (dst, vec); + } + else + { + machine_mode mask_mode = get_vector_mode (BImode, GET_MODE_NUNITS (vmode)).require (); + rtx mask = CONSTM1_RTX (mask_mode); + if (!satisfies_constraint_K (cnt)) + cnt= force_reg (Pmode, cnt); + rtx m_ops[] = {vec, mask, src}; + emit_nonvlmax_insn (code_for_pred_mov (vmode), UNARY_OP_TAMA, + m_ops, cnt); + emit_insn (gen_pred_store (vmode, dst, mask, vec, cnt, + get_avl_type_rtx (NONVLMAX))); + } + + if (need_loop) + { + emit_insn (gen_rtx_SET (src_addr, gen_rtx_PLUS (Pmode, src_addr, cnt))); + emit_insn (gen_rtx_SET (dst_addr, gen_rtx_PLUS (Pmode, dst_addr, cnt))); + emit_insn (gen_rtx_SET (length_rtx, gen_rtx_MINUS (Pmode, length_rtx, cnt))); + + /* Emit the loop condition. */ + rtx test = gen_rtx_NE (VOIDmode, end, const0_rtx); + emit_jump_insn (gen_cbranch4 (Pmode, test, length_rtx, const0_rtx, label)); + emit_insn (gen_nop ()); + } + + return true; +} + /* Return the vectorization machine mode for RVV according to LMUL. */ machine_mode preferred_simd_mode (scalar_mode mode) diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md index e00b8ee3579..1ebe8f92284 100644 --- a/gcc/config/riscv/riscv.md +++ b/gcc/config/riscv/riscv.md @@ -2271,14 +2271,16 @@ DONE; }) -(define_expand "cpymemsi" +(define_expand "cpymem" [(parallel [(set (match_operand:BLK 0 "general_operand") (match_operand:BLK 1 "general_operand")) - (use (match_operand:SI 2 "")) + (use (match_operand:P 2 "")) (use (match_operand:SI 3 "const_int_operand"))])] "" { - if (riscv_expand_block_move (operands[0], operands[1], operands[2])) + if (riscv_vector::expand_block_move (operands[0], operands[1], operands[2])) + DONE; + else if (riscv_expand_block_move (operands[0], operands[1], operands[2])) DONE; else FAIL; diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/cpymem-1.c b/gcc/testsuite/gcc.target/riscv/rvv/base/cpymem-1.c new file mode 100644 index 00000000000..9bb4904e8e9 --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/rvv/base/cpymem-1.c @@ -0,0 +1,71 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-O1" } */ +/* { dg-add-options riscv_v } */ +/* { dg-final { check-function-bodies "**" "" } } */ + +#if 0 /* Using include files when using a multilib-relevant -march option is dicey */ +#include +#else +extern void *memcpy(void *__restrict dest, const void *__restrict src, __SIZE_TYPE__ n); +#endif + +/* memcpy should be implemented using the cpymem pattern. +** f1: +XX \.L\d+: # local label is ignored +** vsetvli\s+[ta][0-7],a2,e8,m8,ta,ma +** vle8\.v\s+v\d+,0\(a1\) +** vse8\.v\s+v\d+,0\(a0\) +** add\s+a1,a1,[ta][0-7] +** add\s+a0,a0,[ta][0-7] +** sub\s+a2,a2,[ta][0-7] +** bne\s+a2,zero,\.L\d+ +** ret +*/ + +void f1 (void *a, void *b, __SIZE_TYPE__ l) +{ + memcpy (a, b, l); +} + +/* We should still use cpymem even with slightly different types, as signed + overflow is undefined. +** f2: +XX \.L\d+: # local label is ignored +** vsetvli\s+[ta][0-7],a2,e8,m8,ta,ma +** vle8\.v\s+v\d+,0\(a1\) +** vse8\.v\s+v\d+,0\(a0\) +** add\s+a1,a1,[ta][0-7] +** add\s+a0,a0,[ta][0-7] +** sub\s+a2,a2,[ta][0-7] +** bne\s+a2,zero,\.L\d+ +** ret +*/ +void f2 (__INT32_TYPE__* a, __INT32_TYPE__* b, int l) +{ + memcpy (a, b, l); +} + +/* If it is known that the pointer arguments to memcpy point + to an aligned object, cpymem can use that alignment. + Use extern here so that we get a known alignment, lest + DATA_ALIGNMENT force us to make the scan pattern accomodate + code for different alignments depending on word size. +** f3: +** lui\s+[ta][0-7],%hi\(a_a\) +** lui\s+[ta][0-7],%hi\(a_b\) +** addi\s+a4,[ta][0-7],%lo\(a_b\) +** vsetivli\s+zero,16,e32,m4,ta,ma +** vle32.v\s+v\d+,0\([ta][0-7]\) +** addi\s+[ta][0-7],[ta][0-7],%lo\(a_a\) +** vse32\.v\s+v\d+,0\([ta][0-7]\) +** ret +*/ + +extern struct { __INT32_TYPE__ a[16]; } a_a, a_b; + +void f3 () +{ + memcpy (&a_a, &a_b, sizeof a_a); +} + +/* { dg-final { scan-assembler-not {\m(tail|call)\s+memcpy\M} } } */ diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/cpymem-2.c b/gcc/testsuite/gcc.target/riscv/rvv/base/cpymem-2.c new file mode 100644 index 00000000000..7b706b6ef52 --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/rvv/base/cpymem-2.c @@ -0,0 +1,46 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-O1" } */ +/* { dg-add-options riscv_v } */ +/* { dg-final { check-function-bodies "**" "" } } */ + +typedef struct { char c[16]; } c16; +typedef struct { char c[32]; } c32; +typedef struct { short s; char c[30]; } s16; + +/* A short struct copy can use vsetivli. +** f1: +** vsetivli\s+zero,16,e8,m1,ta,ma +** vle8.v\s+v1,0\(a1\) +** vse8.v\s+v1,0\(a0\) +** ret +*/ +void f1 (c16 *a, c16* b) +{ + *a = *b; +} + +/* A longer one needs li. +** f2: +** li\s+[ta][0-7],32 +** vsetvli\s+zero,[ta][0-7],e8,m2,ta,ma +** vle8.v\s+v2,0\(a1\) +** vse8.v\s+v2,0\(a0\) +** ret +*/ +void f2 (c32 *a, c32* b) +{ + *a = *b; +} + +/* A 32 byte struct is still short enough for vsetivli + if we can use an element width larger than 8. +** f3: +** vsetivli\s+zero,16,e16,m2,ta,ma +** vle16.v\s+v2,0\(a1\) +** vse16.v\s+v2,0\(a0\) +** ret +*/ +void f3 (s16 *a, s16* b) +{ + *a = *b; +}