From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 1005) id 024223858D20; Sat, 3 Dec 2022 04:34:42 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 024223858D20 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org; s=default; t=1670042083; bh=gpadIqQDB1EIrHckLNAyuPJsfW1Ir+ylVxq+5k1HmSw=; h=From:To:Subject:Date:From; b=OXzR8jKYhxTES4irYTxmKDwrYt+dUKPAdi33x3jyex8eiii/ZIYCeN9uDgbZk8IsH j26YFULIF3I3Bubr6U9vC1SczzH+C+rpjmUDxN6r3SIxWSZqNP1rN0U4ChOtIgxD7G 39yClCZZMllgw8Iu9cGj+mbyZ2J5a/w46/Bx3egU= Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: Michael Meissner To: gcc-cvs@gcc.gnu.org Subject: [gcc(refs/users/meissner/heads/dmf005)] Use lxvl and stxvl for small variable memcpy moves. X-Act-Checkin: gcc X-Git-Author: Michael Meissner X-Git-Refname: refs/users/meissner/heads/dmf005 X-Git-Oldrev: 359bb98ed976c82d50bc9078cadd1259886d971d X-Git-Newrev: 76b94f28c88e78569c408fea65191e3b5678b76d Message-Id: <20221203043443.024223858D20@sourceware.org> Date: Sat, 3 Dec 2022 04:34:42 +0000 (GMT) List-Id: https://gcc.gnu.org/g:76b94f28c88e78569c408fea65191e3b5678b76d commit 76b94f28c88e78569c408fea65191e3b5678b76d Author: Michael Meissner Date: Fri Dec 2 23:34:17 2022 -0500 Use lxvl and stxvl for small variable memcpy moves. This patch adds support to generate inline code for block copy with a variable size if the size is 16 bytes or less. If the size is more than 16 bytes, just call memcpy. To handle variable sizes, I found we need DImode versions of the two insns for copying memory (cpymem and ). 2022-12-02 Michael Meissner gcc/ * config/rs6000/rs6000-string.cc (toplevel): Include optabs.h. (expand_lxvl_stxvl): New helper function for variable sized moves. (expand_block_move_variable): New function to optionally generate variable sized block move insns. (expand_block_move): Add support for using lxvl and stxvl to move bytes inline if the variable length is small enough before calling memcpy. * config/rs6000/rs6000.md (cpymem): Expand cpymemsi to also provide cpymemdi to handle DImode sizes as well as SImode sizes. (movmem): Expand movmemsi to also provide movmemdi to handle DImode sizes as well as SImode sizes. * config/rs6000/rs6000.opt (rs6000-memcpy-inline-bytes): New parameter. * config/rs6000/vsx.md (lxvprl): New insns for -mcpu=future. (stxvprl): Likewise. Diff: --- gcc/config/rs6000/rs6000-string.cc | 129 ++++++++++++++++++++++++++++++++++++- gcc/config/rs6000/rs6000.md | 12 ++-- gcc/config/rs6000/rs6000.opt | 5 ++ gcc/config/rs6000/vsx.md | 23 +++++++ 4 files changed, 161 insertions(+), 8 deletions(-) diff --git a/gcc/config/rs6000/rs6000-string.cc b/gcc/config/rs6000/rs6000-string.cc index cd8ee8c2f7e..ee17ddb87e1 100644 --- a/gcc/config/rs6000/rs6000-string.cc +++ b/gcc/config/rs6000/rs6000-string.cc @@ -37,6 +37,7 @@ #include "target.h" #include "profile-count.h" #include "predict.h" +#include "optabs.h" /* Expand a block clear operation, and return 1 if successful. Return 0 if we should let the compiler generate normal code. @@ -2734,6 +2735,128 @@ gen_lxvl_stxvl_move (rtx dest, rtx src, int length) return gen_lxvl (dest, addr, len); } +/* Generate a lxvl/stxvl or lxvprl/stxvprl pair of instructions to move up to + 16 or 32 bytes at a time. */ + +static void +expand_lxvl_stxvl (rtx dest_addr, + rtx src_addr, + rtx bytes_rtx, + int max_bytes) +{ + if (max_bytes > GET_MODE_SIZE (V16QImode)) + { + rtx vreg = gen_reg_rtx (XOmode); + emit_insn (gen_lxvprl (vreg, src_addr, bytes_rtx)); + emit_insn (gen_stxvprl (vreg, dest_addr, bytes_rtx)); + } + + else + { + rtx vreg = gen_reg_rtx (V16QImode); + emit_insn (gen_lxvl (vreg, src_addr, bytes_rtx)); + emit_insn (gen_stxvl (vreg, dest_addr, bytes_rtx)); + } + + return; +} + +/* Expand a variable block move operation, and return 1 if successful. Return + 0 if we should let the compiler generate normal code. + + operands[0] is the destination + operands[1] is the source + operands[2] is the length + operands[3] is the alignment */ + +static int +expand_block_move_variable (rtx operands[], bool might_overlap) +{ + /* See if we have the necessary support for load/store vector with length, + and we want to do the optimization. */ + if (!TARGET_BLOCK_OPS_UNALIGNED_VSX + || !TARGET_P9_VECTOR + || !TARGET_64BIT + || rs6000_memcpy_inline_bytes == 0 + || !param_vect_partial_vector_usage + || might_overlap + || !optimize + || optimize_size) + return 0; + + rtx dest_addr = copy_addr_to_reg (XEXP (operands[0], 0)); + rtx src_addr = copy_addr_to_reg (XEXP (operands[1], 0)); + + /* Check if we want to handle this with inline code. */ + rtx bytes_rtx = (GET_MODE (operands[2]) == Pmode + ? copy_to_reg (operands[2]) + : convert_to_mode (Pmode, operands[2], true)); + + /* Maximum size to move at one time. */ + int vect_size_int + = (TARGET_FUTURE ? GET_MODE_SIZE (XOmode) : GET_MODE_SIZE (V16QImode)); + + /* Total size to move. Limit the number of bytes that we do in this + optimization to just 2 variable moves. Anything larger let the memcpy + glibc function handle it and do the extra optimizations it provides. */ + int var_size_int = (rs6000_memcpy_inline_bytes > (2 * vect_size_int) + ? 2 * vect_size_int + : rs6000_memcpy_inline_bytes); + + rtx var_size = GEN_INT (var_size_int); + rtx var_cr = gen_reg_rtx (CCUNSmode); + emit_insn (gen_rtx_SET (var_cr, + gen_rtx_COMPARE (CCUNSmode, bytes_rtx, var_size))); + + rtx var_label = gen_label_rtx (); + do_ifelse (CCUNSmode, LEU, NULL_RTX, NULL_RTX, var_cr, var_label, + profile_probability::likely ()); + + /* Call memcpy if the size is too large. */ + tree fun = builtin_decl_explicit (BUILT_IN_MEMCPY); + emit_library_call_value (XEXP (DECL_RTL (fun), 0), + NULL_RTX, LCT_NORMAL, Pmode, + dest_addr, Pmode, + src_addr, Pmode, + bytes_rtx, Pmode); + + rtx join_label = gen_label_rtx (); + rtx join_ref = gen_rtx_LABEL_REF (VOIDmode, join_label); + emit_jump_insn (gen_rtx_SET (pc_rtx, join_ref)); + emit_barrier (); + + emit_label (var_label); + + /* We want to move bytes inline. Move 0..16 or 0..32 bytes now. */ + if (vect_size_int > var_size_int) + vect_size_int = var_size_int; + + expand_lxvl_stxvl (dest_addr, src_addr, bytes_rtx, vect_size_int); + + /* If we have more than 16/32 bytes, adjust the pointers/length and generate + a second move. */ + if (var_size_int > vect_size_int) + { + emit_insn (gen_add2_insn (bytes_rtx, GEN_INT (- vect_size_int))); + + rtx vect_cr = gen_reg_rtx (CCmode); + emit_insn (gen_rtx_SET (vect_cr, + gen_rtx_COMPARE (CCmode, bytes_rtx, + const0_rtx))); + + do_ifelse (CCmode, LE, NULL_RTX, NULL_RTX, vect_cr, join_label, + profile_probability::likely ()); + + rtx ptr_adjust = GEN_INT (vect_size_int); + emit_insn (gen_add2_insn (dest_addr, ptr_adjust)); + emit_insn (gen_add2_insn (src_addr, ptr_adjust)); + expand_lxvl_stxvl (dest_addr, src_addr, bytes_rtx, vect_size_int); + } + + emit_label (join_label); + return 1; +} + /* Expand a block move operation, and return 1 if successful. Return 0 if we should let the compiler generate normal code. @@ -2760,9 +2883,11 @@ expand_block_move (rtx operands[], bool might_overlap) rtx stores[MAX_MOVE_REG]; int num_reg = 0; - /* If this is not a fixed size move, just call memcpy */ + /* If this is not a fixed size move, see if we can use load/store vector with + length to handle multiple bytes. Don't do the optimization if -Os. + Otherwise, just call memcpy. */ if (! constp) - return 0; + return expand_block_move_variable (operands, might_overlap); /* This must be a fixed size alignment */ gcc_assert (CONST_INT_P (align_rtx)); diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md index e9dfb138603..12bae0d32a7 100644 --- a/gcc/config/rs6000/rs6000.md +++ b/gcc/config/rs6000/rs6000.md @@ -9880,11 +9880,11 @@ ;; Argument 2 is the length ;; Argument 3 is the alignment -(define_expand "cpymemsi" +(define_expand "cpymem" [(parallel [(set (match_operand:BLK 0 "") (match_operand:BLK 1 "")) - (use (match_operand:SI 2 "")) - (use (match_operand:SI 3 ""))])] + (use (match_operand:GPR 2 "")) + (use (match_operand:GPR 3 ""))])] "" { if (expand_block_move (operands, false)) @@ -9899,11 +9899,11 @@ ;; Argument 2 is the length ;; Argument 3 is the alignment -(define_expand "movmemsi" +(define_expand "movmem" [(parallel [(set (match_operand:BLK 0 "") (match_operand:BLK 1 "")) - (use (match_operand:SI 2 "")) - (use (match_operand:SI 3 ""))])] + (use (match_operand:GPR 2 "")) + (use (match_operand:GPR 3 ""))])] "" { if (expand_block_move (operands, true)) diff --git a/gcc/config/rs6000/rs6000.opt b/gcc/config/rs6000/rs6000.opt index 6872d359952..c594877ebc6 100644 --- a/gcc/config/rs6000/rs6000.opt +++ b/gcc/config/rs6000/rs6000.opt @@ -687,3 +687,8 @@ default value is 4. Target Undocumented Joined UInteger Var(rs6000_vect_unroll_reduc_threshold) Init(1) Param When reduction factor computed for a loop exceeds the threshold specified by this parameter, prefer to unroll this loop. The default value is 1. + +-param=rs6000-memcpy-inline-bytes= +Target Undocumented Joined UInteger Var(rs6000_memcpy_inline_bytes) Init(16) Param +Maximum number of bytes to move with inline code before calling the memcpy +library function. The default value is 16. diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md index 9ce83ea542e..4343183ea1e 100644 --- a/gcc/config/rs6000/vsx.md +++ b/gcc/config/rs6000/vsx.md @@ -5804,6 +5804,29 @@ DONE; }) +;; There are no lxvpl and stxvpl instructions, so we can simplify the support +;; for lxvprl and stxvprl by not incorporating the shift left by 56 bits. +(define_insn "lxvprl" + [(set (match_operand:XO 0 "vsx_register_operand" "=wa") + (unspec:XO + [(match_operand:DI 1 "gpc_reg_operand" "b") + (mem:XO (match_dup 1)) + (match_operand:DI 2 "register_operand" "r")] + UNSPEC_LXVL))] + "TARGET_FUTURE && TARGET_64BIT" + "lxvprl %x0,%1,%2" + [(set_attr "type" "vecload")]) + +(define_insn "stxvprl" + [(set (mem:XO (match_operand:DI 1 "gpc_reg_operand" "b")) + (unspec:XO [(match_operand:XO 0 "vsx_register_operand" "wa") + (mem:XO (match_dup 1)) + (match_operand:DI 2 "register_operand" "r")] + UNSPEC_STXVLL))] + "TARGET_FUTURE" + "stxvprl %x0,%1,%2" + [(set_attr "type" "vecstore")]) + ;; Vector Compare Not Equal Byte (specified/not+eq:) (define_insn "vcmpneb" [(set (match_operand:V16QI 0 "altivec_register_operand" "=v")