From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 1005) id C2A203852C50; Thu, 17 Nov 2022 20:05:19 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org C2A203852C50 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org; s=default; t=1668715519; bh=9KR3XdmCCyeVOFomC0y01791zhjzEv/i2pp0krAn9+U=; h=From:To:Subject:Date:From; b=jbSqpMlTZD1r3CHc11Qg15ngzVTnl19B80/aqnRWU7qRgmGURCx6HlfrWYbFqGbQj JNc2dQqiGMaHL04UgGZIBhwOev2VUfWa6916FYrYsHbjS7Kq0whx1nVBYaerWj9WgL RQTInaBUluJKsRVcE63FPtuXUyCufcaTp9W66x78= Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: Michael Meissner To: gcc-cvs@gcc.gnu.org Subject: [gcc(refs/users/meissner/heads/dmf004)] Add suuport to use stxvl for variable sized memsets. X-Act-Checkin: gcc X-Git-Author: Michael Meissner X-Git-Refname: refs/users/meissner/heads/dmf004 X-Git-Oldrev: 188e5f8f1068ee384029dd49fb4e4f00b4104036 X-Git-Newrev: 3f253637de887a3119ac881eeb0e743b8112367c Message-Id: <20221117200519.C2A203852C50@sourceware.org> Date: Thu, 17 Nov 2022 20:05:19 +0000 (GMT) List-Id: https://gcc.gnu.org/g:3f253637de887a3119ac881eeb0e743b8112367c commit 3f253637de887a3119ac881eeb0e743b8112367c Author: Michael Meissner Date: Thu Nov 17 15:05:02 2022 -0500 Add suuport to use stxvl for variable sized memsets. 2022-11-17 Michael Meissner gcc/ * config/rs6000/rs6000-protos.h (expand_block_set): Add declaration. * config/rs6000/rs6000-string.cc (expand_block_set): New support for optimizing variable sized memsets. * config/rs6000/rs6000.md (setmem): Add setmemdi along with setmemsi. Add support for optimizing memsets of other bytes than just 0. Add support for using stxvl to support variable sized sets. * config/rs6000/rs6000.opt (--param rs6000-memcpy-inline-bytes): Make the default 16, not 32. ((--param rs6000-memset-inline-bytes): New parameter. Diff: --- gcc/config/rs6000/rs6000-protos.h | 1 + gcc/config/rs6000/rs6000-string.cc | 87 ++++++++++++++++++++++++++++++++++++++ gcc/config/rs6000/rs6000.md | 16 +++---- gcc/config/rs6000/rs6000.opt | 9 +++- 4 files changed, 101 insertions(+), 12 deletions(-) diff --git a/gcc/config/rs6000/rs6000-protos.h b/gcc/config/rs6000/rs6000-protos.h index d0d89320ef6..07f0759e19c 100644 --- a/gcc/config/rs6000/rs6000-protos.h +++ b/gcc/config/rs6000/rs6000-protos.h @@ -69,6 +69,7 @@ extern void rs6000_generate_float2_code (bool, rtx, rtx, rtx); extern void rs6000_generate_float2_double_code (rtx, rtx, rtx); extern void rs6000_generate_vsigned2_code (bool, rtx, rtx, rtx); extern int expand_block_clear (rtx[]); +extern int expand_block_set (rtx[]); extern int expand_block_move (rtx[], bool); extern bool expand_block_compare (rtx[]); extern bool expand_strn_compare (rtx[], int); diff --git a/gcc/config/rs6000/rs6000-string.cc b/gcc/config/rs6000/rs6000-string.cc index ee17ddb87e1..4649993cf1a 100644 --- a/gcc/config/rs6000/rs6000-string.cc +++ b/gcc/config/rs6000/rs6000-string.cc @@ -39,6 +39,11 @@ #include "predict.h" #include "optabs.h" +/* Forward reference. */ +static void do_ifelse (machine_mode cmpmode, rtx_code comparison, + rtx a, rtx b, rtx cr, rtx true_label, + profile_probability br_prob); + /* Expand a block clear operation, and return 1 if successful. Return 0 if we should let the compiler generate normal code. @@ -148,6 +153,88 @@ expand_block_clear (rtx operands[]) return 1; } +/* Expand a block set operation, and return 1 if successful. Return 0 + if we should let the compiler generate normal code. + + operands[0] is the destination + operands[1] is the length + operands[2] is the value to set memory to (normally 0) + operands[3] is the alignment */ + +int +expand_block_set (rtx operands[]) +{ + rtx bytes_rtx = operands[1]; + rtx set_byte = operands[2]; + bool constp = CONST_INT_P (bytes_rtx); + + /* At the moment, only handle setting memory to a constant. */ + if (!CONST_INT_P (set_byte) + || !IN_RANGE (INTVAL (set_byte), -127, 255)) + return 0; + + /* If we are storing to a memory region with a variable size, see if we have + the necessary support for store vector with length, and we want to do the + optimization. Fall back to using the clear memory support if we don't + want to use stxvl using an inline test. */ + if (constp + || !TARGET_BLOCK_OPS_UNALIGNED_VSX + || !TARGET_P9_VECTOR + || !TARGET_64BIT + || rs6000_memset_inline_bytes == 0 + || !param_vect_partial_vector_usage + || !optimize + || optimize_size) + { + if (set_byte == const0_rtx) + return expand_block_clear (operands); + + return 0; + } + + rtx dest_addr = force_reg (Pmode, XEXP (operands[0], 0)); + int vect_size_int = (rs6000_memset_inline_bytes >= GET_MODE_SIZE (V16QImode) + ? GET_MODE_SIZE (V16QImode) + : rs6000_memset_inline_bytes); + + rtx vect_size = GEN_INT (vect_size_int); + rtx var_cr = gen_reg_rtx (CCUNSmode); + emit_insn (gen_rtx_SET (var_cr, + gen_rtx_COMPARE (CCUNSmode, bytes_rtx, vect_size))); + + rtx var_label = gen_label_rtx (); + do_ifelse (CCUNSmode, LEU, NULL_RTX, NULL_RTX, var_cr, var_label, + profile_probability::likely ()); + + /* Call memset if the size is too large. */ + tree fun = builtin_decl_explicit (BUILT_IN_MEMSET); + emit_library_call_value (XEXP (DECL_RTL (fun), 0), + NULL_RTX, LCT_NORMAL, Pmode, + dest_addr, Pmode, + set_byte, SImode, + bytes_rtx, Pmode); + + rtx join_label = gen_label_rtx (); + rtx join_ref = gen_rtx_LABEL_REF (VOIDmode, join_label); + emit_jump_insn (gen_rtx_SET (pc_rtx, join_ref)); + emit_barrier (); + + emit_label (var_label); + + if (IN_RANGE (INTVAL (set_byte), 128, 255)) + set_byte = GEN_INT (((INTVAL (set_byte) & 0xff) ^ 0x80) - 0x80); + + /* Create the vector with the bytes splatted. */ + rtx vreg = gen_reg_rtx (V16QImode); + emit_insn (gen_xxspltib_v16qi (vreg, set_byte)); + + /* We want to set bytes inline. Set 0..16 bytes now. */ + emit_insn (gen_stxvl (vreg, dest_addr, bytes_rtx)); + + emit_label (join_label); + return 1; +} + /* Figure out the correct instructions to generate to load data for block compare. MODE is used for the read from memory, and data is zero extended if REG is wider than MODE. If LE code diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md index 12bae0d32a7..6d9d08c6172 100644 --- a/gcc/config/rs6000/rs6000.md +++ b/gcc/config/rs6000/rs6000.md @@ -9790,18 +9790,14 @@ DONE; }) -(define_expand "setmemsi" - [(parallel [(set (match_operand:BLK 0 "") - (match_operand 2 "const_int_operand")) - (use (match_operand:SI 1 "")) - (use (match_operand:SI 3 ""))])] +(define_expand "setmem" + [(use (match_operand:BLK 0 "")) + (use (match_operand:GPR 1 "")) + (use (match_operand:SI 2 "")) + (use (match_operand:SI 3 ""))] "" { - /* If value to set is not zero, use the library routine. */ - if (operands[2] != const0_rtx) - FAIL; - - if (expand_block_clear (operands)) + if (expand_block_set (operands)) DONE; else FAIL; diff --git a/gcc/config/rs6000/rs6000.opt b/gcc/config/rs6000/rs6000.opt index 602930063cd..30641de5ac3 100644 --- a/gcc/config/rs6000/rs6000.opt +++ b/gcc/config/rs6000/rs6000.opt @@ -689,6 +689,11 @@ When reduction factor computed for a loop exceeds the threshold specified by this parameter, prefer to unroll this loop. The default value is 1. -param=rs6000-memcpy-inline-bytes= -Target Undocumented Joined UInteger Var(rs6000_memcpy_inline_bytes) Init(32) Param +Target Undocumented Joined UInteger Var(rs6000_memcpy_inline_bytes) Init(16) Param Maximum number of bytes to move with inline code before calling the memcpy -library function. The default value is 32. +library function. The default value is 16. + +-param=rs6000-memset-inline-bytes= +Target Undocumented Joined UInteger Var(rs6000_memset_inline_bytes) Init(16) Param +Maximum number of bytes to move with inline code before calling the memset +library function. The default value is 16.