From: Stefan Schulze Frielinghaus <stefansf@linux.ibm.com>
To: krebbel@linux.ibm.com, gcc-patches@gcc.gnu.org
Cc: Stefan Schulze Frielinghaus <stefansf@linux.ibm.com>
Subject: [PATCH 3/3] s390: Refactor block operation setmem
Date: Mon, 15 May 2023 09:17:38 +0200 [thread overview]
Message-ID: <20230515071738.563660-4-stefansf@linux.ibm.com> (raw)
In-Reply-To: <20230515071738.563660-1-stefansf@linux.ibm.com>
Vectorize memset with a constant length of less than or equal to 64
bytes.
Do not perform a libc function call into memset in case the size is not
a compile-time constant but bounded and the upper bound is less than or
equal to 256 bytes.
gcc/ChangeLog:
* config/s390/s390-protos.h (s390_expand_setmem): Change
function signature.
* config/s390/s390.cc (s390_expand_setmem): For memset's less
than or equal to 256 byte do not perform a libc call.
* config/s390/s390.md: Change expander into a version which
takes 8 operands.
gcc/testsuite/ChangeLog:
* gcc.target/s390/memset-1.c: Test case memset1 makes use of
vst, now.
---
gcc/config/s390/s390-protos.h | 2 +-
gcc/config/s390/s390.cc | 129 +++++++++++++++++++++--
gcc/config/s390/s390.md | 14 ++-
gcc/testsuite/gcc.target/s390/memset-1.c | 7 +-
4 files changed, 132 insertions(+), 20 deletions(-)
diff --git a/gcc/config/s390/s390-protos.h b/gcc/config/s390/s390-protos.h
index 65e4f97b41e..4a5263fccec 100644
--- a/gcc/config/s390/s390-protos.h
+++ b/gcc/config/s390/s390-protos.h
@@ -109,7 +109,7 @@ extern void emit_symbolic_move (rtx *);
extern void s390_load_address (rtx, rtx);
extern bool s390_expand_cpymem (rtx, rtx, rtx, rtx, rtx);
extern bool s390_expand_movmem (rtx, rtx, rtx, rtx, rtx);
-extern void s390_expand_setmem (rtx, rtx, rtx);
+extern void s390_expand_setmem (rtx, rtx, rtx, rtx, rtx);
extern bool s390_expand_cmpmem (rtx, rtx, rtx, rtx);
extern void s390_expand_vec_strlen (rtx, rtx, rtx);
extern void s390_expand_vec_movstr (rtx, rtx, rtx);
diff --git a/gcc/config/s390/s390.cc b/gcc/config/s390/s390.cc
index 553273f23ff..b1cb54612b8 100644
--- a/gcc/config/s390/s390.cc
+++ b/gcc/config/s390/s390.cc
@@ -5910,20 +5910,62 @@ s390_expand_movmem (rtx dst, rtx src, rtx len, rtx min_len_rtx, rtx max_len_rtx)
Make use of clrmem if VAL is zero. */
void
-s390_expand_setmem (rtx dst, rtx len, rtx val)
+s390_expand_setmem (rtx dst, rtx len, rtx val, rtx min_len_rtx, rtx max_len_rtx)
{
- if (GET_CODE (len) == CONST_INT && INTVAL (len) <= 0)
+ /* Exit early in case nothing has to be done. */
+ if (CONST_INT_P (len) && UINTVAL (len) == 0)
return;
gcc_assert (GET_CODE (val) == CONST_INT || GET_MODE (val) == QImode);
+ unsigned HOST_WIDE_INT min_len = UINTVAL (min_len_rtx);
+ unsigned HOST_WIDE_INT max_len
+ = max_len_rtx ? UINTVAL (max_len_rtx) : HOST_WIDE_INT_M1U;
+
+ /* Vectorize memset with a constant length
+ - if 0 < LEN < 16, then emit a vstl based solution;
+ - if 16 <= LEN <= 64, then emit a vst based solution
+ where the last two vector stores may overlap in case LEN%16!=0. Paying
+ the price for an overlap is negligible compared to an extra GPR which is
+ required for vstl. */
+ if (CONST_INT_P (len) && UINTVAL (len) <= 64 && val != const0_rtx
+ && TARGET_VX)
+ {
+ rtx val_vec = gen_reg_rtx (V16QImode);
+ emit_move_insn (val_vec, gen_rtx_VEC_DUPLICATE (V16QImode, val));
+
+ if (UINTVAL (len) < 16)
+ {
+ rtx len_reg = gen_reg_rtx (SImode);
+ emit_move_insn (len_reg, GEN_INT (UINTVAL (len) - 1));
+ emit_insn (gen_vstlv16qi (val_vec, len_reg, dst));
+ }
+ else
+ {
+ unsigned HOST_WIDE_INT l = UINTVAL (len) / 16;
+ unsigned HOST_WIDE_INT r = UINTVAL (len) % 16;
+ unsigned HOST_WIDE_INT o = 0;
+ for (unsigned HOST_WIDE_INT i = 0; i < l; ++i)
+ {
+ rtx newdst = adjust_address (dst, V16QImode, o);
+ emit_move_insn (newdst, val_vec);
+ o += 16;
+ }
+ if (r != 0)
+ {
+ rtx newdst = adjust_address (dst, V16QImode, (o - 16) + r);
+ emit_move_insn (newdst, val_vec);
+ }
+ }
+ }
+
/* Expand setmem/clrmem for a constant length operand without a
loop if it will be shorter that way.
clrmem loop (with PFD) is 30 bytes -> 5 * xc
clrmem loop (without PFD) is 24 bytes -> 4 * xc
setmem loop (with PFD) is 38 bytes -> ~4 * (mvi/stc + mvc)
setmem loop (without PFD) is 32 bytes -> ~4 * (mvi/stc + mvc) */
- if (GET_CODE (len) == CONST_INT
+ else if (GET_CODE (len) == CONST_INT
&& ((val == const0_rtx
&& (INTVAL (len) <= 256 * 4
|| (INTVAL (len) <= 256 * 5 && TARGET_SETMEM_PFD(val,len))))
@@ -5968,6 +6010,70 @@ s390_expand_setmem (rtx dst, rtx len, rtx val)
val));
}
+ /* Non-constant length and no loop required. */
+ else if (!CONST_INT_P (len) && max_len <= 256)
+ {
+ rtx_code_label *end_label;
+
+ if (min_len == 0)
+ {
+ end_label = gen_label_rtx ();
+ emit_cmp_and_jump_insns (len, const0_rtx, EQ, NULL_RTX,
+ GET_MODE (len), 1, end_label,
+ profile_probability::very_unlikely ());
+ }
+
+ rtx lenm1 = expand_binop (GET_MODE (len), add_optab, len, constm1_rtx,
+ NULL_RTX, 1, OPTAB_DIRECT);
+
+ /* Prefer a vectorized implementation over one which makes use of an
+ execute instruction since it is faster (although it increases register
+ pressure). */
+ if (max_len <= 16 && TARGET_VX)
+ {
+ rtx val_vec = gen_reg_rtx (V16QImode);
+ if (val == const0_rtx)
+ emit_move_insn (val_vec, CONST0_RTX (V16QImode));
+ else
+ emit_move_insn (val_vec, gen_rtx_VEC_DUPLICATE (V16QImode, val));
+
+ lenm1 = convert_to_mode (SImode, lenm1, 1);
+ emit_insn (gen_vstlv16qi (val_vec, lenm1, dst));
+ }
+ else
+ {
+ if (val == const0_rtx)
+ emit_insn (
+ gen_clrmem_short (dst, convert_to_mode (Pmode, lenm1, 1)));
+ else
+ {
+ emit_move_insn (adjust_address (dst, QImode, 0), val);
+
+ rtx_code_label *onebyte_end_label;
+ if (min_len <= 1)
+ {
+ onebyte_end_label = gen_label_rtx ();
+ emit_cmp_and_jump_insns (
+ len, const1_rtx, EQ, NULL_RTX, GET_MODE (len), 1,
+ onebyte_end_label, profile_probability::very_unlikely ());
+ }
+
+ rtx dstp1 = adjust_address (dst, VOIDmode, 1);
+ rtx lenm2
+ = expand_binop (GET_MODE (len), add_optab, len, GEN_INT (-2),
+ NULL_RTX, 1, OPTAB_DIRECT);
+ lenm2 = convert_to_mode (Pmode, lenm2, 1);
+ emit_insn (gen_cpymem_short (dstp1, dst, lenm2));
+
+ if (min_len <= 1)
+ emit_label (onebyte_end_label);
+ }
+ }
+
+ if (min_len == 0)
+ emit_label (end_label);
+ }
+
else
{
rtx dst_addr, count, blocks, temp, dstp1 = NULL_RTX;
@@ -5986,9 +6092,10 @@ s390_expand_setmem (rtx dst, rtx len, rtx val)
blocks = gen_reg_rtx (mode);
convert_move (count, len, 1);
- emit_cmp_and_jump_insns (count, const0_rtx,
- EQ, NULL_RTX, mode, 1, zerobyte_end_label,
- profile_probability::very_unlikely ());
+ if (min_len == 0)
+ emit_cmp_and_jump_insns (count, const0_rtx, EQ, NULL_RTX, mode, 1,
+ zerobyte_end_label,
+ profile_probability::very_unlikely ());
/* We need to make a copy of the target address since memset is
supposed to return it unmodified. We have to make it here
@@ -6003,10 +6110,10 @@ s390_expand_setmem (rtx dst, rtx len, rtx val)
the mvc reading this value). */
set_mem_size (dst, 1);
dstp1 = adjust_address (dst, VOIDmode, 1);
- emit_cmp_and_jump_insns (count,
- const1_rtx, EQ, NULL_RTX, mode, 1,
- onebyte_end_label,
- profile_probability::very_unlikely ());
+ if (min_len <= 1)
+ emit_cmp_and_jump_insns (count, const1_rtx, EQ, NULL_RTX, mode, 1,
+ onebyte_end_label,
+ profile_probability::very_unlikely ());
}
/* There is one unconditional (mvi+mvc)/xc after the loop
@@ -6029,7 +6136,7 @@ s390_expand_setmem (rtx dst, rtx len, rtx val)
emit_jump (loop_start_label);
- if (val != const0_rtx)
+ if (val != const0_rtx && min_len <= 1)
{
/* The 1 byte != 0 special case. Not handled efficiently
since we require two jumps for that. However, this
diff --git a/gcc/config/s390/s390.md b/gcc/config/s390/s390.md
index abe3bbc5cd9..9631b2a8c60 100644
--- a/gcc/config/s390/s390.md
+++ b/gcc/config/s390/s390.md
@@ -3595,12 +3595,16 @@
;
(define_expand "setmem<mode>"
- [(set (match_operand:BLK 0 "memory_operand" "")
- (match_operand:QI 2 "general_operand" ""))
- (use (match_operand:GPR 1 "general_operand" ""))
- (match_operand 3 "" "")]
+ [(set (match_operand:BLK 0 "memory_operand" "") ; destination
+ (match_operand:QI 2 "general_operand" "")) ; value
+ (use (match_operand:GPR 1 "general_operand" "")) ; size
+ (match_operand 3 "") ; align
+ (match_operand 4 "") ; expected align
+ (match_operand 5 "") ; expected size
+ (match_operand 6 "") ; minimal size
+ (match_operand 7 "")] ; maximal size
""
- "s390_expand_setmem (operands[0], operands[1], operands[2]); DONE;")
+ "s390_expand_setmem (operands[0], operands[1], operands[2], operands[6], operands[7]); DONE;")
; Clear a block that is up to 256 bytes in length.
; The block length is taken as (operands[1] % 256) + 1.
diff --git a/gcc/testsuite/gcc.target/s390/memset-1.c b/gcc/testsuite/gcc.target/s390/memset-1.c
index 9463a77208b..5eb96112f13 100644
--- a/gcc/testsuite/gcc.target/s390/memset-1.c
+++ b/gcc/testsuite/gcc.target/s390/memset-1.c
@@ -11,7 +11,7 @@ void
return __builtin_memset (s, c, 1);
}
-/* 1 stc 1 mvc */
+/* 3 vst */
void
*memset1(void *s, int c)
{
@@ -170,8 +170,9 @@ void
}
/* { dg-final { scan-assembler-times "mvi\\s" 1 } } */
-/* { dg-final { scan-assembler-times "mvc\\s" 20 } } */
+/* { dg-final { scan-assembler-times "mvc\\s" 19 } } */
/* { dg-final { scan-assembler-times "xc\\s" 28 } } */
-/* { dg-final { scan-assembler-times "stc\\s" 22 } } */
+/* { dg-final { scan-assembler-times "stc\\s" 21 } } */
/* { dg-final { scan-assembler-times "stcy\\s" 0 } } */
/* { dg-final { scan-assembler-times "pfd\\s" 2 } } */
+/* { dg-final { scan-assembler-times "vst\\s" 3 } } */
--
2.39.2
next prev parent reply other threads:[~2023-05-15 7:18 UTC|newest]
Thread overview: 5+ messages / expand[flat|nested] mbox.gz Atom feed top
2023-05-15 7:17 [PATCH 0/3] Refactor memory block operations Stefan Schulze Frielinghaus
2023-05-15 7:17 ` [PATCH 1/3] s390: Refactor block operation cpymem Stefan Schulze Frielinghaus
2023-05-15 7:17 ` [PATCH 2/3] s390: Add block operation movmem Stefan Schulze Frielinghaus
2023-05-15 7:17 ` Stefan Schulze Frielinghaus [this message]
2023-05-15 20:18 ` [PATCH 0/3] Refactor memory block operations Andreas Krebbel
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20230515071738.563660-4-stefansf@linux.ibm.com \
--to=stefansf@linux.ibm.com \
--cc=gcc-patches@gcc.gnu.org \
--cc=krebbel@linux.ibm.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).