public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc(refs/users/meissner/heads/dmf004)] Add suuport to use stxvl for variable sized memsets.
@ 2022-11-17 21:56 Michael Meissner
  0 siblings, 0 replies; 2+ messages in thread
From: Michael Meissner @ 2022-11-17 21:56 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:badbdb6089f00592e2c91848cd93f808db79520d

commit badbdb6089f00592e2c91848cd93f808db79520d
Author: Michael Meissner <meissner@linux.ibm.com>
Date:   Thu Nov 17 15:05:02 2022 -0500

    Add suuport to use stxvl for variable sized memsets.
    
    2022-11-17   Michael Meissner  <meissner@linux.ibm.com>
    
    gcc/
    
            * config/rs6000/rs6000-protos.h (expand_block_set): Add declaration.
            * config/rs6000/rs6000-string.cc (expand_block_set): New support for
            optimizing variable sized memsets.
            * config/rs6000/rs6000.md (setmem<mode>): Add setmemdi along with
            setmemsi.  Add support for optimizing memsets of other bytes than just
            0.  Add support for using stxvl to support variable sized sets.
            * config/rs6000/rs6000.opt (--param rs6000-memcpy-inline-bytes): Make
            the default 16, not 32.
            ((--param rs6000-memset-inline-bytes): New parameter.

Diff:
---
 gcc/config/rs6000/rs6000-protos.h  |  1 +
 gcc/config/rs6000/rs6000-string.cc | 87 ++++++++++++++++++++++++++++++++++++++
 gcc/config/rs6000/rs6000.md        | 16 +++----
 gcc/config/rs6000/rs6000.opt       |  9 +++-
 4 files changed, 101 insertions(+), 12 deletions(-)

diff --git a/gcc/config/rs6000/rs6000-protos.h b/gcc/config/rs6000/rs6000-protos.h
index d0d89320ef6..07f0759e19c 100644
--- a/gcc/config/rs6000/rs6000-protos.h
+++ b/gcc/config/rs6000/rs6000-protos.h
@@ -69,6 +69,7 @@ extern void rs6000_generate_float2_code (bool, rtx, rtx, rtx);
 extern void rs6000_generate_float2_double_code (rtx, rtx, rtx);
 extern void rs6000_generate_vsigned2_code (bool, rtx, rtx, rtx);
 extern int expand_block_clear (rtx[]);
+extern int expand_block_set (rtx[]);
 extern int expand_block_move (rtx[], bool);
 extern bool expand_block_compare (rtx[]);
 extern bool expand_strn_compare (rtx[], int);
diff --git a/gcc/config/rs6000/rs6000-string.cc b/gcc/config/rs6000/rs6000-string.cc
index ee17ddb87e1..4649993cf1a 100644
--- a/gcc/config/rs6000/rs6000-string.cc
+++ b/gcc/config/rs6000/rs6000-string.cc
@@ -39,6 +39,11 @@
 #include "predict.h"
 #include "optabs.h"
 
+/* Forward reference.  */
+static void do_ifelse (machine_mode cmpmode, rtx_code comparison,
+		       rtx a, rtx b, rtx cr, rtx true_label,
+		       profile_probability br_prob);
+
 /* Expand a block clear operation, and return 1 if successful.  Return 0
    if we should let the compiler generate normal code.
 
@@ -148,6 +153,88 @@ expand_block_clear (rtx operands[])
   return 1;
 }
 
+/* Expand a block set operation, and return 1 if successful.  Return 0
+   if we should let the compiler generate normal code.
+
+   operands[0] is the destination
+   operands[1] is the length
+   operands[2] is the value to set memory to (normally 0)
+   operands[3] is the alignment */
+
+int
+expand_block_set (rtx operands[])
+{
+  rtx bytes_rtx	= operands[1];
+  rtx set_byte = operands[2];
+  bool constp = CONST_INT_P (bytes_rtx);
+
+  /* At the moment, only handle setting memory to a constant.  */
+  if (!CONST_INT_P (set_byte)
+      || !IN_RANGE (INTVAL (set_byte), -127, 255))
+    return 0;
+
+  /* If we are storing to a memory region with a variable size, see if we have
+     the necessary support for store vector with length, and we want to do the
+     optimization.  Fall back to using the clear memory support if we don't
+     want to use stxvl using an inline test.  */
+  if (constp
+      || !TARGET_BLOCK_OPS_UNALIGNED_VSX
+      || !TARGET_P9_VECTOR
+      || !TARGET_64BIT
+      || rs6000_memset_inline_bytes == 0
+      || !param_vect_partial_vector_usage
+      || !optimize
+      || optimize_size)
+    {
+      if (set_byte == const0_rtx)
+	return expand_block_clear (operands);
+
+      return 0;
+    }
+
+  rtx dest_addr = force_reg (Pmode, XEXP (operands[0], 0));
+  int vect_size_int = (rs6000_memset_inline_bytes >= GET_MODE_SIZE (V16QImode)
+		       ? GET_MODE_SIZE (V16QImode)
+		       : rs6000_memset_inline_bytes);
+
+  rtx vect_size = GEN_INT (vect_size_int);
+  rtx var_cr = gen_reg_rtx (CCUNSmode);
+  emit_insn (gen_rtx_SET (var_cr,
+			  gen_rtx_COMPARE (CCUNSmode, bytes_rtx, vect_size)));
+				  
+  rtx var_label = gen_label_rtx ();
+  do_ifelse (CCUNSmode, LEU, NULL_RTX, NULL_RTX, var_cr, var_label,
+	     profile_probability::likely ());
+
+  /* Call memset if the size is too large.  */
+  tree fun = builtin_decl_explicit (BUILT_IN_MEMSET);
+  emit_library_call_value (XEXP (DECL_RTL (fun), 0),
+			   NULL_RTX, LCT_NORMAL, Pmode,
+			   dest_addr, Pmode,
+			   set_byte, SImode,
+			   bytes_rtx, Pmode);
+
+  rtx join_label = gen_label_rtx ();
+  rtx join_ref = gen_rtx_LABEL_REF (VOIDmode, join_label);
+  emit_jump_insn (gen_rtx_SET (pc_rtx, join_ref));
+  emit_barrier ();
+
+  emit_label (var_label);
+
+  if (IN_RANGE (INTVAL (set_byte), 128, 255))
+    set_byte = GEN_INT (((INTVAL (set_byte) & 0xff) ^ 0x80) - 0x80);
+
+  /* Create the vector with the bytes splatted.  */
+  rtx vreg = gen_reg_rtx (V16QImode);
+  emit_insn (gen_xxspltib_v16qi (vreg, set_byte));
+
+  /* We want to set bytes inline.  Set 0..16 bytes now.  */
+  emit_insn (gen_stxvl (vreg, dest_addr, bytes_rtx));
+
+  emit_label (join_label);
+  return 1;
+}
+
 /* Figure out the correct instructions to generate to load data for
    block compare.  MODE is used for the read from memory, and
    data is zero extended if REG is wider than MODE.  If LE code
diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index 12bae0d32a7..6d9d08c6172 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -9790,18 +9790,14 @@
   DONE;
 })
 \f
-(define_expand "setmemsi"
-  [(parallel [(set (match_operand:BLK 0 "")
-		   (match_operand 2 "const_int_operand"))
-	      (use (match_operand:SI 1 ""))
-	      (use (match_operand:SI 3 ""))])]
+(define_expand "setmem<mode>"
+  [(use (match_operand:BLK 0 ""))
+   (use (match_operand:GPR 1 ""))
+   (use (match_operand:SI 2 ""))
+   (use (match_operand:SI 3 ""))]
   ""
 {
-  /* If value to set is not zero, use the library routine.  */
-  if (operands[2] != const0_rtx)
-    FAIL;
-
-  if (expand_block_clear (operands))
+  if (expand_block_set (operands))
     DONE;
   else
     FAIL;
diff --git a/gcc/config/rs6000/rs6000.opt b/gcc/config/rs6000/rs6000.opt
index 602930063cd..30641de5ac3 100644
--- a/gcc/config/rs6000/rs6000.opt
+++ b/gcc/config/rs6000/rs6000.opt
@@ -689,6 +689,11 @@ When reduction factor computed for a loop exceeds the threshold specified by
 this parameter, prefer to unroll this loop.  The default value is 1.
 
 -param=rs6000-memcpy-inline-bytes=
-Target Undocumented Joined UInteger Var(rs6000_memcpy_inline_bytes) Init(32) Param
+Target Undocumented Joined UInteger Var(rs6000_memcpy_inline_bytes) Init(16) Param
 Maximum number of bytes to move with inline code before calling the memcpy
-library function.  The default value is 32.
+library function.  The default value is 16.
+
+-param=rs6000-memset-inline-bytes=
+Target Undocumented Joined UInteger Var(rs6000_memset_inline_bytes) Init(16) Param
+Maximum number of bytes to move with inline code before calling the memset
+library function.  The default value is 16.

^ permalink raw reply	[flat|nested] 2+ messages in thread

* [gcc(refs/users/meissner/heads/dmf004)] Add suuport to use stxvl for variable sized memsets.
@ 2022-11-17 20:05 Michael Meissner
  0 siblings, 0 replies; 2+ messages in thread
From: Michael Meissner @ 2022-11-17 20:05 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:3f253637de887a3119ac881eeb0e743b8112367c

commit 3f253637de887a3119ac881eeb0e743b8112367c
Author: Michael Meissner <meissner@linux.ibm.com>
Date:   Thu Nov 17 15:05:02 2022 -0500

    Add suuport to use stxvl for variable sized memsets.
    
    2022-11-17   Michael Meissner  <meissner@linux.ibm.com>
    
    gcc/
    
            * config/rs6000/rs6000-protos.h (expand_block_set): Add declaration.
            * config/rs6000/rs6000-string.cc (expand_block_set): New support for
            optimizing variable sized memsets.
            * config/rs6000/rs6000.md (setmem<mode>): Add setmemdi along with
            setmemsi.  Add support for optimizing memsets of other bytes than just
            0.  Add support for using stxvl to support variable sized sets.
            * config/rs6000/rs6000.opt (--param rs6000-memcpy-inline-bytes): Make
            the default 16, not 32.
            ((--param rs6000-memset-inline-bytes): New parameter.

Diff:
---
 gcc/config/rs6000/rs6000-protos.h  |  1 +
 gcc/config/rs6000/rs6000-string.cc | 87 ++++++++++++++++++++++++++++++++++++++
 gcc/config/rs6000/rs6000.md        | 16 +++----
 gcc/config/rs6000/rs6000.opt       |  9 +++-
 4 files changed, 101 insertions(+), 12 deletions(-)

diff --git a/gcc/config/rs6000/rs6000-protos.h b/gcc/config/rs6000/rs6000-protos.h
index d0d89320ef6..07f0759e19c 100644
--- a/gcc/config/rs6000/rs6000-protos.h
+++ b/gcc/config/rs6000/rs6000-protos.h
@@ -69,6 +69,7 @@ extern void rs6000_generate_float2_code (bool, rtx, rtx, rtx);
 extern void rs6000_generate_float2_double_code (rtx, rtx, rtx);
 extern void rs6000_generate_vsigned2_code (bool, rtx, rtx, rtx);
 extern int expand_block_clear (rtx[]);
+extern int expand_block_set (rtx[]);
 extern int expand_block_move (rtx[], bool);
 extern bool expand_block_compare (rtx[]);
 extern bool expand_strn_compare (rtx[], int);
diff --git a/gcc/config/rs6000/rs6000-string.cc b/gcc/config/rs6000/rs6000-string.cc
index ee17ddb87e1..4649993cf1a 100644
--- a/gcc/config/rs6000/rs6000-string.cc
+++ b/gcc/config/rs6000/rs6000-string.cc
@@ -39,6 +39,11 @@
 #include "predict.h"
 #include "optabs.h"
 
+/* Forward reference.  */
+static void do_ifelse (machine_mode cmpmode, rtx_code comparison,
+		       rtx a, rtx b, rtx cr, rtx true_label,
+		       profile_probability br_prob);
+
 /* Expand a block clear operation, and return 1 if successful.  Return 0
    if we should let the compiler generate normal code.
 
@@ -148,6 +153,88 @@ expand_block_clear (rtx operands[])
   return 1;
 }
 
+/* Expand a block set operation, and return 1 if successful.  Return 0
+   if we should let the compiler generate normal code.
+
+   operands[0] is the destination
+   operands[1] is the length
+   operands[2] is the value to set memory to (normally 0)
+   operands[3] is the alignment */
+
+int
+expand_block_set (rtx operands[])
+{
+  rtx bytes_rtx	= operands[1];
+  rtx set_byte = operands[2];
+  bool constp = CONST_INT_P (bytes_rtx);
+
+  /* At the moment, only handle setting memory to a constant.  */
+  if (!CONST_INT_P (set_byte)
+      || !IN_RANGE (INTVAL (set_byte), -127, 255))
+    return 0;
+
+  /* If we are storing to a memory region with a variable size, see if we have
+     the necessary support for store vector with length, and we want to do the
+     optimization.  Fall back to using the clear memory support if we don't
+     want to use stxvl using an inline test.  */
+  if (constp
+      || !TARGET_BLOCK_OPS_UNALIGNED_VSX
+      || !TARGET_P9_VECTOR
+      || !TARGET_64BIT
+      || rs6000_memset_inline_bytes == 0
+      || !param_vect_partial_vector_usage
+      || !optimize
+      || optimize_size)
+    {
+      if (set_byte == const0_rtx)
+	return expand_block_clear (operands);
+
+      return 0;
+    }
+
+  rtx dest_addr = force_reg (Pmode, XEXP (operands[0], 0));
+  int vect_size_int = (rs6000_memset_inline_bytes >= GET_MODE_SIZE (V16QImode)
+		       ? GET_MODE_SIZE (V16QImode)
+		       : rs6000_memset_inline_bytes);
+
+  rtx vect_size = GEN_INT (vect_size_int);
+  rtx var_cr = gen_reg_rtx (CCUNSmode);
+  emit_insn (gen_rtx_SET (var_cr,
+			  gen_rtx_COMPARE (CCUNSmode, bytes_rtx, vect_size)));
+				  
+  rtx var_label = gen_label_rtx ();
+  do_ifelse (CCUNSmode, LEU, NULL_RTX, NULL_RTX, var_cr, var_label,
+	     profile_probability::likely ());
+
+  /* Call memset if the size is too large.  */
+  tree fun = builtin_decl_explicit (BUILT_IN_MEMSET);
+  emit_library_call_value (XEXP (DECL_RTL (fun), 0),
+			   NULL_RTX, LCT_NORMAL, Pmode,
+			   dest_addr, Pmode,
+			   set_byte, SImode,
+			   bytes_rtx, Pmode);
+
+  rtx join_label = gen_label_rtx ();
+  rtx join_ref = gen_rtx_LABEL_REF (VOIDmode, join_label);
+  emit_jump_insn (gen_rtx_SET (pc_rtx, join_ref));
+  emit_barrier ();
+
+  emit_label (var_label);
+
+  if (IN_RANGE (INTVAL (set_byte), 128, 255))
+    set_byte = GEN_INT (((INTVAL (set_byte) & 0xff) ^ 0x80) - 0x80);
+
+  /* Create the vector with the bytes splatted.  */
+  rtx vreg = gen_reg_rtx (V16QImode);
+  emit_insn (gen_xxspltib_v16qi (vreg, set_byte));
+
+  /* We want to set bytes inline.  Set 0..16 bytes now.  */
+  emit_insn (gen_stxvl (vreg, dest_addr, bytes_rtx));
+
+  emit_label (join_label);
+  return 1;
+}
+
 /* Figure out the correct instructions to generate to load data for
    block compare.  MODE is used for the read from memory, and
    data is zero extended if REG is wider than MODE.  If LE code
diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index 12bae0d32a7..6d9d08c6172 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -9790,18 +9790,14 @@
   DONE;
 })
 \f
-(define_expand "setmemsi"
-  [(parallel [(set (match_operand:BLK 0 "")
-		   (match_operand 2 "const_int_operand"))
-	      (use (match_operand:SI 1 ""))
-	      (use (match_operand:SI 3 ""))])]
+(define_expand "setmem<mode>"
+  [(use (match_operand:BLK 0 ""))
+   (use (match_operand:GPR 1 ""))
+   (use (match_operand:SI 2 ""))
+   (use (match_operand:SI 3 ""))]
   ""
 {
-  /* If value to set is not zero, use the library routine.  */
-  if (operands[2] != const0_rtx)
-    FAIL;
-
-  if (expand_block_clear (operands))
+  if (expand_block_set (operands))
     DONE;
   else
     FAIL;
diff --git a/gcc/config/rs6000/rs6000.opt b/gcc/config/rs6000/rs6000.opt
index 602930063cd..30641de5ac3 100644
--- a/gcc/config/rs6000/rs6000.opt
+++ b/gcc/config/rs6000/rs6000.opt
@@ -689,6 +689,11 @@ When reduction factor computed for a loop exceeds the threshold specified by
 this parameter, prefer to unroll this loop.  The default value is 1.
 
 -param=rs6000-memcpy-inline-bytes=
-Target Undocumented Joined UInteger Var(rs6000_memcpy_inline_bytes) Init(32) Param
+Target Undocumented Joined UInteger Var(rs6000_memcpy_inline_bytes) Init(16) Param
 Maximum number of bytes to move with inline code before calling the memcpy
-library function.  The default value is 32.
+library function.  The default value is 16.
+
+-param=rs6000-memset-inline-bytes=
+Target Undocumented Joined UInteger Var(rs6000_memset_inline_bytes) Init(16) Param
+Maximum number of bytes to move with inline code before calling the memset
+library function.  The default value is 16.

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2022-11-17 21:56 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-11-17 21:56 [gcc(refs/users/meissner/heads/dmf004)] Add suuport to use stxvl for variable sized memsets Michael Meissner
  -- strict thread matches above, loose matches on Subject: below --
2022-11-17 20:05 Michael Meissner

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).