public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc r12-5934] aarch64: Use +mops to inline memset operations
@ 2021-12-13 15:18 Kyrylo Tkachov
  0 siblings, 0 replies; only message in thread
From: Kyrylo Tkachov @ 2021-12-13 15:18 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:d3bd985e799b63e2133e89870472ac36d06015d3

commit r12-5934-gd3bd985e799b63e2133e89870472ac36d06015d3
Author: Kyrylo Tkachov <kyrylo.tkachov@arm.com>
Date:   Mon Dec 13 14:14:21 2021 +0000

    aarch64: Use +mops to inline memset operations
    
    This 3rd patch in the series adds an inline sequence for the memset operation.
    The aarch64-mops-memset-size-threshold param is added to control the size threshold for the sequence.
    Its default setting is 256, which may seem a bit high, but it is consistent with the current
    SIMD memset inline sequence limit, and future CPU tunings can override it easily as needed.
    
    Bootstrapped and tested on aarch64-none-linux-gnu.
    
    gcc/ChangeLog:
    
            * config/aarch64/aarch64.c (aarch64_expand_setmem_mops): Define.
            (aarch64_expand_setmem): Adjust for TARGET_MOPS.
            * config/aarch64/aarch64.h (CLEAR_RATIO): Adjust for TARGET_MOPS.
            (SET_RATIO): Likewise.
            * config/aarch64/aarch64.md ("unspec"): Add UNSPEC_SETMEM.
            (aarch64_setmemdi): Define.
            (setmemdi): Adjust for TARGET_MOPS.
            * config/aarch64/aarch64.opt (aarch64-mops-memset-size-threshold):
            New param.
    
    gcc/testsuite/ChangeLog:
    
            * gcc.target/aarch64/mops_3.c: New test.

Diff:
---
 gcc/config/aarch64/aarch64.c              | 93 ++++++++++++++++++++++++-------
 gcc/config/aarch64/aarch64.h              |  4 +-
 gcc/config/aarch64/aarch64.md             | 20 +++++--
 gcc/config/aarch64/aarch64.opt            |  4 ++
 gcc/testsuite/gcc.target/aarch64/mops_3.c | 85 ++++++++++++++++++++++++++++
 5 files changed, 181 insertions(+), 25 deletions(-)

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index bd754e4e2c2..d11a40c02cd 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -23754,6 +23754,28 @@ aarch64_set_one_block_and_progress_pointer (rtx src, rtx *dst,
   *dst = aarch64_progress_pointer (*dst);
 }
 
+/* Expand a setmem using the MOPS instructions.  OPERANDS are the same
+   as for the setmem pattern.  Return true iff we succeed.  */
+static bool
+aarch64_expand_setmem_mops (rtx *operands)
+{
+  if (!TARGET_MOPS)
+    return false;
+
+  rtx addr_dst = XEXP (operands[0], 0);
+  rtx sz_reg = operands[1];
+  rtx val = operands[2];
+
+  if (!REG_P (sz_reg))
+   sz_reg = force_reg (DImode, sz_reg);
+  if (!REG_P (addr_dst))
+   addr_dst = force_reg (DImode, addr_dst);
+  if (!REG_P (val) && val != CONST0_RTX (QImode))
+   val = force_reg (QImode, val);
+  emit_insn (gen_aarch64_setmemdi (addr_dst, val, sz_reg));
+  return true;
+}
+
 /* Expand setmem, as if from a __builtin_memset.  Return true if
    we succeed, otherwise return false.  */
 
@@ -23767,39 +23789,59 @@ aarch64_expand_setmem (rtx *operands)
   rtx base;
   machine_mode cur_mode = BLKmode, next_mode;
 
-  /* We can't do anything smart if the amount to copy is not constant.  */
-  if (!CONST_INT_P (operands[1]))
-    return false;
+  /* If we don't have SIMD registers or the size is variable use the MOPS
+     inlined sequence if possible.  */
+  if (!CONST_INT_P (operands[1]) || !TARGET_SIMD)
+    return aarch64_expand_setmem_mops (operands);
 
   bool size_p = optimize_function_for_size_p (cfun);
 
-  /* Default the maximum to 256-bytes.  */
+  /* Default the maximum to 256-bytes when considering only libcall vs
+     SIMD broadcast sequence.  */
   unsigned max_set_size = 256;
 
   len = INTVAL (operands[1]);
-
-  /* Upper bound check.  */
-  if (len > max_set_size)
+  if (len > max_set_size && !TARGET_MOPS)
     return false;
 
+  int cst_val = !!(CONST_INT_P (val) && (INTVAL (val) != 0));
+  /* The MOPS sequence takes:
+     3 instructions for the memory storing
+     + 1 to move the constant size into a reg
+     + 1 if VAL is a non-zero constant to move into a reg
+    (zero constants can use XZR directly).  */
+  unsigned mops_cost = 3 + 1 + cst_val;
+  /* A libcall to memset in the worst case takes 3 instructions to prepare
+     the arguments + 1 for the call.  */
+  unsigned libcall_cost = 4;
+
+  /* Upper bound check.  For large constant-sized setmem use the MOPS sequence
+     when available.  */
+  if (TARGET_MOPS
+      && len >= (unsigned HOST_WIDE_INT) aarch64_mops_memset_size_threshold)
+    return aarch64_expand_setmem_mops (operands);
+
   /* Attempt a sequence with a vector broadcast followed by stores.
-     Count the number of operations involved to see if it's worth it for
-     code size.  */
+     Count the number of operations involved to see if it's worth it
+     against the alternatives.  A simple counter simd_ops on the
+     algorithmically-relevant operations is used rather than an rtx_insn count
+     as all the pointer adjusmtents and mode reinterprets will be optimized
+     away later.  */
   start_sequence ();
-  unsigned nops = 0;
+  unsigned simd_ops = 0;
+
   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
 
   /* Prepare the val using a DUP/MOVI v0.16B, val.  */
   src = expand_vector_broadcast (V16QImode, val);
   src = force_reg (V16QImode, src);
-  nops++;
+  simd_ops++;
   /* Convert len to bits to make the rest of the code simpler.  */
   n = len * BITS_PER_UNIT;
 
   /* Maximum amount to copy in one go.  We allow 256-bit chunks based on the
-     AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter.  setmem expand
-     pattern is only turned on for TARGET_SIMD.  */
+     AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter.  */
   const int copy_limit = (aarch64_tune_params.extra_tuning_flags
 			  & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
 			  ? GET_MODE_BITSIZE (TImode) : 256;
@@ -23817,7 +23859,7 @@ aarch64_expand_setmem (rtx *operands)
 
       mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
       aarch64_set_one_block_and_progress_pointer (src, &dst, cur_mode);
-      nops++;
+      simd_ops++;
       n -= mode_bits;
 
       /* Do certain trailing copies as overlapping if it's going to be
@@ -23835,12 +23877,25 @@ aarch64_expand_setmem (rtx *operands)
     }
   rtx_insn *seq = get_insns ();
   end_sequence ();
-  /* A call to memset in the worst case requires 3 instructions to prepare
-     the arguments + 1 for the call.  Prefer the inline sequence for size if
-     it is no longer than that.  */
-  if (size_p && nops > 4)
-    return false;
 
+  if (size_p)
+    {
+      /* When optimizing for size we have 3 options: the SIMD broadcast sequence,
+	 call to memset or the MOPS expansion.  */
+      if (TARGET_MOPS
+	  && mops_cost <= libcall_cost
+	  && mops_cost <= simd_ops)
+	return aarch64_expand_setmem_mops (operands);
+      /* If MOPS is not available or not shorter pick a libcall if the SIMD
+	 sequence is too long.  */
+      else if (libcall_cost < simd_ops)
+	return false;
+      emit_insn (seq);
+      return true;
+    }
+
+  /* At this point the SIMD broadcast sequence is the best choice when
+     optimizing for speed.  */
   emit_insn (seq);
   return true;
 }
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index 79d0bcd357f..2478d0db290 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -1063,14 +1063,14 @@ typedef struct
    Otherwise follow a sensible default: when optimizing for size, give a better
    estimate of the length of a memset call, but use the default otherwise.  */
 #define CLEAR_RATIO(speed) \
-  (!STRICT_ALIGNMENT ? 4 : (speed) ? 15 : AARCH64_CALL_RATIO)
+  (!STRICT_ALIGNMENT ? (TARGET_MOPS ? 0 : 4) : (speed) ? 15 : AARCH64_CALL_RATIO)
 
 /* SET_RATIO is similar to CLEAR_RATIO, but for a non-zero constant.  Without
    -mstrict-align, make decisions in "setmem".  Otherwise follow a sensible
    default: when optimizing for size adjust the ratio to account for the
    overhead of loading the constant.  */
 #define SET_RATIO(speed) \
-  (!STRICT_ALIGNMENT ? 0 : (speed) ? 15 : AARCH64_CALL_RATIO - 2)
+  ((!STRICT_ALIGNMENT || TARGET_MOPS) ? 0 : (speed) ? 15 : AARCH64_CALL_RATIO - 2)
 
 /* Disable auto-increment in move_by_pieces et al.  Use of auto-increment is
    rarely a good idea in straight-line code since it adds an extra address
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index b71c171ca04..9e50a26e6f4 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -204,6 +204,7 @@
     UNSPEC_SABDL2
     UNSPEC_SADALP
     UNSPEC_SCVTF
+    UNSPEC_SETMEM
     UNSPEC_SISD_NEG
     UNSPEC_SISD_SSHL
     UNSPEC_SISD_USHL
@@ -1650,18 +1651,29 @@
 }
 )
 
+(define_insn "aarch64_setmemdi"
+  [(parallel [
+   (set (match_operand:DI 2 "register_operand" "+&r") (const_int 0))
+   (clobber (match_operand:DI 0 "register_operand" "+&r"))
+   (set (mem:BLK (match_dup 0))
+        (unspec:BLK [(match_operand:QI 1 "aarch64_reg_or_zero" "rZ")
+                    (match_dup 2)] UNSPEC_SETMEM))])]
+ "TARGET_MOPS"
+ "setp\t[%x0]!, %x2!, %x1\;setm\t[%x0]!, %x2!, %x1\;sete\t[%x0]!, %x2!, %x1"
+ [(set_attr "length" "12")]
+)
+
 ;; 0 is dst
 ;; 1 is val
 ;; 2 is size of copy in bytes
 ;; 3 is alignment
-
 (define_expand "setmemdi"
   [(set (match_operand:BLK 0 "memory_operand")     ;; Dest
         (match_operand:QI  2 "nonmemory_operand")) ;; Value
-   (use (match_operand:DI  1 "immediate_operand")) ;; Length
+   (use (match_operand:DI  1 "general_operand")) ;; Length
    (match_operand          3 "immediate_operand")] ;; Align
-  "TARGET_SIMD"
-{
+ "TARGET_SIMD || TARGET_MOPS"
+ {
   if (aarch64_expand_setmem (operands))
     DONE;
 
diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
index 33788ffd9e3..264739e6143 100644
--- a/gcc/config/aarch64/aarch64.opt
+++ b/gcc/config/aarch64/aarch64.opt
@@ -288,3 +288,7 @@ Constant memcpy size in bytes above which to start using MOPS sequence.
 -param=aarch64-mops-memmove-size-threshold=
 Target Joined UInteger Var(aarch64_mops_memmove_size_threshold) Init(0) Param
 Constant memmove size in bytes above which to start using MOPS sequence.
+
+-param=aarch64-mops-memset-size-threshold=
+Target Joined UInteger Var(aarch64_mops_memset_size_threshold) Init(256) Param
+Constant memset size in bytes from which to start using MOPS sequence.
diff --git a/gcc/testsuite/gcc.target/aarch64/mops_3.c b/gcc/testsuite/gcc.target/aarch64/mops_3.c
new file mode 100644
index 00000000000..0eda2ffb578
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/mops_3.c
@@ -0,0 +1,85 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=armv8.6-a+mops --param=aarch64-mops-memset-size-threshold=0" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include <stdlib.h>
+
+/* We want to inline variable-sized memset.
+** do_it_set:
+**	setp	\[x0\]\!, x2\!, x1
+**	setm	\[x0\]\!, x2\!, x1
+**	sete	\[x0\]\!, x2\!, x1
+**	ret
+*/
+void do_it_set (char * out, int n, size_t size)
+{
+  __builtin_memset (out, n, size);
+}
+
+/*
+** do_it_set_large:
+**	mov	w2, 1
+**	mov	x1, 1024
+**	setp	\[x0\]\!, x1\!, x2
+**	setm	\[x0\]\!, x1\!, x2
+**	sete	\[x0\]\!, x1\!, x2
+**	ret
+*/
+void do_it_set_large (char * out)
+{
+  __builtin_memset (out, 1, 1024);
+}
+
+/*
+** do_it_set_256:
+**	mov	w2, 1
+**	mov	x1, 256
+**	setp	\[x0\]\!, x1\!, x2
+**	setm	\[x0\]\!, x1\!, x2
+**	sete	\[x0\]\!, x1\!, x2
+**	ret
+*/
+void do_it_set_256 (char * out)
+{
+  __builtin_memset (out, 1, 256);
+}
+
+/*
+** do_it_set_255:
+**	mov	w2, 1
+**	mov	x1, 255
+**	setp	\[x0\]\!, x1\!, x2
+**	setm	\[x0\]\!, x1\!, x2
+**	sete	\[x0\]\!, x1\!, x2
+**	ret
+*/
+void do_it_set_255 (char * out)
+{
+  __builtin_memset (out, 1, 255);
+}
+
+/*
+** do_it_set_0:
+**	setp	\[x0\]\!, x1\!, xzr
+**	setm	\[x0\]\!, x1\!, xzr
+**	sete	\[x0\]\!, x1\!, xzr
+**	ret
+*/
+void do_it_set_0 (char * out, size_t n)
+{
+  __builtin_memset (out, 0, n);
+}
+
+/*
+** do_it_set_0_255:
+**	mov	x1, 255
+**	setp	\[x0\]\!, x1\!, xzr
+**	setm	\[x0\]\!, x1\!, xzr
+**	sete	\[x0\]\!, x1\!, xzr
+**	ret
+*/
+void do_it_set_0_255 (char * out)
+{
+  __builtin_memset (out, 0, 255);
+}
+


^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2021-12-13 15:18 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-12-13 15:18 [gcc r12-5934] aarch64: Use +mops to inline memset operations Kyrylo Tkachov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).