[gcc r12-5932] aarch64: Add support for Armv8.8-a memory operations and memcpy expansion

public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed

* [gcc r12-5932] aarch64: Add support for Armv8.8-a memory operations and memcpy expansion
@ 2021-12-13 15:18 Kyrylo Tkachov
  0 siblings, 0 replies; only message in thread
From: Kyrylo Tkachov @ 2021-12-13 15:18 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:0caf592d6ae836a99907841fccd31c4c5f180e8d

commit r12-5932-g0caf592d6ae836a99907841fccd31c4c5f180e8d
Author: Kyrylo Tkachov <kyrylo.tkachov@arm.com>
Date:   Mon Dec 13 14:11:59 2021 +0000

    aarch64: Add support for Armv8.8-a memory operations and memcpy expansion
    
    This patch adds the +mops architecture extension flag from the 2021 Arm Architecture extensions, Armv8.8-a.
    The +mops extensions introduce instructions to accelerate the memcpy, memset, memmove standard functions.
    The first patch here uses the instructions in the inline memcpy expansion.
    Further patches in the series will use similar instructions to inline memmove and memset.
    
    A new param, aarch64-mops-memcpy-size-threshold, is introduced to control the size threshold above which to
    emit the new sequence. Its default setting is 256 bytes, which is the same as the current threshold above
    which we'd emit a libcall.
    
    Bootstrapped and tested on aarch64-none-linux-gnu.
    
    gcc/ChangeLog:
    
            * config/aarch64/aarch64-option-extensions.def (mops): Define.
            * config/aarch64/aarch64.c (aarch64_expand_cpymem_mops): Define.
            (aarch64_expand_cpymem): Define.
            * config/aarch64/aarch64.h (AARCH64_FL_MOPS): Define.
            (AARCH64_ISA_MOPS): Define.
            (TARGET_MOPS): Define.
            (MOVE_RATIO): Adjust for TARGET_MOPS.
            * config/aarch64/aarch64.md ("unspec"): Add UNSPEC_CPYMEM.
            (aarch64_cpymemdi): New pattern.
            (cpymemdi): Adjust for TARGET_MOPS.
            * config/aarch64/aarch64.opt (aarch64-mops-memcpy-size-threshol):
            New param.
            * doc/invoke.texi (AArch64 Options): Document +mops.
    
    gcc/testsuite/ChangeLog:
    
            * gcc.target/aarch64/mops_1.c: New test.

Diff:
---
 gcc/config/aarch64/aarch64-option-extensions.def |  3 ++
 gcc/config/aarch64/aarch64.c                     | 62 ++++++++++++++++++++----
 gcc/config/aarch64/aarch64.h                     | 20 +++++---
 gcc/config/aarch64/aarch64.md                    | 17 ++++++-
 gcc/config/aarch64/aarch64.opt                   |  4 ++
 gcc/doc/invoke.texi                              |  3 ++
 gcc/testsuite/gcc.target/aarch64/mops_1.c        | 57 ++++++++++++++++++++++
 7 files changed, 149 insertions(+), 17 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-option-extensions.def b/gcc/config/aarch64/aarch64-option-extensions.def
index b61f1df9019..3f449fba415 100644
--- a/gcc/config/aarch64/aarch64-option-extensions.def
+++ b/gcc/config/aarch64/aarch64-option-extensions.def
@@ -235,4 +235,7 @@ AARCH64_OPT_EXTENSION("pauth", AARCH64_FL_PAUTH, 0, 0, false, "paca pacg")
 /* Enabling/Disabling "ls64" only changes "ls64".  */
 AARCH64_OPT_EXTENSION("ls64", AARCH64_FL_LS64, 0, 0, false, "")
 
+/* Enabling/disabling "mops" only changes "mops".  */
+AARCH64_OPT_EXTENSION("mops", AARCH64_FL_MOPS, 0, 0, false, "")
+
 #undef AARCH64_OPT_EXTENSION
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index be24b7320d2..bd754e4e2c2 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -23568,6 +23568,28 @@ aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
   *dst = aarch64_progress_pointer (*dst);
 }
 
+/* Expand a cpymem using the MOPS extension.  OPERANDS are taken
+   from the cpymem pattern.  Return true iff we succeeded.  */
+static bool
+aarch64_expand_cpymem_mops (rtx *operands)
+{
+  if (!TARGET_MOPS)
+    return false;
+  rtx addr_dst = XEXP (operands[0], 0);
+  rtx addr_src = XEXP (operands[1], 0);
+  rtx sz_reg = operands[2];
+
+  if (!REG_P (sz_reg))
+    sz_reg = force_reg (DImode, sz_reg);
+  if (!REG_P (addr_dst))
+    addr_dst = force_reg (DImode, addr_dst);
+  if (!REG_P (addr_src))
+    addr_src = force_reg (DImode, addr_src);
+  emit_insn (gen_aarch64_cpymemdi (addr_dst, addr_src, sz_reg));
+
+  return true;
+}
+
 /* Expand cpymem, as if from a __builtin_memcpy.  Return true if
    we succeed, otherwise return false, indicating that a libcall to
    memcpy should be emitted.  */
@@ -23581,19 +23603,25 @@ aarch64_expand_cpymem (rtx *operands)
   rtx base;
   machine_mode cur_mode = BLKmode;
 
-  /* Only expand fixed-size copies.  */
+  /* Variable-sized memcpy can go through the MOPS expansion if available.  */
   if (!CONST_INT_P (operands[2]))
-    return false;
+    return aarch64_expand_cpymem_mops (operands);
 
   unsigned HOST_WIDE_INT size = INTVAL (operands[2]);
 
-  /* Try to inline up to 256 bytes.  */
-  unsigned HOST_WIDE_INT max_copy_size = 256;
+  /* Try to inline up to 256 bytes or use the MOPS threshold if available.  */
+  unsigned HOST_WIDE_INT max_copy_size
+    = TARGET_MOPS ? aarch64_mops_memcpy_size_threshold : 256;
 
   bool size_p = optimize_function_for_size_p (cfun);
 
+  /* Large constant-sized cpymem should go through MOPS when possible.
+     It should be a win even for size optimization in the general case.
+     For speed optimization the choice between MOPS and the SIMD sequence
+     depends on the size of the copy, rather than number of instructions,
+     alignment etc.  */
   if (size > max_copy_size)
-    return false;
+    return aarch64_expand_cpymem_mops (operands);
 
   int copy_bits = 256;
 
@@ -23643,9 +23671,9 @@ aarch64_expand_cpymem (rtx *operands)
       nops += 2;
       n -= mode_bits;
 
-      /* Emit trailing copies using overlapping unaligned accesses - this is
-	 smaller and faster.  */
-      if (n > 0 && n < copy_bits / 2)
+      /* Emit trailing copies using overlapping unaligned accesses
+	(when !STRICT_ALIGNMENT) - this is smaller and faster.  */
+      if (n > 0 && n < copy_bits / 2 && !STRICT_ALIGNMENT)
 	{
 	  machine_mode next_mode = smallest_mode_for_size (n, MODE_INT);
 	  int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
@@ -23657,9 +23685,25 @@ aarch64_expand_cpymem (rtx *operands)
     }
   rtx_insn *seq = get_insns ();
   end_sequence ();
+  /* MOPS sequence requires 3 instructions for the memory copying + 1 to move
+     the constant size into a register.  */
+  unsigned mops_cost = 3 + 1;
+
+  /* If MOPS is available at this point we don't consider the libcall as it's
+     not a win even on code size.  At this point only consider MOPS if
+     optimizing for size.  For speed optimizations we will have chosen between
+     the two based on copy size already.  */
+  if (TARGET_MOPS)
+    {
+      if (size_p && mops_cost < nops)
+	return aarch64_expand_cpymem_mops (operands);
+      emit_insn (seq);
+      return true;
+    }
 
   /* A memcpy libcall in the worst case takes 3 instructions to prepare the
-     arguments + 1 for the call.  */
+     arguments + 1 for the call.  When MOPS is not available and we're
+     optimizing for size a libcall may be preferable.  */
   unsigned libcall_cost = 4;
   if (size_p && libcall_cost < nops)
     return false;
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index 2792bb29adb..79d0bcd357f 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -231,14 +231,17 @@ extern unsigned aarch64_architecture_version;
 /* Pointer Authentication (PAUTH) extension.  */
 #define AARCH64_FL_PAUTH      (1ULL << 40)
 
+/* Armv9.0-A.  */
+#define AARCH64_FL_V9         (1ULL << 41)  /* Armv9.0-A Architecture.  */
+
 /* 64-byte atomic load/store extensions.  */
-#define AARCH64_FL_LS64      (1ULL << 41)
+#define AARCH64_FL_LS64      (1ULL << 42)
 
 /* Armv8.7-a architecture extensions.  */
-#define AARCH64_FL_V8_7       (1ULL << 42)
+#define AARCH64_FL_V8_7       (1ULL << 43)
 
-/* Armv9.0-A.  */
-#define AARCH64_FL_V9         (1ULL << 43)  /* Armv9.0-A Architecture.  */
+/* Hardware memory operation instructions.  */
+#define AARCH64_FL_MOPS       (1ULL << 44)
 
 /* Has FP and SIMD.  */
 #define AARCH64_FL_FPSIMD     (AARCH64_FL_FP | AARCH64_FL_SIMD)
@@ -310,6 +313,7 @@ extern unsigned aarch64_architecture_version;
 #define AARCH64_ISA_V8_R	   (aarch64_isa_flags & AARCH64_FL_V8_R)
 #define AARCH64_ISA_PAUTH	   (aarch64_isa_flags & AARCH64_FL_PAUTH)
 #define AARCH64_ISA_V9		   (aarch64_isa_flags & AARCH64_FL_V9)
+#define AARCH64_ISA_MOPS	   (aarch64_isa_flags & AARCH64_FL_MOPS)
 
 /* Crypto is an optional extension to AdvSIMD.  */
 #define TARGET_CRYPTO (TARGET_SIMD && AARCH64_ISA_CRYPTO)
@@ -401,6 +405,9 @@ extern unsigned aarch64_architecture_version;
 /* PAUTH instructions are enabled through +pauth.  */
 #define TARGET_PAUTH (AARCH64_ISA_PAUTH)
 
+/* MOPS instructions are enabled through +mops.  */
+#define TARGET_MOPS (AARCH64_ISA_MOPS)
+
 /* Make sure this is always defined so we don't have to check for ifdefs
    but rather use normal ifs.  */
 #ifndef TARGET_FIX_ERR_A53_835769_DEFAULT
@@ -1046,9 +1053,10 @@ typedef struct
    7-byte copy is a 4-byte + 2-byte + byte copy.  This proves inefficient
    for both size and speed of copy, so we will instead use the "cpymem"
    standard name to implement the copy.  This logic does not apply when
-   targeting -mstrict-align, so keep a sensible default in that case.  */
+   targeting -mstrict-align or TARGET_MOPS, so keep a sensible default in
+   that case.  */
 #define MOVE_RATIO(speed) \
-  (!STRICT_ALIGNMENT ? 2 : (((speed) ? 15 : AARCH64_CALL_RATIO) / 2))
+  ((!STRICT_ALIGNMENT || TARGET_MOPS) ? 2 : (((speed) ? 15 : AARCH64_CALL_RATIO) / 2))
 
 /* Like MOVE_RATIO, without -mstrict-align, make decisions in "setmem" when
    we would use more than 3 scalar instructions.
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 5297b2d3f95..d623c1b00bf 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -143,6 +143,7 @@
     UNSPEC_AUTIBSP
     UNSPEC_CALLEE_ABI
     UNSPEC_CASESI
+    UNSPEC_CPYMEM
     UNSPEC_CRC32B
     UNSPEC_CRC32CB
     UNSPEC_CRC32CH
@@ -1572,6 +1573,18 @@
   }
 )
 
+(define_insn "aarch64_cpymemdi"
+  [(parallel [
+   (set (match_operand:DI 2 "register_operand" "+&r") (const_int 0))
+   (clobber (match_operand:DI 0 "register_operand" "+&r"))
+   (clobber (match_operand:DI 1 "register_operand" "+&r"))
+   (set (mem:BLK (match_dup 0))
+        (unspec:BLK [(mem:BLK (match_dup 1)) (match_dup 2)] UNSPEC_CPYMEM))])]
+ "TARGET_MOPS"
+ "cpyfp\t[%x0]!, [%x1]!, %x2!\;cpyfm\t[%x0]!, [%x1]!, %x2!\;cpyfe\t[%x0]!, [%x1]!, %x2!"
+ [(set_attr "length" "12")]
+)
+
 ;; 0 is dst
 ;; 1 is src
 ;; 2 is size of copy in bytes
@@ -1580,9 +1593,9 @@
 (define_expand "cpymemdi"
   [(match_operand:BLK 0 "memory_operand")
    (match_operand:BLK 1 "memory_operand")
-   (match_operand:DI 2 "immediate_operand")
+   (match_operand:DI 2 "general_operand")
    (match_operand:DI 3 "immediate_operand")]
-   "!STRICT_ALIGNMENT"
+   "!STRICT_ALIGNMENT || TARGET_MOPS"
 {
   if (aarch64_expand_cpymem (operands))
     DONE;
diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
index 32191cf1acf..7445ed106cc 100644
--- a/gcc/config/aarch64/aarch64.opt
+++ b/gcc/config/aarch64/aarch64.opt
@@ -280,3 +280,7 @@ Target Joined UInteger Var(aarch64_autovec_preference) Init(0) IntegerRange(0, 4
 
 -param=aarch64-loop-vect-issue-rate-niters=
 Target Joined UInteger Var(aarch64_loop_vect_issue_rate_niters) Init(6) IntegerRange(0, 65536) Param
+
+-param=aarch64-mops-memcpy-size-threshold=
+Target Joined UInteger Var(aarch64_mops_memcpy_size_threshold) Init(256) Param
+Constant memcpy size in bytes above which to start using MOPS sequence.
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 9b4371b9213..2424a5bf3e0 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -19144,6 +19144,9 @@ prior to Armv8.2-A is not supported.
 @item ls64
 Enable the 64-byte atomic load and store instructions for accelerators.
 This option is enabled by default for @option{-march=armv8.7-a}.
+@item mops
+Enable the instructions to accelerate memory operations like @code{memcpy},
+@code{memmove}, @code{memset}.
 @item flagm
 Enable the Flag Manipulation instructions Extension.
 @item pauth
diff --git a/gcc/testsuite/gcc.target/aarch64/mops_1.c b/gcc/testsuite/gcc.target/aarch64/mops_1.c
new file mode 100644
index 00000000000..661c14192e8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/mops_1.c
@@ -0,0 +1,57 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=armv8.6-a+mops --param=aarch64-mops-memcpy-size-threshold=0" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include <stdlib.h>
+
+/* We want to inline variable-sized memcpy.
+** do_it_cpy:
+**	cpyfp	\[x1\]\!, \[x0\]\!, x2\!
+**	cpyfm	\[x1\]\!, \[x0\]\!, x2\!
+**	cpyfe	\[x1\]\!, \[x0\]\!, x2\!
+**	ret
+*/
+void do_it_cpy (char * in, char * out, size_t size)
+{
+  __builtin_memcpy (out, in, size);
+}
+
+/*
+** do_it_cpy_large:
+**	mov	x2, 1024
+**	cpyfp	\[x1\]\!, \[x0\]!, x2\!
+**	cpyfm	\[x1\]\!, \[x0\]!, x2\!
+**	cpyfe	\[x1\]\!, \[x0\]\!, x2\!
+**	ret
+*/
+void do_it_cpy_large (char * in, char * out)
+{
+  __builtin_memcpy (out, in, 1024);
+}
+
+/*
+** do_it_cpy_127:
+**	mov	x2, 127
+**	cpyfp	\[x1\]\!, \[x0\]!, x2\!
+**	cpyfm	\[x1\]\!, \[x0\]!, x2\!
+**	cpyfe	\[x1\]\!, \[x0\]\!, x2\!
+**	ret
+*/
+void do_it_cpy_127 (char * in, char * out)
+{
+  __builtin_memcpy (out, in, 127);
+}
+
+/*
+** do_it_cpy_128:
+**	mov	x2, 128
+**	cpyfp	\[x1\]\!, \[x0\]!, x2\!
+**	cpyfm	\[x1\]\!, \[x0\]!, x2\!
+**	cpyfe	\[x1\]\!, \[x0\]\!, x2\!
+**	ret
+*/
+void do_it_cpy_128 (char * in, char * out)
+{
+  __builtin_memcpy (out, in, 128);
+}
+


^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2021-12-13 15:18 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-12-13 15:18 [gcc r12-5932] aarch64: Add support for Armv8.8-a memory operations and memcpy expansion Kyrylo Tkachov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).