[2.31 COMMITTED] AArch64: Backport memcpy improvements

public inbox for libc-stable@sourceware.org
 help / color / mirror / Atom feed

* [2.31 COMMITTED] AArch64: Backport memcpy improvements
@ 2020-10-14 15:36 Wilco Dijkstra
  0 siblings, 0 replies; only message in thread
From: Wilco Dijkstra @ 2020-10-14 15:36 UTC (permalink / raw)
  To: libc-stable; +Cc: nd

commit 4bc9918c998085800ecf5bbb3c863e66ea6252a0
Author: Wilco Dijkstra <wdijkstr@arm.com>
Date:   Wed Oct 14 13:56:21 2020 +0100

    AArch64: Use __memcpy_simd on Neoverse N2/V1

    Add CPU detection of Neoverse N2 and Neoverse V1, and select __memcpy_simd as
    the memcpy/memmove ifunc.

    Reviewed-by: Adhemerval Zanella  <adhemerval.zanella@linaro.org>
    (cherry picked from commit e11ed9d2b4558eeacff81557dc9557001af42a6b)

commit 4722d1fb9d605e084dd0d9030a1643386c26cfdf
Author: Wilco Dijkstra <wdijkstr@arm.com>
Date:   Wed Mar 11 17:15:25 2020 +0000

    [AArch64] Improve integer memcpy

    Further optimize integer memcpy.  Small cases now include copies up
    to 32 bytes.  64-128 byte copies are split into two cases to improve
    performance of 64-96 byte copies.  Comments have been rewritten.

    (cherry picked from commit 700065132744e0dfa6d4d9142d63f6e3a1934726)

commit bea507a3f55604064e2305fdf71e80ee4f43c2d3
Author: Wilco Dijkstra <wdijkstr@arm.com>
Date:   Wed Jul 15 16:58:07 2020 +0100

    AArch64: Rename IS_ARES to IS_NEOVERSE_N1

    Rename IS_ARES to IS_NEOVERSE_N1 since that is a bit clearer.

    Reviewed-by: Carlos O'Donell <carlos@redhat.com>
    (cherry picked from commit 0f6278a8793a5d04ea31878119eccf99f469a02d)

commit d0a5b769027b17a7000ebc58e240ddd98ae0d719
Author: Wilco Dijkstra <wdijkstr@arm.com>
Date:   Fri Aug 28 17:51:40 2020 +0100

    AArch64: Improve backwards memmove performance

    On some microarchitectures performance of the backwards memmove improves if
    the stores use STR with decreasing addresses.  So change the memmove loop
    in memcpy_advsimd.S to use 2x STR rather than STP.

    Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
    (cherry picked from commit bd394d131c10c9ec22c6424197b79410042eed99)

commit 24a30c595958a1b23b620bc3dea62b0ab9d8f480
Author: Wilco Dijkstra <wdijkstr@arm.com>
Date:   Wed Jul 15 16:55:07 2020 +0100

    AArch64: Add optimized Q-register memcpy

    Add a new memcpy using 128-bit Q registers - this is faster on modern
    cores and reduces codesize.  Similar to the generic memcpy, small cases
    include copies up to 32 bytes.  64-128 byte copies are split into two
    cases to improve performance of 64-96 byte copies.  Large copies align
    the source rather than the destination.

    bench-memcpy-random is ~9% faster than memcpy_falkor on Neoverse N1,
    so make this memcpy the default on N1 (on Centriq it is 15% faster than
    memcpy_falkor).

    Passes GLIBC regression tests.

    Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
    (cherry picked from commit 4a733bf375238a6a595033b5785cea7f27d61307)

commit 88db98fa6e427c05d202af8b3d0f1402df20c44d
Author: Wilco Dijkstra <wdijkstr@arm.com>
Date:   Wed Jul 15 16:50:02 2020 +0100

    AArch64: Align ENTRY to a cacheline

    Given almost all uses of ENTRY are for string/memory functions,
    align ENTRY to a cacheline to simplify things.

    Reviewed-by: Carlos O'Donell <carlos@redhat.com>
    (cherry picked from commit 34f0d01d5e43c7dedd002ab47f6266dfb5b79c22)


diff --git a/sysdeps/aarch64/memcpy.S b/sysdeps/aarch64/memcpy.S
index d0d47e9..e0b4c45 100644
--- a/sysdeps/aarch64/memcpy.S
+++ b/sysdeps/aarch64/memcpy.S
@@ -33,11 +33,11 @@
 #define A_l    x6
 #define A_lw   w6
 #define A_h    x7
-#define A_hw   w7
 #define B_l    x8
 #define B_lw   w8
 #define B_h    x9
 #define C_l    x10
+#define C_lw   w10
 #define C_h    x11
 #define D_l    x12
 #define D_h    x13
@@ -51,16 +51,6 @@
 #define H_h    srcend
 #define tmp1   x14

-/* Copies are split into 3 main cases: small copies of up to 32 bytes,
-   medium copies of 33..128 bytes which are fully unrolled. Large copies
-   of more than 128 bytes align the destination and use an unrolled loop
-   processing 64 bytes per iteration.
-   In order to share code with memmove, small and medium copies read all
-   data before writing, allowing any kind of overlap. So small, medium
-   and large backwards memmoves are handled by falling through into memcpy.
-   Overlapping large forward memmoves use a loop that copies backwards.
-*/
-
 #ifndef MEMMOVE
 # define MEMMOVE memmove
 #endif
@@ -68,118 +58,115 @@
 # define MEMCPY memcpy
 #endif

-ENTRY_ALIGN (MEMMOVE, 6)
+/* This implementation supports both memcpy and memmove and shares most code.
+   It uses unaligned accesses and branchless sequences to keep the code small,
+   simple and improve performance.

-       DELOUSE (0)
-       DELOUSE (1)
-       DELOUSE (2)
+   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
+   copies of up to 128 bytes, and large copies.  The overhead of the overlap
+   check in memmove is negligible since it is only required for large copies.

-       sub     tmp1, dstin, src
-       cmp     count, 128
-       ccmp    tmp1, count, 2, hi
-       b.lo    L(move_long)
-
-       /* Common case falls through into memcpy.  */
-END (MEMMOVE)
-libc_hidden_builtin_def (MEMMOVE)
-ENTRY (MEMCPY)
+   Large copies use a software pipelined loop processing 64 bytes per
+   iteration.  The destination pointer is 16-byte aligned to minimize
+   unaligned accesses.  The loop tail is handled by always copying 64 bytes
+   from the end.
+*/

+ENTRY_ALIGN (MEMCPY, 6)
        DELOUSE (0)
        DELOUSE (1)
        DELOUSE (2)

-       prfm    PLDL1KEEP, [src]
        add     srcend, src, count
        add     dstend, dstin, count
-       cmp     count, 32
-       b.ls    L(copy32)
        cmp     count, 128
        b.hi    L(copy_long)
+       cmp     count, 32
+       b.hi    L(copy32_128)

-       /* Medium copies: 33..128 bytes.  */
+       /* Small copies: 0..32 bytes.  */
+       cmp     count, 16
+       b.lo    L(copy16)
        ldp     A_l, A_h, [src]
-       ldp     B_l, B_h, [src, 16]
-       ldp     C_l, C_h, [srcend, -32]
        ldp     D_l, D_h, [srcend, -16]
-       cmp     count, 64
-       b.hi    L(copy128)
        stp     A_l, A_h, [dstin]
-       stp     B_l, B_h, [dstin, 16]
-       stp     C_l, C_h, [dstend, -32]
        stp     D_l, D_h, [dstend, -16]
        ret

-       .p2align 4
-       /* Small copies: 0..32 bytes.  */
-L(copy32):
-       /* 16-32 bytes.  */
-       cmp     count, 16
-       b.lo    1f
-       ldp     A_l, A_h, [src]
-       ldp     B_l, B_h, [srcend, -16]
-       stp     A_l, A_h, [dstin]
-       stp     B_l, B_h, [dstend, -16]
-       ret
-       .p2align 4
-1:
-       /* 8-15 bytes.  */
-       tbz     count, 3, 1f
+       /* Copy 8-15 bytes.  */
+L(copy16):
+       tbz     count, 3, L(copy8)
        ldr     A_l, [src]
        ldr     A_h, [srcend, -8]
        str     A_l, [dstin]
        str     A_h, [dstend, -8]
        ret
-       .p2align 4
-1:
-       /* 4-7 bytes.  */
-       tbz     count, 2, 1f
+
+       .p2align 3
+       /* Copy 4-7 bytes.  */
+L(copy8):
+       tbz     count, 2, L(copy4)
        ldr     A_lw, [src]
-       ldr     A_hw, [srcend, -4]
+       ldr     B_lw, [srcend, -4]
        str     A_lw, [dstin]
-       str     A_hw, [dstend, -4]
+       str     B_lw, [dstend, -4]
        ret

-       /* Copy 0..3 bytes.  Use a branchless sequence that copies the same
-          byte 3 times if count==1, or the 2nd byte twice if count==2.  */
-1:
-       cbz     count, 2f
+       /* Copy 0..3 bytes using a branchless sequence.  */
+L(copy4):
+       cbz     count, L(copy0)
        lsr     tmp1, count, 1
        ldrb    A_lw, [src]
-       ldrb    A_hw, [srcend, -1]
+       ldrb    C_lw, [srcend, -1]
        ldrb    B_lw, [src, tmp1]
        strb    A_lw, [dstin]
        strb    B_lw, [dstin, tmp1]
-       strb    A_hw, [dstend, -1]
-2:     ret
+       strb    C_lw, [dstend, -1]
+L(copy0):
+       ret
+
+       .p2align 4
+       /* Medium copies: 33..128 bytes.  */
+L(copy32_128):
+       ldp     A_l, A_h, [src]
+       ldp     B_l, B_h, [src, 16]
+       ldp     C_l, C_h, [srcend, -32]
+       ldp     D_l, D_h, [srcend, -16]
+       cmp     count, 64
+       b.hi    L(copy128)
+       stp     A_l, A_h, [dstin]
+       stp     B_l, B_h, [dstin, 16]
+       stp     C_l, C_h, [dstend, -32]
+       stp     D_l, D_h, [dstend, -16]
+       ret

        .p2align 4
-       /* Copy 65..128 bytes.  Copy 64 bytes from the start and
-          64 bytes from the end.  */
+       /* Copy 65..128 bytes.  */
 L(copy128):
        ldp     E_l, E_h, [src, 32]
        ldp     F_l, F_h, [src, 48]
+       cmp     count, 96
+       b.ls    L(copy96)
        ldp     G_l, G_h, [srcend, -64]
        ldp     H_l, H_h, [srcend, -48]
+       stp     G_l, G_h, [dstend, -64]
+       stp     H_l, H_h, [dstend, -48]
+L(copy96):
        stp     A_l, A_h, [dstin]
        stp     B_l, B_h, [dstin, 16]
        stp     E_l, E_h, [dstin, 32]
        stp     F_l, F_h, [dstin, 48]
-       stp     G_l, G_h, [dstend, -64]
-       stp     H_l, H_h, [dstend, -48]
        stp     C_l, C_h, [dstend, -32]
        stp     D_l, D_h, [dstend, -16]
        ret

-       /* Align DST to 16 byte alignment so that we don't cross cache line
-          boundaries on both loads and stores.  There are at least 128 bytes
-          to copy, so copy 16 bytes unaligned and then align.  The loop
-          copies 64 bytes per iteration and prefetches one iteration ahead.  */
-
        .p2align 4
+       /* Copy more than 128 bytes.  */
 L(copy_long):
+       /* Copy 16 bytes and then align dst to 16-byte alignment.  */
+       ldp     D_l, D_h, [src]
        and     tmp1, dstin, 15
        bic     dst, dstin, 15
-       ldp     D_l, D_h, [src]
        sub     src, src, tmp1
        add     count, count, tmp1      /* Count is now 16 too large.  */
        ldp     A_l, A_h, [src, 16]
@@ -188,7 +175,8 @@ L(copy_long):
        ldp     C_l, C_h, [src, 48]
        ldp     D_l, D_h, [src, 64]!
        subs    count, count, 128 + 16  /* Test and readjust count.  */
-       b.ls    L(last64)
+       b.ls    L(copy64_from_end)
+
 L(loop64):
        stp     A_l, A_h, [dst, 16]
        ldp     A_l, A_h, [src, 16]
@@ -201,10 +189,8 @@ L(loop64):
        subs    count, count, 64
        b.hi    L(loop64)

-       /* Write the last full set of 64 bytes.  The remainder is at most 64
-          bytes, so it is safe to always copy 64 bytes from the end even if
-          there is just 1 byte left.  */
-L(last64):
+       /* Write the last iteration and copy 64 bytes from the end.  */
+L(copy64_from_end):
        ldp     E_l, E_h, [srcend, -64]
        stp     A_l, A_h, [dst, 16]
        ldp     A_l, A_h, [srcend, -48]
@@ -219,20 +205,42 @@ L(last64):
        stp     C_l, C_h, [dstend, -16]
        ret

-       .p2align 4
-L(move_long):
-       cbz     tmp1, 3f
+END (MEMCPY)
+libc_hidden_builtin_def (MEMCPY)
+
+ENTRY_ALIGN (MEMMOVE, 4)
+       DELOUSE (0)
+       DELOUSE (1)
+       DELOUSE (2)

        add     srcend, src, count
        add     dstend, dstin, count
+       cmp     count, 128
+       b.hi    L(move_long)
+       cmp     count, 32
+       b.hi    L(copy32_128)

-       /* Align dstend to 16 byte alignment so that we don't cross cache line
-          boundaries on both loads and stores.  There are at least 128 bytes
-          to copy, so copy 16 bytes unaligned and then align.  The loop
-          copies 64 bytes per iteration and prefetches one iteration ahead.  */
+       /* Small copies: 0..32 bytes.  */
+       cmp     count, 16
+       b.lo    L(copy16)
+       ldp     A_l, A_h, [src]
+       ldp     D_l, D_h, [srcend, -16]
+       stp     A_l, A_h, [dstin]
+       stp     D_l, D_h, [dstend, -16]
+       ret

-       and     tmp1, dstend, 15
+       .p2align 4
+L(move_long):
+       /* Only use backward copy if there is an overlap.  */
+       sub     tmp1, dstin, src
+       cbz     tmp1, L(copy0)
+       cmp     tmp1, count
+       b.hs    L(copy_long)
+
+       /* Large backwards copy for overlapping copies.
+          Copy 16 bytes and then align dst to 16-byte alignment.  */
        ldp     D_l, D_h, [srcend, -16]
+       and     tmp1, dstend, 15
        sub     srcend, srcend, tmp1
        sub     count, count, tmp1
        ldp     A_l, A_h, [srcend, -16]
@@ -242,10 +250,9 @@ L(move_long):
        ldp     D_l, D_h, [srcend, -64]!
        sub     dstend, dstend, tmp1
        subs    count, count, 128
-       b.ls    2f
+       b.ls    L(copy64_from_start)

-       nop
-1:
+L(loop64_backwards):
        stp     A_l, A_h, [dstend, -16]
        ldp     A_l, A_h, [srcend, -16]
        stp     B_l, B_h, [dstend, -32]
@@ -255,12 +262,10 @@ L(move_long):
        stp     D_l, D_h, [dstend, -64]!
        ldp     D_l, D_h, [srcend, -64]!
        subs    count, count, 64
-       b.hi    1b
+       b.hi    L(loop64_backwards)

-       /* Write the last full set of 64 bytes.  The remainder is at most 64
-          bytes, so it is safe to always copy 64 bytes from the start even if
-          there is just 1 byte left.  */
-2:
+       /* Write the last iteration and copy 64 bytes from the start.  */
+L(copy64_from_start):
        ldp     G_l, G_h, [src, 48]
        stp     A_l, A_h, [dstend, -16]
        ldp     A_l, A_h, [src, 32]
@@ -273,7 +278,7 @@ L(move_long):
        stp     A_l, A_h, [dstin, 32]
        stp     B_l, B_h, [dstin, 16]
        stp     C_l, C_h, [dstin]
-3:     ret
+       ret

-END (MEMCPY)
-libc_hidden_builtin_def (MEMCPY)
+END (MEMMOVE)
+libc_hidden_builtin_def (MEMMOVE)
diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
index 8378107..3c5292d 100644
--- a/sysdeps/aarch64/multiarch/Makefile
+++ b/sysdeps/aarch64/multiarch/Makefile
@@ -1,5 +1,5 @@
 ifeq ($(subdir),string)
-sysdep_routines += memcpy_generic memcpy_thunderx memcpy_thunderx2 \
+sysdep_routines += memcpy_generic memcpy_advsimd memcpy_thunderx memcpy_thunderx2 \
                   memcpy_falkor memmove_falkor \
                   memset_generic memset_falkor memset_emag memset_kunpeng \
                   memchr_generic memchr_nosimd \
diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
index b7da62c..4b004ac 100644
--- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
@@ -42,11 +42,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
              IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx)
              IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx2)
              IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_falkor)
+             IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_simd)
              IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_generic))
   IFUNC_IMPL (i, name, memmove,
              IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx)
              IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx2)
              IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_falkor)
+             IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_simd)
              IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic))
   IFUNC_IMPL (i, name, memset,
              /* Enable this on non-falkor processors too so that other cores
diff --git a/sysdeps/aarch64/multiarch/memcpy.c b/sysdeps/aarch64/multiarch/memcpy.c
index 2fafefd..799d60c 100644
--- a/sysdeps/aarch64/multiarch/memcpy.c
+++ b/sysdeps/aarch64/multiarch/memcpy.c
@@ -29,6 +29,7 @@
 extern __typeof (__redirect_memcpy) __libc_memcpy;

 extern __typeof (__redirect_memcpy) __memcpy_generic attribute_hidden;
+extern __typeof (__redirect_memcpy) __memcpy_simd attribute_hidden;
 extern __typeof (__redirect_memcpy) __memcpy_thunderx attribute_hidden;
 extern __typeof (__redirect_memcpy) __memcpy_thunderx2 attribute_hidden;
 extern __typeof (__redirect_memcpy) __memcpy_falkor attribute_hidden;
@@ -36,11 +37,14 @@ extern __typeof (__redirect_memcpy) __memcpy_falkor attribute_hidden;
 libc_ifunc (__libc_memcpy,
             (IS_THUNDERX (midr)
             ? __memcpy_thunderx
-            : (IS_FALKOR (midr) || IS_PHECDA (midr) || IS_ARES (midr) || IS_KUNPENG920 (midr)
+            : (IS_FALKOR (midr) || IS_PHECDA (midr) || IS_KUNPENG920 (midr)
                ? __memcpy_falkor
                : (IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr)
                  ? __memcpy_thunderx2
-                 : __memcpy_generic))));
+                 : (IS_NEOVERSE_N1 (midr) || IS_NEOVERSE_N2 (midr)
+                    || IS_NEOVERSE_V1 (midr)
+                    ? __memcpy_simd
+                    : __memcpy_generic)))));

 # undef memcpy
 strong_alias (__libc_memcpy, memcpy);
diff --git a/sysdeps/aarch64/multiarch/memcpy_advsimd.S b/sysdeps/aarch64/multiarch/memcpy_advsimd.S
new file mode 100644
index 0000000..48bb6d7
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memcpy_advsimd.S
@@ -0,0 +1,248 @@
+/* Generic optimized memcpy using SIMD.
+   Copyright (C) 2020 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
+ *
+ */
+
+#define dstin  x0
+#define src    x1
+#define count  x2
+#define dst    x3
+#define srcend x4
+#define dstend x5
+#define A_l    x6
+#define A_lw   w6
+#define A_h    x7
+#define B_l    x8
+#define B_lw   w8
+#define B_h    x9
+#define C_lw   w10
+#define tmp1   x14
+
+#define A_q    q0
+#define B_q    q1
+#define C_q    q2
+#define D_q    q3
+#define E_q    q4
+#define F_q    q5
+#define G_q    q6
+#define H_q    q7
+
+
+/* This implementation supports both memcpy and memmove and shares most code.
+   It uses unaligned accesses and branchless sequences to keep the code small,
+   simple and improve performance.
+
+   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
+   copies of up to 128 bytes, and large copies.  The overhead of the overlap
+   check in memmove is negligible since it is only required for large copies.
+
+   Large copies use a software pipelined loop processing 64 bytes per
+   iteration.  The destination pointer is 16-byte aligned to minimize
+   unaligned accesses.  The loop tail is handled by always copying 64 bytes
+   from the end.  */
+
+ENTRY (__memcpy_simd)
+       DELOUSE (0)
+       DELOUSE (1)
+       DELOUSE (2)
+
+       add     srcend, src, count
+       add     dstend, dstin, count
+       cmp     count, 128
+       b.hi    L(copy_long)
+       cmp     count, 32
+       b.hi    L(copy32_128)
+
+       /* Small copies: 0..32 bytes.  */
+       cmp     count, 16
+       b.lo    L(copy16)
+       ldr     A_q, [src]
+       ldr     B_q, [srcend, -16]
+       str     A_q, [dstin]
+       str     B_q, [dstend, -16]
+       ret
+
+       /* Copy 8-15 bytes.  */
+L(copy16):
+       tbz     count, 3, L(copy8)
+       ldr     A_l, [src]
+       ldr     A_h, [srcend, -8]
+       str     A_l, [dstin]
+       str     A_h, [dstend, -8]
+       ret
+
+       /* Copy 4-7 bytes.  */
+L(copy8):
+       tbz     count, 2, L(copy4)
+       ldr     A_lw, [src]
+       ldr     B_lw, [srcend, -4]
+       str     A_lw, [dstin]
+       str     B_lw, [dstend, -4]
+       ret
+
+       /* Copy 0..3 bytes using a branchless sequence.  */
+L(copy4):
+       cbz     count, L(copy0)
+       lsr     tmp1, count, 1
+       ldrb    A_lw, [src]
+       ldrb    C_lw, [srcend, -1]
+       ldrb    B_lw, [src, tmp1]
+       strb    A_lw, [dstin]
+       strb    B_lw, [dstin, tmp1]
+       strb    C_lw, [dstend, -1]
+L(copy0):
+       ret
+
+       .p2align 4
+       /* Medium copies: 33..128 bytes.  */
+L(copy32_128):
+       ldp     A_q, B_q, [src]
+       ldp     C_q, D_q, [srcend, -32]
+       cmp     count, 64
+       b.hi    L(copy128)
+       stp     A_q, B_q, [dstin]
+       stp     C_q, D_q, [dstend, -32]
+       ret
+
+       .p2align 4
+       /* Copy 65..128 bytes.  */
+L(copy128):
+       ldp     E_q, F_q, [src, 32]
+       cmp     count, 96
+       b.ls    L(copy96)
+       ldp     G_q, H_q, [srcend, -64]
+       stp     G_q, H_q, [dstend, -64]
+L(copy96):
+       stp     A_q, B_q, [dstin]
+       stp     E_q, F_q, [dstin, 32]
+       stp     C_q, D_q, [dstend, -32]
+       ret
+
+       /* Align loop64 below to 16 bytes.  */
+       nop
+
+       /* Copy more than 128 bytes.  */
+L(copy_long):
+       /* Copy 16 bytes and then align src to 16-byte alignment.  */
+       ldr     D_q, [src]
+       and     tmp1, src, 15
+       bic     src, src, 15
+       sub     dst, dstin, tmp1
+       add     count, count, tmp1      /* Count is now 16 too large.  */
+       ldp     A_q, B_q, [src, 16]
+       str     D_q, [dstin]
+       ldp     C_q, D_q, [src, 48]
+       subs    count, count, 128 + 16  /* Test and readjust count.  */
+       b.ls    L(copy64_from_end)
+L(loop64):
+       stp     A_q, B_q, [dst, 16]
+       ldp     A_q, B_q, [src, 80]
+       stp     C_q, D_q, [dst, 48]
+       ldp     C_q, D_q, [src, 112]
+       add     src, src, 64
+       add     dst, dst, 64
+       subs    count, count, 64
+       b.hi    L(loop64)
+
+       /* Write the last iteration and copy 64 bytes from the end.  */
+L(copy64_from_end):
+       ldp     E_q, F_q, [srcend, -64]
+       stp     A_q, B_q, [dst, 16]
+       ldp     A_q, B_q, [srcend, -32]
+       stp     C_q, D_q, [dst, 48]
+       stp     E_q, F_q, [dstend, -64]
+       stp     A_q, B_q, [dstend, -32]
+       ret
+
+END (__memcpy_simd)
+libc_hidden_builtin_def (__memcpy_simd)
+
+
+ENTRY (__memmove_simd)
+       DELOUSE (0)
+       DELOUSE (1)
+       DELOUSE (2)
+
+       add     srcend, src, count
+       add     dstend, dstin, count
+       cmp     count, 128
+       b.hi    L(move_long)
+       cmp     count, 32
+       b.hi    L(copy32_128)
+
+       /* Small moves: 0..32 bytes.  */
+       cmp     count, 16
+       b.lo    L(copy16)
+       ldr     A_q, [src]
+       ldr     B_q, [srcend, -16]
+       str     A_q, [dstin]
+       str     B_q, [dstend, -16]
+       ret
+
+L(move_long):
+       /* Only use backward copy if there is an overlap.  */
+       sub     tmp1, dstin, src
+       cbz     tmp1, L(move0)
+       cmp     tmp1, count
+       b.hs    L(copy_long)
+
+       /* Large backwards copy for overlapping copies.
+          Copy 16 bytes and then align srcend to 16-byte alignment.  */
+L(copy_long_backwards):
+       ldr     D_q, [srcend, -16]
+       and     tmp1, srcend, 15
+       bic     srcend, srcend, 15
+       sub     count, count, tmp1
+       ldp     A_q, B_q, [srcend, -32]
+       str     D_q, [dstend, -16]
+       ldp     C_q, D_q, [srcend, -64]
+       sub     dstend, dstend, tmp1
+       subs    count, count, 128
+       b.ls    L(copy64_from_start)
+
+L(loop64_backwards):
+       str     B_q, [dstend, -16]
+       str     A_q, [dstend, -32]
+       ldp     A_q, B_q, [srcend, -96]
+       str     D_q, [dstend, -48]
+       str     C_q, [dstend, -64]!
+       ldp     C_q, D_q, [srcend, -128]
+       sub     srcend, srcend, 64
+       subs    count, count, 64
+       b.hi    L(loop64_backwards)
+
+       /* Write the last iteration and copy 64 bytes from the start.  */
+L(copy64_from_start):
+       ldp     E_q, F_q, [src, 32]
+       stp     A_q, B_q, [dstend, -32]
+       ldp     A_q, B_q, [src]
+       stp     C_q, D_q, [dstend, -64]
+       stp     E_q, F_q, [dstin, 32]
+       stp     A_q, B_q, [dstin]
+L(move0):
+       ret
+
+END (__memmove_simd)
+libc_hidden_builtin_def (__memmove_simd)
diff --git a/sysdeps/aarch64/multiarch/memmove.c b/sysdeps/aarch64/multiarch/memmove.c
index ed5a47f..46a4cb3 100644
--- a/sysdeps/aarch64/multiarch/memmove.c
+++ b/sysdeps/aarch64/multiarch/memmove.c
@@ -29,6 +29,7 @@
 extern __typeof (__redirect_memmove) __libc_memmove;

 extern __typeof (__redirect_memmove) __memmove_generic attribute_hidden;
+extern __typeof (__redirect_memmove) __memmove_simd attribute_hidden;
 extern __typeof (__redirect_memmove) __memmove_thunderx attribute_hidden;
 extern __typeof (__redirect_memmove) __memmove_thunderx2 attribute_hidden;
 extern __typeof (__redirect_memmove) __memmove_falkor attribute_hidden;
@@ -40,7 +41,10 @@ libc_ifunc (__libc_memmove,
                ? __memmove_falkor
                : (IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr)
                  ? __memmove_thunderx2
-                 : __memmove_generic))));
+                 : (IS_NEOVERSE_N1 (midr) || IS_NEOVERSE_N2 (midr)
+                    || IS_NEOVERSE_V1 (midr)
+                    ? __memmove_simd
+                    : __memmove_generic)))));

 # undef memmove
 strong_alias (__libc_memmove, memmove);
diff --git a/sysdeps/aarch64/sysdep.h b/sysdeps/aarch64/sysdep.h
index 604c489..f1feb19 100644
--- a/sysdeps/aarch64/sysdep.h
+++ b/sysdeps/aarch64/sysdep.h
@@ -45,7 +45,7 @@
 #define ENTRY(name)                                            \
   .globl C_SYMBOL_NAME(name);                                  \
   .type C_SYMBOL_NAME(name),%function;                         \
-  .align 4;                                                    \
+  .p2align 6;                                                  \
   C_LABEL(name)                                                        \
   cfi_startproc;                                               \
   CALL_MCOUNT
diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
index 1389cea..346d045 100644
--- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
+++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
@@ -51,8 +51,12 @@

 #define IS_PHECDA(midr) (MIDR_IMPLEMENTOR(midr) == 'h'                       \
                         && MIDR_PARTNUM(midr) == 0x000)
-#define IS_ARES(midr) (MIDR_IMPLEMENTOR(midr) == 'A'                         \
-                       && MIDR_PARTNUM(midr) == 0xd0c)
+#define IS_NEOVERSE_N1(midr) (MIDR_IMPLEMENTOR(midr) == 'A'                  \
+                             && MIDR_PARTNUM(midr) == 0xd0c)
+#define IS_NEOVERSE_N2(midr) (MIDR_IMPLEMENTOR(midr) == 'A'                  \
+                             && MIDR_PARTNUM(midr) == 0xd49)
+#define IS_NEOVERSE_V1(midr) (MIDR_IMPLEMENTOR(midr) == 'A'                  \
+                             && MIDR_PARTNUM(midr) == 0xd40)

 #define IS_EMAG(midr) (MIDR_IMPLEMENTOR(midr) == 'P'                         \
                        && MIDR_PARTNUM(midr) == 0x000)



^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2020-10-14 15:36 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-10-14 15:36 [2.31 COMMITTED] AArch64: Backport memcpy improvements Wilco Dijkstra

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).