Hi,

Further optimize integer memcpy.  Small cases now include copies up
to 32 bytes.  64-128 byte copies are split into two cases to improve
performance of 64-96 byte copies.  Comments have been rewritten.
The attached graph shows how the new memcpy (memcpy_new) performs
against the current generic memcpy and the previous version (memcpy.S
before commit b9f145df85).

Passes GLIBC tests.

---

diff --git a/sysdeps/aarch64/memcpy.S b/sysdeps/aarch64/memcpy.S
index ff720c800ed0ca3afac03d19ba02f67817b3422e..e0547259a8618292fe798e70fe5b44409acecc51 100644
--- a/sysdeps/aarch64/memcpy.S
+++ b/sysdeps/aarch64/memcpy.S
@@ -33,11 +33,11 @@
 #define A_l     x6
 #define A_lw    w6
 #define A_h     x7
-#define A_hw   w7
 #define B_l     x8
 #define B_lw    w8
 #define B_h     x9
 #define C_l     x10
+#define C_lw   w10
 #define C_h     x11
 #define D_l     x12
 #define D_h     x13
@@ -51,16 +51,6 @@
 #define H_h     srcend
 #define tmp1    x14
 
-/* Copies are split into 3 main cases: small copies of up to 32 bytes,
-   medium copies of 33..128 bytes which are fully unrolled. Large copies
-   of more than 128 bytes align the destination and use an unrolled loop
-   processing 64 bytes per iteration.
-   In order to share code with memmove, small and medium copies read all
-   data before writing, allowing any kind of overlap. So small, medium
-   and large backwards memmoves are handled by falling through into memcpy.
-   Overlapping large forward memmoves use a loop that copies backwards.
-*/
-
 #ifndef MEMMOVE
 # define MEMMOVE memmove
 #endif
@@ -68,128 +58,124 @@
 # define MEMCPY memcpy
 #endif
 
-ENTRY_ALIGN (MEMMOVE, 6)
+/* This implementation supports both memcpy and memmove and shares most code.
+   It uses unaligned accesses and branchless sequences to keep the code small,
+   simple and improve performance.
 
-       DELOUSE (0)
-       DELOUSE (1)
-       DELOUSE (2)
+   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
+   copies of up to 128 bytes, and large copies.  The overhead of the overlap
+   check in memmove is negligible since it is only required for large copies.
 
-       sub     tmp1, dstin, src
-       cmp     count, 128
-       ccmp    tmp1, count, 2, hi
-       b.lo    L(move_long)
+   Large copies use a software pipelined loop processing 64 bytes per iteration.
+   The destination pointer is 16-byte aligned to minimize unaligned accesses.
+   The loop tail is handled by always copying 64 bytes from the end.
+*/
 
-       /* Common case falls through into memcpy.  */
-END (MEMMOVE)
-libc_hidden_builtin_def (MEMMOVE)
 ENTRY (MEMCPY)
-
         DELOUSE (0)
         DELOUSE (1)
         DELOUSE (2)
 
-       prfm    PLDL1KEEP, [src]
         add     srcend, src, count
         add     dstend, dstin, count
+       cmp     count, 128
+       b.hi    L(copy_long)
         cmp     count, 32
-       b.ls    L(copy32)
-       cmp     count, 128
-       b.hi    L(copy_long)
+       b.hi    L(copy32_128)
 
-       /* Medium copies: 33..128 bytes.  */
+       /* Small copies: 0..32 bytes.  */
+       cmp     count, 16
+       b.lo    L(copy16)
         ldp     A_l, A_h, [src]
-       ldp     B_l, B_h, [src, 16]
-       ldp     C_l, C_h, [srcend, -32]
         ldp     D_l, D_h, [srcend, -16]
-       cmp     count, 64
-       b.hi    L(copy128)
         stp     A_l, A_h, [dstin]
-       stp     B_l, B_h, [dstin, 16]
-       stp     C_l, C_h, [dstend, -32]
         stp     D_l, D_h, [dstend, -16]
         ret
 
-       .p2align 4
-       /* Small copies: 0..32 bytes.  */
-L(copy32):
-       /* 16-32 bytes.  */
-       cmp     count, 16
-       b.lo    1f
-       ldp     A_l, A_h, [src]
-       ldp     B_l, B_h, [srcend, -16]
-       stp     A_l, A_h, [dstin]
-       stp     B_l, B_h, [dstend, -16]
-       ret
-       .p2align 4
-1:
-       /* 8-15 bytes.  */
-       tbz     count, 3, 1f
+       /* Copy 8-15 bytes.  */
+L(copy16):
+       tbz     count, 3, L(copy8)
         ldr     A_l, [src]
         ldr     A_h, [srcend, -8]
         str     A_l, [dstin]
         str     A_h, [dstend, -8]
         ret
-       .p2align 4
-1:
-       /* 4-7 bytes.  */
-       tbz     count, 2, 1f
+
+       .p2align 3
+       /* Copy 4-7 bytes.  */
+L(copy8):
+       tbz     count, 2, L(copy4)
         ldr     A_lw, [src]
-       ldr     A_hw, [srcend, -4]
+       ldr     B_lw, [srcend, -4]
         str     A_lw, [dstin]
-       str     A_hw, [dstend, -4]
+       str     B_lw, [dstend, -4]
         ret
 
-       /* Copy 0..3 bytes.  Use a branchless sequence that copies the same
-          byte 3 times if count==1, or the 2nd byte twice if count==2.  */
-1:
-       cbz     count, 2f
+       /* Copy 0..3 bytes using a branchless sequence.  */
+L(copy4):
+       cbz     count, L(copy0)
         lsr     tmp1, count, 1
         ldrb    A_lw, [src]
-       ldrb    A_hw, [srcend, -1]
+       ldrb    C_lw, [srcend, -1]
         ldrb    B_lw, [src, tmp1]
         strb    A_lw, [dstin]
         strb    B_lw, [dstin, tmp1]
-       strb    A_hw, [dstend, -1]
-2:     ret
+       strb    C_lw, [dstend, -1]
+L(copy0):
+       ret
+
+       .p2align 4
+       /* Medium copies: 33..128 bytes.  */
+L(copy32_128):
+       ldp     A_l, A_h, [src]
+       ldp     B_l, B_h, [src, 16]
+       ldp     C_l, C_h, [srcend, -32]
+       ldp     D_l, D_h, [srcend, -16]
+       cmp     count, 64
+       b.hi    L(copy128)
+       stp     A_l, A_h, [dstin]
+       stp     B_l, B_h, [dstin, 16]
+       stp     C_l, C_h, [dstend, -32]
+       stp     D_l, D_h, [dstend, -16]
+       ret
 
         .p2align 4
-       /* Copy 65..128 bytes.  Copy 64 bytes from the start and
-          64 bytes from the end.  */
+       /* Copy 65..128 bytes.  */
 L(copy128):
         ldp     E_l, E_h, [src, 32]
         ldp     F_l, F_h, [src, 48]
+       cmp     count, 96
+       b.ls    L(copy96)
         ldp     G_l, G_h, [srcend, -64]
         ldp     H_l, H_h, [srcend, -48]
+       stp     G_l, G_h, [dstend, -64]
+       stp     H_l, H_h, [dstend, -48]
+L(copy96):
         stp     A_l, A_h, [dstin]
-       stp     B_l, B_h, [dstin, 16]
-       stp     E_l, E_h, [dstin, 32]
-       stp     F_l, F_h, [dstin, 48]
-       stp     G_l, G_h, [dstend, -64]
-       stp     H_l, H_h, [dstend, -48]
-       stp     C_l, C_h, [dstend, -32]
+       stp     B_l, B_h, [dstin, 16]
+       stp     E_l, E_h, [dstin, 32]
+       stp     F_l, F_h, [dstin, 48]
+       stp     C_l, C_h, [dstend, -32]
         stp     D_l, D_h, [dstend, -16]
         ret
 
-       /* Align DST to 16 byte alignment so that we don't cross cache line
-          boundaries on both loads and stores.  There are at least 128 bytes
-          to copy, so copy 16 bytes unaligned and then align.  The loop
-          copies 64 bytes per iteration and prefetches one iteration ahead.  */
-
         .p2align 4
+       /* Copy more than 128 bytes.  */
 L(copy_long):
+       /* Copy 16 bytes and then align dst to 16-byte alignment.  */
+       ldp     D_l, D_h, [src]
         and     tmp1, dstin, 15
         bic     dst, dstin, 15
-       ldp     D_l, D_h, [src]
         sub     src, src, tmp1
-       add     count, count, tmp1      /* Count is now 16 too large.  */
+       add     count, count, tmp1      /* Count is now 16 too large.  */
         ldp     A_l, A_h, [src, 16]
         stp     D_l, D_h, [dstin]
         ldp     B_l, B_h, [src, 32]
         ldp     C_l, C_h, [src, 48]
         ldp     D_l, D_h, [src, 64]!
-
         subs    count, count, 128 + 16  /* Test and readjust count.  */
-       b.ls    L(last64)
+       b.ls    L(copy64_from_end)
+
 L(loop64):
         stp     A_l, A_h, [dst, 16]
         ldp     A_l, A_h, [src, 16]
@@ -202,10 +188,8 @@ L(loop64):
         subs    count, count, 64
         b.hi    L(loop64)
 
-       /* Write the last full set of 64 bytes.  The remainder is at most 64
-          bytes, so it is safe to always copy 64 bytes from the end even if
-          there is just 1 byte left.  */
-L(last64):
+       /* Write the last iteration and copy 64 bytes from the end.  */
+L(copy64_from_end):
         ldp     E_l, E_h, [srcend, -64]
         stp     A_l, A_h, [dst, 16]
         ldp     A_l, A_h, [srcend, -48]
@@ -220,20 +204,42 @@ L(last64):
         stp     C_l, C_h, [dstend, -16]
         ret
 
-       .p2align 4
-L(move_long):
-       cbz     tmp1, 3f
+END (MEMCPY)
+libc_hidden_builtin_def (MEMCPY)
+
+ENTRY_ALIGN (MEMMOVE, 4)
+       DELOUSE (0)
+       DELOUSE (1)
+       DELOUSE (2)
 
         add     srcend, src, count
         add     dstend, dstin, count
+       cmp     count, 128
+       b.hi    L(move_long)
+       cmp     count, 32
+       b.hi    L(copy32_128)
+
+       /* Small copies: 0..32 bytes.  */
+       cmp     count, 16
+       b.lo    L(copy16)
+       ldp     A_l, A_h, [src]
+       ldp     D_l, D_h, [srcend, -16]
+       stp     A_l, A_h, [dstin]
+       stp     D_l, D_h, [dstend, -16]
+       ret
 
-       /* Align dstend to 16 byte alignment so that we don't cross cache line
-          boundaries on both loads and stores.  There are at least 128 bytes
-          to copy, so copy 16 bytes unaligned and then align.  The loop
-          copies 64 bytes per iteration and prefetches one iteration ahead.  */
+       .p2align 4
+L(move_long):
+       /* Only use backward copy if there is an overlap.  */
+       sub     tmp1, dstin, src
+       cbz     tmp1, L(copy0)
+       cmp     tmp1, count
+       b.hs    L(copy_long)
 
-       and     tmp1, dstend, 15
+       /* Large backwards copy for overlapping copies.
+          Copy 16 bytes and then align dst to 16-byte alignment.  */
         ldp     D_l, D_h, [srcend, -16]
+       and     tmp1, dstend, 15
         sub     srcend, srcend, tmp1
         sub     count, count, tmp1
         ldp     A_l, A_h, [srcend, -16]
@@ -243,10 +249,9 @@ L(move_long):
         ldp     D_l, D_h, [srcend, -64]!
         sub     dstend, dstend, tmp1
         subs    count, count, 128
-       b.ls    2f
+       b.ls    L(copy64_from_start)
 
-       nop
-1:
+L(loop64_backwards):
         stp     A_l, A_h, [dstend, -16]
         ldp     A_l, A_h, [srcend, -16]
         stp     B_l, B_h, [dstend, -32]
@@ -256,12 +261,10 @@ L(move_long):
         stp     D_l, D_h, [dstend, -64]!
         ldp     D_l, D_h, [srcend, -64]!
         subs    count, count, 64
-       b.hi    1b
+       b.hi    L(loop64_backwards)
 
-       /* Write the last full set of 64 bytes.  The remainder is at most 64
-          bytes, so it is safe to always copy 64 bytes from the start even if
-          there is just 1 byte left.  */
-2:
+       /* Write the last iteration and copy 64 bytes from the start.  */
+L(copy64_from_start):
         ldp     G_l, G_h, [src, 48]
         stp     A_l, A_h, [dstend, -16]
         ldp     A_l, A_h, [src, 32]
@@ -274,7 +277,7 @@ L(move_long):
         stp     A_l, A_h, [dstin, 32]
         stp     B_l, B_h, [dstin, 16]
         stp     C_l, C_h, [dstin]
-3:     ret
+       ret
 
-END (MEMCPY)
-libc_hidden_builtin_def (MEMCPY)
+END (MEMMOVE)
+libc_hidden_builtin_def (MEMMOVE)