Hi, Further optimize integer memcpy. Small cases now include copies up to 32 bytes. 64-128 byte copies are split into two cases to improve performance of 64-96 byte copies. Comments have been rewritten. The attached graph shows how the new memcpy (memcpy_new) performs against the current generic memcpy and the previous version (memcpy.S before commit b9f145df85). Passes GLIBC tests. --- diff --git a/sysdeps/aarch64/memcpy.S b/sysdeps/aarch64/memcpy.S index ff720c800ed0ca3afac03d19ba02f67817b3422e..e0547259a8618292fe798e70fe5b44409acecc51 100644 --- a/sysdeps/aarch64/memcpy.S +++ b/sysdeps/aarch64/memcpy.S @@ -33,11 +33,11 @@ #define A_l x6 #define A_lw w6 #define A_h x7 -#define A_hw w7 #define B_l x8 #define B_lw w8 #define B_h x9 #define C_l x10 +#define C_lw w10 #define C_h x11 #define D_l x12 #define D_h x13 @@ -51,16 +51,6 @@ #define H_h srcend #define tmp1 x14 -/* Copies are split into 3 main cases: small copies of up to 32 bytes, - medium copies of 33..128 bytes which are fully unrolled. Large copies - of more than 128 bytes align the destination and use an unrolled loop - processing 64 bytes per iteration. - In order to share code with memmove, small and medium copies read all - data before writing, allowing any kind of overlap. So small, medium - and large backwards memmoves are handled by falling through into memcpy. - Overlapping large forward memmoves use a loop that copies backwards. -*/ - #ifndef MEMMOVE # define MEMMOVE memmove #endif @@ -68,128 +58,124 @@ # define MEMCPY memcpy #endif -ENTRY_ALIGN (MEMMOVE, 6) +/* This implementation supports both memcpy and memmove and shares most code. + It uses unaligned accesses and branchless sequences to keep the code small, + simple and improve performance. - DELOUSE (0) - DELOUSE (1) - DELOUSE (2) + Copies are split into 3 main cases: small copies of up to 32 bytes, medium + copies of up to 128 bytes, and large copies. The overhead of the overlap + check in memmove is negligible since it is only required for large copies. - sub tmp1, dstin, src - cmp count, 128 - ccmp tmp1, count, 2, hi - b.lo L(move_long) + Large copies use a software pipelined loop processing 64 bytes per iteration. + The destination pointer is 16-byte aligned to minimize unaligned accesses. + The loop tail is handled by always copying 64 bytes from the end. +*/ - /* Common case falls through into memcpy. */ -END (MEMMOVE) -libc_hidden_builtin_def (MEMMOVE) ENTRY (MEMCPY) - DELOUSE (0) DELOUSE (1) DELOUSE (2) - prfm PLDL1KEEP, [src] add srcend, src, count add dstend, dstin, count + cmp count, 128 + b.hi L(copy_long) cmp count, 32 - b.ls L(copy32) - cmp count, 128 - b.hi L(copy_long) + b.hi L(copy32_128) - /* Medium copies: 33..128 bytes. */ + /* Small copies: 0..32 bytes. */ + cmp count, 16 + b.lo L(copy16) ldp A_l, A_h, [src] - ldp B_l, B_h, [src, 16] - ldp C_l, C_h, [srcend, -32] ldp D_l, D_h, [srcend, -16] - cmp count, 64 - b.hi L(copy128) stp A_l, A_h, [dstin] - stp B_l, B_h, [dstin, 16] - stp C_l, C_h, [dstend, -32] stp D_l, D_h, [dstend, -16] ret - .p2align 4 - /* Small copies: 0..32 bytes. */ -L(copy32): - /* 16-32 bytes. */ - cmp count, 16 - b.lo 1f - ldp A_l, A_h, [src] - ldp B_l, B_h, [srcend, -16] - stp A_l, A_h, [dstin] - stp B_l, B_h, [dstend, -16] - ret - .p2align 4 -1: - /* 8-15 bytes. */ - tbz count, 3, 1f + /* Copy 8-15 bytes. */ +L(copy16): + tbz count, 3, L(copy8) ldr A_l, [src] ldr A_h, [srcend, -8] str A_l, [dstin] str A_h, [dstend, -8] ret - .p2align 4 -1: - /* 4-7 bytes. */ - tbz count, 2, 1f + + .p2align 3 + /* Copy 4-7 bytes. */ +L(copy8): + tbz count, 2, L(copy4) ldr A_lw, [src] - ldr A_hw, [srcend, -4] + ldr B_lw, [srcend, -4] str A_lw, [dstin] - str A_hw, [dstend, -4] + str B_lw, [dstend, -4] ret - /* Copy 0..3 bytes. Use a branchless sequence that copies the same - byte 3 times if count==1, or the 2nd byte twice if count==2. */ -1: - cbz count, 2f + /* Copy 0..3 bytes using a branchless sequence. */ +L(copy4): + cbz count, L(copy0) lsr tmp1, count, 1 ldrb A_lw, [src] - ldrb A_hw, [srcend, -1] + ldrb C_lw, [srcend, -1] ldrb B_lw, [src, tmp1] strb A_lw, [dstin] strb B_lw, [dstin, tmp1] - strb A_hw, [dstend, -1] -2: ret + strb C_lw, [dstend, -1] +L(copy0): + ret + + .p2align 4 + /* Medium copies: 33..128 bytes. */ +L(copy32_128): + ldp A_l, A_h, [src] + ldp B_l, B_h, [src, 16] + ldp C_l, C_h, [srcend, -32] + ldp D_l, D_h, [srcend, -16] + cmp count, 64 + b.hi L(copy128) + stp A_l, A_h, [dstin] + stp B_l, B_h, [dstin, 16] + stp C_l, C_h, [dstend, -32] + stp D_l, D_h, [dstend, -16] + ret .p2align 4 - /* Copy 65..128 bytes. Copy 64 bytes from the start and - 64 bytes from the end. */ + /* Copy 65..128 bytes. */ L(copy128): ldp E_l, E_h, [src, 32] ldp F_l, F_h, [src, 48] + cmp count, 96 + b.ls L(copy96) ldp G_l, G_h, [srcend, -64] ldp H_l, H_h, [srcend, -48] + stp G_l, G_h, [dstend, -64] + stp H_l, H_h, [dstend, -48] +L(copy96): stp A_l, A_h, [dstin] - stp B_l, B_h, [dstin, 16] - stp E_l, E_h, [dstin, 32] - stp F_l, F_h, [dstin, 48] - stp G_l, G_h, [dstend, -64] - stp H_l, H_h, [dstend, -48] - stp C_l, C_h, [dstend, -32] + stp B_l, B_h, [dstin, 16] + stp E_l, E_h, [dstin, 32] + stp F_l, F_h, [dstin, 48] + stp C_l, C_h, [dstend, -32] stp D_l, D_h, [dstend, -16] ret - /* Align DST to 16 byte alignment so that we don't cross cache line - boundaries on both loads and stores. There are at least 128 bytes - to copy, so copy 16 bytes unaligned and then align. The loop - copies 64 bytes per iteration and prefetches one iteration ahead. */ - .p2align 4 + /* Copy more than 128 bytes. */ L(copy_long): + /* Copy 16 bytes and then align dst to 16-byte alignment. */ + ldp D_l, D_h, [src] and tmp1, dstin, 15 bic dst, dstin, 15 - ldp D_l, D_h, [src] sub src, src, tmp1 - add count, count, tmp1 /* Count is now 16 too large. */ + add count, count, tmp1 /* Count is now 16 too large. */ ldp A_l, A_h, [src, 16] stp D_l, D_h, [dstin] ldp B_l, B_h, [src, 32] ldp C_l, C_h, [src, 48] ldp D_l, D_h, [src, 64]! - subs count, count, 128 + 16 /* Test and readjust count. */ - b.ls L(last64) + b.ls L(copy64_from_end) + L(loop64): stp A_l, A_h, [dst, 16] ldp A_l, A_h, [src, 16] @@ -202,10 +188,8 @@ L(loop64): subs count, count, 64 b.hi L(loop64) - /* Write the last full set of 64 bytes. The remainder is at most 64 - bytes, so it is safe to always copy 64 bytes from the end even if - there is just 1 byte left. */ -L(last64): + /* Write the last iteration and copy 64 bytes from the end. */ +L(copy64_from_end): ldp E_l, E_h, [srcend, -64] stp A_l, A_h, [dst, 16] ldp A_l, A_h, [srcend, -48] @@ -220,20 +204,42 @@ L(last64): stp C_l, C_h, [dstend, -16] ret - .p2align 4 -L(move_long): - cbz tmp1, 3f +END (MEMCPY) +libc_hidden_builtin_def (MEMCPY) + +ENTRY_ALIGN (MEMMOVE, 4) + DELOUSE (0) + DELOUSE (1) + DELOUSE (2) add srcend, src, count add dstend, dstin, count + cmp count, 128 + b.hi L(move_long) + cmp count, 32 + b.hi L(copy32_128) + + /* Small copies: 0..32 bytes. */ + cmp count, 16 + b.lo L(copy16) + ldp A_l, A_h, [src] + ldp D_l, D_h, [srcend, -16] + stp A_l, A_h, [dstin] + stp D_l, D_h, [dstend, -16] + ret - /* Align dstend to 16 byte alignment so that we don't cross cache line - boundaries on both loads and stores. There are at least 128 bytes - to copy, so copy 16 bytes unaligned and then align. The loop - copies 64 bytes per iteration and prefetches one iteration ahead. */ + .p2align 4 +L(move_long): + /* Only use backward copy if there is an overlap. */ + sub tmp1, dstin, src + cbz tmp1, L(copy0) + cmp tmp1, count + b.hs L(copy_long) - and tmp1, dstend, 15 + /* Large backwards copy for overlapping copies. + Copy 16 bytes and then align dst to 16-byte alignment. */ ldp D_l, D_h, [srcend, -16] + and tmp1, dstend, 15 sub srcend, srcend, tmp1 sub count, count, tmp1 ldp A_l, A_h, [srcend, -16] @@ -243,10 +249,9 @@ L(move_long): ldp D_l, D_h, [srcend, -64]! sub dstend, dstend, tmp1 subs count, count, 128 - b.ls 2f + b.ls L(copy64_from_start) - nop -1: +L(loop64_backwards): stp A_l, A_h, [dstend, -16] ldp A_l, A_h, [srcend, -16] stp B_l, B_h, [dstend, -32] @@ -256,12 +261,10 @@ L(move_long): stp D_l, D_h, [dstend, -64]! ldp D_l, D_h, [srcend, -64]! subs count, count, 64 - b.hi 1b + b.hi L(loop64_backwards) - /* Write the last full set of 64 bytes. The remainder is at most 64 - bytes, so it is safe to always copy 64 bytes from the start even if - there is just 1 byte left. */ -2: + /* Write the last iteration and copy 64 bytes from the start. */ +L(copy64_from_start): ldp G_l, G_h, [src, 48] stp A_l, A_h, [dstend, -16] ldp A_l, A_h, [src, 32] @@ -274,7 +277,7 @@ L(move_long): stp A_l, A_h, [dstin, 32] stp B_l, B_h, [dstin, 16] stp C_l, C_h, [dstin] -3: ret + ret -END (MEMCPY) -libc_hidden_builtin_def (MEMCPY) +END (MEMMOVE) +libc_hidden_builtin_def (MEMMOVE)