From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 1895) id 81EAE386183F; Wed, 14 Oct 2020 16:51:51 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 81EAE386183F Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: Wilco Dijkstra To: glibc-cvs@sourceware.org Subject: [glibc/release/2.28/master] AArch64: Add optimized Q-register memcpy X-Act-Checkin: glibc X-Git-Author: Wilco Dijkstra X-Git-Refname: refs/heads/release/2.28/master X-Git-Oldrev: 118fbee7a0dfac0d311b8a7a8f8bd8d1fb6e205b X-Git-Newrev: 4bd28df0b0598c380f4ae63b96eaadc782c9d709 Message-Id: <20201014165151.81EAE386183F@sourceware.org> Date: Wed, 14 Oct 2020 16:51:51 +0000 (GMT) X-BeenThere: glibc-cvs@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Glibc-cvs mailing list List-Unsubscribe: , List-Archive: List-Help: List-Subscribe: , X-List-Received-Date: Wed, 14 Oct 2020 16:51:51 -0000 https://sourceware.org/git/gitweb.cgi?p=glibc.git;h=4bd28df0b0598c380f4ae63b96eaadc782c9d709 commit 4bd28df0b0598c380f4ae63b96eaadc782c9d709 Author: Wilco Dijkstra Date: Wed Jul 15 16:55:07 2020 +0100 AArch64: Add optimized Q-register memcpy Add a new memcpy using 128-bit Q registers - this is faster on modern cores and reduces codesize. Similar to the generic memcpy, small cases include copies up to 32 bytes. 64-128 byte copies are split into two cases to improve performance of 64-96 byte copies. Large copies align the source rather than the destination. bench-memcpy-random is ~9% faster than memcpy_falkor on Neoverse N1, so make this memcpy the default on N1 (on Centriq it is 15% faster than memcpy_falkor). Passes GLIBC regression tests. Reviewed-by: Szabolcs Nagy (cherry picked from commit 4a733bf375238a6a595033b5785cea7f27d61307) Diff: --- sysdeps/aarch64/multiarch/Makefile | 2 +- sysdeps/aarch64/multiarch/ifunc-impl-list.c | 2 + sysdeps/aarch64/multiarch/memcpy.c | 5 +- sysdeps/aarch64/multiarch/memcpy_advsimd.S | 247 ++++++++++++++++++++++++++++ sysdeps/aarch64/multiarch/memmove.c | 3 +- 5 files changed, 255 insertions(+), 4 deletions(-) diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile index 57ffdf7238..134fe463fd 100644 --- a/sysdeps/aarch64/multiarch/Makefile +++ b/sysdeps/aarch64/multiarch/Makefile @@ -1,4 +1,4 @@ ifeq ($(subdir),string) -sysdep_routines += memcpy_generic memcpy_thunderx memcpy_thunderx2 \ +sysdep_routines += memcpy_generic memcpy_advsimd memcpy_thunderx memcpy_thunderx2 \ memcpy_falkor memmove_falkor memset_generic memset_falkor endif diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c index e55be80103..0ccd14118c 100644 --- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c +++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c @@ -42,10 +42,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx) IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx2) IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_falkor) + IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_simd) IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_generic)) IFUNC_IMPL (i, name, memmove, IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx) IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_falkor) + IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_simd) IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic)) IFUNC_IMPL (i, name, memset, /* Enable this on non-falkor processors too so that other cores diff --git a/sysdeps/aarch64/multiarch/memcpy.c b/sysdeps/aarch64/multiarch/memcpy.c index 8f5d4e7df5..a6774f0d17 100644 --- a/sysdeps/aarch64/multiarch/memcpy.c +++ b/sysdeps/aarch64/multiarch/memcpy.c @@ -29,6 +29,7 @@ extern __typeof (__redirect_memcpy) __libc_memcpy; extern __typeof (__redirect_memcpy) __memcpy_generic attribute_hidden; +extern __typeof (__redirect_memcpy) __memcpy_simd attribute_hidden; extern __typeof (__redirect_memcpy) __memcpy_thunderx attribute_hidden; extern __typeof (__redirect_memcpy) __memcpy_thunderx2 attribute_hidden; extern __typeof (__redirect_memcpy) __memcpy_falkor attribute_hidden; @@ -36,11 +37,11 @@ extern __typeof (__redirect_memcpy) __memcpy_falkor attribute_hidden; libc_ifunc (__libc_memcpy, (IS_THUNDERX (midr) ? __memcpy_thunderx - : (IS_FALKOR (midr) || IS_PHECDA (midr) || IS_ARES (midr) + : (IS_FALKOR (midr) || IS_PHECDA (midr) ? __memcpy_falkor : (IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr) ? __memcpy_thunderx2 - : __memcpy_generic)))); + : (IS_ARES (midr) ? __memcpy_simd : __memcpy_generic))))); # undef memcpy strong_alias (__libc_memcpy, memcpy); diff --git a/sysdeps/aarch64/multiarch/memcpy_advsimd.S b/sysdeps/aarch64/multiarch/memcpy_advsimd.S new file mode 100644 index 0000000000..d4ba747777 --- /dev/null +++ b/sysdeps/aarch64/multiarch/memcpy_advsimd.S @@ -0,0 +1,247 @@ +/* Generic optimized memcpy using SIMD. + Copyright (C) 2020 Free Software Foundation, Inc. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + . */ + +#include + +/* Assumptions: + * + * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. + * + */ + +#define dstin x0 +#define src x1 +#define count x2 +#define dst x3 +#define srcend x4 +#define dstend x5 +#define A_l x6 +#define A_lw w6 +#define A_h x7 +#define B_l x8 +#define B_lw w8 +#define B_h x9 +#define C_lw w10 +#define tmp1 x14 + +#define A_q q0 +#define B_q q1 +#define C_q q2 +#define D_q q3 +#define E_q q4 +#define F_q q5 +#define G_q q6 +#define H_q q7 + + +/* This implementation supports both memcpy and memmove and shares most code. + It uses unaligned accesses and branchless sequences to keep the code small, + simple and improve performance. + + Copies are split into 3 main cases: small copies of up to 32 bytes, medium + copies of up to 128 bytes, and large copies. The overhead of the overlap + check in memmove is negligible since it is only required for large copies. + + Large copies use a software pipelined loop processing 64 bytes per + iteration. The destination pointer is 16-byte aligned to minimize + unaligned accesses. The loop tail is handled by always copying 64 bytes + from the end. */ + +ENTRY (__memcpy_simd) + DELOUSE (0) + DELOUSE (1) + DELOUSE (2) + + add srcend, src, count + add dstend, dstin, count + cmp count, 128 + b.hi L(copy_long) + cmp count, 32 + b.hi L(copy32_128) + + /* Small copies: 0..32 bytes. */ + cmp count, 16 + b.lo L(copy16) + ldr A_q, [src] + ldr B_q, [srcend, -16] + str A_q, [dstin] + str B_q, [dstend, -16] + ret + + /* Copy 8-15 bytes. */ +L(copy16): + tbz count, 3, L(copy8) + ldr A_l, [src] + ldr A_h, [srcend, -8] + str A_l, [dstin] + str A_h, [dstend, -8] + ret + + /* Copy 4-7 bytes. */ +L(copy8): + tbz count, 2, L(copy4) + ldr A_lw, [src] + ldr B_lw, [srcend, -4] + str A_lw, [dstin] + str B_lw, [dstend, -4] + ret + + /* Copy 0..3 bytes using a branchless sequence. */ +L(copy4): + cbz count, L(copy0) + lsr tmp1, count, 1 + ldrb A_lw, [src] + ldrb C_lw, [srcend, -1] + ldrb B_lw, [src, tmp1] + strb A_lw, [dstin] + strb B_lw, [dstin, tmp1] + strb C_lw, [dstend, -1] +L(copy0): + ret + + .p2align 4 + /* Medium copies: 33..128 bytes. */ +L(copy32_128): + ldp A_q, B_q, [src] + ldp C_q, D_q, [srcend, -32] + cmp count, 64 + b.hi L(copy128) + stp A_q, B_q, [dstin] + stp C_q, D_q, [dstend, -32] + ret + + .p2align 4 + /* Copy 65..128 bytes. */ +L(copy128): + ldp E_q, F_q, [src, 32] + cmp count, 96 + b.ls L(copy96) + ldp G_q, H_q, [srcend, -64] + stp G_q, H_q, [dstend, -64] +L(copy96): + stp A_q, B_q, [dstin] + stp E_q, F_q, [dstin, 32] + stp C_q, D_q, [dstend, -32] + ret + + /* Align loop64 below to 16 bytes. */ + nop + + /* Copy more than 128 bytes. */ +L(copy_long): + /* Copy 16 bytes and then align src to 16-byte alignment. */ + ldr D_q, [src] + and tmp1, src, 15 + bic src, src, 15 + sub dst, dstin, tmp1 + add count, count, tmp1 /* Count is now 16 too large. */ + ldp A_q, B_q, [src, 16] + str D_q, [dstin] + ldp C_q, D_q, [src, 48] + subs count, count, 128 + 16 /* Test and readjust count. */ + b.ls L(copy64_from_end) +L(loop64): + stp A_q, B_q, [dst, 16] + ldp A_q, B_q, [src, 80] + stp C_q, D_q, [dst, 48] + ldp C_q, D_q, [src, 112] + add src, src, 64 + add dst, dst, 64 + subs count, count, 64 + b.hi L(loop64) + + /* Write the last iteration and copy 64 bytes from the end. */ +L(copy64_from_end): + ldp E_q, F_q, [srcend, -64] + stp A_q, B_q, [dst, 16] + ldp A_q, B_q, [srcend, -32] + stp C_q, D_q, [dst, 48] + stp E_q, F_q, [dstend, -64] + stp A_q, B_q, [dstend, -32] + ret + +END (__memcpy_simd) +libc_hidden_builtin_def (__memcpy_simd) + + +ENTRY (__memmove_simd) + DELOUSE (0) + DELOUSE (1) + DELOUSE (2) + + add srcend, src, count + add dstend, dstin, count + cmp count, 128 + b.hi L(move_long) + cmp count, 32 + b.hi L(copy32_128) + + /* Small moves: 0..32 bytes. */ + cmp count, 16 + b.lo L(copy16) + ldr A_q, [src] + ldr B_q, [srcend, -16] + str A_q, [dstin] + str B_q, [dstend, -16] + ret + +L(move_long): + /* Only use backward copy if there is an overlap. */ + sub tmp1, dstin, src + cbz tmp1, L(move0) + cmp tmp1, count + b.hs L(copy_long) + + /* Large backwards copy for overlapping copies. + Copy 16 bytes and then align srcend to 16-byte alignment. */ +L(copy_long_backwards): + ldr D_q, [srcend, -16] + and tmp1, srcend, 15 + bic srcend, srcend, 15 + sub count, count, tmp1 + ldp A_q, B_q, [srcend, -32] + str D_q, [dstend, -16] + ldp C_q, D_q, [srcend, -64] + sub dstend, dstend, tmp1 + subs count, count, 128 + b.ls L(copy64_from_start) + +L(loop64_backwards): + stp A_q, B_q, [dstend, -32] + ldp A_q, B_q, [srcend, -96] + stp C_q, D_q, [dstend, -64] + ldp C_q, D_q, [srcend, -128] + sub srcend, srcend, 64 + sub dstend, dstend, 64 + subs count, count, 64 + b.hi L(loop64_backwards) + + /* Write the last iteration and copy 64 bytes from the start. */ +L(copy64_from_start): + ldp E_q, F_q, [src, 32] + stp A_q, B_q, [dstend, -32] + ldp A_q, B_q, [src] + stp C_q, D_q, [dstend, -64] + stp E_q, F_q, [dstin, 32] + stp A_q, B_q, [dstin] +L(move0): + ret + +END (__memmove_simd) +libc_hidden_builtin_def (__memmove_simd) diff --git a/sysdeps/aarch64/multiarch/memmove.c b/sysdeps/aarch64/multiarch/memmove.c index e69d816291..6f585b3758 100644 --- a/sysdeps/aarch64/multiarch/memmove.c +++ b/sysdeps/aarch64/multiarch/memmove.c @@ -29,6 +29,7 @@ extern __typeof (__redirect_memmove) __libc_memmove; extern __typeof (__redirect_memmove) __memmove_generic attribute_hidden; +extern __typeof (__redirect_memmove) __memmove_simd attribute_hidden; extern __typeof (__redirect_memmove) __memmove_thunderx attribute_hidden; extern __typeof (__redirect_memmove) __memmove_falkor attribute_hidden; @@ -37,7 +38,7 @@ libc_ifunc (__libc_memmove, ? __memmove_thunderx : (IS_FALKOR (midr) || IS_PHECDA (midr) ? __memmove_falkor - : __memmove_generic))); + : (IS_ARES (midr) ? __memmove_simd : __memmove_generic)))); # undef memmove strong_alias (__libc_memmove, memmove);