From: Andrew Pinski <pinskia@gmail.com>
To: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
Cc: GNU C Library <libc-alpha@sourceware.org>
Subject: Re: [PATCH] aarch64: Use memcpy_simd as the default memcpy
Date: Wed, 12 Oct 2022 12:12:52 -0700 [thread overview]
Message-ID: <CA+=Sn1mf4o2XNOdyUtsMV_P98P0PXampaNXw8ikjf_LYASwH7A@mail.gmail.com> (raw)
In-Reply-To: <AS4PR08MB79015B70DC5F74EB452222B183229@AS4PR08MB7901.eurprd08.prod.outlook.com>
On Wed, Oct 12, 2022 at 8:20 AM Wilco Dijkstra via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> Since __memcpy_simd is the fastest memcpy on almost all cores, use it by default
> if SVE is not available.
>
> Passes regress, OK for commit?
>
> ---
> diff --git a/sysdeps/aarch64/memcpy.S b/sysdeps/aarch64/memcpy.S
> index 98d4e2c0e202eca13e1fd19ad8046cf61ad280ff..7b396b202fabf01b6ff2adc71a1038148e0b1054 100644
> --- a/sysdeps/aarch64/memcpy.S
> +++ b/sysdeps/aarch64/memcpy.S
> @@ -1,4 +1,5 @@
> -/* Copyright (C) 2012-2022 Free Software Foundation, Inc.
> +/* Generic optimized memcpy using SIMD.
> + Copyright (C) 2012-2022 Free Software Foundation, Inc.
>
> This file is part of the GNU C Library.
>
> @@ -20,7 +21,7 @@
>
> /* Assumptions:
> *
> - * ARMv8-a, AArch64, unaligned accesses.
> + * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
> *
> */
>
> @@ -36,21 +37,18 @@
> #define B_l x8
> #define B_lw w8
> #define B_h x9
> -#define C_l x10
> #define C_lw w10
> -#define C_h x11
> -#define D_l x12
> -#define D_h x13
> -#define E_l x14
> -#define E_h x15
> -#define F_l x16
> -#define F_h x17
> -#define G_l count
> -#define G_h dst
> -#define H_l src
> -#define H_h srcend
> #define tmp1 x14
>
> +#define A_q q0
> +#define B_q q1
> +#define C_q q2
> +#define D_q q3
> +#define E_q q4
> +#define F_q q5
> +#define G_q q6
> +#define H_q q7
> +
> #ifndef MEMMOVE
> # define MEMMOVE memmove
> #endif
> @@ -69,10 +67,9 @@
> Large copies use a software pipelined loop processing 64 bytes per
> iteration. The destination pointer is 16-byte aligned to minimize
> unaligned accesses. The loop tail is handled by always copying 64 bytes
> - from the end.
> -*/
> + from the end. */
>
> -ENTRY_ALIGN (MEMCPY, 6)
> +ENTRY (MEMCPY)
> PTR_ARG (0)
> PTR_ARG (1)
> SIZE_ARG (2)
> @@ -87,10 +84,10 @@ ENTRY_ALIGN (MEMCPY, 6)
> /* Small copies: 0..32 bytes. */
> cmp count, 16
> b.lo L(copy16)
> - ldp A_l, A_h, [src]
> - ldp D_l, D_h, [srcend, -16]
> - stp A_l, A_h, [dstin]
> - stp D_l, D_h, [dstend, -16]
> + ldr A_q, [src]
> + ldr B_q, [srcend, -16]
> + str A_q, [dstin]
> + str B_q, [dstend, -16]
> ret
>
> /* Copy 8-15 bytes. */
> @@ -102,7 +99,6 @@ L(copy16):
> str A_h, [dstend, -8]
> ret
>
> - .p2align 3
> /* Copy 4-7 bytes. */
> L(copy8):
> tbz count, 2, L(copy4)
> @@ -128,87 +124,69 @@ L(copy0):
> .p2align 4
> /* Medium copies: 33..128 bytes. */
> L(copy32_128):
> - ldp A_l, A_h, [src]
> - ldp B_l, B_h, [src, 16]
> - ldp C_l, C_h, [srcend, -32]
> - ldp D_l, D_h, [srcend, -16]
> + ldp A_q, B_q, [src]
> + ldp C_q, D_q, [srcend, -32]
> cmp count, 64
> b.hi L(copy128)
> - stp A_l, A_h, [dstin]
> - stp B_l, B_h, [dstin, 16]
> - stp C_l, C_h, [dstend, -32]
> - stp D_l, D_h, [dstend, -16]
> + stp A_q, B_q, [dstin]
> + stp C_q, D_q, [dstend, -32]
> ret
>
> .p2align 4
> /* Copy 65..128 bytes. */
> L(copy128):
> - ldp E_l, E_h, [src, 32]
> - ldp F_l, F_h, [src, 48]
> + ldp E_q, F_q, [src, 32]
> cmp count, 96
> b.ls L(copy96)
> - ldp G_l, G_h, [srcend, -64]
> - ldp H_l, H_h, [srcend, -48]
> - stp G_l, G_h, [dstend, -64]
> - stp H_l, H_h, [dstend, -48]
> + ldp G_q, H_q, [srcend, -64]
> + stp G_q, H_q, [dstend, -64]
> L(copy96):
> - stp A_l, A_h, [dstin]
> - stp B_l, B_h, [dstin, 16]
> - stp E_l, E_h, [dstin, 32]
> - stp F_l, F_h, [dstin, 48]
> - stp C_l, C_h, [dstend, -32]
> - stp D_l, D_h, [dstend, -16]
> + stp A_q, B_q, [dstin]
> + stp E_q, F_q, [dstin, 32]
> + stp C_q, D_q, [dstend, -32]
> ret
>
> - .p2align 4
> + /* Align loop64 below to 16 bytes. */
> + nop
> +
> /* Copy more than 128 bytes. */
> L(copy_long):
> - /* Copy 16 bytes and then align dst to 16-byte alignment. */
> - ldp D_l, D_h, [src]
> - and tmp1, dstin, 15
> - bic dst, dstin, 15
> - sub src, src, tmp1
> + /* Copy 16 bytes and then align src to 16-byte alignment. */
> + ldr D_q, [src]
> + and tmp1, src, 15
> + bic src, src, 15
> + sub dst, dstin, tmp1
> add count, count, tmp1 /* Count is now 16 too large. */
> - ldp A_l, A_h, [src, 16]
> - stp D_l, D_h, [dstin]
> - ldp B_l, B_h, [src, 32]
> - ldp C_l, C_h, [src, 48]
> - ldp D_l, D_h, [src, 64]!
> + ldp A_q, B_q, [src, 16]
> + str D_q, [dstin]
> + ldp C_q, D_q, [src, 48]
> subs count, count, 128 + 16 /* Test and readjust count. */
> b.ls L(copy64_from_end)
> -
> L(loop64):
> - stp A_l, A_h, [dst, 16]
> - ldp A_l, A_h, [src, 16]
> - stp B_l, B_h, [dst, 32]
> - ldp B_l, B_h, [src, 32]
> - stp C_l, C_h, [dst, 48]
> - ldp C_l, C_h, [src, 48]
> - stp D_l, D_h, [dst, 64]!
> - ldp D_l, D_h, [src, 64]!
> + stp A_q, B_q, [dst, 16]
> + ldp A_q, B_q, [src, 80]
> + stp C_q, D_q, [dst, 48]
> + ldp C_q, D_q, [src, 112]
> + add src, src, 64
> + add dst, dst, 64
> subs count, count, 64
> b.hi L(loop64)
>
> /* Write the last iteration and copy 64 bytes from the end. */
> L(copy64_from_end):
> - ldp E_l, E_h, [srcend, -64]
> - stp A_l, A_h, [dst, 16]
> - ldp A_l, A_h, [srcend, -48]
> - stp B_l, B_h, [dst, 32]
> - ldp B_l, B_h, [srcend, -32]
> - stp C_l, C_h, [dst, 48]
> - ldp C_l, C_h, [srcend, -16]
> - stp D_l, D_h, [dst, 64]
> - stp E_l, E_h, [dstend, -64]
> - stp A_l, A_h, [dstend, -48]
> - stp B_l, B_h, [dstend, -32]
> - stp C_l, C_h, [dstend, -16]
> + ldp E_q, F_q, [srcend, -64]
> + stp A_q, B_q, [dst, 16]
> + ldp A_q, B_q, [srcend, -32]
> + stp C_q, D_q, [dst, 48]
> + stp E_q, F_q, [dstend, -64]
> + stp A_q, B_q, [dstend, -32]
> ret
>
> END (MEMCPY)
> libc_hidden_builtin_def (MEMCPY)
>
> -ENTRY_ALIGN (MEMMOVE, 4)
> +
> +ENTRY (MEMMOVE)
> PTR_ARG (0)
> PTR_ARG (1)
> SIZE_ARG (2)
> @@ -220,64 +198,56 @@ ENTRY_ALIGN (MEMMOVE, 4)
> cmp count, 32
> b.hi L(copy32_128)
>
> - /* Small copies: 0..32 bytes. */
> + /* Small moves: 0..32 bytes. */
> cmp count, 16
> b.lo L(copy16)
> - ldp A_l, A_h, [src]
> - ldp D_l, D_h, [srcend, -16]
> - stp A_l, A_h, [dstin]
> - stp D_l, D_h, [dstend, -16]
> + ldr A_q, [src]
> + ldr B_q, [srcend, -16]
> + str A_q, [dstin]
> + str B_q, [dstend, -16]
> ret
>
> - .p2align 4
> L(move_long):
> /* Only use backward copy if there is an overlap. */
> sub tmp1, dstin, src
> - cbz tmp1, L(copy0)
> + cbz tmp1, L(move0)
> cmp tmp1, count
> b.hs L(copy_long)
>
> /* Large backwards copy for overlapping copies.
> - Copy 16 bytes and then align dst to 16-byte alignment. */
> - ldp D_l, D_h, [srcend, -16]
> - and tmp1, dstend, 15
> - sub srcend, srcend, tmp1
> + Copy 16 bytes and then align srcend to 16-byte alignment. */
> +L(copy_long_backwards):
> + ldr D_q, [srcend, -16]
> + and tmp1, srcend, 15
> + bic srcend, srcend, 15
> sub count, count, tmp1
> - ldp A_l, A_h, [srcend, -16]
> - stp D_l, D_h, [dstend, -16]
> - ldp B_l, B_h, [srcend, -32]
> - ldp C_l, C_h, [srcend, -48]
> - ldp D_l, D_h, [srcend, -64]!
> + ldp A_q, B_q, [srcend, -32]
> + str D_q, [dstend, -16]
> + ldp C_q, D_q, [srcend, -64]
> sub dstend, dstend, tmp1
> subs count, count, 128
> b.ls L(copy64_from_start)
>
> L(loop64_backwards):
> - stp A_l, A_h, [dstend, -16]
> - ldp A_l, A_h, [srcend, -16]
> - stp B_l, B_h, [dstend, -32]
> - ldp B_l, B_h, [srcend, -32]
> - stp C_l, C_h, [dstend, -48]
> - ldp C_l, C_h, [srcend, -48]
> - stp D_l, D_h, [dstend, -64]!
> - ldp D_l, D_h, [srcend, -64]!
> + str B_q, [dstend, -16]
> + str A_q, [dstend, -32]
> + ldp A_q, B_q, [srcend, -96]
> + str D_q, [dstend, -48]
> + str C_q, [dstend, -64]!
> + ldp C_q, D_q, [srcend, -128]
> + sub srcend, srcend, 64
> subs count, count, 64
> b.hi L(loop64_backwards)
>
> /* Write the last iteration and copy 64 bytes from the start. */
> L(copy64_from_start):
> - ldp G_l, G_h, [src, 48]
> - stp A_l, A_h, [dstend, -16]
> - ldp A_l, A_h, [src, 32]
> - stp B_l, B_h, [dstend, -32]
> - ldp B_l, B_h, [src, 16]
> - stp C_l, C_h, [dstend, -48]
> - ldp C_l, C_h, [src]
> - stp D_l, D_h, [dstend, -64]
> - stp G_l, G_h, [dstin, 48]
> - stp A_l, A_h, [dstin, 32]
> - stp B_l, B_h, [dstin, 16]
> - stp C_l, C_h, [dstin]
> + ldp E_q, F_q, [src, 32]
> + stp A_q, B_q, [dstend, -32]
> + ldp A_q, B_q, [src]
> + stp C_q, D_q, [dstend, -64]
> + stp E_q, F_q, [dstin, 32]
> + stp A_q, B_q, [dstin]
> +L(move0):
> ret
>
> END (MEMMOVE)
> diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
> index bc5cde8add07b908178fb0271decc27f728f7a2e..7f2d85b0e5acc0a694e91b17fbccc0dba0ea339d 100644
> --- a/sysdeps/aarch64/multiarch/Makefile
> +++ b/sysdeps/aarch64/multiarch/Makefile
> @@ -3,7 +3,6 @@ sysdep_routines += \
> memchr_generic \
> memchr_nosimd \
> memcpy_a64fx \
> - memcpy_advsimd \
> memcpy_generic \
> memcpy_sve \
> memcpy_thunderx \
> diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
> index 9c2542de38fb109b7c6f1db4aacee3a6b544fa3f..e7c4dcc0ed5a68ecd8dacc06256d0749b76912cb 100644
> --- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
> @@ -36,7 +36,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> IFUNC_IMPL (i, name, memcpy,
> IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx)
> IFUNC_IMPL_ADD (array, i, memcpy, !bti, __memcpy_thunderx2)
> - IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_simd)
> #if HAVE_AARCH64_SVE_ASM
> IFUNC_IMPL_ADD (array, i, memcpy, sve, __memcpy_a64fx)
> IFUNC_IMPL_ADD (array, i, memcpy, sve, __memcpy_sve)
> @@ -45,7 +44,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> IFUNC_IMPL (i, name, memmove,
> IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx)
> IFUNC_IMPL_ADD (array, i, memmove, !bti, __memmove_thunderx2)
> - IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_simd)
> #if HAVE_AARCH64_SVE_ASM
> IFUNC_IMPL_ADD (array, i, memmove, sve, __memmove_a64fx)
> IFUNC_IMPL_ADD (array, i, memmove, sve, __memmove_sve)
> diff --git a/sysdeps/aarch64/multiarch/memcpy.c b/sysdeps/aarch64/multiarch/memcpy.c
> index 5006b0594a476bcc149f2ae022bea50379d04908..1e08ce852e68409fd0eeb975edab77ebe8da8635 100644
> --- a/sysdeps/aarch64/multiarch/memcpy.c
> +++ b/sysdeps/aarch64/multiarch/memcpy.c
> @@ -29,7 +29,6 @@
> extern __typeof (__redirect_memcpy) __libc_memcpy;
>
> extern __typeof (__redirect_memcpy) __memcpy_generic attribute_hidden;
> -extern __typeof (__redirect_memcpy) __memcpy_simd attribute_hidden;
> extern __typeof (__redirect_memcpy) __memcpy_thunderx attribute_hidden;
> extern __typeof (__redirect_memcpy) __memcpy_thunderx2 attribute_hidden;
> extern __typeof (__redirect_memcpy) __memcpy_a64fx attribute_hidden;
> @@ -40,9 +39,6 @@ select_memcpy_ifunc (void)
> {
> INIT_ARCH ();
>
> - if (IS_NEOVERSE_N1 (midr) || IS_NEOVERSE_N2 (midr))
> - return __memcpy_simd;
> -
> if (sve && HAVE_AARCH64_SVE_ASM)
> {
This changes how neoverse-n2 is handled, is that expected?
That is neoverse-n2 was returning __memcpy_simd before and now will be
returning __memcpy_sve as n2 has SVE.
Thanks,
Andrew Pinski
> if (IS_A64FX (midr))
> diff --git a/sysdeps/aarch64/multiarch/memcpy_advsimd.S b/sysdeps/aarch64/multiarch/memcpy_advsimd.S
> deleted file mode 100644
> index fe9beaf5ead47268867bee98acad3b17c554656a..0000000000000000000000000000000000000000
> --- a/sysdeps/aarch64/multiarch/memcpy_advsimd.S
> +++ /dev/null
> @@ -1,248 +0,0 @@
> -/* Generic optimized memcpy using SIMD.
> - Copyright (C) 2020-2022 Free Software Foundation, Inc.
> -
> - This file is part of the GNU C Library.
> -
> - The GNU C Library is free software; you can redistribute it and/or
> - modify it under the terms of the GNU Lesser General Public
> - License as published by the Free Software Foundation; either
> - version 2.1 of the License, or (at your option) any later version.
> -
> - The GNU C Library is distributed in the hope that it will be useful,
> - but WITHOUT ANY WARRANTY; without even the implied warranty of
> - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> - Lesser General Public License for more details.
> -
> - You should have received a copy of the GNU Lesser General Public
> - License along with the GNU C Library. If not, see
> - <https://www.gnu.org/licenses/>. */
> -
> -#include <sysdep.h>
> -
> -/* Assumptions:
> - *
> - * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
> - *
> - */
> -
> -#define dstin x0
> -#define src x1
> -#define count x2
> -#define dst x3
> -#define srcend x4
> -#define dstend x5
> -#define A_l x6
> -#define A_lw w6
> -#define A_h x7
> -#define B_l x8
> -#define B_lw w8
> -#define B_h x9
> -#define C_lw w10
> -#define tmp1 x14
> -
> -#define A_q q0
> -#define B_q q1
> -#define C_q q2
> -#define D_q q3
> -#define E_q q4
> -#define F_q q5
> -#define G_q q6
> -#define H_q q7
> -
> -
> -/* This implementation supports both memcpy and memmove and shares most code.
> - It uses unaligned accesses and branchless sequences to keep the code small,
> - simple and improve performance.
> -
> - Copies are split into 3 main cases: small copies of up to 32 bytes, medium
> - copies of up to 128 bytes, and large copies. The overhead of the overlap
> - check in memmove is negligible since it is only required for large copies.
> -
> - Large copies use a software pipelined loop processing 64 bytes per
> - iteration. The destination pointer is 16-byte aligned to minimize
> - unaligned accesses. The loop tail is handled by always copying 64 bytes
> - from the end. */
> -
> -ENTRY (__memcpy_simd)
> - PTR_ARG (0)
> - PTR_ARG (1)
> - SIZE_ARG (2)
> -
> - add srcend, src, count
> - add dstend, dstin, count
> - cmp count, 128
> - b.hi L(copy_long)
> - cmp count, 32
> - b.hi L(copy32_128)
> -
> - /* Small copies: 0..32 bytes. */
> - cmp count, 16
> - b.lo L(copy16)
> - ldr A_q, [src]
> - ldr B_q, [srcend, -16]
> - str A_q, [dstin]
> - str B_q, [dstend, -16]
> - ret
> -
> - /* Copy 8-15 bytes. */
> -L(copy16):
> - tbz count, 3, L(copy8)
> - ldr A_l, [src]
> - ldr A_h, [srcend, -8]
> - str A_l, [dstin]
> - str A_h, [dstend, -8]
> - ret
> -
> - /* Copy 4-7 bytes. */
> -L(copy8):
> - tbz count, 2, L(copy4)
> - ldr A_lw, [src]
> - ldr B_lw, [srcend, -4]
> - str A_lw, [dstin]
> - str B_lw, [dstend, -4]
> - ret
> -
> - /* Copy 0..3 bytes using a branchless sequence. */
> -L(copy4):
> - cbz count, L(copy0)
> - lsr tmp1, count, 1
> - ldrb A_lw, [src]
> - ldrb C_lw, [srcend, -1]
> - ldrb B_lw, [src, tmp1]
> - strb A_lw, [dstin]
> - strb B_lw, [dstin, tmp1]
> - strb C_lw, [dstend, -1]
> -L(copy0):
> - ret
> -
> - .p2align 4
> - /* Medium copies: 33..128 bytes. */
> -L(copy32_128):
> - ldp A_q, B_q, [src]
> - ldp C_q, D_q, [srcend, -32]
> - cmp count, 64
> - b.hi L(copy128)
> - stp A_q, B_q, [dstin]
> - stp C_q, D_q, [dstend, -32]
> - ret
> -
> - .p2align 4
> - /* Copy 65..128 bytes. */
> -L(copy128):
> - ldp E_q, F_q, [src, 32]
> - cmp count, 96
> - b.ls L(copy96)
> - ldp G_q, H_q, [srcend, -64]
> - stp G_q, H_q, [dstend, -64]
> -L(copy96):
> - stp A_q, B_q, [dstin]
> - stp E_q, F_q, [dstin, 32]
> - stp C_q, D_q, [dstend, -32]
> - ret
> -
> - /* Align loop64 below to 16 bytes. */
> - nop
> -
> - /* Copy more than 128 bytes. */
> -L(copy_long):
> - /* Copy 16 bytes and then align src to 16-byte alignment. */
> - ldr D_q, [src]
> - and tmp1, src, 15
> - bic src, src, 15
> - sub dst, dstin, tmp1
> - add count, count, tmp1 /* Count is now 16 too large. */
> - ldp A_q, B_q, [src, 16]
> - str D_q, [dstin]
> - ldp C_q, D_q, [src, 48]
> - subs count, count, 128 + 16 /* Test and readjust count. */
> - b.ls L(copy64_from_end)
> -L(loop64):
> - stp A_q, B_q, [dst, 16]
> - ldp A_q, B_q, [src, 80]
> - stp C_q, D_q, [dst, 48]
> - ldp C_q, D_q, [src, 112]
> - add src, src, 64
> - add dst, dst, 64
> - subs count, count, 64
> - b.hi L(loop64)
> -
> - /* Write the last iteration and copy 64 bytes from the end. */
> -L(copy64_from_end):
> - ldp E_q, F_q, [srcend, -64]
> - stp A_q, B_q, [dst, 16]
> - ldp A_q, B_q, [srcend, -32]
> - stp C_q, D_q, [dst, 48]
> - stp E_q, F_q, [dstend, -64]
> - stp A_q, B_q, [dstend, -32]
> - ret
> -
> -END (__memcpy_simd)
> -libc_hidden_builtin_def (__memcpy_simd)
> -
> -
> -ENTRY (__memmove_simd)
> - PTR_ARG (0)
> - PTR_ARG (1)
> - SIZE_ARG (2)
> -
> - add srcend, src, count
> - add dstend, dstin, count
> - cmp count, 128
> - b.hi L(move_long)
> - cmp count, 32
> - b.hi L(copy32_128)
> -
> - /* Small moves: 0..32 bytes. */
> - cmp count, 16
> - b.lo L(copy16)
> - ldr A_q, [src]
> - ldr B_q, [srcend, -16]
> - str A_q, [dstin]
> - str B_q, [dstend, -16]
> - ret
> -
> -L(move_long):
> - /* Only use backward copy if there is an overlap. */
> - sub tmp1, dstin, src
> - cbz tmp1, L(move0)
> - cmp tmp1, count
> - b.hs L(copy_long)
> -
> - /* Large backwards copy for overlapping copies.
> - Copy 16 bytes and then align srcend to 16-byte alignment. */
> -L(copy_long_backwards):
> - ldr D_q, [srcend, -16]
> - and tmp1, srcend, 15
> - bic srcend, srcend, 15
> - sub count, count, tmp1
> - ldp A_q, B_q, [srcend, -32]
> - str D_q, [dstend, -16]
> - ldp C_q, D_q, [srcend, -64]
> - sub dstend, dstend, tmp1
> - subs count, count, 128
> - b.ls L(copy64_from_start)
> -
> -L(loop64_backwards):
> - str B_q, [dstend, -16]
> - str A_q, [dstend, -32]
> - ldp A_q, B_q, [srcend, -96]
> - str D_q, [dstend, -48]
> - str C_q, [dstend, -64]!
> - ldp C_q, D_q, [srcend, -128]
> - sub srcend, srcend, 64
> - subs count, count, 64
> - b.hi L(loop64_backwards)
> -
> - /* Write the last iteration and copy 64 bytes from the start. */
> -L(copy64_from_start):
> - ldp E_q, F_q, [src, 32]
> - stp A_q, B_q, [dstend, -32]
> - ldp A_q, B_q, [src]
> - stp C_q, D_q, [dstend, -64]
> - stp E_q, F_q, [dstin, 32]
> - stp A_q, B_q, [dstin]
> -L(move0):
> - ret
> -
> -END (__memmove_simd)
> -libc_hidden_builtin_def (__memmove_simd)
> diff --git a/sysdeps/aarch64/multiarch/memmove.c b/sysdeps/aarch64/multiarch/memmove.c
> index 7dae8b7c956f9083d0896cc771cae79f4901581d..dbf1536525e614f72d3d74bb193015b303618357 100644
> --- a/sysdeps/aarch64/multiarch/memmove.c
> +++ b/sysdeps/aarch64/multiarch/memmove.c
> @@ -29,7 +29,6 @@
> extern __typeof (__redirect_memmove) __libc_memmove;
>
> extern __typeof (__redirect_memmove) __memmove_generic attribute_hidden;
> -extern __typeof (__redirect_memmove) __memmove_simd attribute_hidden;
> extern __typeof (__redirect_memmove) __memmove_thunderx attribute_hidden;
> extern __typeof (__redirect_memmove) __memmove_thunderx2 attribute_hidden;
> extern __typeof (__redirect_memmove) __memmove_a64fx attribute_hidden;
> @@ -40,9 +39,6 @@ select_memmove_ifunc (void)
> {
> INIT_ARCH ();
>
> - if (IS_NEOVERSE_N1 (midr) || IS_NEOVERSE_N2 (midr))
> - return __memmove_simd;
> -
> if (sve && HAVE_AARCH64_SVE_ASM)
> {
> if (IS_A64FX (midr))
>
next prev parent reply other threads:[~2022-10-12 19:13 UTC|newest]
Thread overview: 6+ messages / expand[flat|nested] mbox.gz Atom feed top
2022-10-12 15:19 Wilco Dijkstra
2022-10-12 19:12 ` Andrew Pinski [this message]
2022-10-13 12:28 ` Wilco Dijkstra
2022-10-13 12:30 ` Adhemerval Zanella Netto
2022-10-19 12:31 ` Wilco Dijkstra
2022-10-25 12:55 ` Szabolcs Nagy
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to='CA+=Sn1mf4o2XNOdyUtsMV_P98P0PXampaNXw8ikjf_LYASwH7A@mail.gmail.com' \
--to=pinskia@gmail.com \
--cc=Wilco.Dijkstra@arm.com \
--cc=libc-alpha@sourceware.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).