From: Richard Earnshaw <Richard.Earnshaw@foss.arm.com>
To: Sebastian Huber <sebastian.huber@embedded-brains.de>,
newlib@sourceware.org
Cc: Szabolcs Nagy <szabolcs.nagy@arm.com>
Subject: Re: [PATCH v3 1/2] aarch64: Sync with ARM-software/optimized-routines
Date: Thu, 5 Oct 2023 11:37:06 +0100 [thread overview]
Message-ID: <eca2ad5d-d833-235c-cad2-a1cd76dab88d@foss.arm.com> (raw)
In-Reply-To: <20230912100507.33946-2-sebastian.huber@embedded-brains.de>
Hi Sebastian,
My apologies for the delay replying, the GNU Cauldron organizing took up
a lot of my time over the last few weeks.
This is basically ok, but you're removing an existing license and adding
a new one from Arm; I think you need to copy the new license into
COPYING.NEWLIB - it's not enough just to have an SPDX identifier, the
text of the license must be added somewhere as well.
R.
On 12/09/2023 11:05, Sebastian Huber wrote:
> Update AArch64 assembly string routines from:
>
> https://github.com/ARM-software/optimized-routines
>
> commit 0cf84f26b6b8dcad8287fe30a4dcc1fdabd06560
> Author: Sebastian Huber <sebastian.huber@embedded-brains.de>
> Date: Thu Jul 27 17:14:57 2023 +0200
>
> string: Fix corrupt GNU_PROPERTY_TYPE (5) size
>
> For ELF32 the notes alignment is 4 and not 8.
> ---
> newlib/libc/machine/aarch64/asmdefs.h | 106 ++++++
> newlib/libc/machine/aarch64/memchr.S | 73 ++--
> newlib/libc/machine/aarch64/memcmp.S | 311 +++++++++--------
> newlib/libc/machine/aarch64/memcpy.S | 272 ++++++++-------
> newlib/libc/machine/aarch64/memset.S | 194 ++---------
> newlib/libc/machine/aarch64/stpcpy.S | 36 +-
> newlib/libc/machine/aarch64/strchr.S | 107 ++----
> newlib/libc/machine/aarch64/strchrnul.S | 90 ++---
> newlib/libc/machine/aarch64/strcmp.S | 282 ++++++++-------
> newlib/libc/machine/aarch64/strcpy.S | 437 +++++++-----------------
> newlib/libc/machine/aarch64/strlen.S | 319 ++++++++---------
> newlib/libc/machine/aarch64/strncmp.S | 323 ++++++++++--------
> newlib/libc/machine/aarch64/strnlen.S | 256 +++++---------
> newlib/libc/machine/aarch64/strrchr.S | 86 ++---
> 14 files changed, 1226 insertions(+), 1666 deletions(-)
> create mode 100644 newlib/libc/machine/aarch64/asmdefs.h
>
> diff --git a/newlib/libc/machine/aarch64/asmdefs.h b/newlib/libc/machine/aarch64/asmdefs.h
> new file mode 100644
> index 0000000000..131b95e1fe
> --- /dev/null
> +++ b/newlib/libc/machine/aarch64/asmdefs.h
> @@ -0,0 +1,106 @@
> +/*
> + * Macros for asm code. AArch64 version.
> + *
> + * Copyright (c) 2019-2023, Arm Limited.
> + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
> + */
> +
> +#ifndef _ASMDEFS_H
> +#define _ASMDEFS_H
> +
> +/* Branch Target Identitication support. */
> +#define BTI_C hint 34
> +#define BTI_J hint 36
> +/* Return address signing support (pac-ret). */
> +#define PACIASP hint 25; .cfi_window_save
> +#define AUTIASP hint 29; .cfi_window_save
> +
> +/* GNU_PROPERTY_AARCH64_* macros from elf.h. */
> +#define FEATURE_1_AND 0xc0000000
> +#define FEATURE_1_BTI 1
> +#define FEATURE_1_PAC 2
> +
> +/* Add a NT_GNU_PROPERTY_TYPE_0 note. */
> +#ifdef __ILP32__
> +#define GNU_PROPERTY(type, value) \
> + .section .note.gnu.property, "a"; \
> + .p2align 2; \
> + .word 4; \
> + .word 12; \
> + .word 5; \
> + .asciz "GNU"; \
> + .word type; \
> + .word 4; \
> + .word value; \
> + .text
> +#else
> +#define GNU_PROPERTY(type, value) \
> + .section .note.gnu.property, "a"; \
> + .p2align 3; \
> + .word 4; \
> + .word 16; \
> + .word 5; \
> + .asciz "GNU"; \
> + .word type; \
> + .word 4; \
> + .word value; \
> + .word 0; \
> + .text
> +#endif
> +
> +/* If set then the GNU Property Note section will be added to
> + mark objects to support BTI and PAC-RET. */
> +#ifndef WANT_GNU_PROPERTY
> +#define WANT_GNU_PROPERTY 1
> +#endif
> +
> +#if WANT_GNU_PROPERTY
> +/* Add property note with supported features to all asm files. */
> +GNU_PROPERTY (FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_PAC)
> +#endif
> +
> +#define ENTRY_ALIGN(name, alignment) \
> + .global name; \
> + .type name,%function; \
> + .align alignment; \
> + name: \
> + .cfi_startproc; \
> + BTI_C;
> +
> +#define ENTRY(name) ENTRY_ALIGN(name, 6)
> +
> +#define ENTRY_ALIAS(name) \
> + .global name; \
> + .type name,%function; \
> + name:
> +
> +#define END(name) \
> + .cfi_endproc; \
> + .size name, .-name;
> +
> +#define L(l) .L ## l
> +
> +#ifdef __ILP32__
> + /* Sanitize padding bits of pointer arguments as per aapcs64 */
> +#define PTR_ARG(n) mov w##n, w##n
> +#else
> +#define PTR_ARG(n)
> +#endif
> +
> +#ifdef __ILP32__
> + /* Sanitize padding bits of size arguments as per aapcs64 */
> +#define SIZE_ARG(n) mov w##n, w##n
> +#else
> +#define SIZE_ARG(n)
> +#endif
> +
> +/* Compiler supports SVE instructions */
> +#ifndef HAVE_SVE
> +# if __aarch64__ && (__GNUC__ >= 8 || __clang_major__ >= 5)
> +# define HAVE_SVE 1
> +# else
> +# define HAVE_SVE 0
> +# endif
> +#endif
> +
> +#endif
> diff --git a/newlib/libc/machine/aarch64/memchr.S b/newlib/libc/machine/aarch64/memchr.S
> index 53f5d6bc0e..a0f305e0fc 100644
> --- a/newlib/libc/machine/aarch64/memchr.S
> +++ b/newlib/libc/machine/aarch64/memchr.S
> @@ -1,31 +1,8 @@
> /*
> * memchr - find a character in a memory zone
> *
> - * Copyright (c) 2014, ARM Limited
> - * All rights Reserved.
> - *
> - * Redistribution and use in source and binary forms, with or without
> - * modification, are permitted provided that the following conditions are met:
> - * * Redistributions of source code must retain the above copyright
> - * notice, this list of conditions and the following disclaimer.
> - * * Redistributions in binary form must reproduce the above copyright
> - * notice, this list of conditions and the following disclaimer in the
> - * documentation and/or other materials provided with the distribution.
> - * * Neither the name of the company nor the names of its contributors
> - * may be used to endorse or promote products derived from this
> - * software without specific prior written permission.
> - *
> - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> - * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> + * Copyright (c) 2014-2022, Arm Limited.
> + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
> */
>
> #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
> @@ -37,6 +14,8 @@
> * Neon Available.
> */
>
> +#include "asmdefs.h"
> +
> /* Arguments and results. */
> #define srcin x0
> #define chrin w1
> @@ -70,17 +49,11 @@
> * identify exactly which byte has matched.
> */
>
> - .macro def_fn f p2align=0
> - .text
> - .p2align \p2align
> - .global \f
> - .type \f, %function
> -\f:
> - .endm
> -
> -def_fn memchr
> +ENTRY (memchr)
> + PTR_ARG (0)
> + SIZE_ARG (2)
> /* Do not dereference srcin if no bytes to compare. */
> - cbz cntin, .Lzero_length
> + cbz cntin, L(zero_length)
> /*
> * Magic constant 0x40100401 allows us to identify which lane matches
> * the requested byte.
> @@ -93,7 +66,7 @@ def_fn memchr
> dup vrepmask.4s, wtmp2
> ands soff, srcin, #31
> and cntrem, cntin, #31
> - b.eq .Lloop
> + b.eq L(loop)
>
> /*
> * Input string is not 32-byte aligned. We calculate the syndrome
> @@ -110,41 +83,41 @@ def_fn memchr
> and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
> addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */
> addp vend.16b, vend.16b, vend.16b /* 128->64 */
> - mov synd, vend.2d[0]
> + mov synd, vend.d[0]
> /* Clear the soff*2 lower bits */
> lsl tmp, soff, #1
> lsr synd, synd, tmp
> lsl synd, synd, tmp
> /* The first block can also be the last */
> - b.ls .Lmasklast
> + b.ls L(masklast)
> /* Have we found something already? */
> - cbnz synd, .Ltail
> + cbnz synd, L(tail)
>
> -.Lloop:
> +L(loop):
> ld1 {vdata1.16b, vdata2.16b}, [src], #32
> subs cntin, cntin, #32
> cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
> cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
> /* If we're out of data we finish regardless of the result */
> - b.ls .Lend
> + b.ls L(end)
> /* Use a fast check for the termination condition */
> orr vend.16b, vhas_chr1.16b, vhas_chr2.16b
> addp vend.2d, vend.2d, vend.2d
> - mov synd, vend.2d[0]
> + mov synd, vend.d[0]
> /* We're not out of data, loop if we haven't found the character */
> - cbz synd, .Lloop
> + cbz synd, L(loop)
>
> -.Lend:
> +L(end):
> /* Termination condition found, let's calculate the syndrome value */
> and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
> and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
> addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */
> addp vend.16b, vend.16b, vend.16b /* 128->64 */
> - mov synd, vend.2d[0]
> + mov synd, vend.d[0]
> /* Only do the clear for the last possible block */
> - b.hi .Ltail
> + b.hs L(tail)
>
> -.Lmasklast:
> +L(masklast):
> /* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */
> add tmp, cntrem, soff
> and tmp, tmp, #31
> @@ -153,7 +126,7 @@ def_fn memchr
> lsl synd, synd, tmp
> lsr synd, synd, tmp
>
> -.Ltail:
> +L(tail):
> /* Count the trailing zeros using bit reversing */
> rbit synd, synd
> /* Compensate the last post-increment */
> @@ -168,9 +141,9 @@ def_fn memchr
> csel result, xzr, result, eq
> ret
>
> -.Lzero_length:
> +L(zero_length):
> mov result, #0
> ret
>
> - .size memchr, . - memchr
> +END (memchr)
> #endif
> diff --git a/newlib/libc/machine/aarch64/memcmp.S b/newlib/libc/machine/aarch64/memcmp.S
> index 605d99365e..18874d3215 100644
> --- a/newlib/libc/machine/aarch64/memcmp.S
> +++ b/newlib/libc/machine/aarch64/memcmp.S
> @@ -1,57 +1,7 @@
> /* memcmp - compare memory
> -
> - Copyright (c) 2018 Linaro Limited
> - All rights reserved.
> -
> - Redistribution and use in source and binary forms, with or without
> - modification, are permitted provided that the following conditions are met:
> - * Redistributions of source code must retain the above copyright
> - notice, this list of conditions and the following disclaimer.
> - * Redistributions in binary form must reproduce the above copyright
> - notice, this list of conditions and the following disclaimer in the
> - documentation and/or other materials provided with the distribution.
> - * Neither the name of the Linaro nor the
> - names of its contributors may be used to endorse or promote products
> - derived from this software without specific prior written permission.
> -
> - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
> -
> -/*
> - * Copyright (c) 2017 ARM Ltd
> - * All rights reserved.
> - *
> - * Redistribution and use in source and binary forms, with or without
> - * modification, are permitted provided that the following conditions
> - * are met:
> - * 1. Redistributions of source code must retain the above copyright
> - * notice, this list of conditions and the following disclaimer.
> - * 2. Redistributions in binary form must reproduce the above copyright
> - * notice, this list of conditions and the following disclaimer in the
> - * documentation and/or other materials provided with the distribution.
> - * 3. The name of the company may not be used to endorse or promote
> - * products derived from this software without specific prior written
> - * permission.
> *
> - * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
> - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
> - * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
> - * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
> - * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
> - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
> - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
> - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
> - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> + * Copyright (c) 2013-2022, Arm Limited.
> + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
> */
>
> #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
> @@ -60,103 +10,79 @@
>
> /* Assumptions:
> *
> - * ARMv8-a, AArch64, unaligned accesses.
> + * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
> */
>
> -#define L(l) .L ## l
> -
> -/* Parameters and result. */
> -#define src1 x0
> -#define src2 x1
> -#define limit x2
> -#define result w0
> -
> -/* Internal variables. */
> -#define data1 x3
> -#define data1w w3
> -#define data1h x4
> -#define data2 x5
> -#define data2w w5
> -#define data2h x6
> -#define tmp1 x7
> -#define tmp2 x8
> -
> - .macro def_fn f p2align=0
> - .text
> - .p2align \p2align
> - .global \f
> - .type \f, %function
> -\f:
> - .endm
> -
> -def_fn memcmp p2align=6
> - subs limit, limit, 8
> - b.lo L(less8)
> -
> - ldr data1, [src1], 8
> - ldr data2, [src2], 8
> - cmp data1, data2
> - b.ne L(return)
> -
> - subs limit, limit, 8
> - b.gt L(more16)
> -
> - ldr data1, [src1, limit]
> - ldr data2, [src2, limit]
> - b L(return)
> -
> -L(more16):
> - ldr data1, [src1], 8
> - ldr data2, [src2], 8
> - cmp data1, data2
> - bne L(return)
> -
> - /* Jump directly to comparing the last 16 bytes for 32 byte (or less)
> - strings. */
> - subs limit, limit, 16
> +#include "asmdefs.h"
> +
> +#define src1 x0
> +#define src2 x1
> +#define limit x2
> +#define result w0
> +
> +#define data1 x3
> +#define data1w w3
> +#define data2 x4
> +#define data2w w4
> +#define data3 x5
> +#define data3w w5
> +#define data4 x6
> +#define data4w w6
> +#define tmp x6
> +#define src1end x7
> +#define src2end x8
> +
> +
> +ENTRY (memcmp)
> + PTR_ARG (0)
> + PTR_ARG (1)
> + SIZE_ARG (2)
> +
> + cmp limit, 16
> + b.lo L(less16)
> + ldp data1, data3, [src1]
> + ldp data2, data4, [src2]
> + ccmp data1, data2, 0, ne
> + ccmp data3, data4, 0, eq
> + b.ne L(return2)
> +
> + add src1end, src1, limit
> + add src2end, src2, limit
> + cmp limit, 32
> b.ls L(last_bytes)
> + cmp limit, 160
> + b.hs L(loop_align)
> + sub limit, limit, 32
>
> - /* We overlap loads between 0-32 bytes at either side of SRC1 when we
> - try to align, so limit it only to strings larger than 128 bytes. */
> - cmp limit, 96
> - b.ls L(loop16)
> -
> - /* Align src1 and adjust src2 with bytes not yet done. */
> - and tmp1, src1, 15
> - add limit, limit, tmp1
> - sub src1, src1, tmp1
> - sub src2, src2, tmp1
> -
> - /* Loop performing 16 bytes per iteration using aligned src1.
> - Limit is pre-decremented by 16 and must be larger than zero.
> - Exit if <= 16 bytes left to do or if the data is not equal. */
> .p2align 4
> -L(loop16):
> - ldp data1, data1h, [src1], 16
> - ldp data2, data2h, [src2], 16
> - subs limit, limit, 16
> - ccmp data1, data2, 0, hi
> - ccmp data1h, data2h, 0, eq
> - b.eq L(loop16)
> -
> +L(loop32):
> + ldp data1, data3, [src1, 16]
> + ldp data2, data4, [src2, 16]
> cmp data1, data2
> - bne L(return)
> - mov data1, data1h
> - mov data2, data2h
> + ccmp data3, data4, 0, eq
> + b.ne L(return2)
> + cmp limit, 16
> + b.ls L(last_bytes)
> +
> + ldp data1, data3, [src1, 32]
> + ldp data2, data4, [src2, 32]
> cmp data1, data2
> - bne L(return)
> + ccmp data3, data4, 0, eq
> + b.ne L(return2)
> + add src1, src1, 32
> + add src2, src2, 32
> +L(last64):
> + subs limit, limit, 32
> + b.hi L(loop32)
>
> /* Compare last 1-16 bytes using unaligned access. */
> L(last_bytes):
> - add src1, src1, limit
> - add src2, src2, limit
> - ldp data1, data1h, [src1]
> - ldp data2, data2h, [src2]
> - cmp data1, data2
> - bne L(return)
> - mov data1, data1h
> - mov data2, data2h
> + ldp data1, data3, [src1end, -16]
> + ldp data2, data4, [src2end, -16]
> +L(return2):
> cmp data1, data2
> + csel data1, data1, data3, ne
> + csel data2, data2, data4, ne
>
> /* Compare data bytes and set return value to 0, -1 or 1. */
> L(return):
> @@ -164,33 +90,106 @@ L(return):
> rev data1, data1
> rev data2, data2
> #endif
> - cmp data1, data2
> -L(ret_eq):
> + cmp data1, data2
> cset result, ne
> cneg result, result, lo
> ret
>
> .p2align 4
> - /* Compare up to 8 bytes. Limit is [-8..-1]. */
> +L(less16):
> + add src1end, src1, limit
> + add src2end, src2, limit
> + tbz limit, 3, L(less8)
> + ldr data1, [src1]
> + ldr data2, [src2]
> + ldr data3, [src1end, -8]
> + ldr data4, [src2end, -8]
> + b L(return2)
> +
> + .p2align 4
> L(less8):
> - adds limit, limit, 4
> - b.lo L(less4)
> - ldr data1w, [src1], 4
> - ldr data2w, [src2], 4
> + tbz limit, 2, L(less4)
> + ldr data1w, [src1]
> + ldr data2w, [src2]
> + ldr data3w, [src1end, -4]
> + ldr data4w, [src2end, -4]
> + b L(return2)
> +
> +L(less4):
> + tbz limit, 1, L(less2)
> + ldrh data1w, [src1]
> + ldrh data2w, [src2]
> cmp data1w, data2w
> b.ne L(return)
> - sub limit, limit, 4
> -L(less4):
> - adds limit, limit, 4
> - beq L(ret_eq)
> -L(byte_loop):
> - ldrb data1w, [src1], 1
> - ldrb data2w, [src2], 1
> - subs limit, limit, 1
> - ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */
> - b.eq L(byte_loop)
> +L(less2):
> + mov result, 0
> + tbz limit, 0, L(return_zero)
> + ldrb data1w, [src1end, -1]
> + ldrb data2w, [src2end, -1]
> sub result, data1w, data2w
> +L(return_zero):
> + ret
> +
> +L(loop_align):
> + ldp data1, data3, [src1, 16]
> + ldp data2, data4, [src2, 16]
> + cmp data1, data2
> + ccmp data3, data4, 0, eq
> + b.ne L(return2)
> +
> + /* Align src2 and adjust src1, src2 and limit. */
> + and tmp, src2, 15
> + sub tmp, tmp, 16
> + sub src2, src2, tmp
> + add limit, limit, tmp
> + sub src1, src1, tmp
> + sub limit, limit, 64 + 16
> +
> + .p2align 4
> +L(loop64):
> + ldr q0, [src1, 16]
> + ldr q1, [src2, 16]
> + subs limit, limit, 64
> + ldr q2, [src1, 32]
> + ldr q3, [src2, 32]
> + eor v0.16b, v0.16b, v1.16b
> + eor v1.16b, v2.16b, v3.16b
> + ldr q2, [src1, 48]
> + ldr q3, [src2, 48]
> + umaxp v0.16b, v0.16b, v1.16b
> + ldr q4, [src1, 64]!
> + ldr q5, [src2, 64]!
> + eor v1.16b, v2.16b, v3.16b
> + eor v2.16b, v4.16b, v5.16b
> + umaxp v1.16b, v1.16b, v2.16b
> + umaxp v0.16b, v0.16b, v1.16b
> + umaxp v0.16b, v0.16b, v0.16b
> + fmov tmp, d0
> + ccmp tmp, 0, 0, hi
> + b.eq L(loop64)
> +
> + /* If equal, process last 1-64 bytes using scalar loop. */
> + add limit, limit, 64 + 16
> + cbz tmp, L(last64)
> +
> + /* Determine the 8-byte aligned offset of the first difference. */
> +#ifdef __AARCH64EB__
> + rev16 tmp, tmp
> +#endif
> + rev tmp, tmp
> + clz tmp, tmp
> + bic tmp, tmp, 7
> + sub tmp, tmp, 48
> + ldr data1, [src1, tmp]
> + ldr data2, [src2, tmp]
> +#ifndef __AARCH64EB__
> + rev data1, data1
> + rev data2, data2
> +#endif
> + mov result, 1
> + cmp data1, data2
> + cneg result, result, lo
> ret
>
> - .size memcmp, . - memcmp
> +END (memcmp)
> #endif
> diff --git a/newlib/libc/machine/aarch64/memcpy.S b/newlib/libc/machine/aarch64/memcpy.S
> index 463bad0a18..248e7843a2 100644
> --- a/newlib/libc/machine/aarch64/memcpy.S
> +++ b/newlib/libc/machine/aarch64/memcpy.S
> @@ -1,55 +1,8 @@
> -/* Copyright (c) 2012-2013, Linaro Limited
> - All rights reserved.
> -
> - Redistribution and use in source and binary forms, with or without
> - modification, are permitted provided that the following conditions are met:
> - * Redistributions of source code must retain the above copyright
> - notice, this list of conditions and the following disclaimer.
> - * Redistributions in binary form must reproduce the above copyright
> - notice, this list of conditions and the following disclaimer in the
> - documentation and/or other materials provided with the distribution.
> - * Neither the name of the Linaro nor the
> - names of its contributors may be used to endorse or promote products
> - derived from this software without specific prior written permission.
> -
> - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
> -
> /*
> - * Copyright (c) 2015 ARM Ltd
> - * All rights reserved.
> - *
> - * Redistribution and use in source and binary forms, with or without
> - * modification, are permitted provided that the following conditions
> - * are met:
> - * 1. Redistributions of source code must retain the above copyright
> - * notice, this list of conditions and the following disclaimer.
> - * 2. Redistributions in binary form must reproduce the above copyright
> - * notice, this list of conditions and the following disclaimer in the
> - * documentation and/or other materials provided with the distribution.
> - * 3. The name of the company may not be used to endorse or promote
> - * products derived from this software without specific prior written
> - * permission.
> + * memcpy - copy memory area
> *
> - * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
> - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
> - * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
> - * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
> - * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
> - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
> - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
> - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
> - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> + * Copyright (c) 2012-2022, Arm Limited.
> + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
> */
>
> /* Assumptions:
> @@ -61,6 +14,7 @@
> #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
> /* See memcpy-stub.c */
> #else
> +#include "asmdefs.h"
>
> #define dstin x0
> #define src x1
> @@ -71,122 +25,139 @@
> #define A_l x6
> #define A_lw w6
> #define A_h x7
> -#define A_hw w7
> #define B_l x8
> #define B_lw w8
> #define B_h x9
> #define C_l x10
> +#define C_lw w10
> #define C_h x11
> #define D_l x12
> #define D_h x13
> -#define E_l src
> -#define E_h count
> -#define F_l srcend
> -#define F_h dst
> -#define tmp1 x9
> -
> -#define L(l) .L ## l
> -
> - .macro def_fn f p2align=0
> - .text
> - .p2align \p2align
> - .global \f
> - .type \f, %function
> -\f:
> - .endm
> -
> -/* Copies are split into 3 main cases: small copies of up to 16 bytes,
> - medium copies of 17..96 bytes which are fully unrolled. Large copies
> - of more than 96 bytes align the destination and use an unrolled loop
> - processing 64 bytes per iteration.
> - Small and medium copies read all data before writing, allowing any
> - kind of overlap, and memmove tailcalls memcpy for these cases as
> - well as non-overlapping copies.
> +#define E_l x14
> +#define E_h x15
> +#define F_l x16
> +#define F_h x17
> +#define G_l count
> +#define G_h dst
> +#define H_l src
> +#define H_h srcend
> +#define tmp1 x14
> +
> +/* This implementation handles overlaps and supports both memcpy and memmove
> + from a single entry point. It uses unaligned accesses and branchless
> + sequences to keep the code small, simple and improve performance.
> +
> + Copies are split into 3 main cases: small copies of up to 32 bytes, medium
> + copies of up to 128 bytes, and large copies. The overhead of the overlap
> + check is negligible since it is only required for large copies.
> +
> + Large copies use a software pipelined loop processing 64 bytes per iteration.
> + The destination pointer is 16-byte aligned to minimize unaligned accesses.
> + The loop tail is handled by always copying 64 bytes from the end.
> */
>
> -def_fn memcpy p2align=6
> - prfm PLDL1KEEP, [src]
> +ENTRY_ALIAS (memmove)
> +ENTRY (memcpy)
> + PTR_ARG (0)
> + PTR_ARG (1)
> + SIZE_ARG (2)
> add srcend, src, count
> add dstend, dstin, count
> - cmp count, 16
> - b.ls L(copy16)
> - cmp count, 96
> + cmp count, 128
> b.hi L(copy_long)
> + cmp count, 32
> + b.hi L(copy32_128)
>
> - /* Medium copies: 17..96 bytes. */
> - sub tmp1, count, 1
> + /* Small copies: 0..32 bytes. */
> + cmp count, 16
> + b.lo L(copy16)
> ldp A_l, A_h, [src]
> - tbnz tmp1, 6, L(copy96)
> ldp D_l, D_h, [srcend, -16]
> - tbz tmp1, 5, 1f
> - ldp B_l, B_h, [src, 16]
> - ldp C_l, C_h, [srcend, -32]
> - stp B_l, B_h, [dstin, 16]
> - stp C_l, C_h, [dstend, -32]
> -1:
> stp A_l, A_h, [dstin]
> stp D_l, D_h, [dstend, -16]
> ret
>
> - .p2align 4
> - /* Small copies: 0..16 bytes. */
> + /* Copy 8-15 bytes. */
> L(copy16):
> - cmp count, 8
> - b.lo 1f
> + tbz count, 3, L(copy8)
> ldr A_l, [src]
> ldr A_h, [srcend, -8]
> str A_l, [dstin]
> str A_h, [dstend, -8]
> ret
> - .p2align 4
> -1:
> - tbz count, 2, 1f
> +
> + .p2align 3
> + /* Copy 4-7 bytes. */
> +L(copy8):
> + tbz count, 2, L(copy4)
> ldr A_lw, [src]
> - ldr A_hw, [srcend, -4]
> + ldr B_lw, [srcend, -4]
> str A_lw, [dstin]
> - str A_hw, [dstend, -4]
> + str B_lw, [dstend, -4]
> ret
>
> - /* Copy 0..3 bytes. Use a branchless sequence that copies the same
> - byte 3 times if count==1, or the 2nd byte twice if count==2. */
> -1:
> - cbz count, 2f
> + /* Copy 0..3 bytes using a branchless sequence. */
> +L(copy4):
> + cbz count, L(copy0)
> lsr tmp1, count, 1
> ldrb A_lw, [src]
> - ldrb A_hw, [srcend, -1]
> + ldrb C_lw, [srcend, -1]
> ldrb B_lw, [src, tmp1]
> strb A_lw, [dstin]
> strb B_lw, [dstin, tmp1]
> - strb A_hw, [dstend, -1]
> -2: ret
> + strb C_lw, [dstend, -1]
> +L(copy0):
> + ret
>
> .p2align 4
> - /* Copy 64..96 bytes. Copy 64 bytes from the start and
> - 32 bytes from the end. */
> -L(copy96):
> + /* Medium copies: 33..128 bytes. */
> +L(copy32_128):
> + ldp A_l, A_h, [src]
> ldp B_l, B_h, [src, 16]
> - ldp C_l, C_h, [src, 32]
> - ldp D_l, D_h, [src, 48]
> - ldp E_l, E_h, [srcend, -32]
> - ldp F_l, F_h, [srcend, -16]
> + ldp C_l, C_h, [srcend, -32]
> + ldp D_l, D_h, [srcend, -16]
> + cmp count, 64
> + b.hi L(copy128)
> stp A_l, A_h, [dstin]
> stp B_l, B_h, [dstin, 16]
> - stp C_l, C_h, [dstin, 32]
> - stp D_l, D_h, [dstin, 48]
> - stp E_l, E_h, [dstend, -32]
> - stp F_l, F_h, [dstend, -16]
> + stp C_l, C_h, [dstend, -32]
> + stp D_l, D_h, [dstend, -16]
> ret
>
> - /* Align DST to 16 byte alignment so that we don't cross cache line
> - boundaries on both loads and stores. There are at least 96 bytes
> - to copy, so copy 16 bytes unaligned and then align. The loop
> - copies 64 bytes per iteration and prefetches one iteration ahead. */
> + .p2align 4
> + /* Copy 65..128 bytes. */
> +L(copy128):
> + ldp E_l, E_h, [src, 32]
> + ldp F_l, F_h, [src, 48]
> + cmp count, 96
> + b.ls L(copy96)
> + ldp G_l, G_h, [srcend, -64]
> + ldp H_l, H_h, [srcend, -48]
> + stp G_l, G_h, [dstend, -64]
> + stp H_l, H_h, [dstend, -48]
> +L(copy96):
> + stp A_l, A_h, [dstin]
> + stp B_l, B_h, [dstin, 16]
> + stp E_l, E_h, [dstin, 32]
> + stp F_l, F_h, [dstin, 48]
> + stp C_l, C_h, [dstend, -32]
> + stp D_l, D_h, [dstend, -16]
> + ret
>
> .p2align 4
> + /* Copy more than 128 bytes. */
> L(copy_long):
> + /* Use backwards copy if there is an overlap. */
> + sub tmp1, dstin, src
> + cbz tmp1, L(copy0)
> + cmp tmp1, count
> + b.lo L(copy_long_backwards)
> +
> + /* Copy 16 bytes and then align dst to 16-byte alignment. */
> +
> + ldp D_l, D_h, [src]
> and tmp1, dstin, 15
> bic dst, dstin, 15
> - ldp D_l, D_h, [src]
> sub src, src, tmp1
> add count, count, tmp1 /* Count is now 16 too large. */
> ldp A_l, A_h, [src, 16]
> @@ -195,8 +166,9 @@ L(copy_long):
> ldp C_l, C_h, [src, 48]
> ldp D_l, D_h, [src, 64]!
> subs count, count, 128 + 16 /* Test and readjust count. */
> - b.ls 2f
> -1:
> + b.ls L(copy64_from_end)
> +
> +L(loop64):
> stp A_l, A_h, [dst, 16]
> ldp A_l, A_h, [src, 16]
> stp B_l, B_h, [dst, 32]
> @@ -206,12 +178,10 @@ L(copy_long):
> stp D_l, D_h, [dst, 64]!
> ldp D_l, D_h, [src, 64]!
> subs count, count, 64
> - b.hi 1b
> + b.hi L(loop64)
>
> - /* Write the last full set of 64 bytes. The remainder is at most 64
> - bytes, so it is safe to always copy 64 bytes from the end even if
> - there is just 1 byte left. */
> -2:
> + /* Write the last iteration and copy 64 bytes from the end. */
> +L(copy64_from_end):
> ldp E_l, E_h, [srcend, -64]
> stp A_l, A_h, [dst, 16]
> ldp A_l, A_h, [srcend, -48]
> @@ -226,5 +196,51 @@ L(copy_long):
> stp C_l, C_h, [dstend, -16]
> ret
>
> - .size memcpy, . - memcpy
> + .p2align 4
> +
> + /* Large backwards copy for overlapping copies.
> + Copy 16 bytes and then align dst to 16-byte alignment. */
> +L(copy_long_backwards):
> + ldp D_l, D_h, [srcend, -16]
> + and tmp1, dstend, 15
> + sub srcend, srcend, tmp1
> + sub count, count, tmp1
> + ldp A_l, A_h, [srcend, -16]
> + stp D_l, D_h, [dstend, -16]
> + ldp B_l, B_h, [srcend, -32]
> + ldp C_l, C_h, [srcend, -48]
> + ldp D_l, D_h, [srcend, -64]!
> + sub dstend, dstend, tmp1
> + subs count, count, 128
> + b.ls L(copy64_from_start)
> +
> +L(loop64_backwards):
> + stp A_l, A_h, [dstend, -16]
> + ldp A_l, A_h, [srcend, -16]
> + stp B_l, B_h, [dstend, -32]
> + ldp B_l, B_h, [srcend, -32]
> + stp C_l, C_h, [dstend, -48]
> + ldp C_l, C_h, [srcend, -48]
> + stp D_l, D_h, [dstend, -64]!
> + ldp D_l, D_h, [srcend, -64]!
> + subs count, count, 64
> + b.hi L(loop64_backwards)
> +
> + /* Write the last iteration and copy 64 bytes from the start. */
> +L(copy64_from_start):
> + ldp G_l, G_h, [src, 48]
> + stp A_l, A_h, [dstend, -16]
> + ldp A_l, A_h, [src, 32]
> + stp B_l, B_h, [dstend, -32]
> + ldp B_l, B_h, [src, 16]
> + stp C_l, C_h, [dstend, -48]
> + ldp C_l, C_h, [src]
> + stp D_l, D_h, [dstend, -64]
> + stp G_l, G_h, [dstin, 48]
> + stp A_l, A_h, [dstin, 32]
> + stp B_l, B_h, [dstin, 16]
> + stp C_l, C_h, [dstin]
> + ret
> +
> +END (memcpy)
> #endif
> diff --git a/newlib/libc/machine/aarch64/memset.S b/newlib/libc/machine/aarch64/memset.S
> index 103e3f8bb0..ca76439a91 100644
> --- a/newlib/libc/machine/aarch64/memset.S
> +++ b/newlib/libc/machine/aarch64/memset.S
> @@ -1,66 +1,20 @@
> -/* Copyright (c) 2012-2013, Linaro Limited
> - All rights reserved.
> -
> - Redistribution and use in source and binary forms, with or without
> - modification, are permitted provided that the following conditions are met:
> - * Redistributions of source code must retain the above copyright
> - notice, this list of conditions and the following disclaimer.
> - * Redistributions in binary form must reproduce the above copyright
> - notice, this list of conditions and the following disclaimer in the
> - documentation and/or other materials provided with the distribution.
> - * Neither the name of the Linaro nor the
> - names of its contributors may be used to endorse or promote products
> - derived from this software without specific prior written permission.
> -
> - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
> -
> /*
> - * Copyright (c) 2015 ARM Ltd
> - * All rights reserved.
> + * memset - fill memory with a constant byte
> *
> - * Redistribution and use in source and binary forms, with or without
> - * modification, are permitted provided that the following conditions
> - * are met:
> - * 1. Redistributions of source code must retain the above copyright
> - * notice, this list of conditions and the following disclaimer.
> - * 2. Redistributions in binary form must reproduce the above copyright
> - * notice, this list of conditions and the following disclaimer in the
> - * documentation and/or other materials provided with the distribution.
> - * 3. The name of the company may not be used to endorse or promote
> - * products derived from this software without specific prior written
> - * permission.
> - *
> - * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
> - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
> - * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
> - * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
> - * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
> - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
> - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
> - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
> - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> + * Copyright (c) 2012-2022, Arm Limited.
> + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
> */
>
> /* Assumptions:
> *
> - * ARMv8-a, AArch64, unaligned accesses
> + * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
> *
> */
>
> #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
> /* See memset-stub.c */
> #else
> +#include "asmdefs.h"
>
> #define dstin x0
> #define val x1
> @@ -68,24 +22,11 @@
> #define count x2
> #define dst x3
> #define dstend x4
> -#define tmp1 x5
> -#define tmp1w w5
> -#define tmp2 x6
> -#define tmp2w w6
> -#define zva_len x7
> -#define zva_lenw w7
> -
> -#define L(l) .L ## l
> +#define zva_val x5
>
> - .macro def_fn f p2align=0
> - .text
> - .p2align \p2align
> - .global \f
> - .type \f, %function
> -\f:
> - .endm
> -
> -def_fn memset p2align=6
> +ENTRY (memset)
> + PTR_ARG (0)
> + SIZE_ARG (2)
>
> dup v0.16B, valw
> add dstend, dstin, count
> @@ -101,7 +42,7 @@ def_fn memset p2align=6
> str val, [dstin]
> str val, [dstend, -8]
> ret
> - nop
> + .p2align 4
> 1: tbz count, 2, 2f
> str valw, [dstin]
> str valw, [dstend, -4]
> @@ -131,110 +72,49 @@ L(set96):
> stp q0, q0, [dstend, -32]
> ret
>
> - .p2align 3
> - nop
> + .p2align 4
> L(set_long):
> and valw, valw, 255
> bic dst, dstin, 15
> str q0, [dstin]
> - cmp count, 256
> - ccmp valw, 0, 0, cs
> - b.eq L(try_zva)
> -L(no_zva):
> - sub count, dstend, dst /* Count is 16 too large. */
> - sub dst, dst, 16 /* Dst is biased by -32. */
> - sub count, count, 64 + 16 /* Adjust count and bias for loop. */
> -1: stp q0, q0, [dst, 32]
> - stp q0, q0, [dst, 64]!
> -L(tail64):
> - subs count, count, 64
> - b.hi 1b
> -2: stp q0, q0, [dstend, -64]
> - stp q0, q0, [dstend, -32]
> - ret
> -
> - .p2align 3
> -L(try_zva):
> - mrs tmp1, dczid_el0
> - tbnz tmp1w, 4, L(no_zva)
> - and tmp1w, tmp1w, 15
> - cmp tmp1w, 4 /* ZVA size is 64 bytes. */
> - b.ne L(zva_128)
> -
> - /* Write the first and last 64 byte aligned block using stp rather
> - than using DC ZVA. This is faster on some cores.
> - */
> -L(zva_64):
> + cmp count, 160
> + ccmp valw, 0, 0, hs
> + b.ne L(no_zva)
> +
> +#ifndef SKIP_ZVA_CHECK
> + mrs zva_val, dczid_el0
> + and zva_val, zva_val, 31
> + cmp zva_val, 4 /* ZVA size is 64 bytes. */
> + b.ne L(no_zva)
> +#endif
> str q0, [dst, 16]
> stp q0, q0, [dst, 32]
> bic dst, dst, 63
> - stp q0, q0, [dst, 64]
> - stp q0, q0, [dst, 96]
> - sub count, dstend, dst /* Count is now 128 too large. */
> - sub count, count, 128+64+64 /* Adjust count and bias for loop. */
> - add dst, dst, 128
> - nop
> -1: dc zva, dst
> + sub count, dstend, dst /* Count is now 64 too large. */
> + sub count, count, 128 /* Adjust count and bias for loop. */
> +
> + .p2align 4
> +L(zva_loop):
> add dst, dst, 64
> + dc zva, dst
> subs count, count, 64
> - b.hi 1b
> - stp q0, q0, [dst, 0]
> - stp q0, q0, [dst, 32]
> + b.hi L(zva_loop)
> stp q0, q0, [dstend, -64]
> stp q0, q0, [dstend, -32]
> ret
>
> - .p2align 3
> -L(zva_128):
> - cmp tmp1w, 5 /* ZVA size is 128 bytes. */
> - b.ne L(zva_other)
> -
> - str q0, [dst, 16]
> +L(no_zva):
> + sub count, dstend, dst /* Count is 16 too large. */
> + sub dst, dst, 16 /* Dst is biased by -32. */
> + sub count, count, 64 + 16 /* Adjust count and bias for loop. */
> +L(no_zva_loop):
> stp q0, q0, [dst, 32]
> - stp q0, q0, [dst, 64]
> - stp q0, q0, [dst, 96]
> - bic dst, dst, 127
> - sub count, dstend, dst /* Count is now 128 too large. */
> - sub count, count, 128+128 /* Adjust count and bias for loop. */
> - add dst, dst, 128
> -1: dc zva, dst
> - add dst, dst, 128
> - subs count, count, 128
> - b.hi 1b
> - stp q0, q0, [dstend, -128]
> - stp q0, q0, [dstend, -96]
> + stp q0, q0, [dst, 64]!
> + subs count, count, 64
> + b.hi L(no_zva_loop)
> stp q0, q0, [dstend, -64]
> stp q0, q0, [dstend, -32]
> ret
>
> -L(zva_other):
> - mov tmp2w, 4
> - lsl zva_lenw, tmp2w, tmp1w
> - add tmp1, zva_len, 64 /* Max alignment bytes written. */
> - cmp count, tmp1
> - blo L(no_zva)
> -
> - sub tmp2, zva_len, 1
> - add tmp1, dst, zva_len
> - add dst, dst, 16
> - subs count, tmp1, dst /* Actual alignment bytes to write. */
> - bic tmp1, tmp1, tmp2 /* Aligned dc zva start address. */
> - beq 2f
> -1: stp q0, q0, [dst], 64
> - stp q0, q0, [dst, -32]
> - subs count, count, 64
> - b.hi 1b
> -2: mov dst, tmp1
> - sub count, dstend, tmp1 /* Remaining bytes to write. */
> - subs count, count, zva_len
> - b.lo 4f
> -3: dc zva, dst
> - add dst, dst, zva_len
> - subs count, count, zva_len
> - b.hs 3b
> -4: add count, count, zva_len
> - sub dst, dst, 32 /* Bias dst for tail loop. */
> - b L(tail64)
> -
> - .size memset, . - memset
> +END (memset)
> #endif
> diff --git a/newlib/libc/machine/aarch64/stpcpy.S b/newlib/libc/machine/aarch64/stpcpy.S
> index 696b45889f..155c68d75a 100644
> --- a/newlib/libc/machine/aarch64/stpcpy.S
> +++ b/newlib/libc/machine/aarch64/stpcpy.S
> @@ -1,34 +1,10 @@
> /*
> - stpcpy - copy a string returning pointer to end.
> + * stpcpy - copy a string returning pointer to end.
> + *
> + * Copyright (c) 2020, Arm Limited.
> + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
> + */
>
> - Copyright (c) 2015 ARM Ltd.
> - All Rights Reserved.
> +#define BUILD_STPCPY 1
>
> - Redistribution and use in source and binary forms, with or without
> - modification, are permitted provided that the following conditions are met:
> - * Redistributions of source code must retain the above copyright
> - notice, this list of conditions and the following disclaimer.
> - * Redistributions in binary form must reproduce the above copyright
> - notice, this list of conditions and the following disclaimer in the
> - documentation and/or other materials provided with the distribution.
> - * Neither the name of the company nor the names of its contributors
> - may be used to endorse or promote products derived from this
> - software without specific prior written permission.
> -
> - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
> -
> -/* This is just a wrapper that uses strcpy code with appropriate
> - pre-defines. */
> -
> -#define BUILD_STPCPY
> #include "strcpy.S"
> diff --git a/newlib/libc/machine/aarch64/strchr.S b/newlib/libc/machine/aarch64/strchr.S
> index 2448dbc7d5..500d9aff29 100644
> --- a/newlib/libc/machine/aarch64/strchr.S
> +++ b/newlib/libc/machine/aarch64/strchr.S
> @@ -1,32 +1,9 @@
> /*
> - strchr - find a character in a string
> -
> - Copyright (c) 2014, ARM Limited
> - All rights Reserved.
> -
> - Redistribution and use in source and binary forms, with or without
> - modification, are permitted provided that the following conditions are met:
> - * Redistributions of source code must retain the above copyright
> - notice, this list of conditions and the following disclaimer.
> - * Redistributions in binary form must reproduce the above copyright
> - notice, this list of conditions and the following disclaimer in the
> - documentation and/or other materials provided with the distribution.
> - * Neither the name of the company nor the names of its contributors
> - may be used to endorse or promote products derived from this
> - software without specific prior written permission.
> -
> - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
> -
> + * strchr - find a character in a string
> + *
> + * Copyright (c) 2014-2022, Arm Limited.
> + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
> + */
> #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
> /* See strchr-stub.c */
> #else
> @@ -37,6 +14,8 @@
> * Neon Available.
> */
>
> +#include "asmdefs.h"
> +
> /* Arguments and results. */
> #define srcin x0
> #define chrin w1
> @@ -74,26 +53,19 @@
>
> /* Locals and temporaries. */
>
> - .macro def_fn f p2align=0
> - .text
> - .p2align \p2align
> - .global \f
> - .type \f, %function
> -\f:
> - .endm
> -
> -def_fn strchr
> - /* Magic constant 0x40100401 to allow us to identify which lane
> - matches the requested byte. Magic constant 0x80200802 used
> - similarly for NUL termination. */
> - mov wtmp2, #0x0401
> - movk wtmp2, #0x4010, lsl #16
> +ENTRY (strchr)
> + PTR_ARG (0)
> + /* Magic constant 0xc0300c03 to allow us to identify which lane
> + matches the requested byte. Even bits are set if the character
> + matches, odd bits if either the char is NUL or matches. */
> + mov wtmp2, 0x0c03
> + movk wtmp2, 0xc030, lsl 16
> dup vrepchr.16b, chrin
> bic src, srcin, #31 /* Work with aligned 32-byte hunks. */
> dup vrepmask_c.4s, wtmp2
> ands tmp1, srcin, #31
> add vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */
> - b.eq .Lloop
> + b.eq L(loop)
>
> /* Input string is not 32-byte aligned. Rather than forcing
> the padding bytes to a safe value, we calculate the syndrome
> @@ -105,49 +77,42 @@ def_fn strchr
> cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
> cmeq vhas_nul2.16b, vdata2.16b, #0
> cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
> - and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
> - and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
> - and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
> - and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
> - orr vend1.16b, vhas_nul1.16b, vhas_chr1.16b
> - orr vend2.16b, vhas_nul2.16b, vhas_chr2.16b
> + bif vhas_nul1.16b, vhas_chr1.16b, vrepmask_0.16b
> + bif vhas_nul2.16b, vhas_chr2.16b, vrepmask_0.16b
> + and vend1.16b, vhas_nul1.16b, vrepmask_c.16b
> + and vend2.16b, vhas_nul2.16b, vrepmask_c.16b
> lsl tmp1, tmp1, #1
> addp vend1.16b, vend1.16b, vend2.16b // 256->128
> mov tmp3, #~0
> addp vend1.16b, vend1.16b, vend2.16b // 128->64
> lsr tmp1, tmp3, tmp1
>
> - mov tmp3, vend1.2d[0]
> + mov tmp3, vend1.d[0]
> bic tmp1, tmp3, tmp1 // Mask padding bits.
> - cbnz tmp1, .Ltail
> + cbnz tmp1, L(tail)
>
> -.Lloop:
> + .p2align 4
> +L(loop):
> ld1 {vdata1.16b, vdata2.16b}, [src], #32
> - cmeq vhas_nul1.16b, vdata1.16b, #0
> cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
> - cmeq vhas_nul2.16b, vdata2.16b, #0
> cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
> - /* Use a fast check for the termination condition. */
> - orr vend1.16b, vhas_nul1.16b, vhas_chr1.16b
> - orr vend2.16b, vhas_nul2.16b, vhas_chr2.16b
> - orr vend1.16b, vend1.16b, vend2.16b
> - addp vend1.2d, vend1.2d, vend1.2d
> - mov tmp1, vend1.2d[0]
> - cbz tmp1, .Lloop
> + cmhs vhas_nul1.16b, vhas_chr1.16b, vdata1.16b
> + cmhs vhas_nul2.16b, vhas_chr2.16b, vdata2.16b
> + orr vend1.16b, vhas_nul1.16b, vhas_nul2.16b
> + umaxp vend1.16b, vend1.16b, vend1.16b
> + mov tmp1, vend1.d[0]
> + cbz tmp1, L(loop)
>
> /* Termination condition found. Now need to establish exactly why
> we terminated. */
> - and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
> - and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
> - and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
> - and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
> - orr vend1.16b, vhas_nul1.16b, vhas_chr1.16b
> - orr vend2.16b, vhas_nul2.16b, vhas_chr2.16b
> + bif vhas_nul1.16b, vhas_chr1.16b, vrepmask_0.16b
> + bif vhas_nul2.16b, vhas_chr2.16b, vrepmask_0.16b
> + and vend1.16b, vhas_nul1.16b, vrepmask_c.16b
> + and vend2.16b, vhas_nul2.16b, vrepmask_c.16b
> addp vend1.16b, vend1.16b, vend2.16b // 256->128
> addp vend1.16b, vend1.16b, vend2.16b // 128->64
> -
> - mov tmp1, vend1.2d[0]
> -.Ltail:
> + mov tmp1, vend1.d[0]
> +L(tail):
> /* Count the trailing zeros, by bit reversing... */
> rbit tmp1, tmp1
> /* Re-bias source. */
> @@ -160,5 +125,5 @@ def_fn strchr
> csel result, result, xzr, eq
> ret
>
> - .size strchr, . - strchr
> +END (strchr)
> #endif
> diff --git a/newlib/libc/machine/aarch64/strchrnul.S b/newlib/libc/machine/aarch64/strchrnul.S
> index a0ac13b7f4..ceaf4dca17 100644
> --- a/newlib/libc/machine/aarch64/strchrnul.S
> +++ b/newlib/libc/machine/aarch64/strchrnul.S
> @@ -1,32 +1,9 @@
> /*
> - strchrnul - find a character or nul in a string
> -
> - Copyright (c) 2014, ARM Limited
> - All rights Reserved.
> -
> - Redistribution and use in source and binary forms, with or without
> - modification, are permitted provided that the following conditions are met:
> - * Redistributions of source code must retain the above copyright
> - notice, this list of conditions and the following disclaimer.
> - * Redistributions in binary form must reproduce the above copyright
> - notice, this list of conditions and the following disclaimer in the
> - documentation and/or other materials provided with the distribution.
> - * Neither the name of the company nor the names of its contributors
> - may be used to endorse or promote products derived from this
> - software without specific prior written permission.
> -
> - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
> -
> + * strchrnul - find a character or nul in a string
> + *
> + * Copyright (c) 2014-2022, Arm Limited.
> + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
> + */
> #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
> /* See strchrnul-stub.c */
> #else
> @@ -37,6 +14,8 @@
> * Neon Available.
> */
>
> +#include "asmdefs.h"
> +
> /* Arguments and results. */
> #define srcin x0
> #define chrin w1
> @@ -70,15 +49,8 @@
>
> /* Locals and temporaries. */
>
> - .macro def_fn f p2align=0
> - .text
> - .p2align \p2align
> - .global \f
> - .type \f, %function
> -\f:
> - .endm
> -
> -def_fn strchrnul
> +ENTRY (strchrnul)
> + PTR_ARG (0)
> /* Magic constant 0x40100401 to allow us to identify which lane
> matches the termination condition. */
> mov wtmp2, #0x0401
> @@ -87,7 +59,7 @@ def_fn strchrnul
> bic src, srcin, #31 /* Work with aligned 32-byte hunks. */
> dup vrepmask.4s, wtmp2
> ands tmp1, srcin, #31
> - b.eq .Lloop
> + b.eq L(loop)
>
> /* Input string is not 32-byte aligned. Rather than forcing
> the padding bytes to a safe value, we calculate the syndrome
> @@ -95,47 +67,43 @@ def_fn strchrnul
> syndrome that are related to the padding. */
> ld1 {vdata1.16b, vdata2.16b}, [src], #32
> neg tmp1, tmp1
> - cmeq vhas_nul1.16b, vdata1.16b, #0
> cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
> - cmeq vhas_nul2.16b, vdata2.16b, #0
> cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
> - orr vhas_chr1.16b, vhas_chr1.16b, vhas_nul1.16b
> - orr vhas_chr2.16b, vhas_chr2.16b, vhas_nul2.16b
> - and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
> - and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
> + cmhs vhas_nul1.16b, vhas_chr1.16b, vdata1.16b
> + cmhs vhas_nul2.16b, vhas_chr2.16b, vdata2.16b
> + and vhas_chr1.16b, vhas_nul1.16b, vrepmask.16b
> + and vhas_chr2.16b, vhas_nul2.16b, vrepmask.16b
> lsl tmp1, tmp1, #1
> addp vend1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128
> mov tmp3, #~0
> addp vend1.16b, vend1.16b, vend1.16b // 128->64
> lsr tmp1, tmp3, tmp1
>
> - mov tmp3, vend1.2d[0]
> + mov tmp3, vend1.d[0]
> bic tmp1, tmp3, tmp1 // Mask padding bits.
> - cbnz tmp1, .Ltail
> + cbnz tmp1, L(tail)
>
> -.Lloop:
> + .p2align 4
> +L(loop):
> ld1 {vdata1.16b, vdata2.16b}, [src], #32
> - cmeq vhas_nul1.16b, vdata1.16b, #0
> cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
> - cmeq vhas_nul2.16b, vdata2.16b, #0
> cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
> - /* Use a fast check for the termination condition. */
> - orr vhas_chr1.16b, vhas_nul1.16b, vhas_chr1.16b
> - orr vhas_chr2.16b, vhas_nul2.16b, vhas_chr2.16b
> - orr vend1.16b, vhas_chr1.16b, vhas_chr2.16b
> - addp vend1.2d, vend1.2d, vend1.2d
> - mov tmp1, vend1.2d[0]
> - cbz tmp1, .Lloop
> + cmhs vhas_nul1.16b, vhas_chr1.16b, vdata1.16b
> + cmhs vhas_nul2.16b, vhas_chr2.16b, vdata2.16b
> + orr vend1.16b, vhas_nul1.16b, vhas_nul2.16b
> + umaxp vend1.16b, vend1.16b, vend1.16b
> + mov tmp1, vend1.d[0]
> + cbz tmp1, L(loop)
>
> /* Termination condition found. Now need to establish exactly why
> we terminated. */
> - and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
> - and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
> + and vhas_chr1.16b, vhas_nul1.16b, vrepmask.16b
> + and vhas_chr2.16b, vhas_nul2.16b, vrepmask.16b
> addp vend1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128
> addp vend1.16b, vend1.16b, vend1.16b // 128->64
>
> - mov tmp1, vend1.2d[0]
> -.Ltail:
> + mov tmp1, vend1.d[0]
> +L(tail):
> /* Count the trailing zeros, by bit reversing... */
> rbit tmp1, tmp1
> /* Re-bias source. */
> @@ -145,5 +113,5 @@ def_fn strchrnul
> add result, src, tmp1, lsr #1
> ret
>
> - .size strchrnul, . - strchrnul
> +END (strchrnul)
> #endif
> diff --git a/newlib/libc/machine/aarch64/strcmp.S b/newlib/libc/machine/aarch64/strcmp.S
> index e2bef2d49d..691a1760ee 100644
> --- a/newlib/libc/machine/aarch64/strcmp.S
> +++ b/newlib/libc/machine/aarch64/strcmp.S
> @@ -1,202 +1,192 @@
> -/* Copyright (c) 2012-2018, Linaro Limited
> - All rights reserved.
> -
> - Redistribution and use in source and binary forms, with or without
> - modification, are permitted provided that the following conditions are met:
> - * Redistributions of source code must retain the above copyright
> - notice, this list of conditions and the following disclaimer.
> - * Redistributions in binary form must reproduce the above copyright
> - notice, this list of conditions and the following disclaimer in the
> - documentation and/or other materials provided with the distribution.
> - * Neither the name of the Linaro nor the
> - names of its contributors may be used to endorse or promote products
> - derived from this software without specific prior written permission.
> -
> - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
> -
> -/* Assumptions:
> +/*
> + * strcmp - compare two strings
> *
> - * ARMv8-a, AArch64
> + * Copyright (c) 2012-2022, Arm Limited.
> + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
> */
>
> #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
> /* See strcmp-stub.c */
> #else
>
> - .macro def_fn f p2align=0
> - .text
> - .p2align \p2align
> - .global \f
> - .type \f, %function
> -\f:
> - .endm
> +/* Assumptions:
> + *
> + * ARMv8-a, AArch64.
> + * MTE compatible.
> + */
>
> -#define L(label) .L ## label
> +#include "asmdefs.h"
>
> #define REP8_01 0x0101010101010101
> #define REP8_7f 0x7f7f7f7f7f7f7f7f
> -#define REP8_80 0x8080808080808080
>
> -/* Parameters and result. */
> #define src1 x0
> #define src2 x1
> #define result x0
>
> -/* Internal variables. */
> #define data1 x2
> #define data1w w2
> #define data2 x3
> #define data2w w3
> #define has_nul x4
> #define diff x5
> +#define off1 x5
> #define syndrome x6
> -#define tmp1 x7
> -#define tmp2 x8
> -#define tmp3 x9
> -#define zeroones x10
> -#define pos x11
> -
> - /* Start of performance-critical section -- one 64B cache line. */
> -def_fn strcmp p2align=6
> - eor tmp1, src1, src2
> - mov zeroones, #REP8_01
> - tst tmp1, #7
> +#define tmp x6
> +#define data3 x7
> +#define zeroones x8
> +#define shift x9
> +#define off2 x10
> +
> +/* On big-endian early bytes are at MSB and on little-endian LSB.
> + LS_FW means shifting towards early bytes. */
> +#ifdef __AARCH64EB__
> +# define LS_FW lsl
> +#else
> +# define LS_FW lsr
> +#endif
> +
> +/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
> + (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
> + can be done in parallel across the entire word.
> + Since carry propagation makes 0x1 bytes before a NUL byte appear
> + NUL too in big-endian, byte-reverse the data before the NUL check. */
> +
> +
> +ENTRY (strcmp)
> + PTR_ARG (0)
> + PTR_ARG (1)
> + sub off2, src2, src1
> + mov zeroones, REP8_01
> + and tmp, src1, 7
> + tst off2, 7
> b.ne L(misaligned8)
> - ands tmp1, src1, #7
> - b.ne L(mutual_align)
> - /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
> - (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
> - can be done in parallel across the entire word. */
> + cbnz tmp, L(mutual_align)
> +
> + .p2align 4
> +
> L(loop_aligned):
> - ldr data1, [src1], #8
> - ldr data2, [src2], #8
> + ldr data2, [src1, off2]
> + ldr data1, [src1], 8
> L(start_realigned):
> - sub tmp1, data1, zeroones
> - orr tmp2, data1, #REP8_7f
> - eor diff, data1, data2 /* Non-zero if differences found. */
> - bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
> +#ifdef __AARCH64EB__
> + rev tmp, data1
> + sub has_nul, tmp, zeroones
> + orr tmp, tmp, REP8_7f
> +#else
> + sub has_nul, data1, zeroones
> + orr tmp, data1, REP8_7f
> +#endif
> + bics has_nul, has_nul, tmp /* Non-zero if NUL terminator. */
> + ccmp data1, data2, 0, eq
> + b.eq L(loop_aligned)
> +#ifdef __AARCH64EB__
> + rev has_nul, has_nul
> +#endif
> + eor diff, data1, data2
> orr syndrome, diff, has_nul
> - cbz syndrome, L(loop_aligned)
> - /* End of performance-critical section -- one 64B cache line. */
> -
> L(end):
> -#ifndef __AARCH64EB__
> +#ifndef __AARCH64EB__
> rev syndrome, syndrome
> rev data1, data1
> - /* The MS-non-zero bit of the syndrome marks either the first bit
> - that is different, or the top bit of the first zero byte.
> - Shifting left now will bring the critical information into the
> - top bits. */
> - clz pos, syndrome
> rev data2, data2
> - lsl data1, data1, pos
> - lsl data2, data2, pos
> - /* But we need to zero-extend (char is unsigned) the value and then
> - perform a signed 32-bit subtraction. */
> - lsr data1, data1, #56
> - sub result, data1, data2, lsr #56
> - ret
> -#else
> - /* For big-endian we cannot use the trick with the syndrome value
> - as carry-propagation can corrupt the upper bits if the trailing
> - bytes in the string contain 0x01. */
> - /* However, if there is no NUL byte in the dword, we can generate
> - the result directly. We can't just subtract the bytes as the
> - MSB might be significant. */
> - cbnz has_nul, 1f
> - cmp data1, data2
> - cset result, ne
> - cneg result, result, lo
> - ret
> -1:
> - /* Re-compute the NUL-byte detection, using a byte-reversed value. */
> - rev tmp3, data1
> - sub tmp1, tmp3, zeroones
> - orr tmp2, tmp3, #REP8_7f
> - bic has_nul, tmp1, tmp2
> - rev has_nul, has_nul
> - orr syndrome, diff, has_nul
> - clz pos, syndrome
> - /* The MS-non-zero bit of the syndrome marks either the first bit
> - that is different, or the top bit of the first zero byte.
> +#endif
> + clz shift, syndrome
> + /* The most-significant-non-zero bit of the syndrome marks either the
> + first bit that is different, or the top bit of the first zero byte.
> Shifting left now will bring the critical information into the
> top bits. */
> - lsl data1, data1, pos
> - lsl data2, data2, pos
> + lsl data1, data1, shift
> + lsl data2, data2, shift
> /* But we need to zero-extend (char is unsigned) the value and then
> perform a signed 32-bit subtraction. */
> - lsr data1, data1, #56
> - sub result, data1, data2, lsr #56
> + lsr data1, data1, 56
> + sub result, data1, data2, lsr 56
> ret
> -#endif
> +
> + .p2align 4
>
> L(mutual_align):
> /* Sources are mutually aligned, but are not currently at an
> alignment boundary. Round down the addresses and then mask off
> - the bytes that preceed the start point. */
> - bic src1, src1, #7
> - bic src2, src2, #7
> - lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */
> - ldr data1, [src1], #8
> - neg tmp1, tmp1 /* Bits to alignment -64. */
> - ldr data2, [src2], #8
> - mov tmp2, #~0
> -#ifdef __AARCH64EB__
> - /* Big-endian. Early bytes are at MSB. */
> - lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
> -#else
> - /* Little-endian. Early bytes are at LSB. */
> - lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
> -#endif
> - orr data1, data1, tmp2
> - orr data2, data2, tmp2
> + the bytes that precede the start point. */
> + bic src1, src1, 7
> + ldr data2, [src1, off2]
> + ldr data1, [src1], 8
> + neg shift, src2, lsl 3 /* Bits to alignment -64. */
> + mov tmp, -1
> + LS_FW tmp, tmp, shift
> + orr data1, data1, tmp
> + orr data2, data2, tmp
> b L(start_realigned)
>
> L(misaligned8):
> /* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always
> - checking to make sure that we don't access beyond page boundary in
> - SRC2. */
> - tst src1, #7
> - b.eq L(loop_misaligned)
> + checking to make sure that we don't access beyond the end of SRC2. */
> + cbz tmp, L(src1_aligned)
> L(do_misaligned):
> - ldrb data1w, [src1], #1
> - ldrb data2w, [src2], #1
> - cmp data1w, #1
> - ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
> + ldrb data1w, [src1], 1
> + ldrb data2w, [src2], 1
> + cmp data1w, 0
> + ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */
> b.ne L(done)
> - tst src1, #7
> + tst src1, 7
> b.ne L(do_misaligned)
>
> -L(loop_misaligned):
> - /* Test if we are within the last dword of the end of a 4K page. If
> - yes then jump back to the misaligned loop to copy a byte at a time. */
> - and tmp1, src2, #0xff8
> - eor tmp1, tmp1, #0xff8
> - cbz tmp1, L(do_misaligned)
> - ldr data1, [src1], #8
> - ldr data2, [src2], #8
> -
> - sub tmp1, data1, zeroones
> - orr tmp2, data1, #REP8_7f
> - eor diff, data1, data2 /* Non-zero if differences found. */
> - bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
> +L(src1_aligned):
> + neg shift, src2, lsl 3
> + bic src2, src2, 7
> + ldr data3, [src2], 8
> +#ifdef __AARCH64EB__
> + rev data3, data3
> +#endif
> + lsr tmp, zeroones, shift
> + orr data3, data3, tmp
> + sub has_nul, data3, zeroones
> + orr tmp, data3, REP8_7f
> + bics has_nul, has_nul, tmp
> + b.ne L(tail)
> +
> + sub off1, src2, src1
> +
> + .p2align 4
> +
> +L(loop_unaligned):
> + ldr data3, [src1, off1]
> + ldr data2, [src1, off2]
> +#ifdef __AARCH64EB__
> + rev data3, data3
> +#endif
> + sub has_nul, data3, zeroones
> + orr tmp, data3, REP8_7f
> + ldr data1, [src1], 8
> + bics has_nul, has_nul, tmp
> + ccmp data1, data2, 0, eq
> + b.eq L(loop_unaligned)
> +
> + lsl tmp, has_nul, shift
> +#ifdef __AARCH64EB__
> + rev tmp, tmp
> +#endif
> + eor diff, data1, data2
> + orr syndrome, diff, tmp
> + cbnz syndrome, L(end)
> +L(tail):
> + ldr data1, [src1]
> + neg shift, shift
> + lsr data2, data3, shift
> + lsr has_nul, has_nul, shift
> +#ifdef __AARCH64EB__
> + rev data2, data2
> + rev has_nul, has_nul
> +#endif
> + eor diff, data1, data2
> orr syndrome, diff, has_nul
> - cbz syndrome, L(loop_misaligned)
> b L(end)
>
> L(done):
> sub result, data1, data2
> ret
> - .size strcmp, .-strcmp
>
> +END (strcmp)
> #endif
> diff --git a/newlib/libc/machine/aarch64/strcpy.S b/newlib/libc/machine/aarch64/strcpy.S
> index e5405f2535..57c46f3908 100644
> --- a/newlib/libc/machine/aarch64/strcpy.S
> +++ b/newlib/libc/machine/aarch64/strcpy.S
> @@ -1,341 +1,160 @@
> /*
> - strcpy/stpcpy - copy a string returning pointer to start/end.
> -
> - Copyright (c) 2013, 2014, 2015 ARM Ltd.
> - All Rights Reserved.
> -
> - Redistribution and use in source and binary forms, with or without
> - modification, are permitted provided that the following conditions are met:
> - * Redistributions of source code must retain the above copyright
> - notice, this list of conditions and the following disclaimer.
> - * Redistributions in binary form must reproduce the above copyright
> - notice, this list of conditions and the following disclaimer in the
> - documentation and/or other materials provided with the distribution.
> - * Neither the name of the company nor the names of its contributors
> - may be used to endorse or promote products derived from this
> - software without specific prior written permission.
> -
> - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
> -
> + * strcpy/stpcpy - copy a string returning pointer to start/end.
> + *
> + * Copyright (c) 2020-2023, Arm Limited.
> + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
> + */
> #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
> /* See strchr-stub.c */
> #else
>
> /* Assumptions:
> *
> - * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
> + * ARMv8-a, AArch64, Advanced SIMD.
> + * MTE compatible.
> */
>
> -/* To build as stpcpy, define BUILD_STPCPY before compiling this file.
> +#include "asmdefs.h"
>
> - To test the page crossing code path more thoroughly, compile with
> - -DSTRCPY_TEST_PAGE_CROSS - this will force all copies through the slower
> - entry path. This option is not intended for production use. */
> -
> -/* Arguments and results. */
> #define dstin x0
> #define srcin x1
> +#define result x0
>
> -/* Locals and temporaries. */
> #define src x2
> #define dst x3
> -#define data1 x4
> -#define data1w w4
> -#define data2 x5
> -#define data2w w5
> -#define has_nul1 x6
> -#define has_nul2 x7
> -#define tmp1 x8
> -#define tmp2 x9
> -#define tmp3 x10
> -#define tmp4 x11
> -#define zeroones x12
> -#define data1a x13
> -#define data2a x14
> -#define pos x15
> -#define len x16
> -#define to_align x17
> +#define len x4
> +#define synd x4
> +#define tmp x5
> +#define shift x5
> +#define data1 x6
> +#define dataw1 w6
> +#define data2 x7
> +#define dataw2 w7
> +
> +#define dataq q0
> +#define vdata v0
> +#define vhas_nul v1
> +#define vend v2
> +#define dend d2
> +#define dataq2 q1
>
> #ifdef BUILD_STPCPY
> -#define STRCPY stpcpy
> +# define STRCPY stpcpy
> +# define IFSTPCPY(X,...) X,__VA_ARGS__
> #else
> -#define STRCPY strcpy
> +# define STRCPY strcpy
> +# define IFSTPCPY(X,...)
> #endif
>
> - .macro def_fn f p2align=0
> - .text
> - .p2align \p2align
> - .global \f
> - .type \f, %function
> -\f:
> - .endm
> -
> - /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
> - (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
> - can be done in parallel across the entire word. */
> -
> -#define REP8_01 0x0101010101010101
> -#define REP8_7f 0x7f7f7f7f7f7f7f7f
> -#define REP8_80 0x8080808080808080
> -
> - /* AArch64 systems have a minimum page size of 4k. We can do a quick
> - page size check for crossing this boundary on entry and if we
> - do not, then we can short-circuit much of the entry code. We
> - expect early page-crossing strings to be rare (probability of
> - 16/MIN_PAGE_SIZE ~= 0.4%), so the branch should be quite
> - predictable, even with random strings.
> -
> - We don't bother checking for larger page sizes, the cost of setting
> - up the correct page size is just not worth the extra gain from
> - a small reduction in the cases taking the slow path. Note that
> - we only care about whether the first fetch, which may be
> - misaligned, crosses a page boundary - after that we move to aligned
> - fetches for the remainder of the string. */
> -
> -#ifdef STRCPY_TEST_PAGE_CROSS
> - /* Make everything that isn't Qword aligned look like a page cross. */
> -#define MIN_PAGE_P2 4
> -#else
> -#define MIN_PAGE_P2 12
> -#endif
> -
> -#define MIN_PAGE_SIZE (1 << MIN_PAGE_P2)
> -
> -def_fn STRCPY p2align=6
> - /* For moderately short strings, the fastest way to do the copy is to
> - calculate the length of the string in the same way as strlen, then
> - essentially do a memcpy of the result. This avoids the need for
> - multiple byte copies and further means that by the time we
> - reach the bulk copy loop we know we can always use DWord
> - accesses. We expect strcpy to rarely be called repeatedly
> - with the same source string, so branch prediction is likely to
> - always be difficult - we mitigate against this by preferring
> - conditional select operations over branches whenever this is
> - feasible. */
> - and tmp2, srcin, #(MIN_PAGE_SIZE - 1)
> - mov zeroones, #REP8_01
> - and to_align, srcin, #15
> - cmp tmp2, #(MIN_PAGE_SIZE - 16)
> - neg tmp1, to_align
> - /* The first fetch will straddle a (possible) page boundary iff
> - srcin + 15 causes bit[MIN_PAGE_P2] to change value. A 16-byte
> - aligned string will never fail the page align check, so will
> - always take the fast path. */
> - b.gt .Lpage_cross
> -
> -.Lpage_cross_ok:
> - ldp data1, data2, [srcin]
> -#ifdef __AARCH64EB__
> - /* Because we expect the end to be found within 16 characters
> - (profiling shows this is the most common case), it's worth
> - swapping the bytes now to save having to recalculate the
> - termination syndrome later. We preserve data1 and data2
> - so that we can re-use the values later on. */
> - rev tmp2, data1
> - sub tmp1, tmp2, zeroones
> - orr tmp2, tmp2, #REP8_7f
> - bics has_nul1, tmp1, tmp2
> - b.ne .Lfp_le8
> - rev tmp4, data2
> - sub tmp3, tmp4, zeroones
> - orr tmp4, tmp4, #REP8_7f
> -#else
> - sub tmp1, data1, zeroones
> - orr tmp2, data1, #REP8_7f
> - bics has_nul1, tmp1, tmp2
> - b.ne .Lfp_le8
> - sub tmp3, data2, zeroones
> - orr tmp4, data2, #REP8_7f
> +/*
> + Core algorithm:
> + For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
> + per byte. We take 4 bits of every comparison byte with shift right and narrow
> + by 4 instruction. Since the bits in the nibble mask reflect the order in
> + which things occur in the original string, counting leading zeros identifies
> + exactly which byte matched. */
> +
> +ENTRY (STRCPY)
> + PTR_ARG (0)
> + PTR_ARG (1)
> + bic src, srcin, 15
> + ld1 {vdata.16b}, [src]
> + cmeq vhas_nul.16b, vdata.16b, 0
> + lsl shift, srcin, 2
> + shrn vend.8b, vhas_nul.8h, 4
> + fmov synd, dend
> + lsr synd, synd, shift
> + cbnz synd, L(tail)
> +
> + ldr dataq, [src, 16]!
> + cmeq vhas_nul.16b, vdata.16b, 0
> + shrn vend.8b, vhas_nul.8h, 4
> + fmov synd, dend
> + cbz synd, L(start_loop)
> +
> +#ifndef __AARCH64EB__
> + rbit synd, synd
> #endif
> - bics has_nul2, tmp3, tmp4
> - b.eq .Lbulk_entry
> + sub tmp, src, srcin
> + clz len, synd
> + add len, tmp, len, lsr 2
> + tbz len, 4, L(less16)
> + sub tmp, len, 15
> + ldr dataq, [srcin]
> + ldr dataq2, [srcin, tmp]
> + str dataq, [dstin]
> + str dataq2, [dstin, tmp]
> + IFSTPCPY (add result, dstin, len)
> + ret
>
> - /* The string is short (<=16 bytes). We don't know exactly how
> - short though, yet. Work out the exact length so that we can
> - quickly select the optimal copy strategy. */
> -.Lfp_gt8:
> - rev has_nul2, has_nul2
> - clz pos, has_nul2
> - mov tmp2, #56
> - add dst, dstin, pos, lsr #3 /* Bits to bytes. */
> - sub pos, tmp2, pos
> -#ifdef __AARCH64EB__
> - lsr data2, data2, pos
> -#else
> - lsl data2, data2, pos
> -#endif
> - str data2, [dst, #1]
> +L(tail):
> + rbit synd, synd
> + clz len, synd
> + lsr len, len, 2
> +L(less16):
> + tbz len, 3, L(less8)
> + sub tmp, len, 7
> + ldr data1, [srcin]
> + ldr data2, [srcin, tmp]
> str data1, [dstin]
> -#ifdef BUILD_STPCPY
> - add dstin, dst, #8
> -#endif
> + str data2, [dstin, tmp]
> + IFSTPCPY (add result, dstin, len)
> ret
>
> -.Lfp_le8:
> - rev has_nul1, has_nul1
> - clz pos, has_nul1
> - add dst, dstin, pos, lsr #3 /* Bits to bytes. */
> - subs tmp2, pos, #24 /* Pos in bits. */
> - b.lt .Lfp_lt4
> -#ifdef __AARCH64EB__
> - mov tmp2, #56
> - sub pos, tmp2, pos
> - lsr data2, data1, pos
> - lsr data1, data1, #32
> -#else
> - lsr data2, data1, tmp2
> -#endif
> - /* 4->7 bytes to copy. */
> - str data2w, [dst, #-3]
> - str data1w, [dstin]
> -#ifdef BUILD_STPCPY
> - mov dstin, dst
> -#endif
> - ret
> -.Lfp_lt4:
> - cbz pos, .Lfp_lt2
> - /* 2->3 bytes to copy. */
> -#ifdef __AARCH64EB__
> - lsr data1, data1, #48
> -#endif
> - strh data1w, [dstin]
> - /* Fall-through, one byte (max) to go. */
> -.Lfp_lt2:
> - /* Null-terminated string. Last character must be zero! */
> - strb wzr, [dst]
> -#ifdef BUILD_STPCPY
> - mov dstin, dst
> -#endif
> + .p2align 4
> +L(less8):
> + subs tmp, len, 3
> + b.lo L(less4)
> + ldr dataw1, [srcin]
> + ldr dataw2, [srcin, tmp]
> + str dataw1, [dstin]
> + str dataw2, [dstin, tmp]
> + IFSTPCPY (add result, dstin, len)
> ret
>
> - .p2align 6
> - /* Aligning here ensures that the entry code and main loop all lies
> - within one 64-byte cache line. */
> -.Lbulk_entry:
> - sub to_align, to_align, #16
> - stp data1, data2, [dstin]
> - sub src, srcin, to_align
> - sub dst, dstin, to_align
> - b .Lentry_no_page_cross
> -
> - /* The inner loop deals with two Dwords at a time. This has a
> - slightly higher start-up cost, but we should win quite quickly,
> - especially on cores with a high number of issue slots per
> - cycle, as we get much better parallelism out of the operations. */
> -.Lmain_loop:
> - stp data1, data2, [dst], #16
> -.Lentry_no_page_cross:
> - ldp data1, data2, [src], #16
> - sub tmp1, data1, zeroones
> - orr tmp2, data1, #REP8_7f
> - sub tmp3, data2, zeroones
> - orr tmp4, data2, #REP8_7f
> - bic has_nul1, tmp1, tmp2
> - bics has_nul2, tmp3, tmp4
> - ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */
> - b.eq .Lmain_loop
> -
> - /* Since we know we are copying at least 16 bytes, the fastest way
> - to deal with the tail is to determine the location of the
> - trailing NUL, then (re)copy the 16 bytes leading up to that. */
> - cmp has_nul1, #0
> -#ifdef __AARCH64EB__
> - /* For big-endian, carry propagation (if the final byte in the
> - string is 0x01) means we cannot use has_nul directly. The
> - easiest way to get the correct byte is to byte-swap the data
> - and calculate the syndrome a second time. */
> - csel data1, data1, data2, ne
> - rev data1, data1
> - sub tmp1, data1, zeroones
> - orr tmp2, data1, #REP8_7f
> - bic has_nul1, tmp1, tmp2
> -#else
> - csel has_nul1, has_nul1, has_nul2, ne
> -#endif
> - rev has_nul1, has_nul1
> - clz pos, has_nul1
> - add tmp1, pos, #72
> - add pos, pos, #8
> - csel pos, pos, tmp1, ne
> - add src, src, pos, lsr #3
> - add dst, dst, pos, lsr #3
> - ldp data1, data2, [src, #-32]
> - stp data1, data2, [dst, #-16]
> -#ifdef BUILD_STPCPY
> - sub dstin, dst, #1
> -#endif
> +L(less4):
> + cbz len, L(zerobyte)
> + ldrh dataw1, [srcin]
> + strh dataw1, [dstin]
> +L(zerobyte):
> + strb wzr, [dstin, len]
> + IFSTPCPY (add result, dstin, len)
> ret
>
> -.Lpage_cross:
> - bic src, srcin, #15
> - /* Start by loading two words at [srcin & ~15], then forcing the
> - bytes that precede srcin to 0xff. This means they never look
> - like termination bytes. */
> - ldp data1, data2, [src]
> - lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */
> - tst to_align, #7
> - csetm tmp2, ne
> -#ifdef __AARCH64EB__
> - lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
> -#else
> - lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
> + .p2align 4
> +L(start_loop):
> + sub tmp, srcin, dstin
> + ldr dataq2, [srcin]
> + sub dst, src, tmp
> + str dataq2, [dstin]
> +L(loop):
> + str dataq, [dst], 32
> + ldr dataq, [src, 16]
> + cmeq vhas_nul.16b, vdata.16b, 0
> + umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
> + fmov synd, dend
> + cbnz synd, L(loopend)
> + str dataq, [dst, -16]
> + ldr dataq, [src, 32]!
> + cmeq vhas_nul.16b, vdata.16b, 0
> + umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
> + fmov synd, dend
> + cbz synd, L(loop)
> + add dst, dst, 16
> +L(loopend):
> + shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
> + fmov synd, dend
> + sub dst, dst, 31
> +#ifndef __AARCH64EB__
> + rbit synd, synd
> #endif
> - orr data1, data1, tmp2
> - orr data2a, data2, tmp2
> - cmp to_align, #8
> - csinv data1, data1, xzr, lt
> - csel data2, data2, data2a, lt
> - sub tmp1, data1, zeroones
> - orr tmp2, data1, #REP8_7f
> - sub tmp3, data2, zeroones
> - orr tmp4, data2, #REP8_7f
> - bic has_nul1, tmp1, tmp2
> - bics has_nul2, tmp3, tmp4
> - ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */
> - b.eq .Lpage_cross_ok
> - /* We now need to make data1 and data2 look like they've been
> - loaded directly from srcin. Do a rotate on the 128-bit value. */
> - lsl tmp1, to_align, #3 /* Bytes->bits. */
> - neg tmp2, to_align, lsl #3
> -#ifdef __AARCH64EB__
> - lsl data1a, data1, tmp1
> - lsr tmp4, data2, tmp2
> - lsl data2, data2, tmp1
> - orr tmp4, tmp4, data1a
> - cmp to_align, #8
> - csel data1, tmp4, data2, lt
> - rev tmp2, data1
> - rev tmp4, data2
> - sub tmp1, tmp2, zeroones
> - orr tmp2, tmp2, #REP8_7f
> - sub tmp3, tmp4, zeroones
> - orr tmp4, tmp4, #REP8_7f
> -#else
> - lsr data1a, data1, tmp1
> - lsl tmp4, data2, tmp2
> - lsr data2, data2, tmp1
> - orr tmp4, tmp4, data1a
> - cmp to_align, #8
> - csel data1, tmp4, data2, lt
> - sub tmp1, data1, zeroones
> - orr tmp2, data1, #REP8_7f
> - sub tmp3, data2, zeroones
> - orr tmp4, data2, #REP8_7f
> -#endif
> - bic has_nul1, tmp1, tmp2
> - cbnz has_nul1, .Lfp_le8
> - bic has_nul2, tmp3, tmp4
> - b .Lfp_gt8
> + clz len, synd
> + lsr len, len, 2
> + add dst, dst, len
> + ldr dataq, [dst, tmp]
> + str dataq, [dst]
> + IFSTPCPY (add result, dst, 15)
> + ret
>
> - .size STRCPY, . - STRCPY
> +END (STRCPY)
> #endif
> diff --git a/newlib/libc/machine/aarch64/strlen.S b/newlib/libc/machine/aarch64/strlen.S
> index 872d136ef4..68a6f357cf 100644
> --- a/newlib/libc/machine/aarch64/strlen.S
> +++ b/newlib/libc/machine/aarch64/strlen.S
> @@ -1,115 +1,92 @@
> -/* Copyright (c) 2013-2015, Linaro Limited
> - All rights reserved.
> -
> - Redistribution and use in source and binary forms, with or without
> - modification, are permitted provided that the following conditions are met:
> - * Redistributions of source code must retain the above copyright
> - notice, this list of conditions and the following disclaimer.
> - * Redistributions in binary form must reproduce the above copyright
> - notice, this list of conditions and the following disclaimer in the
> - documentation and/or other materials provided with the distribution.
> - * Neither the name of the Linaro nor the
> - names of its contributors may be used to endorse or promote products
> - derived from this software without specific prior written permission.
> -
> - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
> -
> +/*
> + * strlen - calculate the length of a string.
> + *
> + * Copyright (c) 2020-2022, Arm Limited.
> + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
> + */
> #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
> /* See strlen-stub.c */
> #else
>
> /* Assumptions:
> *
> - * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
> + * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
> + * Not MTE compatible.
> */
>
> -/* To test the page crossing code path more thoroughly, compile with
> - -DTEST_PAGE_CROSS - this will force all calls through the slower
> - entry path. This option is not intended for production use. */
> -
> -/* Arguments and results. */
> -#define srcin x0
> -#define len x0
> -
> -/* Locals and temporaries. */
> -#define src x1
> -#define data1 x2
> -#define data2 x3
> -#define has_nul1 x4
> -#define has_nul2 x5
> -#define tmp1 x4
> -#define tmp2 x5
> -#define tmp3 x6
> -#define tmp4 x7
> -#define zeroones x8
> -
> -#define L(l) .L ## l
> -
> - .macro def_fn f p2align=0
> - .text
> - .p2align \p2align
> - .global \f
> - .type \f, %function
> -\f:
> - .endm
> -
> - /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
> - (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
> - can be done in parallel across the entire word. A faster check
> - (X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives
> - false hits for characters 129..255. */
> +#include "asmdefs.h"
> +
> +#define srcin x0
> +#define len x0
> +
> +#define src x1
> +#define data1 x2
> +#define data2 x3
> +#define has_nul1 x4
> +#define has_nul2 x5
> +#define tmp1 x4
> +#define tmp2 x5
> +#define tmp3 x6
> +#define tmp4 x7
> +#define zeroones x8
> +
> +#define maskv v0
> +#define maskd d0
> +#define dataq1 q1
> +#define dataq2 q2
> +#define datav1 v1
> +#define datav2 v2
> +#define tmp x2
> +#define tmpw w2
> +#define synd x3
> +#define syndw w3
> +#define shift x4
> +
> +/* For the first 32 bytes, NUL detection works on the principle that
> + (X - 1) & (~X) & 0x80 (=> (X - 1) & ~(X | 0x7f)) is non-zero if a
> + byte is zero, and can be done in parallel across the entire word. */
>
> #define REP8_01 0x0101010101010101
> #define REP8_7f 0x7f7f7f7f7f7f7f7f
> -#define REP8_80 0x8080808080808080
> +
> +/* To test the page crossing code path more thoroughly, compile with
> + -DTEST_PAGE_CROSS - this will force all calls through the slower
> + entry path. This option is not intended for production use. */
>
> #ifdef TEST_PAGE_CROSS
> -# define MIN_PAGE_SIZE 15
> +# define MIN_PAGE_SIZE 32
> #else
> # define MIN_PAGE_SIZE 4096
> #endif
>
> - /* Since strings are short on average, we check the first 16 bytes
> - of the string for a NUL character. In order to do an unaligned ldp
> - safely we have to do a page cross check first. If there is a NUL
> - byte we calculate the length from the 2 8-byte words using
> - conditional select to reduce branch mispredictions (it is unlikely
> - strlen will be repeatedly called on strings with the same length).
> -
> - If the string is longer than 16 bytes, we align src so don't need
> - further page cross checks, and process 32 bytes per iteration
> - using the fast NUL check. If we encounter non-ASCII characters,
> - fallback to a second loop using the full NUL check.
> -
> - If the page cross check fails, we read 16 bytes from an aligned
> - address, remove any characters before the string, and continue
> - in the main loop using aligned loads. Since strings crossing a
> - page in the first 16 bytes are rare (probability of
> - 16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized.
> -
> - AArch64 systems have a minimum page size of 4k. We don't bother
> - checking for larger page sizes - the cost of setting up the correct
> - page size is just not worth the extra gain from a small reduction in
> - the cases taking the slow path. Note that we only care about
> - whether the first fetch, which may be misaligned, crosses a page
> - boundary. */
> -
> -def_fn strlen p2align=6
> +/* Core algorithm:
> +
> + Since strings are short on average, we check the first 32 bytes of the
> + string for a NUL character without aligning the string. In order to use
> + unaligned loads safely we must do a page cross check first.
> +
> + If there is a NUL byte we calculate the length from the 2 8-byte words
> + using conditional select to reduce branch mispredictions (it is unlikely
> + strlen will be repeatedly called on strings with the same length).
> +
> + If the string is longer than 32 bytes, align src so we don't need further
> + page cross checks, and process 32 bytes per iteration using a fast SIMD
> + loop.
> +
> + If the page cross check fails, we read 32 bytes from an aligned address,
> + and ignore any characters before the string. If it contains a NUL
> + character, return the length, if not, continue in the main loop. */
> +
> +ENTRY (strlen)
> + PTR_ARG (0)
> and tmp1, srcin, MIN_PAGE_SIZE - 1
> - mov zeroones, REP8_01
> - cmp tmp1, MIN_PAGE_SIZE - 16
> - b.gt L(page_cross)
> + cmp tmp1, MIN_PAGE_SIZE - 32
> + b.hi L(page_cross)
> +
> + /* Look for a NUL byte in the first 16 bytes. */
> ldp data1, data2, [srcin]
> + mov zeroones, REP8_01
> +
> #ifdef __AARCH64EB__
> /* For big-endian, carry propagation (if the final byte in the
> string is 0x01) means we cannot use has_nul1/2 directly.
> @@ -125,114 +102,96 @@ def_fn strlen p2align=6
> bics has_nul1, tmp1, tmp2
> bic has_nul2, tmp3, tmp4
> ccmp has_nul2, 0, 0, eq
> - beq L(main_loop_entry)
> + b.eq L(bytes16_31)
>
> - /* Enter with C = has_nul1 == 0. */
> + /* Find the exact offset of the first NUL byte in the first 16 bytes
> + from the string start. Enter with C = has_nul1 == 0. */
> csel has_nul1, has_nul1, has_nul2, cc
> mov len, 8
> rev has_nul1, has_nul1
> - clz tmp1, has_nul1
> csel len, xzr, len, cc
> + clz tmp1, has_nul1
> add len, len, tmp1, lsr 3
> ret
>
> - /* The inner loop processes 32 bytes per iteration and uses the fast
> - NUL check. If we encounter non-ASCII characters, use a second
> - loop with the accurate NUL check. */
> - .p2align 4
> -L(main_loop_entry):
> - bic src, srcin, 15
> - sub src, src, 16
> -L(main_loop):
> - ldp data1, data2, [src, 32]!
> -.Lpage_cross_entry:
> - sub tmp1, data1, zeroones
> - sub tmp3, data2, zeroones
> - orr tmp2, tmp1, tmp3
> - tst tmp2, zeroones, lsl 7
> - bne 1f
> - ldp data1, data2, [src, 16]
> + /* Look for a NUL byte at offset 16..31 in the string. */
> +L(bytes16_31):
> + ldp data1, data2, [srcin, 16]
> +#ifdef __AARCH64EB__
> + rev data1, data1
> + rev data2, data2
> +#endif
> sub tmp1, data1, zeroones
> - sub tmp3, data2, zeroones
> - orr tmp2, tmp1, tmp3
> - tst tmp2, zeroones, lsl 7
> - beq L(main_loop)
> - add src, src, 16
> -1:
> - /* The fast check failed, so do the slower, accurate NUL check. */
> orr tmp2, data1, REP8_7f
> + sub tmp3, data2, zeroones
> orr tmp4, data2, REP8_7f
> bics has_nul1, tmp1, tmp2
> bic has_nul2, tmp3, tmp4
> ccmp has_nul2, 0, 0, eq
> - beq L(nonascii_loop)
> + b.eq L(loop_entry)
>
> - /* Enter with C = has_nul1 == 0. */
> -L(tail):
> -#ifdef __AARCH64EB__
> - /* For big-endian, carry propagation (if the final byte in the
> - string is 0x01) means we cannot use has_nul1/2 directly. The
> - easiest way to get the correct byte is to byte-swap the data
> - and calculate the syndrome a second time. */
> - csel data1, data1, data2, cc
> - rev data1, data1
> - sub tmp1, data1, zeroones
> - orr tmp2, data1, REP8_7f
> - bic has_nul1, tmp1, tmp2
> -#else
> + /* Find the exact offset of the first NUL byte at offset 16..31 from
> + the string start. Enter with C = has_nul1 == 0. */
> csel has_nul1, has_nul1, has_nul2, cc
> -#endif
> - sub len, src, srcin
> + mov len, 24
> rev has_nul1, has_nul1
> - add tmp2, len, 8
> + mov tmp3, 16
> clz tmp1, has_nul1
> - csel len, len, tmp2, cc
> + csel len, tmp3, len, cc
> add len, len, tmp1, lsr 3
> ret
>
> -L(nonascii_loop):
> - ldp data1, data2, [src, 16]!
> - sub tmp1, data1, zeroones
> - orr tmp2, data1, REP8_7f
> - sub tmp3, data2, zeroones
> - orr tmp4, data2, REP8_7f
> - bics has_nul1, tmp1, tmp2
> - bic has_nul2, tmp3, tmp4
> - ccmp has_nul2, 0, 0, eq
> - bne L(tail)
> - ldp data1, data2, [src, 16]!
> - sub tmp1, data1, zeroones
> - orr tmp2, data1, REP8_7f
> - sub tmp3, data2, zeroones
> - orr tmp4, data2, REP8_7f
> - bics has_nul1, tmp1, tmp2
> - bic has_nul2, tmp3, tmp4
> - ccmp has_nul2, 0, 0, eq
> - beq L(nonascii_loop)
> - b L(tail)
> + nop
> +L(loop_entry):
> + bic src, srcin, 31
> +
> + .p2align 5
> +L(loop):
> + ldp dataq1, dataq2, [src, 32]!
> + uminp maskv.16b, datav1.16b, datav2.16b
> + uminp maskv.16b, maskv.16b, maskv.16b
> + cmeq maskv.8b, maskv.8b, 0
> + fmov synd, maskd
> + cbz synd, L(loop)
> +
> + /* Low 32 bits of synd are non-zero if a NUL was found in datav1. */
> + cmeq maskv.16b, datav1.16b, 0
> + sub len, src, srcin
> + cbnz syndw, 1f
> + cmeq maskv.16b, datav2.16b, 0
> + add len, len, 16
> +1:
> + /* Generate a bitmask and compute correct byte offset. */
> + shrn maskv.8b, maskv.8h, 4
> + fmov synd, maskd
> +#ifndef __AARCH64EB__
> + rbit synd, synd
> +#endif
> + clz tmp, synd
> + add len, len, tmp, lsr 2
> + ret
>
> - /* Load 16 bytes from [srcin & ~15] and force the bytes that precede
> - srcin to 0x7f, so we ignore any NUL bytes before the string.
> - Then continue in the aligned loop. */
> L(page_cross):
> - bic src, srcin, 15
> - ldp data1, data2, [src]
> - lsl tmp1, srcin, 3
> - mov tmp4, -1
> -#ifdef __AARCH64EB__
> - /* Big-endian. Early bytes are at MSB. */
> - lsr tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */
> -#else
> - /* Little-endian. Early bytes are at LSB. */
> - lsl tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */
> -#endif
> - orr tmp1, tmp1, REP8_80
> - orn data1, data1, tmp1
> - orn tmp2, data2, tmp1
> - tst srcin, 8
> - csel data1, data1, tmp4, eq
> - csel data2, data2, tmp2, eq
> - b L(page_cross_entry)
> -
> - .size strlen, . - strlen
> + bic src, srcin, 31
> + mov tmpw, 0x0c03
> + movk tmpw, 0xc030, lsl 16
> + ld1 {datav1.16b, datav2.16b}, [src]
> + dup maskv.4s, tmpw
> + cmeq datav1.16b, datav1.16b, 0
> + cmeq datav2.16b, datav2.16b, 0
> + and datav1.16b, datav1.16b, maskv.16b
> + and datav2.16b, datav2.16b, maskv.16b
> + addp maskv.16b, datav1.16b, datav2.16b
> + addp maskv.16b, maskv.16b, maskv.16b
> + fmov synd, maskd
> + lsl shift, srcin, 1
> + lsr synd, synd, shift
> + cbz synd, L(loop)
> +
> + rbit synd, synd
> + clz len, synd
> + lsr len, len, 1
> + ret
> +
> +END (strlen)
> #endif
> diff --git a/newlib/libc/machine/aarch64/strncmp.S b/newlib/libc/machine/aarch64/strncmp.S
> index ffdabc2607..373695503d 100644
> --- a/newlib/libc/machine/aarch64/strncmp.S
> +++ b/newlib/libc/machine/aarch64/strncmp.S
> @@ -1,49 +1,23 @@
> -/* Copyright (c) 2013, 2018, Linaro Limited
> - All rights reserved.
> -
> - Redistribution and use in source and binary forms, with or without
> - modification, are permitted provided that the following conditions are met:
> - * Redistributions of source code must retain the above copyright
> - notice, this list of conditions and the following disclaimer.
> - * Redistributions in binary form must reproduce the above copyright
> - notice, this list of conditions and the following disclaimer in the
> - documentation and/or other materials provided with the distribution.
> - * Neither the name of the Linaro nor the
> - names of its contributors may be used to endorse or promote products
> - derived from this software without specific prior written permission.
> -
> - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
> -
> +/*
> + * strncmp - compare two strings
> + *
> + * Copyright (c) 2013-2022, Arm Limited.
> + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
> + */
> #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
> /* See strcmp-stub.c */
> #else
>
> /* Assumptions:
> *
> - * ARMv8-a, AArch64
> + * ARMv8-a, AArch64.
> + * MTE compatible.
> */
>
> - .macro def_fn f p2align=0
> - .text
> - .p2align \p2align
> - .global \f
> - .type \f, %function
> -\f:
> - .endm
> +#include "asmdefs.h"
>
> #define REP8_01 0x0101010101010101
> #define REP8_7f 0x7f7f7f7f7f7f7f7f
> -#define REP8_80 0x8080808080808080
>
> /* Parameters and result. */
> #define src1 x0
> @@ -64,86 +38,91 @@
> #define tmp3 x10
> #define zeroones x11
> #define pos x12
> -#define limit_wd x13
> -#define mask x14
> -#define endloop x15
> +#define mask x13
> +#define endloop x14
> #define count mask
> +#define offset pos
> +#define neg_offset x15
> +
> +/* Define endian dependent shift operations.
> + On big-endian early bytes are at MSB and on little-endian LSB.
> + LS_FW means shifting towards early bytes.
> + LS_BK means shifting towards later bytes.
> + */
> +#ifdef __AARCH64EB__
> +#define LS_FW lsl
> +#define LS_BK lsr
> +#else
> +#define LS_FW lsr
> +#define LS_BK lsl
> +#endif
>
> - .text
> - .p2align 6
> - .rep 7
> - nop /* Pad so that the loop below fits a cache line. */
> - .endr
> -def_fn strncmp
> - cbz limit, .Lret0
> +ENTRY (strncmp)
> + PTR_ARG (0)
> + PTR_ARG (1)
> + SIZE_ARG (2)
> + cbz limit, L(ret0)
> eor tmp1, src1, src2
> mov zeroones, #REP8_01
> tst tmp1, #7
> and count, src1, #7
> - b.ne .Lmisaligned8
> - cbnz count, .Lmutual_align
> - /* Calculate the number of full and partial words -1. */
> - sub limit_wd, limit, #1 /* limit != 0, so no underflow. */
> - lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */
> + b.ne L(misaligned8)
> + cbnz count, L(mutual_align)
>
> /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
> (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
> can be done in parallel across the entire word. */
> - /* Start of performance-critical section -- one 64B cache line. */
> -.Lloop_aligned:
> + .p2align 4
> +L(loop_aligned):
> ldr data1, [src1], #8
> ldr data2, [src2], #8
> -.Lstart_realigned:
> - subs limit_wd, limit_wd, #1
> +L(start_realigned):
> + subs limit, limit, #8
> sub tmp1, data1, zeroones
> orr tmp2, data1, #REP8_7f
> eor diff, data1, data2 /* Non-zero if differences found. */
> - csinv endloop, diff, xzr, pl /* Last Dword or differences. */
> + csinv endloop, diff, xzr, hi /* Last Dword or differences. */
> bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
> ccmp endloop, #0, #0, eq
> - b.eq .Lloop_aligned
> - /* End of performance-critical section -- one 64B cache line. */
> + b.eq L(loop_aligned)
> + /* End of main loop */
>
> - /* Not reached the limit, must have found the end or a diff. */
> - tbz limit_wd, #63, .Lnot_limit
> -
> - /* Limit % 8 == 0 => all bytes significant. */
> - ands limit, limit, #7
> - b.eq .Lnot_limit
> -
> - lsl limit, limit, #3 /* Bits -> bytes. */
> - mov mask, #~0
> -#ifdef __AARCH64EB__
> - lsr mask, mask, limit
> -#else
> - lsl mask, mask, limit
> -#endif
> - bic data1, data1, mask
> - bic data2, data2, mask
> -
> - /* Make sure that the NUL byte is marked in the syndrome. */
> - orr has_nul, has_nul, mask
> -
> -.Lnot_limit:
> +L(full_check):
> +#ifndef __AARCH64EB__
> orr syndrome, diff, has_nul
> -
> -#ifndef __AARCH64EB__
> + add limit, limit, 8 /* Rewind limit to before last subs. */
> +L(syndrome_check):
> + /* Limit was reached. Check if the NUL byte or the difference
> + is before the limit. */
> rev syndrome, syndrome
> rev data1, data1
> - /* The MS-non-zero bit of the syndrome marks either the first bit
> - that is different, or the top bit of the first zero byte.
> - Shifting left now will bring the critical information into the
> - top bits. */
> clz pos, syndrome
> rev data2, data2
> lsl data1, data1, pos
> + cmp limit, pos, lsr #3
> lsl data2, data2, pos
> /* But we need to zero-extend (char is unsigned) the value and then
> perform a signed 32-bit subtraction. */
> lsr data1, data1, #56
> sub result, data1, data2, lsr #56
> + csel result, result, xzr, hi
> ret
> #else
> + /* Not reached the limit, must have found the end or a diff. */
> + tbz limit, #63, L(not_limit)
> + add tmp1, limit, 8
> + cbz limit, L(not_limit)
> +
> + lsl limit, tmp1, #3 /* Bits -> bytes. */
> + mov mask, #~0
> + lsr mask, mask, limit
> + bic data1, data1, mask
> + bic data2, data2, mask
> +
> + /* Make sure that the NUL byte is marked in the syndrome. */
> + orr has_nul, has_nul, mask
> +
> +L(not_limit):
> /* For big-endian we cannot use the trick with the syndrome value
> as carry-propagation can corrupt the upper bits if the trailing
> bytes in the string contain 0x01. */
> @@ -164,10 +143,11 @@ def_fn strncmp
> rev has_nul, has_nul
> orr syndrome, diff, has_nul
> clz pos, syndrome
> - /* The MS-non-zero bit of the syndrome marks either the first bit
> - that is different, or the top bit of the first zero byte.
> + /* The most-significant-non-zero bit of the syndrome marks either the
> + first bit that is different, or the top bit of the first zero byte.
> Shifting left now will bring the critical information into the
> top bits. */
> +L(end_quick):
> lsl data1, data1, pos
> lsl data2, data2, pos
> /* But we need to zero-extend (char is unsigned) the value and then
> @@ -177,7 +157,7 @@ def_fn strncmp
> ret
> #endif
>
> -.Lmutual_align:
> +L(mutual_align):
> /* Sources are mutually aligned, but are not currently at an
> alignment boundary. Round down the addresses and then mask off
> the bytes that precede the start point.
> @@ -189,102 +169,143 @@ def_fn strncmp
> neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */
> ldr data2, [src2], #8
> mov tmp2, #~0
> - sub limit_wd, limit, #1 /* limit != 0, so no underflow. */
> -#ifdef __AARCH64EB__
> - /* Big-endian. Early bytes are at MSB. */
> - lsl tmp2, tmp2, tmp3 /* Shift (count & 63). */
> -#else
> - /* Little-endian. Early bytes are at LSB. */
> - lsr tmp2, tmp2, tmp3 /* Shift (count & 63). */
> -#endif
> - and tmp3, limit_wd, #7
> - lsr limit_wd, limit_wd, #3
> - /* Adjust the limit. Only low 3 bits used, so overflow irrelevant. */
> - add limit, limit, count
> - add tmp3, tmp3, count
> + LS_FW tmp2, tmp2, tmp3 /* Shift (count & 63). */
> + /* Adjust the limit and ensure it doesn't overflow. */
> + adds limit, limit, count
> + csinv limit, limit, xzr, lo
> orr data1, data1, tmp2
> orr data2, data2, tmp2
> - add limit_wd, limit_wd, tmp3, lsr #3
> - b .Lstart_realigned
> + b L(start_realigned)
>
> - .p2align 6
> + .p2align 4
> /* Don't bother with dwords for up to 16 bytes. */
> -.Lmisaligned8:
> +L(misaligned8):
> cmp limit, #16
> - b.hs .Ltry_misaligned_words
> + b.hs L(try_misaligned_words)
>
> -.Lbyte_loop:
> +L(byte_loop):
> /* Perhaps we can do better than this. */
> ldrb data1w, [src1], #1
> ldrb data2w, [src2], #1
> subs limit, limit, #1
> ccmp data1w, #1, #0, hi /* NZCV = 0b0000. */
> ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
> - b.eq .Lbyte_loop
> -.Ldone:
> + b.eq L(byte_loop)
> +L(done):
> sub result, data1, data2
> ret
> /* Align the SRC1 to a dword by doing a bytewise compare and then do
> the dword loop. */
> -.Ltry_misaligned_words:
> - lsr limit_wd, limit, #3
> - cbz count, .Ldo_misaligned
> +L(try_misaligned_words):
> + cbz count, L(src1_aligned)
>
> neg count, count
> and count, count, #7
> sub limit, limit, count
> - lsr limit_wd, limit, #3
>
> -.Lpage_end_loop:
> +L(page_end_loop):
> ldrb data1w, [src1], #1
> ldrb data2w, [src2], #1
> cmp data1w, #1
> ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
> - b.ne .Ldone
> + b.ne L(done)
> subs count, count, #1
> - b.hi .Lpage_end_loop
> + b.hi L(page_end_loop)
> +
> + /* The following diagram explains the comparison of misaligned strings.
> + The bytes are shown in natural order. For little-endian, it is
> + reversed in the registers. The "x" bytes are before the string.
> + The "|" separates data that is loaded at one time.
> + src1 | a a a a a a a a | b b b c c c c c | . . .
> + src2 | x x x x x a a a a a a a a b b b | c c c c c . . .
>
> -.Ldo_misaligned:
> - /* Prepare ourselves for the next page crossing. Unlike the aligned
> - loop, we fetch 1 less dword because we risk crossing bounds on
> - SRC2. */
> - mov count, #8
> - subs limit_wd, limit_wd, #1
> - b.lo .Ldone_loop
> -.Lloop_misaligned:
> - and tmp2, src2, #0xff8
> - eor tmp2, tmp2, #0xff8
> - cbz tmp2, .Lpage_end_loop
> + After shifting in each step, the data looks like this:
> + STEP_A STEP_B STEP_C
> + data1 a a a a a a a a b b b c c c c c b b b c c c c c
> + data2 a a a a a a a a b b b 0 0 0 0 0 0 0 0 c c c c c
>
> + The bytes with "0" are eliminated from the syndrome via mask.
> +
> + Align SRC2 down to 16 bytes. This way we can read 16 bytes at a
> + time from SRC2. The comparison happens in 3 steps. After each step
> + the loop can exit, or read from SRC1 or SRC2. */
> +L(src1_aligned):
> + /* Calculate offset from 8 byte alignment to string start in bits. No
> + need to mask offset since shifts are ignoring upper bits. */
> + lsl offset, src2, #3
> + bic src2, src2, #0xf
> + mov mask, -1
> + neg neg_offset, offset
> ldr data1, [src1], #8
> - ldr data2, [src2], #8
> - sub tmp1, data1, zeroones
> - orr tmp2, data1, #REP8_7f
> - eor diff, data1, data2 /* Non-zero if differences found. */
> - bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
> - ccmp diff, #0, #0, eq
> - b.ne .Lnot_limit
> - subs limit_wd, limit_wd, #1
> - b.pl .Lloop_misaligned
> + ldp tmp1, tmp2, [src2], #16
> + LS_BK mask, mask, neg_offset
> + and neg_offset, neg_offset, #63 /* Need actual value for cmp later. */
> + /* Skip the first compare if data in tmp1 is irrelevant. */
> + tbnz offset, 6, L(misaligned_mid_loop)
>
> -.Ldone_loop:
> - /* We found a difference or a NULL before the limit was reached. */
> - and limit, limit, #7
> - cbz limit, .Lnot_limit
> - /* Read the last word. */
> - sub src1, src1, 8
> - sub src2, src2, 8
> - ldr data1, [src1, limit]
> - ldr data2, [src2, limit]
> - sub tmp1, data1, zeroones
> - orr tmp2, data1, #REP8_7f
> +L(loop_misaligned):
> + /* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/
> + LS_FW data2, tmp1, offset
> + LS_BK tmp1, tmp2, neg_offset
> + subs limit, limit, #8
> + orr data2, data2, tmp1 /* 8 bytes from SRC2 combined from two regs.*/
> + sub has_nul, data1, zeroones
> eor diff, data1, data2 /* Non-zero if differences found. */
> - bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
> - ccmp diff, #0, #0, eq
> - b.ne .Lnot_limit
> + orr tmp3, data1, #REP8_7f
> + csinv endloop, diff, xzr, hi /* If limit, set to all ones. */
> + bic has_nul, has_nul, tmp3 /* Non-zero if NUL byte found in SRC1. */
> + orr tmp3, endloop, has_nul
> + cbnz tmp3, L(full_check)
> +
> + ldr data1, [src1], #8
> +L(misaligned_mid_loop):
> + /* STEP_B: Compare first part of data1 to second part of tmp2. */
> + LS_FW data2, tmp2, offset
> +#ifdef __AARCH64EB__
> + /* For big-endian we do a byte reverse to avoid carry-propagation
> + problem described above. This way we can reuse the has_nul in the
> + next step and also use syndrome value trick at the end. */
> + rev tmp3, data1
> + #define data1_fixed tmp3
> +#else
> + #define data1_fixed data1
> +#endif
> + sub has_nul, data1_fixed, zeroones
> + orr tmp3, data1_fixed, #REP8_7f
> + eor diff, data2, data1 /* Non-zero if differences found. */
> + bic has_nul, has_nul, tmp3 /* Non-zero if NUL terminator. */
> +#ifdef __AARCH64EB__
> + rev has_nul, has_nul
> +#endif
> + cmp limit, neg_offset, lsr #3
> + orr syndrome, diff, has_nul
> + bic syndrome, syndrome, mask /* Ignore later bytes. */
> + csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */
> + cbnz tmp3, L(syndrome_check)
> +
> + /* STEP_C: Compare second part of data1 to first part of tmp1. */
> + ldp tmp1, tmp2, [src2], #16
> + cmp limit, #8
> + LS_BK data2, tmp1, neg_offset
> + eor diff, data2, data1 /* Non-zero if differences found. */
> + orr syndrome, diff, has_nul
> + and syndrome, syndrome, mask /* Ignore earlier bytes. */
> + csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */
> + cbnz tmp3, L(syndrome_check)
> +
> + ldr data1, [src1], #8
> + sub limit, limit, #8
> + b L(loop_misaligned)
> +
> +#ifdef __AARCH64EB__
> +L(syndrome_check):
> + clz pos, syndrome
> + cmp pos, limit, lsl #3
> + b.lo L(end_quick)
> +#endif
>
> -.Lret0:
> +L(ret0):
> mov result, #0
> ret
> - .size strncmp, . - strncmp
> +END(strncmp)
> #endif
> diff --git a/newlib/libc/machine/aarch64/strnlen.S b/newlib/libc/machine/aarch64/strnlen.S
> index c255c3f7c6..091002e0b0 100644
> --- a/newlib/libc/machine/aarch64/strnlen.S
> +++ b/newlib/libc/machine/aarch64/strnlen.S
> @@ -1,187 +1,105 @@
> -/* strnlen - calculate the length of a string with limit.
> -
> - Copyright (c) 2013, Linaro Limited
> - All rights reserved.
> -
> - Redistribution and use in source and binary forms, with or without
> - modification, are permitted provided that the following conditions are met:
> - * Redistributions of source code must retain the above copyright
> - notice, this list of conditions and the following disclaimer.
> - * Redistributions in binary form must reproduce the above copyright
> - notice, this list of conditions and the following disclaimer in the
> - documentation and/or other materials provided with the distribution.
> - * Neither the name of the Linaro nor the
> - names of its contributors may be used to endorse or promote products
> - derived from this software without specific prior written permission.
> -
> - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
> -
> +/*
> + * strnlen - calculate the length of a string with limit.
> + *
> + * Copyright (c) 2020-2022, Arm Limited.
> + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
> + */
> #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
> /* See strlen-stub.c */
> #else
>
> /* Assumptions:
> *
> - * ARMv8-a, AArch64
> + * ARMv8-a, AArch64, Advanced SIMD.
> + * MTE compatible.
> */
>
> -/* Arguments and results. */
> +#include "asmdefs.h"
> +
> #define srcin x0
> -#define len x0
> -#define limit x1
> +#define cntin x1
> +#define result x0
>
> -/* Locals and temporaries. */
> #define src x2
> -#define data1 x3
> -#define data2 x4
> -#define data2a x5
> -#define has_nul1 x6
> -#define has_nul2 x7
> -#define tmp1 x8
> -#define tmp2 x9
> -#define tmp3 x10
> -#define tmp4 x11
> -#define zeroones x12
> -#define pos x13
> -#define limit_wd x14
> -
> - .macro def_fn f p2align=0
> - .text
> - .p2align \p2align
> - .global \f
> - .type \f, %function
> -\f:
> - .endm
> -
> -#define REP8_01 0x0101010101010101
> -#define REP8_7f 0x7f7f7f7f7f7f7f7f
> -#define REP8_80 0x8080808080808080
> -
> - .text
> - .p2align 6
> -.Lstart:
> - /* Pre-pad to ensure critical loop begins an icache line. */
> - .rep 7
> - nop
> - .endr
> - /* Put this code here to avoid wasting more space with pre-padding. */
> -.Lhit_limit:
> - mov len, limit
> +#define synd x3
> +#define shift x4
> +#define tmp x4
> +#define cntrem x5
> +
> +#define qdata q0
> +#define vdata v0
> +#define vhas_chr v1
> +#define vend v2
> +#define dend d2
> +
> +/*
> + Core algorithm:
> + Process the string in 16-byte aligned chunks. Compute a 64-bit mask with
> + four bits per byte using the shrn instruction. A count trailing zeros then
> + identifies the first zero byte. */
> +
> +ENTRY (strnlen)
> + PTR_ARG (0)
> + SIZE_ARG (1)
> + bic src, srcin, 15
> + cbz cntin, L(nomatch)
> + ld1 {vdata.16b}, [src]
> + cmeq vhas_chr.16b, vdata.16b, 0
> + lsl shift, srcin, 2
> + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
> + fmov synd, dend
> + lsr synd, synd, shift
> + cbz synd, L(start_loop)
> +L(finish):
> + rbit synd, synd
> + clz synd, synd
> + lsr result, synd, 2
> + cmp cntin, result
> + csel result, cntin, result, ls
> ret
>
> -def_fn strnlen
> - cbz limit, .Lhit_limit
> - mov zeroones, #REP8_01
> - bic src, srcin, #15
> - ands tmp1, srcin, #15
> - b.ne .Lmisaligned
> - /* Calculate the number of full and partial words -1. */
> - sub limit_wd, limit, #1 /* Limit != 0, so no underflow. */
> - lsr limit_wd, limit_wd, #4 /* Convert to Qwords. */
> -
> - /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
> - (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
> - can be done in parallel across the entire word. */
> - /* The inner loop deals with two Dwords at a time. This has a
> - slightly higher start-up cost, but we should win quite quickly,
> - especially on cores with a high number of issue slots per
> - cycle, as we get much better parallelism out of the operations. */
> -
> - /* Start of critial section -- keep to one 64Byte cache line. */
> -.Lloop:
> - ldp data1, data2, [src], #16
> -.Lrealigned:
> - sub tmp1, data1, zeroones
> - orr tmp2, data1, #REP8_7f
> - sub tmp3, data2, zeroones
> - orr tmp4, data2, #REP8_7f
> - bic has_nul1, tmp1, tmp2
> - bic has_nul2, tmp3, tmp4
> - subs limit_wd, limit_wd, #1
> - orr tmp1, has_nul1, has_nul2
> - ccmp tmp1, #0, #0, pl /* NZCV = 0000 */
> - b.eq .Lloop
> - /* End of critical section -- keep to one 64Byte cache line. */
> -
> - orr tmp1, has_nul1, has_nul2
> - cbz tmp1, .Lhit_limit /* No null in final Qword. */
> -
> - /* We know there's a null in the final Qword. The easiest thing
> - to do now is work out the length of the string and return
> - MIN (len, limit). */
> -
> - sub len, src, srcin
> - cbz has_nul1, .Lnul_in_data2
> -#ifdef __AARCH64EB__
> - mov data2, data1
> -#endif
> - sub len, len, #8
> - mov has_nul2, has_nul1
> -.Lnul_in_data2:
> -#ifdef __AARCH64EB__
> - /* For big-endian, carry propagation (if the final byte in the
> - string is 0x01) means we cannot use has_nul directly. The
> - easiest way to get the correct byte is to byte-swap the data
> - and calculate the syndrome a second time. */
> - rev data2, data2
> - sub tmp1, data2, zeroones
> - orr tmp2, data2, #REP8_7f
> - bic has_nul2, tmp1, tmp2
> -#endif
> - sub len, len, #8
> - rev has_nul2, has_nul2
> - clz pos, has_nul2
> - add len, len, pos, lsr #3 /* Bits to bytes. */
> - cmp len, limit
> - csel len, len, limit, ls /* Return the lower value. */
> +L(nomatch):
> + mov result, cntin
> ret
>
> -.Lmisaligned:
> - /* Deal with a partial first word.
> - We're doing two things in parallel here;
> - 1) Calculate the number of words (but avoiding overflow if
> - limit is near ULONG_MAX) - to do this we need to work out
> - limit + tmp1 - 1 as a 65-bit value before shifting it;
> - 2) Load and mask the initial data words - we force the bytes
> - before the ones we are interested in to 0xff - this ensures
> - early bytes will not hit any zero detection. */
> - sub limit_wd, limit, #1
> - neg tmp4, tmp1
> - cmp tmp1, #8
> -
> - and tmp3, limit_wd, #15
> - lsr limit_wd, limit_wd, #4
> - mov tmp2, #~0
> -
> - ldp data1, data2, [src], #16
> - lsl tmp4, tmp4, #3 /* Bytes beyond alignment -> bits. */
> - add tmp3, tmp3, tmp1
> -
> -#ifdef __AARCH64EB__
> - /* Big-endian. Early bytes are at MSB. */
> - lsl tmp2, tmp2, tmp4 /* Shift (tmp1 & 63). */
> -#else
> - /* Little-endian. Early bytes are at LSB. */
> - lsr tmp2, tmp2, tmp4 /* Shift (tmp1 & 63). */
> +L(start_loop):
> + sub tmp, src, srcin
> + add tmp, tmp, 17
> + subs cntrem, cntin, tmp
> + b.lo L(nomatch)
> +
> + /* Make sure that it won't overread by a 16-byte chunk */
> + tbz cntrem, 4, L(loop32_2)
> + sub src, src, 16
> + .p2align 5
> +L(loop32):
> + ldr qdata, [src, 32]!
> + cmeq vhas_chr.16b, vdata.16b, 0
> + umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
> + fmov synd, dend
> + cbnz synd, L(end)
> +L(loop32_2):
> + ldr qdata, [src, 16]
> + subs cntrem, cntrem, 32
> + cmeq vhas_chr.16b, vdata.16b, 0
> + b.lo L(end_2)
> + umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
> + fmov synd, dend
> + cbz synd, L(loop32)
> +L(end_2):
> + add src, src, 16
> +L(end):
> + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
> + sub result, src, srcin
> + fmov synd, dend
> +#ifndef __AARCH64EB__
> + rbit synd, synd
> #endif
> - add limit_wd, limit_wd, tmp3, lsr #4
> -
> - orr data1, data1, tmp2
> - orr data2a, data2, tmp2
> -
> - csinv data1, data1, xzr, le
> - csel data2, data2, data2a, le
> - b .Lrealigned
> - .size strnlen, . - .Lstart /* Include pre-padding in size. */
> + clz synd, synd
> + add result, result, synd, lsr 2
> + cmp cntin, result
> + csel result, cntin, result, ls
> + ret
>
> +END (strnlen)
> #endif
> diff --git a/newlib/libc/machine/aarch64/strrchr.S b/newlib/libc/machine/aarch64/strrchr.S
> index d64fc09b1a..b0574228b6 100644
> --- a/newlib/libc/machine/aarch64/strrchr.S
> +++ b/newlib/libc/machine/aarch64/strrchr.S
> @@ -1,32 +1,9 @@
> /*
> - strrchr - find last instance of a character in a string
> -
> - Copyright (c) 2014, ARM Limited
> - All rights Reserved.
> -
> - Redistribution and use in source and binary forms, with or without
> - modification, are permitted provided that the following conditions are met:
> - * Redistributions of source code must retain the above copyright
> - notice, this list of conditions and the following disclaimer.
> - * Redistributions in binary form must reproduce the above copyright
> - notice, this list of conditions and the following disclaimer in the
> - documentation and/or other materials provided with the distribution.
> - * Neither the name of the company nor the names of its contributors
> - may be used to endorse or promote products derived from this
> - software without specific prior written permission.
> -
> - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
> -
> + * strrchr - find last position of a character in a string.
> + *
> + * Copyright (c) 2014-2022, Arm Limited.
> + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
> + */
> #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
> /* See strchr-stub.c */
> #else
> @@ -37,6 +14,8 @@
> * Neon Available.
> */
>
> +#include "asmdefs.h"
> +
> /* Arguments and results. */
> #define srcin x0
> #define chrin w1
> @@ -78,17 +57,8 @@
> in the original string a count_trailing_zeros() operation will
> identify exactly which byte is causing the termination, and why. */
>
> -/* Locals and temporaries. */
> -
> - .macro def_fn f p2align=0
> - .text
> - .p2align \p2align
> - .global \f
> - .type \f, %function
> -\f:
> - .endm
> -
> -def_fn strrchr
> +ENTRY (strrchr)
> + PTR_ARG (0)
> /* Magic constant 0x40100401 to allow us to identify which lane
> matches the requested byte. Magic constant 0x80200802 used
> similarly for NUL termination. */
> @@ -100,7 +70,7 @@ def_fn strrchr
> mov src_offset, #0
> ands tmp1, srcin, #31
> add vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */
> - b.eq .Laligned
> + b.eq L(aligned)
>
> /* Input string is not 32-byte aligned. Rather than forcing
> the padding bytes to a safe value, we calculate the syndrome
> @@ -118,45 +88,45 @@ def_fn strrchr
> and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
> addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b // 256->128
> addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128
> - addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul1.16b // 128->64
> - addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr1.16b // 128->64
> - mov nul_match, vhas_nul1.2d[0]
> + addp vend1.16b, vhas_nul1.16b, vhas_chr1.16b // 128->64
> + mov nul_match, vend1.d[0]
> lsl tmp1, tmp1, #1
> mov const_m1, #~0
> - mov chr_match, vhas_chr1.2d[0]
> lsr tmp3, const_m1, tmp1
> + mov chr_match, vend1.d[1]
>
> bic nul_match, nul_match, tmp3 // Mask padding bits.
> bic chr_match, chr_match, tmp3 // Mask padding bits.
> - cbnz nul_match, .Ltail
> + cbnz nul_match, L(tail)
>
> -.Lloop:
> + .p2align 4
> +L(loop):
> cmp chr_match, #0
> csel src_match, src, src_match, ne
> csel src_offset, chr_match, src_offset, ne
> -.Laligned:
> +L(aligned):
> ld1 {vdata1.16b, vdata2.16b}, [src], #32
> - cmeq vhas_nul1.16b, vdata1.16b, #0
> cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
> - cmeq vhas_nul2.16b, vdata2.16b, #0
> cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
> - addp vend1.16b, vhas_nul1.16b, vhas_nul2.16b // 256->128
> + uminp vend1.16b, vdata1.16b, vdata2.16b
> and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
> and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
> + cmeq vend1.16b, vend1.16b, 0
> addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128
> - addp vend1.16b, vend1.16b, vend1.16b // 128->64
> - addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr1.16b // 128->64
> - mov nul_match, vend1.2d[0]
> - mov chr_match, vhas_chr1.2d[0]
> - cbz nul_match, .Lloop
> + addp vend1.16b, vend1.16b, vhas_chr1.16b // 128->64
> + mov nul_match, vend1.d[0]
> + mov chr_match, vend1.d[1]
> + cbz nul_match, L(loop)
>
> + cmeq vhas_nul1.16b, vdata1.16b, #0
> + cmeq vhas_nul2.16b, vdata2.16b, #0
> and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
> and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
> addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b
> addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul1.16b
> - mov nul_match, vhas_nul1.2d[0]
> + mov nul_match, vhas_nul1.d[0]
>
> -.Ltail:
> +L(tail):
> /* Work out exactly where the string ends. */
> sub tmp4, nul_match, #1
> eor tmp4, tmp4, nul_match
> @@ -178,5 +148,5 @@ def_fn strrchr
>
> ret
>
> - .size strrchr, . - strrchr
> +END (strrchr)
> #endif
next prev parent reply other threads:[~2023-10-05 10:37 UTC|newest]
Thread overview: 7+ messages / expand[flat|nested] mbox.gz Atom feed top
2023-09-12 10:05 [PATCH 0/2] " Sebastian Huber
2023-09-12 10:05 ` [PATCH v3 1/2] " Sebastian Huber
2023-10-05 10:37 ` Richard Earnshaw [this message]
2023-10-05 12:23 ` Sebastian Huber
2023-09-12 10:05 ` [PATCH v3 2/2] aarch64: Import memrchr.S Sebastian Huber
2023-09-18 12:25 ` [PATCH 0/2] aarch64: Sync with ARM-software/optimized-routines Sebastian Huber
2023-09-27 9:53 ` Sebastian Huber
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=eca2ad5d-d833-235c-cad2-a1cd76dab88d@foss.arm.com \
--to=richard.earnshaw@foss.arm.com \
--cc=newlib@sourceware.org \
--cc=sebastian.huber@embedded-brains.de \
--cc=szabolcs.nagy@arm.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).