[PATCH 0/2] aarch64: Sync with ARM-software/optimized-routines

public inbox for newlib@sourceware.org
 help / color / mirror / Atom feed

* [PATCH 0/2] aarch64: Sync with ARM-software/optimized-routines
@ 2023-09-12 10:05 Sebastian Huber
  2023-09-12 10:05 ` [PATCH v3 1/2] " Sebastian Huber
                   ` (2 more replies)
  0 siblings, 3 replies; 7+ messages in thread
From: Sebastian Huber @ 2023-09-12 10:05 UTC (permalink / raw)
  To: newlib; +Cc: Szabolcs Nagy

This patch set synchronizes AArch64-specific files with the

https://github.com/ARM-software/optimized-routines

upstream.

Sebastian Huber (2):
  aarch64: Sync with ARM-software/optimized-routines
  aarch64: Import memrchr.S

v3:

Use latest commit.

 newlib/Makefile.in                         |  40 ++
 newlib/libc/machine/aarch64/Makefile.inc   |   2 +
 newlib/libc/machine/aarch64/asmdefs.h      | 106 +++++
 newlib/libc/machine/aarch64/memchr.S       |  73 ++--
 newlib/libc/machine/aarch64/memcmp.S       | 311 ++++++++-------
 newlib/libc/machine/aarch64/memcpy.S       | 272 +++++++------
 newlib/libc/machine/aarch64/memrchr-stub.c |  11 +
 newlib/libc/machine/aarch64/memrchr.S      | 115 ++++++
 newlib/libc/machine/aarch64/memset.S       | 194 ++-------
 newlib/libc/machine/aarch64/stpcpy.S       |  36 +-
 newlib/libc/machine/aarch64/strchr.S       | 107 ++---
 newlib/libc/machine/aarch64/strchrnul.S    |  90 ++---
 newlib/libc/machine/aarch64/strcmp.S       | 282 +++++++------
 newlib/libc/machine/aarch64/strcpy.S       | 437 ++++++---------------
 newlib/libc/machine/aarch64/strlen.S       | 319 +++++++--------
 newlib/libc/machine/aarch64/strncmp.S      | 323 ++++++++-------
 newlib/libc/machine/aarch64/strnlen.S      | 256 ++++--------
 newlib/libc/machine/aarch64/strrchr.S      |  86 ++--
 18 files changed, 1394 insertions(+), 1666 deletions(-)
 create mode 100644 newlib/libc/machine/aarch64/asmdefs.h
 create mode 100644 newlib/libc/machine/aarch64/memrchr-stub.c
 create mode 100644 newlib/libc/machine/aarch64/memrchr.S

-- 
2.35.3


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH v3 1/2] aarch64: Sync with ARM-software/optimized-routines
  2023-09-12 10:05 [PATCH 0/2] aarch64: Sync with ARM-software/optimized-routines Sebastian Huber
@ 2023-09-12 10:05 ` Sebastian Huber
  2023-10-05 10:37   ` Richard Earnshaw
  2023-09-12 10:05 ` [PATCH v3 2/2] aarch64: Import memrchr.S Sebastian Huber
  2023-09-18 12:25 ` [PATCH 0/2] aarch64: Sync with ARM-software/optimized-routines Sebastian Huber
  2 siblings, 1 reply; 7+ messages in thread
From: Sebastian Huber @ 2023-09-12 10:05 UTC (permalink / raw)
  To: newlib; +Cc: Szabolcs Nagy

Update AArch64 assembly string routines from:

https://github.com/ARM-software/optimized-routines

commit 0cf84f26b6b8dcad8287fe30a4dcc1fdabd06560
Author: Sebastian Huber <sebastian.huber@embedded-brains.de>
Date:   Thu Jul 27 17:14:57 2023 +0200

    string: Fix corrupt GNU_PROPERTY_TYPE (5) size

    For ELF32 the notes alignment is 4 and not 8.
---
 newlib/libc/machine/aarch64/asmdefs.h   | 106 ++++++
 newlib/libc/machine/aarch64/memchr.S    |  73 ++--
 newlib/libc/machine/aarch64/memcmp.S    | 311 +++++++++--------
 newlib/libc/machine/aarch64/memcpy.S    | 272 ++++++++-------
 newlib/libc/machine/aarch64/memset.S    | 194 ++---------
 newlib/libc/machine/aarch64/stpcpy.S    |  36 +-
 newlib/libc/machine/aarch64/strchr.S    | 107 ++----
 newlib/libc/machine/aarch64/strchrnul.S |  90 ++---
 newlib/libc/machine/aarch64/strcmp.S    | 282 ++++++++-------
 newlib/libc/machine/aarch64/strcpy.S    | 437 +++++++-----------------
 newlib/libc/machine/aarch64/strlen.S    | 319 ++++++++---------
 newlib/libc/machine/aarch64/strncmp.S   | 323 ++++++++++--------
 newlib/libc/machine/aarch64/strnlen.S   | 256 +++++---------
 newlib/libc/machine/aarch64/strrchr.S   |  86 ++---
 14 files changed, 1226 insertions(+), 1666 deletions(-)
 create mode 100644 newlib/libc/machine/aarch64/asmdefs.h

diff --git a/newlib/libc/machine/aarch64/asmdefs.h b/newlib/libc/machine/aarch64/asmdefs.h
new file mode 100644
index 0000000000..131b95e1fe
--- /dev/null
+++ b/newlib/libc/machine/aarch64/asmdefs.h
@@ -0,0 +1,106 @@
+/*
+ * Macros for asm code.  AArch64 version.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef _ASMDEFS_H
+#define _ASMDEFS_H
+
+/* Branch Target Identitication support.  */
+#define BTI_C		hint	34
+#define BTI_J		hint	36
+/* Return address signing support (pac-ret).  */
+#define PACIASP		hint	25; .cfi_window_save
+#define AUTIASP		hint	29; .cfi_window_save
+
+/* GNU_PROPERTY_AARCH64_* macros from elf.h.  */
+#define FEATURE_1_AND 0xc0000000
+#define FEATURE_1_BTI 1
+#define FEATURE_1_PAC 2
+
+/* Add a NT_GNU_PROPERTY_TYPE_0 note.  */
+#ifdef __ILP32__
+#define GNU_PROPERTY(type, value)	\
+  .section .note.gnu.property, "a";	\
+  .p2align 2;				\
+  .word 4;				\
+  .word 12;				\
+  .word 5;				\
+  .asciz "GNU";				\
+  .word type;				\
+  .word 4;				\
+  .word value;				\
+  .text
+#else
+#define GNU_PROPERTY(type, value)	\
+  .section .note.gnu.property, "a";	\
+  .p2align 3;				\
+  .word 4;				\
+  .word 16;				\
+  .word 5;				\
+  .asciz "GNU";				\
+  .word type;				\
+  .word 4;				\
+  .word value;				\
+  .word 0;				\
+  .text
+#endif
+
+/* If set then the GNU Property Note section will be added to
+   mark objects to support BTI and PAC-RET.  */
+#ifndef WANT_GNU_PROPERTY
+#define WANT_GNU_PROPERTY 1
+#endif
+
+#if WANT_GNU_PROPERTY
+/* Add property note with supported features to all asm files.  */
+GNU_PROPERTY (FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_PAC)
+#endif
+
+#define ENTRY_ALIGN(name, alignment)	\
+  .global name;		\
+  .type name,%function;	\
+  .align alignment;		\
+  name:			\
+  .cfi_startproc;	\
+  BTI_C;
+
+#define ENTRY(name)	ENTRY_ALIGN(name, 6)
+
+#define ENTRY_ALIAS(name)	\
+  .global name;		\
+  .type name,%function;	\
+  name:
+
+#define END(name)	\
+  .cfi_endproc;		\
+  .size name, .-name;
+
+#define L(l) .L ## l
+
+#ifdef __ILP32__
+  /* Sanitize padding bits of pointer arguments as per aapcs64 */
+#define PTR_ARG(n)  mov w##n, w##n
+#else
+#define PTR_ARG(n)
+#endif
+
+#ifdef __ILP32__
+  /* Sanitize padding bits of size arguments as per aapcs64 */
+#define SIZE_ARG(n)  mov w##n, w##n
+#else
+#define SIZE_ARG(n)
+#endif
+
+/* Compiler supports SVE instructions  */
+#ifndef HAVE_SVE
+# if __aarch64__ && (__GNUC__ >= 8 || __clang_major__ >= 5)
+#   define HAVE_SVE 1
+# else
+#   define HAVE_SVE 0
+# endif
+#endif
+
+#endif
diff --git a/newlib/libc/machine/aarch64/memchr.S b/newlib/libc/machine/aarch64/memchr.S
index 53f5d6bc0e..a0f305e0fc 100644
--- a/newlib/libc/machine/aarch64/memchr.S
+++ b/newlib/libc/machine/aarch64/memchr.S
@@ -1,31 +1,8 @@
 /*
  * memchr - find a character in a memory zone
  *
- * Copyright (c) 2014, ARM Limited
- * All rights Reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the company nor the names of its contributors
- *       may be used to endorse or promote products derived from this
- *       software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * Copyright (c) 2014-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
@@ -37,6 +14,8 @@
  * Neon Available.
  */
 
+#include "asmdefs.h"
+
 /* Arguments and results.  */
 #define srcin		x0
 #define chrin		w1
@@ -70,17 +49,11 @@
  * identify exactly which byte has matched.
  */
 
-	.macro def_fn f p2align=0
-	.text
-	.p2align \p2align
-	.global \f
-	.type \f, %function
-\f:
-	.endm
-
-def_fn memchr
+ENTRY (memchr)
+	PTR_ARG (0)
+	SIZE_ARG (2)
 	/* Do not dereference srcin if no bytes to compare.  */
-	cbz	cntin, .Lzero_length
+	cbz	cntin, L(zero_length)
 	/*
 	 * Magic constant 0x40100401 allows us to identify which lane matches
 	 * the requested byte.
@@ -93,7 +66,7 @@ def_fn memchr
 	dup	vrepmask.4s, wtmp2
 	ands	soff, srcin, #31
 	and	cntrem, cntin, #31
-	b.eq	.Lloop
+	b.eq	L(loop)
 
 	/*
 	 * Input string is not 32-byte aligned. We calculate the syndrome
@@ -110,41 +83,41 @@ def_fn memchr
 	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
 	addp	vend.16b, vhas_chr1.16b, vhas_chr2.16b		/* 256->128 */
 	addp	vend.16b, vend.16b, vend.16b			/* 128->64 */
-	mov	synd, vend.2d[0]
+	mov	synd, vend.d[0]
 	/* Clear the soff*2 lower bits */
 	lsl	tmp, soff, #1
 	lsr	synd, synd, tmp
 	lsl	synd, synd, tmp
 	/* The first block can also be the last */
-	b.ls	.Lmasklast
+	b.ls	L(masklast)
 	/* Have we found something already? */
-	cbnz	synd, .Ltail
+	cbnz	synd, L(tail)
 
-.Lloop:
+L(loop):
 	ld1	{vdata1.16b, vdata2.16b}, [src], #32
 	subs	cntin, cntin, #32
 	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
 	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
 	/* If we're out of data we finish regardless of the result */
-	b.ls	.Lend
+	b.ls	L(end)
 	/* Use a fast check for the termination condition */
 	orr	vend.16b, vhas_chr1.16b, vhas_chr2.16b
 	addp	vend.2d, vend.2d, vend.2d
-	mov	synd, vend.2d[0]
+	mov	synd, vend.d[0]
 	/* We're not out of data, loop if we haven't found the character */
-	cbz	synd, .Lloop
+	cbz	synd, L(loop)
 
-.Lend:
+L(end):
 	/* Termination condition found, let's calculate the syndrome value */
 	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
 	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
 	addp	vend.16b, vhas_chr1.16b, vhas_chr2.16b		/* 256->128 */
 	addp	vend.16b, vend.16b, vend.16b			/* 128->64 */
-	mov	synd, vend.2d[0]
+	mov	synd, vend.d[0]
 	/* Only do the clear for the last possible block */
-	b.hi	.Ltail
+	b.hs	L(tail)
 
-.Lmasklast:
+L(masklast):
 	/* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */
 	add	tmp, cntrem, soff
 	and	tmp, tmp, #31
@@ -153,7 +126,7 @@ def_fn memchr
 	lsl	synd, synd, tmp
 	lsr	synd, synd, tmp
 
-.Ltail:
+L(tail):
 	/* Count the trailing zeros using bit reversing */
 	rbit	synd, synd
 	/* Compensate the last post-increment */
@@ -168,9 +141,9 @@ def_fn memchr
 	csel	result, xzr, result, eq
 	ret
 
-.Lzero_length:
+L(zero_length):
 	mov	result, #0
 	ret
 
-	.size	memchr, . - memchr
+END (memchr)
 #endif
diff --git a/newlib/libc/machine/aarch64/memcmp.S b/newlib/libc/machine/aarch64/memcmp.S
index 605d99365e..18874d3215 100644
--- a/newlib/libc/machine/aarch64/memcmp.S
+++ b/newlib/libc/machine/aarch64/memcmp.S
@@ -1,57 +1,7 @@
 /* memcmp - compare memory
-
-   Copyright (c) 2018 Linaro Limited
-   All rights reserved.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are met:
-       * Redistributions of source code must retain the above copyright
-         notice, this list of conditions and the following disclaimer.
-       * Redistributions in binary form must reproduce the above copyright
-         notice, this list of conditions and the following disclaimer in the
-         documentation and/or other materials provided with the distribution.
-       * Neither the name of the Linaro nor the
-         names of its contributors may be used to endorse or promote products
-         derived from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
-
-/*
- * Copyright (c) 2017 ARM Ltd
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. The name of the company may not be used to endorse or promote
- *    products derived from this software without specific prior written
- *    permission.
  *
- * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
- * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
- * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * Copyright (c) 2013-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
@@ -60,103 +10,79 @@
 
 /* Assumptions:
  *
- * ARMv8-a, AArch64, unaligned accesses.
+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
  */
 
-#define L(l) .L ## l
-
-/* Parameters and result.  */
-#define src1		x0
-#define src2		x1
-#define limit		x2
-#define result		w0
-
-/* Internal variables.  */
-#define data1		x3
-#define data1w		w3
-#define data1h		x4
-#define data2		x5
-#define data2w		w5
-#define data2h		x6
-#define tmp1		x7
-#define tmp2		x8
-
-        .macro def_fn f p2align=0
-        .text
-        .p2align \p2align
-        .global \f
-        .type \f, %function
-\f:
-        .endm
-
-def_fn memcmp p2align=6
-	subs	limit, limit, 8
-	b.lo	L(less8)
-
-	ldr	data1, [src1], 8
-	ldr	data2, [src2], 8
-	cmp	data1, data2
-	b.ne	L(return)
-
-	subs	limit, limit, 8
-	b.gt	L(more16)
-
-	ldr	data1, [src1, limit]
-	ldr	data2, [src2, limit]
-	b	L(return)
-
-L(more16):
-	ldr	data1, [src1], 8
-	ldr	data2, [src2], 8
-	cmp	data1, data2
-	bne	L(return)
-
-	/* Jump directly to comparing the last 16 bytes for 32 byte (or less)
-	   strings.  */
-	subs	limit, limit, 16
+#include "asmdefs.h"
+
+#define src1	x0
+#define src2	x1
+#define limit	x2
+#define result	w0
+
+#define data1	x3
+#define data1w	w3
+#define data2	x4
+#define data2w	w4
+#define data3	x5
+#define data3w	w5
+#define data4	x6
+#define data4w	w6
+#define tmp	x6
+#define src1end	x7
+#define src2end	x8
+
+
+ENTRY (memcmp)
+	PTR_ARG (0)
+	PTR_ARG (1)
+	SIZE_ARG (2)
+
+	cmp	limit, 16
+	b.lo	L(less16)
+	ldp	data1, data3, [src1]
+	ldp	data2, data4, [src2]
+	ccmp	data1, data2, 0, ne
+	ccmp	data3, data4, 0, eq
+	b.ne	L(return2)
+
+	add	src1end, src1, limit
+	add	src2end, src2, limit
+	cmp	limit, 32
 	b.ls	L(last_bytes)
+	cmp	limit, 160
+	b.hs	L(loop_align)
+	sub	limit, limit, 32
 
-	/* We overlap loads between 0-32 bytes at either side of SRC1 when we
-	   try to align, so limit it only to strings larger than 128 bytes.  */
-	cmp	limit, 96
-	b.ls	L(loop16)
-
-	/* Align src1 and adjust src2 with bytes not yet done.  */
-	and	tmp1, src1, 15
-	add	limit, limit, tmp1
-	sub	src1, src1, tmp1
-	sub	src2, src2, tmp1
-
-	/* Loop performing 16 bytes per iteration using aligned src1.
-	   Limit is pre-decremented by 16 and must be larger than zero.
-	   Exit if <= 16 bytes left to do or if the data is not equal.  */
 	.p2align 4
-L(loop16):
-	ldp	data1, data1h, [src1], 16
-	ldp	data2, data2h, [src2], 16
-	subs	limit, limit, 16
-	ccmp	data1, data2, 0, hi
-	ccmp	data1h, data2h, 0, eq
-	b.eq	L(loop16)
-
+L(loop32):
+	ldp	data1, data3, [src1, 16]
+	ldp	data2, data4, [src2, 16]
 	cmp	data1, data2
-	bne	L(return)
-	mov	data1, data1h
-	mov	data2, data2h
+	ccmp	data3, data4, 0, eq
+	b.ne	L(return2)
+	cmp	limit, 16
+	b.ls	L(last_bytes)
+
+	ldp	data1, data3, [src1, 32]
+	ldp	data2, data4, [src2, 32]
 	cmp	data1, data2
-	bne	L(return)
+	ccmp	data3, data4, 0, eq
+	b.ne	L(return2)
+	add	src1, src1, 32
+	add	src2, src2, 32
+L(last64):
+	subs	limit, limit, 32
+	b.hi	L(loop32)
 
 	/* Compare last 1-16 bytes using unaligned access.  */
 L(last_bytes):
-	add	src1, src1, limit
-	add	src2, src2, limit
-	ldp	data1, data1h, [src1]
-	ldp	data2, data2h, [src2]
-	cmp     data1, data2
-	bne	L(return)
-	mov	data1, data1h
-	mov	data2, data2h
+	ldp	data1, data3, [src1end, -16]
+	ldp	data2, data4, [src2end, -16]
+L(return2):
 	cmp	data1, data2
+	csel	data1, data1, data3, ne
+	csel	data2, data2, data4, ne
 
 	/* Compare data bytes and set return value to 0, -1 or 1.  */
 L(return):
@@ -164,33 +90,106 @@ L(return):
 	rev	data1, data1
 	rev	data2, data2
 #endif
-	cmp     data1, data2
-L(ret_eq):
+	cmp	data1, data2
 	cset	result, ne
 	cneg	result, result, lo
 	ret
 
 	.p2align 4
-	/* Compare up to 8 bytes.  Limit is [-8..-1].  */
+L(less16):
+	add	src1end, src1, limit
+	add	src2end, src2, limit
+	tbz	limit, 3, L(less8)
+	ldr	data1, [src1]
+	ldr	data2, [src2]
+	ldr	data3, [src1end, -8]
+	ldr	data4, [src2end, -8]
+	b	L(return2)
+
+	.p2align 4
 L(less8):
-	adds	limit, limit, 4
-	b.lo	L(less4)
-	ldr	data1w, [src1], 4
-	ldr	data2w, [src2], 4
+	tbz	limit, 2, L(less4)
+	ldr	data1w, [src1]
+	ldr	data2w, [src2]
+	ldr	data3w, [src1end, -4]
+	ldr	data4w, [src2end, -4]
+	b	L(return2)
+
+L(less4):
+	tbz	limit, 1, L(less2)
+	ldrh	data1w, [src1]
+	ldrh	data2w, [src2]
 	cmp	data1w, data2w
 	b.ne	L(return)
-	sub	limit, limit, 4
-L(less4):
-	adds	limit, limit, 4
-	beq	L(ret_eq)
-L(byte_loop):
-	ldrb	data1w, [src1], 1
-	ldrb	data2w, [src2], 1
-	subs	limit, limit, 1
-	ccmp	data1w, data2w, 0, ne	/* NZCV = 0b0000.  */
-	b.eq	L(byte_loop)
+L(less2):
+	mov	result, 0
+	tbz	limit, 0, L(return_zero)
+	ldrb	data1w, [src1end, -1]
+	ldrb	data2w, [src2end, -1]
 	sub	result, data1w, data2w
+L(return_zero):
+	ret
+
+L(loop_align):
+	ldp	data1, data3, [src1, 16]
+	ldp	data2, data4, [src2, 16]
+	cmp	data1, data2
+	ccmp	data3, data4, 0, eq
+	b.ne	L(return2)
+
+	/* Align src2 and adjust src1, src2 and limit.  */
+	and	tmp, src2, 15
+	sub	tmp, tmp, 16
+	sub	src2, src2, tmp
+	add	limit, limit, tmp
+	sub	src1, src1, tmp
+	sub	limit, limit, 64 + 16
+
+	.p2align 4
+L(loop64):
+	ldr	q0, [src1, 16]
+	ldr	q1, [src2, 16]
+	subs	limit, limit, 64
+	ldr	q2, [src1, 32]
+	ldr	q3, [src2, 32]
+	eor	v0.16b, v0.16b, v1.16b
+	eor	v1.16b, v2.16b, v3.16b
+	ldr	q2, [src1, 48]
+	ldr	q3, [src2, 48]
+	umaxp	v0.16b, v0.16b, v1.16b
+	ldr	q4, [src1, 64]!
+	ldr	q5, [src2, 64]!
+	eor	v1.16b, v2.16b, v3.16b
+	eor	v2.16b, v4.16b, v5.16b
+	umaxp	v1.16b, v1.16b, v2.16b
+	umaxp	v0.16b, v0.16b, v1.16b
+	umaxp	v0.16b, v0.16b, v0.16b
+	fmov	tmp, d0
+	ccmp	tmp, 0, 0, hi
+	b.eq	L(loop64)
+
+	/* If equal, process last 1-64 bytes using scalar loop.  */
+	add	limit, limit, 64 + 16
+	cbz	tmp, L(last64)
+
+	/* Determine the 8-byte aligned offset of the first difference.  */
+#ifdef __AARCH64EB__
+	rev16	tmp, tmp
+#endif
+	rev	tmp, tmp
+	clz	tmp, tmp
+	bic	tmp, tmp, 7
+	sub	tmp, tmp, 48
+	ldr	data1, [src1, tmp]
+	ldr	data2, [src2, tmp]
+#ifndef __AARCH64EB__
+	rev	data1, data1
+	rev	data2, data2
+#endif
+	mov	result, 1
+	cmp	data1, data2
+	cneg	result, result, lo
 	ret
 
-	.size	memcmp, . - memcmp
+END (memcmp)
 #endif
diff --git a/newlib/libc/machine/aarch64/memcpy.S b/newlib/libc/machine/aarch64/memcpy.S
index 463bad0a18..248e7843a2 100644
--- a/newlib/libc/machine/aarch64/memcpy.S
+++ b/newlib/libc/machine/aarch64/memcpy.S
@@ -1,55 +1,8 @@
-/* Copyright (c) 2012-2013, Linaro Limited
-   All rights reserved.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are met:
-       * Redistributions of source code must retain the above copyright
-         notice, this list of conditions and the following disclaimer.
-       * Redistributions in binary form must reproduce the above copyright
-         notice, this list of conditions and the following disclaimer in the
-         documentation and/or other materials provided with the distribution.
-       * Neither the name of the Linaro nor the
-         names of its contributors may be used to endorse or promote products
-         derived from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
-
 /*
- * Copyright (c) 2015 ARM Ltd
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. The name of the company may not be used to endorse or promote
- *    products derived from this software without specific prior written
- *    permission.
+ * memcpy - copy memory area
  *
- * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
- * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
- * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * Copyright (c) 2012-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
@@ -61,6 +14,7 @@
 #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
 /* See memcpy-stub.c  */
 #else
+#include "asmdefs.h"
 
 #define dstin	x0
 #define src	x1
@@ -71,122 +25,139 @@
 #define A_l	x6
 #define A_lw	w6
 #define A_h	x7
-#define A_hw	w7
 #define B_l	x8
 #define B_lw	w8
 #define B_h	x9
 #define C_l	x10
+#define C_lw	w10
 #define C_h	x11
 #define D_l	x12
 #define D_h	x13
-#define E_l	src
-#define E_h	count
-#define F_l	srcend
-#define F_h	dst
-#define tmp1	x9
-
-#define L(l) .L ## l
-
-	.macro def_fn f p2align=0
-	.text
-	.p2align \p2align
-	.global \f
-	.type \f, %function
-\f:
-	.endm
-
-/* Copies are split into 3 main cases: small copies of up to 16 bytes,
-   medium copies of 17..96 bytes which are fully unrolled. Large copies
-   of more than 96 bytes align the destination and use an unrolled loop
-   processing 64 bytes per iteration.
-   Small and medium copies read all data before writing, allowing any
-   kind of overlap, and memmove tailcalls memcpy for these cases as
-   well as non-overlapping copies.
+#define E_l	x14
+#define E_h	x15
+#define F_l	x16
+#define F_h	x17
+#define G_l	count
+#define G_h	dst
+#define H_l	src
+#define H_h	srcend
+#define tmp1	x14
+
+/* This implementation handles overlaps and supports both memcpy and memmove
+   from a single entry point.  It uses unaligned accesses and branchless
+   sequences to keep the code small, simple and improve performance.
+
+   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
+   copies of up to 128 bytes, and large copies.  The overhead of the overlap
+   check is negligible since it is only required for large copies.
+
+   Large copies use a software pipelined loop processing 64 bytes per iteration.
+   The destination pointer is 16-byte aligned to minimize unaligned accesses.
+   The loop tail is handled by always copying 64 bytes from the end.
 */
 
-def_fn memcpy p2align=6
-	prfm	PLDL1KEEP, [src]
+ENTRY_ALIAS (memmove)
+ENTRY (memcpy)
+	PTR_ARG (0)
+	PTR_ARG (1)
+	SIZE_ARG (2)
 	add	srcend, src, count
 	add	dstend, dstin, count
-	cmp	count, 16
-	b.ls	L(copy16)
-	cmp	count, 96
+	cmp	count, 128
 	b.hi	L(copy_long)
+	cmp	count, 32
+	b.hi	L(copy32_128)
 
-	/* Medium copies: 17..96 bytes.  */
-	sub	tmp1, count, 1
+	/* Small copies: 0..32 bytes.  */
+	cmp	count, 16
+	b.lo	L(copy16)
 	ldp	A_l, A_h, [src]
-	tbnz	tmp1, 6, L(copy96)
 	ldp	D_l, D_h, [srcend, -16]
-	tbz	tmp1, 5, 1f
-	ldp	B_l, B_h, [src, 16]
-	ldp	C_l, C_h, [srcend, -32]
-	stp	B_l, B_h, [dstin, 16]
-	stp	C_l, C_h, [dstend, -32]
-1:
 	stp	A_l, A_h, [dstin]
 	stp	D_l, D_h, [dstend, -16]
 	ret
 
-	.p2align 4
-	/* Small copies: 0..16 bytes.  */
+	/* Copy 8-15 bytes.  */
 L(copy16):
-	cmp	count, 8
-	b.lo	1f
+	tbz	count, 3, L(copy8)
 	ldr	A_l, [src]
 	ldr	A_h, [srcend, -8]
 	str	A_l, [dstin]
 	str	A_h, [dstend, -8]
 	ret
-	.p2align 4
-1:
-	tbz	count, 2, 1f
+
+	.p2align 3
+	/* Copy 4-7 bytes.  */
+L(copy8):
+	tbz	count, 2, L(copy4)
 	ldr	A_lw, [src]
-	ldr	A_hw, [srcend, -4]
+	ldr	B_lw, [srcend, -4]
 	str	A_lw, [dstin]
-	str	A_hw, [dstend, -4]
+	str	B_lw, [dstend, -4]
 	ret
 
-	/* Copy 0..3 bytes.  Use a branchless sequence that copies the same
-	   byte 3 times if count==1, or the 2nd byte twice if count==2.  */
-1:
-	cbz	count, 2f
+	/* Copy 0..3 bytes using a branchless sequence.  */
+L(copy4):
+	cbz	count, L(copy0)
 	lsr	tmp1, count, 1
 	ldrb	A_lw, [src]
-	ldrb	A_hw, [srcend, -1]
+	ldrb	C_lw, [srcend, -1]
 	ldrb	B_lw, [src, tmp1]
 	strb	A_lw, [dstin]
 	strb	B_lw, [dstin, tmp1]
-	strb	A_hw, [dstend, -1]
-2:	ret
+	strb	C_lw, [dstend, -1]
+L(copy0):
+	ret
 
 	.p2align 4
-	/* Copy 64..96 bytes.  Copy 64 bytes from the start and
-	   32 bytes from the end.  */
-L(copy96):
+	/* Medium copies: 33..128 bytes.  */
+L(copy32_128):
+	ldp	A_l, A_h, [src]
 	ldp	B_l, B_h, [src, 16]
-	ldp	C_l, C_h, [src, 32]
-	ldp	D_l, D_h, [src, 48]
-	ldp	E_l, E_h, [srcend, -32]
-	ldp	F_l, F_h, [srcend, -16]
+	ldp	C_l, C_h, [srcend, -32]
+	ldp	D_l, D_h, [srcend, -16]
+	cmp	count, 64
+	b.hi	L(copy128)
 	stp	A_l, A_h, [dstin]
 	stp	B_l, B_h, [dstin, 16]
-	stp	C_l, C_h, [dstin, 32]
-	stp	D_l, D_h, [dstin, 48]
-	stp	E_l, E_h, [dstend, -32]
-	stp	F_l, F_h, [dstend, -16]
+	stp	C_l, C_h, [dstend, -32]
+	stp	D_l, D_h, [dstend, -16]
 	ret
 
-	/* Align DST to 16 byte alignment so that we don't cross cache line
-	   boundaries on both loads and stores.	 There are at least 96 bytes
-	   to copy, so copy 16 bytes unaligned and then align.	The loop
-	   copies 64 bytes per iteration and prefetches one iteration ahead.  */
+	.p2align 4
+	/* Copy 65..128 bytes.  */
+L(copy128):
+	ldp	E_l, E_h, [src, 32]
+	ldp	F_l, F_h, [src, 48]
+	cmp	count, 96
+	b.ls	L(copy96)
+	ldp	G_l, G_h, [srcend, -64]
+	ldp	H_l, H_h, [srcend, -48]
+	stp	G_l, G_h, [dstend, -64]
+	stp	H_l, H_h, [dstend, -48]
+L(copy96):
+	stp	A_l, A_h, [dstin]
+	stp	B_l, B_h, [dstin, 16]
+	stp	E_l, E_h, [dstin, 32]
+	stp	F_l, F_h, [dstin, 48]
+	stp	C_l, C_h, [dstend, -32]
+	stp	D_l, D_h, [dstend, -16]
+	ret
 
 	.p2align 4
+	/* Copy more than 128 bytes.  */
 L(copy_long):
+	/* Use backwards copy if there is an overlap.  */
+	sub	tmp1, dstin, src
+	cbz	tmp1, L(copy0)
+	cmp	tmp1, count
+	b.lo	L(copy_long_backwards)
+
+	/* Copy 16 bytes and then align dst to 16-byte alignment.  */
+
+	ldp	D_l, D_h, [src]
 	and	tmp1, dstin, 15
 	bic	dst, dstin, 15
-	ldp	D_l, D_h, [src]
 	sub	src, src, tmp1
 	add	count, count, tmp1	/* Count is now 16 too large.  */
 	ldp	A_l, A_h, [src, 16]
@@ -195,8 +166,9 @@ L(copy_long):
 	ldp	C_l, C_h, [src, 48]
 	ldp	D_l, D_h, [src, 64]!
 	subs	count, count, 128 + 16	/* Test and readjust count.  */
-	b.ls	2f
-1:
+	b.ls	L(copy64_from_end)
+
+L(loop64):
 	stp	A_l, A_h, [dst, 16]
 	ldp	A_l, A_h, [src, 16]
 	stp	B_l, B_h, [dst, 32]
@@ -206,12 +178,10 @@ L(copy_long):
 	stp	D_l, D_h, [dst, 64]!
 	ldp	D_l, D_h, [src, 64]!
 	subs	count, count, 64
-	b.hi	1b
+	b.hi	L(loop64)
 
-	/* Write the last full set of 64 bytes.	 The remainder is at most 64
-	   bytes, so it is safe to always copy 64 bytes from the end even if
-	   there is just 1 byte left.  */
-2:
+	/* Write the last iteration and copy 64 bytes from the end.  */
+L(copy64_from_end):
 	ldp	E_l, E_h, [srcend, -64]
 	stp	A_l, A_h, [dst, 16]
 	ldp	A_l, A_h, [srcend, -48]
@@ -226,5 +196,51 @@ L(copy_long):
 	stp	C_l, C_h, [dstend, -16]
 	ret
 
-	.size	memcpy, . - memcpy
+	.p2align 4
+
+	/* Large backwards copy for overlapping copies.
+	   Copy 16 bytes and then align dst to 16-byte alignment.  */
+L(copy_long_backwards):
+	ldp	D_l, D_h, [srcend, -16]
+	and	tmp1, dstend, 15
+	sub	srcend, srcend, tmp1
+	sub	count, count, tmp1
+	ldp	A_l, A_h, [srcend, -16]
+	stp	D_l, D_h, [dstend, -16]
+	ldp	B_l, B_h, [srcend, -32]
+	ldp	C_l, C_h, [srcend, -48]
+	ldp	D_l, D_h, [srcend, -64]!
+	sub	dstend, dstend, tmp1
+	subs	count, count, 128
+	b.ls	L(copy64_from_start)
+
+L(loop64_backwards):
+	stp	A_l, A_h, [dstend, -16]
+	ldp	A_l, A_h, [srcend, -16]
+	stp	B_l, B_h, [dstend, -32]
+	ldp	B_l, B_h, [srcend, -32]
+	stp	C_l, C_h, [dstend, -48]
+	ldp	C_l, C_h, [srcend, -48]
+	stp	D_l, D_h, [dstend, -64]!
+	ldp	D_l, D_h, [srcend, -64]!
+	subs	count, count, 64
+	b.hi	L(loop64_backwards)
+
+	/* Write the last iteration and copy 64 bytes from the start.  */
+L(copy64_from_start):
+	ldp	G_l, G_h, [src, 48]
+	stp	A_l, A_h, [dstend, -16]
+	ldp	A_l, A_h, [src, 32]
+	stp	B_l, B_h, [dstend, -32]
+	ldp	B_l, B_h, [src, 16]
+	stp	C_l, C_h, [dstend, -48]
+	ldp	C_l, C_h, [src]
+	stp	D_l, D_h, [dstend, -64]
+	stp	G_l, G_h, [dstin, 48]
+	stp	A_l, A_h, [dstin, 32]
+	stp	B_l, B_h, [dstin, 16]
+	stp	C_l, C_h, [dstin]
+	ret
+
+END (memcpy)
 #endif
diff --git a/newlib/libc/machine/aarch64/memset.S b/newlib/libc/machine/aarch64/memset.S
index 103e3f8bb0..ca76439a91 100644
--- a/newlib/libc/machine/aarch64/memset.S
+++ b/newlib/libc/machine/aarch64/memset.S
@@ -1,66 +1,20 @@
-/* Copyright (c) 2012-2013, Linaro Limited
-   All rights reserved.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are met:
-       * Redistributions of source code must retain the above copyright
-         notice, this list of conditions and the following disclaimer.
-       * Redistributions in binary form must reproduce the above copyright
-         notice, this list of conditions and the following disclaimer in the
-         documentation and/or other materials provided with the distribution.
-       * Neither the name of the Linaro nor the
-         names of its contributors may be used to endorse or promote products
-         derived from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
-
 /*
- * Copyright (c) 2015 ARM Ltd
- * All rights reserved.
+ * memset - fill memory with a constant byte
  *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. The name of the company may not be used to endorse or promote
- *    products derived from this software without specific prior written
- *    permission.
- *
- * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
- * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
- * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * Copyright (c) 2012-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
  *
- * ARMv8-a, AArch64, unaligned accesses
+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
  *
  */
 
 #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
 /* See memset-stub.c  */
 #else
+#include "asmdefs.h"
 
 #define dstin	x0
 #define val	x1
@@ -68,24 +22,11 @@
 #define count	x2
 #define dst	x3
 #define dstend	x4
-#define tmp1	x5
-#define tmp1w	w5
-#define tmp2	x6
-#define tmp2w	w6
-#define zva_len x7
-#define zva_lenw w7
-
-#define L(l) .L ## l
+#define zva_val	x5
 
-	.macro def_fn f p2align=0
-	.text
-	.p2align \p2align
-	.global \f
-	.type \f, %function
-\f:
-	.endm
-
-def_fn memset p2align=6
+ENTRY (memset)
+	PTR_ARG (0)
+	SIZE_ARG (2)
 
 	dup	v0.16B, valw
 	add	dstend, dstin, count
@@ -101,7 +42,7 @@ def_fn memset p2align=6
 	str	val, [dstin]
 	str	val, [dstend, -8]
 	ret
-	nop
+	.p2align 4
 1:	tbz	count, 2, 2f
 	str	valw, [dstin]
 	str	valw, [dstend, -4]
@@ -131,110 +72,49 @@ L(set96):
 	stp	q0, q0, [dstend, -32]
 	ret
 
-	.p2align 3
-	nop
+	.p2align 4
 L(set_long):
 	and	valw, valw, 255
 	bic	dst, dstin, 15
 	str	q0, [dstin]
-	cmp	count, 256
-	ccmp	valw, 0, 0, cs
-	b.eq	L(try_zva)
-L(no_zva):
-	sub	count, dstend, dst	/* Count is 16 too large.  */
-	sub	dst, dst, 16		/* Dst is biased by -32.  */
-	sub	count, count, 64 + 16	/* Adjust count and bias for loop.  */
-1:	stp	q0, q0, [dst, 32]
-	stp	q0, q0, [dst, 64]!
-L(tail64):
-	subs	count, count, 64
-	b.hi	1b
-2:	stp	q0, q0, [dstend, -64]
-	stp	q0, q0, [dstend, -32]
-	ret
-
-	.p2align 3
-L(try_zva):
-	mrs	tmp1, dczid_el0
-	tbnz	tmp1w, 4, L(no_zva)
-	and	tmp1w, tmp1w, 15
-	cmp	tmp1w, 4	/* ZVA size is 64 bytes.  */
-	b.ne	 L(zva_128)
-
-	/* Write the first and last 64 byte aligned block using stp rather
-	   than using DC ZVA.  This is faster on some cores.
-	 */
-L(zva_64):
+	cmp	count, 160
+	ccmp	valw, 0, 0, hs
+	b.ne	L(no_zva)
+
+#ifndef SKIP_ZVA_CHECK
+	mrs	zva_val, dczid_el0
+	and	zva_val, zva_val, 31
+	cmp	zva_val, 4		/* ZVA size is 64 bytes.  */
+	b.ne	L(no_zva)
+#endif
 	str	q0, [dst, 16]
 	stp	q0, q0, [dst, 32]
 	bic	dst, dst, 63
-	stp	q0, q0, [dst, 64]
-	stp	q0, q0, [dst, 96]
-	sub	count, dstend, dst	/* Count is now 128 too large.	*/
-	sub	count, count, 128+64+64	/* Adjust count and bias for loop.  */
-	add	dst, dst, 128
-	nop
-1:	dc	zva, dst
+	sub	count, dstend, dst	/* Count is now 64 too large.  */
+	sub	count, count, 128	/* Adjust count and bias for loop.  */
+
+	.p2align 4
+L(zva_loop):
 	add	dst, dst, 64
+	dc	zva, dst
 	subs	count, count, 64
-	b.hi	1b
-	stp	q0, q0, [dst, 0]
-	stp	q0, q0, [dst, 32]
+	b.hi	L(zva_loop)
 	stp	q0, q0, [dstend, -64]
 	stp	q0, q0, [dstend, -32]
 	ret
 
-	.p2align 3
-L(zva_128):
-	cmp	tmp1w, 5	/* ZVA size is 128 bytes.  */
-	b.ne	L(zva_other)
-
-	str	q0, [dst, 16]
+L(no_zva):
+	sub	count, dstend, dst	/* Count is 16 too large.  */
+	sub	dst, dst, 16		/* Dst is biased by -32.  */
+	sub	count, count, 64 + 16	/* Adjust count and bias for loop.  */
+L(no_zva_loop):
 	stp	q0, q0, [dst, 32]
-	stp	q0, q0, [dst, 64]
-	stp	q0, q0, [dst, 96]
-	bic	dst, dst, 127
-	sub	count, dstend, dst	/* Count is now 128 too large.	*/
-	sub	count, count, 128+128	/* Adjust count and bias for loop.  */
-	add	dst, dst, 128
-1:	dc	zva, dst
-	add	dst, dst, 128
-	subs	count, count, 128
-	b.hi	1b
-	stp	q0, q0, [dstend, -128]
-	stp	q0, q0, [dstend, -96]
+	stp	q0, q0, [dst, 64]!
+	subs	count, count, 64
+	b.hi	L(no_zva_loop)
 	stp	q0, q0, [dstend, -64]
 	stp	q0, q0, [dstend, -32]
 	ret
 
-L(zva_other):
-	mov	tmp2w, 4
-	lsl	zva_lenw, tmp2w, tmp1w
-	add	tmp1, zva_len, 64	/* Max alignment bytes written.	 */
-	cmp	count, tmp1
-	blo	L(no_zva)
-
-	sub	tmp2, zva_len, 1
-	add	tmp1, dst, zva_len
-	add	dst, dst, 16
-	subs	count, tmp1, dst	/* Actual alignment bytes to write.  */
-	bic	tmp1, tmp1, tmp2	/* Aligned dc zva start address.  */
-	beq	2f
-1:	stp	q0, q0, [dst], 64
-	stp	q0, q0, [dst, -32]
-	subs	count, count, 64
-	b.hi	1b
-2:	mov	dst, tmp1
-	sub	count, dstend, tmp1	/* Remaining bytes to write.  */
-	subs	count, count, zva_len
-	b.lo	4f
-3:	dc	zva, dst
-	add	dst, dst, zva_len
-	subs	count, count, zva_len
-	b.hs	3b
-4:	add	count, count, zva_len
-	sub	dst, dst, 32		/* Bias dst for tail loop.  */
-	b	L(tail64)
-
-	.size	memset, . - memset
+END (memset)
 #endif
diff --git a/newlib/libc/machine/aarch64/stpcpy.S b/newlib/libc/machine/aarch64/stpcpy.S
index 696b45889f..155c68d75a 100644
--- a/newlib/libc/machine/aarch64/stpcpy.S
+++ b/newlib/libc/machine/aarch64/stpcpy.S
@@ -1,34 +1,10 @@
 /*
-   stpcpy - copy a string returning pointer to end.
+ * stpcpy - copy a string returning pointer to end.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
 
-   Copyright (c) 2015 ARM Ltd.
-   All Rights Reserved.
+#define BUILD_STPCPY 1
 
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are met:
-       * Redistributions of source code must retain the above copyright
-         notice, this list of conditions and the following disclaimer.
-       * Redistributions in binary form must reproduce the above copyright
-         notice, this list of conditions and the following disclaimer in the
-         documentation and/or other materials provided with the distribution.
-       * Neither the name of the company nor the names of its contributors
-         may be used to endorse or promote products derived from this
-         software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  */
-
-/* This is just a wrapper that uses strcpy code with appropriate
-   pre-defines.  */
-
-#define BUILD_STPCPY
 #include "strcpy.S"
diff --git a/newlib/libc/machine/aarch64/strchr.S b/newlib/libc/machine/aarch64/strchr.S
index 2448dbc7d5..500d9aff29 100644
--- a/newlib/libc/machine/aarch64/strchr.S
+++ b/newlib/libc/machine/aarch64/strchr.S
@@ -1,32 +1,9 @@
 /*
-   strchr - find a character in a string
-
-   Copyright (c) 2014, ARM Limited
-   All rights Reserved.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are met:
-       * Redistributions of source code must retain the above copyright
-         notice, this list of conditions and the following disclaimer.
-       * Redistributions in binary form must reproduce the above copyright
-         notice, this list of conditions and the following disclaimer in the
-         documentation and/or other materials provided with the distribution.
-       * Neither the name of the company nor the names of its contributors
-         may be used to endorse or promote products derived from this
-         software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  */
-
+ * strchr - find a character in a string
+ *
+ * Copyright (c) 2014-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
 #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
 /* See strchr-stub.c  */
 #else
@@ -37,6 +14,8 @@
  * Neon Available.
  */
 
+#include "asmdefs.h"
+
 /* Arguments and results.  */
 #define srcin		x0
 #define chrin		w1
@@ -74,26 +53,19 @@
 
 /* Locals and temporaries.  */
 
-	.macro def_fn f p2align=0
-	.text
-	.p2align \p2align
-	.global \f
-	.type \f, %function
-\f:
-	.endm
-
-def_fn strchr
-	/* Magic constant 0x40100401 to allow us to identify which lane
-	   matches the requested byte.  Magic constant 0x80200802 used
-	   similarly for NUL termination.  */
-	mov	wtmp2, #0x0401
-	movk	wtmp2, #0x4010, lsl #16
+ENTRY (strchr)
+	PTR_ARG (0)
+	/* Magic constant 0xc0300c03 to allow us to identify which lane
+	   matches the requested byte.  Even bits are set if the character
+	   matches, odd bits if either the char is NUL or matches.  */
+	mov	wtmp2, 0x0c03
+	movk	wtmp2, 0xc030, lsl 16
 	dup	vrepchr.16b, chrin
 	bic	src, srcin, #31		/* Work with aligned 32-byte hunks.  */
 	dup	vrepmask_c.4s, wtmp2
 	ands	tmp1, srcin, #31
 	add	vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */
-	b.eq	.Lloop
+	b.eq	L(loop)
 
 	/* Input string is not 32-byte aligned.  Rather than forcing
 	   the padding bytes to a safe value, we calculate the syndrome
@@ -105,49 +77,42 @@ def_fn strchr
 	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
 	cmeq	vhas_nul2.16b, vdata2.16b, #0
 	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
-	and	vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
-	and	vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
-	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
-	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
-	orr	vend1.16b, vhas_nul1.16b, vhas_chr1.16b
-	orr	vend2.16b, vhas_nul2.16b, vhas_chr2.16b
+	bif	vhas_nul1.16b, vhas_chr1.16b, vrepmask_0.16b
+	bif	vhas_nul2.16b, vhas_chr2.16b, vrepmask_0.16b
+	and	vend1.16b, vhas_nul1.16b, vrepmask_c.16b
+	and	vend2.16b, vhas_nul2.16b, vrepmask_c.16b
 	lsl	tmp1, tmp1, #1
 	addp	vend1.16b, vend1.16b, vend2.16b		// 256->128
 	mov	tmp3, #~0
 	addp	vend1.16b, vend1.16b, vend2.16b		// 128->64
 	lsr	tmp1, tmp3, tmp1
 
-	mov	tmp3, vend1.2d[0]
+	mov	tmp3, vend1.d[0]
 	bic	tmp1, tmp3, tmp1	// Mask padding bits.
-	cbnz	tmp1, .Ltail
+	cbnz	tmp1, L(tail)
 
-.Lloop:
+	.p2align 4
+L(loop):
 	ld1	{vdata1.16b, vdata2.16b}, [src], #32
-	cmeq	vhas_nul1.16b, vdata1.16b, #0
 	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
-	cmeq	vhas_nul2.16b, vdata2.16b, #0
 	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
-	/* Use a fast check for the termination condition.  */
-	orr	vend1.16b, vhas_nul1.16b, vhas_chr1.16b
-	orr	vend2.16b, vhas_nul2.16b, vhas_chr2.16b
-	orr	vend1.16b, vend1.16b, vend2.16b
-	addp	vend1.2d, vend1.2d, vend1.2d
-	mov	tmp1, vend1.2d[0]
-	cbz	tmp1, .Lloop
+	cmhs	vhas_nul1.16b, vhas_chr1.16b, vdata1.16b
+	cmhs	vhas_nul2.16b, vhas_chr2.16b, vdata2.16b
+	orr	vend1.16b, vhas_nul1.16b, vhas_nul2.16b
+	umaxp	vend1.16b, vend1.16b, vend1.16b
+	mov	tmp1, vend1.d[0]
+	cbz	tmp1, L(loop)
 
 	/* Termination condition found.  Now need to establish exactly why
 	   we terminated.  */
-	and	vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
-	and	vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
-	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
-	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
-	orr	vend1.16b, vhas_nul1.16b, vhas_chr1.16b
-	orr	vend2.16b, vhas_nul2.16b, vhas_chr2.16b
+	bif	vhas_nul1.16b, vhas_chr1.16b, vrepmask_0.16b
+	bif	vhas_nul2.16b, vhas_chr2.16b, vrepmask_0.16b
+	and	vend1.16b, vhas_nul1.16b, vrepmask_c.16b
+	and	vend2.16b, vhas_nul2.16b, vrepmask_c.16b
 	addp	vend1.16b, vend1.16b, vend2.16b		// 256->128
 	addp	vend1.16b, vend1.16b, vend2.16b		// 128->64
-
-	mov	tmp1, vend1.2d[0]
-.Ltail:
+	mov	tmp1, vend1.d[0]
+L(tail):
 	/* Count the trailing zeros, by bit reversing...  */
 	rbit	tmp1, tmp1
 	/* Re-bias source.  */
@@ -160,5 +125,5 @@ def_fn strchr
 	csel	result, result, xzr, eq
 	ret
 
-	.size	strchr, . - strchr
+END (strchr)
 #endif
diff --git a/newlib/libc/machine/aarch64/strchrnul.S b/newlib/libc/machine/aarch64/strchrnul.S
index a0ac13b7f4..ceaf4dca17 100644
--- a/newlib/libc/machine/aarch64/strchrnul.S
+++ b/newlib/libc/machine/aarch64/strchrnul.S
@@ -1,32 +1,9 @@
 /*
-   strchrnul - find a character or nul in a string
-
-   Copyright (c) 2014, ARM Limited
-   All rights Reserved.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are met:
-       * Redistributions of source code must retain the above copyright
-         notice, this list of conditions and the following disclaimer.
-       * Redistributions in binary form must reproduce the above copyright
-         notice, this list of conditions and the following disclaimer in the
-         documentation and/or other materials provided with the distribution.
-       * Neither the name of the company nor the names of its contributors
-         may be used to endorse or promote products derived from this
-         software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  */
-
+ * strchrnul - find a character or nul in a string
+ *
+ * Copyright (c) 2014-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
 #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
 /* See strchrnul-stub.c  */
 #else
@@ -37,6 +14,8 @@
  * Neon Available.
  */
 
+#include "asmdefs.h"
+
 /* Arguments and results.  */
 #define srcin		x0
 #define chrin		w1
@@ -70,15 +49,8 @@
 
 /* Locals and temporaries.  */
 
-	.macro def_fn f p2align=0
-	.text
-	.p2align \p2align
-	.global \f
-	.type \f, %function
-\f:
-	.endm
-
-def_fn strchrnul
+ENTRY (strchrnul)
+	PTR_ARG (0)
 	/* Magic constant 0x40100401 to allow us to identify which lane
 	   matches the termination condition.  */
 	mov	wtmp2, #0x0401
@@ -87,7 +59,7 @@ def_fn strchrnul
 	bic	src, srcin, #31		/* Work with aligned 32-byte hunks.  */
 	dup	vrepmask.4s, wtmp2
 	ands	tmp1, srcin, #31
-	b.eq	.Lloop
+	b.eq	L(loop)
 
 	/* Input string is not 32-byte aligned.  Rather than forcing
 	   the padding bytes to a safe value, we calculate the syndrome
@@ -95,47 +67,43 @@ def_fn strchrnul
 	   syndrome that are related to the padding.  */
 	ld1	{vdata1.16b, vdata2.16b}, [src], #32
 	neg	tmp1, tmp1
-	cmeq	vhas_nul1.16b, vdata1.16b, #0
 	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
-	cmeq	vhas_nul2.16b, vdata2.16b, #0
 	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
-	orr	vhas_chr1.16b, vhas_chr1.16b, vhas_nul1.16b
-	orr	vhas_chr2.16b, vhas_chr2.16b, vhas_nul2.16b
-	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
-	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+	cmhs	vhas_nul1.16b, vhas_chr1.16b, vdata1.16b
+	cmhs	vhas_nul2.16b, vhas_chr2.16b, vdata2.16b
+	and	vhas_chr1.16b, vhas_nul1.16b, vrepmask.16b
+	and	vhas_chr2.16b, vhas_nul2.16b, vrepmask.16b
 	lsl	tmp1, tmp1, #1
 	addp	vend1.16b, vhas_chr1.16b, vhas_chr2.16b	// 256->128
 	mov	tmp3, #~0
 	addp	vend1.16b, vend1.16b, vend1.16b		// 128->64
 	lsr	tmp1, tmp3, tmp1
 
-	mov	tmp3, vend1.2d[0]
+	mov	tmp3, vend1.d[0]
 	bic	tmp1, tmp3, tmp1	// Mask padding bits.
-	cbnz	tmp1, .Ltail
+	cbnz	tmp1, L(tail)
 
-.Lloop:
+	.p2align 4
+L(loop):
 	ld1	{vdata1.16b, vdata2.16b}, [src], #32
-	cmeq	vhas_nul1.16b, vdata1.16b, #0
 	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
-	cmeq	vhas_nul2.16b, vdata2.16b, #0
 	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
-	/* Use a fast check for the termination condition.  */
-	orr	vhas_chr1.16b, vhas_nul1.16b, vhas_chr1.16b
-	orr	vhas_chr2.16b, vhas_nul2.16b, vhas_chr2.16b
-	orr	vend1.16b, vhas_chr1.16b, vhas_chr2.16b
-	addp	vend1.2d, vend1.2d, vend1.2d
-	mov	tmp1, vend1.2d[0]
-	cbz	tmp1, .Lloop
+	cmhs	vhas_nul1.16b, vhas_chr1.16b, vdata1.16b
+	cmhs	vhas_nul2.16b, vhas_chr2.16b, vdata2.16b
+	orr	vend1.16b, vhas_nul1.16b, vhas_nul2.16b
+	umaxp	vend1.16b, vend1.16b, vend1.16b
+	mov	tmp1, vend1.d[0]
+	cbz	tmp1, L(loop)
 
 	/* Termination condition found.  Now need to establish exactly why
 	   we terminated.  */
-	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
-	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+	and	vhas_chr1.16b, vhas_nul1.16b, vrepmask.16b
+	and	vhas_chr2.16b, vhas_nul2.16b, vrepmask.16b
 	addp	vend1.16b, vhas_chr1.16b, vhas_chr2.16b		// 256->128
 	addp	vend1.16b, vend1.16b, vend1.16b		// 128->64
 
-	mov	tmp1, vend1.2d[0]
-.Ltail:
+	mov	tmp1, vend1.d[0]
+L(tail):
 	/* Count the trailing zeros, by bit reversing...  */
 	rbit	tmp1, tmp1
 	/* Re-bias source.  */
@@ -145,5 +113,5 @@ def_fn strchrnul
 	add	result, src, tmp1, lsr #1
 	ret
 
-	.size	strchrnul, . - strchrnul
+END (strchrnul)
 #endif
diff --git a/newlib/libc/machine/aarch64/strcmp.S b/newlib/libc/machine/aarch64/strcmp.S
index e2bef2d49d..691a1760ee 100644
--- a/newlib/libc/machine/aarch64/strcmp.S
+++ b/newlib/libc/machine/aarch64/strcmp.S
@@ -1,202 +1,192 @@
-/* Copyright (c) 2012-2018, Linaro Limited
-   All rights reserved.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are met:
-       * Redistributions of source code must retain the above copyright
-         notice, this list of conditions and the following disclaimer.
-       * Redistributions in binary form must reproduce the above copyright
-         notice, this list of conditions and the following disclaimer in the
-         documentation and/or other materials provided with the distribution.
-       * Neither the name of the Linaro nor the
-         names of its contributors may be used to endorse or promote products
-         derived from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
-
-/* Assumptions:
+/*
+ * strcmp - compare two strings
  *
- * ARMv8-a, AArch64
+ * Copyright (c) 2012-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
 /* See strcmp-stub.c  */
 #else
 
-	.macro def_fn f p2align=0
-	.text
-	.p2align \p2align
-	.global \f
-	.type \f, %function
-\f:
-	.endm
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64.
+ * MTE compatible.
+ */
 
-#define L(label) .L ## label
+#include "asmdefs.h"
 
 #define REP8_01 0x0101010101010101
 #define REP8_7f 0x7f7f7f7f7f7f7f7f
-#define REP8_80 0x8080808080808080
 
-/* Parameters and result.  */
 #define src1		x0
 #define src2		x1
 #define result		x0
 
-/* Internal variables.  */
 #define data1		x2
 #define data1w		w2
 #define data2		x3
 #define data2w		w3
 #define has_nul		x4
 #define diff		x5
+#define off1		x5
 #define syndrome	x6
-#define tmp1		x7
-#define tmp2		x8
-#define tmp3		x9
-#define zeroones	x10
-#define pos		x11
-
-	/* Start of performance-critical section  -- one 64B cache line.  */
-def_fn strcmp p2align=6
-	eor	tmp1, src1, src2
-	mov	zeroones, #REP8_01
-	tst	tmp1, #7
+#define tmp		x6
+#define data3		x7
+#define zeroones	x8
+#define shift		x9
+#define off2		x10
+
+/* On big-endian early bytes are at MSB and on little-endian LSB.
+   LS_FW means shifting towards early bytes.  */
+#ifdef __AARCH64EB__
+# define LS_FW lsl
+#else
+# define LS_FW lsr
+#endif
+
+/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+   can be done in parallel across the entire word.
+   Since carry propagation makes 0x1 bytes before a NUL byte appear
+   NUL too in big-endian, byte-reverse the data before the NUL check.  */
+
+
+ENTRY (strcmp)
+	PTR_ARG (0)
+	PTR_ARG (1)
+	sub	off2, src2, src1
+	mov	zeroones, REP8_01
+	and	tmp, src1, 7
+	tst	off2, 7
 	b.ne	L(misaligned8)
-	ands	tmp1, src1, #7
-	b.ne	L(mutual_align)
-	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
-	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
-	   can be done in parallel across the entire word.  */
+	cbnz	tmp, L(mutual_align)
+
+	.p2align 4
+
 L(loop_aligned):
-	ldr	data1, [src1], #8
-	ldr	data2, [src2], #8
+	ldr	data2, [src1, off2]
+	ldr	data1, [src1], 8
 L(start_realigned):
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, #REP8_7f
-	eor	diff, data1, data2	/* Non-zero if differences found.  */
-	bic	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
+#ifdef __AARCH64EB__
+	rev	tmp, data1
+	sub	has_nul, tmp, zeroones
+	orr	tmp, tmp, REP8_7f
+#else
+	sub	has_nul, data1, zeroones
+	orr	tmp, data1, REP8_7f
+#endif
+	bics	has_nul, has_nul, tmp	/* Non-zero if NUL terminator.  */
+	ccmp	data1, data2, 0, eq
+	b.eq	L(loop_aligned)
+#ifdef __AARCH64EB__
+	rev	has_nul, has_nul
+#endif
+	eor	diff, data1, data2
 	orr	syndrome, diff, has_nul
-	cbz	syndrome, L(loop_aligned)
-	/* End of performance-critical section  -- one 64B cache line.  */
-
 L(end):
-#ifndef	__AARCH64EB__
+#ifndef __AARCH64EB__
 	rev	syndrome, syndrome
 	rev	data1, data1
-	/* The MS-non-zero bit of the syndrome marks either the first bit
-	   that is different, or the top bit of the first zero byte.
-	   Shifting left now will bring the critical information into the
-	   top bits.  */
-	clz	pos, syndrome
 	rev	data2, data2
-	lsl	data1, data1, pos
-	lsl	data2, data2, pos
-	/* But we need to zero-extend (char is unsigned) the value and then
-	   perform a signed 32-bit subtraction.  */
-	lsr	data1, data1, #56
-	sub	result, data1, data2, lsr #56
-	ret
-#else
-	/* For big-endian we cannot use the trick with the syndrome value
-	   as carry-propagation can corrupt the upper bits if the trailing
-	   bytes in the string contain 0x01.  */
-	/* However, if there is no NUL byte in the dword, we can generate
-	   the result directly.  We can't just subtract the bytes as the
-	   MSB might be significant.  */
-	cbnz	has_nul, 1f
-	cmp	data1, data2
-	cset	result, ne
-	cneg	result, result, lo
-	ret
-1:
-	/* Re-compute the NUL-byte detection, using a byte-reversed value.  */
-	rev	tmp3, data1
-	sub	tmp1, tmp3, zeroones
-	orr	tmp2, tmp3, #REP8_7f
-	bic	has_nul, tmp1, tmp2
-	rev	has_nul, has_nul
-	orr	syndrome, diff, has_nul
-	clz	pos, syndrome
-	/* The MS-non-zero bit of the syndrome marks either the first bit
-	   that is different, or the top bit of the first zero byte.
+#endif
+	clz	shift, syndrome
+	/* The most-significant-non-zero bit of the syndrome marks either the
+	   first bit that is different, or the top bit of the first zero byte.
 	   Shifting left now will bring the critical information into the
 	   top bits.  */
-	lsl	data1, data1, pos
-	lsl	data2, data2, pos
+	lsl	data1, data1, shift
+	lsl	data2, data2, shift
 	/* But we need to zero-extend (char is unsigned) the value and then
 	   perform a signed 32-bit subtraction.  */
-	lsr	data1, data1, #56
-	sub	result, data1, data2, lsr #56
+	lsr	data1, data1, 56
+	sub	result, data1, data2, lsr 56
 	ret
-#endif
+
+	.p2align 4
 
 L(mutual_align):
 	/* Sources are mutually aligned, but are not currently at an
 	   alignment boundary.  Round down the addresses and then mask off
-	   the bytes that preceed the start point.  */
-	bic	src1, src1, #7
-	bic	src2, src2, #7
-	lsl	tmp1, tmp1, #3		/* Bytes beyond alignment -> bits.  */
-	ldr	data1, [src1], #8
-	neg	tmp1, tmp1		/* Bits to alignment -64.  */
-	ldr	data2, [src2], #8
-	mov	tmp2, #~0
-#ifdef __AARCH64EB__
-	/* Big-endian.  Early bytes are at MSB.  */
-	lsl	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
-#else
-	/* Little-endian.  Early bytes are at LSB.  */
-	lsr	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
-#endif
-	orr	data1, data1, tmp2
-	orr	data2, data2, tmp2
+	   the bytes that precede the start point.  */
+	bic	src1, src1, 7
+	ldr	data2, [src1, off2]
+	ldr	data1, [src1], 8
+	neg	shift, src2, lsl 3	/* Bits to alignment -64.  */
+	mov	tmp, -1
+	LS_FW	tmp, tmp, shift
+	orr	data1, data1, tmp
+	orr	data2, data2, tmp
 	b	L(start_realigned)
 
 L(misaligned8):
 	/* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always
-	   checking to make sure that we don't access beyond page boundary in
-	   SRC2.  */
-	tst	src1, #7
-	b.eq	L(loop_misaligned)
+	   checking to make sure that we don't access beyond the end of SRC2.  */
+	cbz	tmp, L(src1_aligned)
 L(do_misaligned):
-	ldrb	data1w, [src1], #1
-	ldrb	data2w, [src2], #1
-	cmp	data1w, #1
-	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
+	ldrb	data1w, [src1], 1
+	ldrb	data2w, [src2], 1
+	cmp	data1w, 0
+	ccmp	data1w, data2w, 0, ne	/* NZCV = 0b0000.  */
 	b.ne	L(done)
-	tst	src1, #7
+	tst	src1, 7
 	b.ne	L(do_misaligned)
 
-L(loop_misaligned):
-	/* Test if we are within the last dword of the end of a 4K page.  If
-	   yes then jump back to the misaligned loop to copy a byte at a time.  */
-	and	tmp1, src2, #0xff8
-	eor	tmp1, tmp1, #0xff8
-	cbz	tmp1, L(do_misaligned)
-	ldr	data1, [src1], #8
-	ldr	data2, [src2], #8
-
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, #REP8_7f
-	eor	diff, data1, data2	/* Non-zero if differences found.  */
-	bic	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
+L(src1_aligned):
+	neg	shift, src2, lsl 3
+	bic	src2, src2, 7
+	ldr	data3, [src2], 8
+#ifdef __AARCH64EB__
+	rev	data3, data3
+#endif
+	lsr	tmp, zeroones, shift
+	orr	data3, data3, tmp
+	sub	has_nul, data3, zeroones
+	orr	tmp, data3, REP8_7f
+	bics	has_nul, has_nul, tmp
+	b.ne	L(tail)
+
+	sub	off1, src2, src1
+
+	.p2align 4
+
+L(loop_unaligned):
+	ldr	data3, [src1, off1]
+	ldr	data2, [src1, off2]
+#ifdef __AARCH64EB__
+	rev	data3, data3
+#endif
+	sub	has_nul, data3, zeroones
+	orr	tmp, data3, REP8_7f
+	ldr	data1, [src1], 8
+	bics	has_nul, has_nul, tmp
+	ccmp	data1, data2, 0, eq
+	b.eq	L(loop_unaligned)
+
+	lsl	tmp, has_nul, shift
+#ifdef __AARCH64EB__
+	rev	tmp, tmp
+#endif
+	eor	diff, data1, data2
+	orr	syndrome, diff, tmp
+	cbnz	syndrome, L(end)
+L(tail):
+	ldr	data1, [src1]
+	neg	shift, shift
+	lsr	data2, data3, shift
+	lsr	has_nul, has_nul, shift
+#ifdef __AARCH64EB__
+	rev     data2, data2
+	rev	has_nul, has_nul
+#endif
+	eor	diff, data1, data2
 	orr	syndrome, diff, has_nul
-	cbz	syndrome, L(loop_misaligned)
 	b	L(end)
 
 L(done):
 	sub	result, data1, data2
 	ret
-	.size	strcmp, .-strcmp
 
+END (strcmp)
 #endif
diff --git a/newlib/libc/machine/aarch64/strcpy.S b/newlib/libc/machine/aarch64/strcpy.S
index e5405f2535..57c46f3908 100644
--- a/newlib/libc/machine/aarch64/strcpy.S
+++ b/newlib/libc/machine/aarch64/strcpy.S
@@ -1,341 +1,160 @@
 /*
-   strcpy/stpcpy - copy a string returning pointer to start/end.
-
-   Copyright (c) 2013, 2014, 2015 ARM Ltd.
-   All Rights Reserved.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are met:
-       * Redistributions of source code must retain the above copyright
-         notice, this list of conditions and the following disclaimer.
-       * Redistributions in binary form must reproduce the above copyright
-         notice, this list of conditions and the following disclaimer in the
-         documentation and/or other materials provided with the distribution.
-       * Neither the name of the company nor the names of its contributors
-         may be used to endorse or promote products derived from this
-         software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  */
-
+ * strcpy/stpcpy - copy a string returning pointer to start/end.
+ *
+ * Copyright (c) 2020-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
 #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
 /* See strchr-stub.c  */
 #else
 
 /* Assumptions:
  *
- * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
+ * ARMv8-a, AArch64, Advanced SIMD.
+ * MTE compatible.
  */
 
-/* To build as stpcpy, define BUILD_STPCPY before compiling this file.
+#include "asmdefs.h"
 
-   To test the page crossing code path more thoroughly, compile with
-   -DSTRCPY_TEST_PAGE_CROSS - this will force all copies through the slower
-   entry path.  This option is not intended for production use.  */
-
-/* Arguments and results.  */
 #define dstin		x0
 #define srcin		x1
+#define result		x0
 
-/* Locals and temporaries.  */
 #define src		x2
 #define dst		x3
-#define data1		x4
-#define data1w		w4
-#define data2		x5
-#define data2w		w5
-#define has_nul1	x6
-#define has_nul2	x7
-#define tmp1		x8
-#define tmp2		x9
-#define tmp3		x10
-#define tmp4		x11
-#define zeroones	x12
-#define data1a		x13
-#define data2a		x14
-#define pos		x15
-#define len		x16
-#define to_align	x17
+#define len		x4
+#define synd		x4
+#define	tmp		x5
+#define shift		x5
+#define data1		x6
+#define dataw1		w6
+#define data2		x7
+#define dataw2		w7
+
+#define dataq		q0
+#define vdata		v0
+#define vhas_nul	v1
+#define vend		v2
+#define dend		d2
+#define dataq2		q1
 
 #ifdef BUILD_STPCPY
-#define STRCPY stpcpy
+# define STRCPY stpcpy
+# define IFSTPCPY(X,...) X,__VA_ARGS__
 #else
-#define STRCPY strcpy
+# define STRCPY strcpy
+# define IFSTPCPY(X,...)
 #endif
 
-	.macro def_fn f p2align=0
-	.text
-	.p2align \p2align
-	.global \f
-	.type \f, %function
-\f:
-	.endm
-
-	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
-	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
-	   can be done in parallel across the entire word.  */
-
-#define REP8_01 0x0101010101010101
-#define REP8_7f 0x7f7f7f7f7f7f7f7f
-#define REP8_80 0x8080808080808080
-
-	/* AArch64 systems have a minimum page size of 4k.  We can do a quick
-	   page size check for crossing this boundary on entry and if we
-	   do not, then we can short-circuit much of the entry code.  We
-	   expect early page-crossing strings to be rare (probability of
-	   16/MIN_PAGE_SIZE ~= 0.4%), so the branch should be quite
-	   predictable, even with random strings.
-
-	   We don't bother checking for larger page sizes, the cost of setting
-	   up the correct page size is just not worth the extra gain from
-	   a small reduction in the cases taking the slow path.  Note that
-	   we only care about whether the first fetch, which may be
-	   misaligned, crosses a page boundary - after that we move to aligned
-	   fetches for the remainder of the string.  */
-
-#ifdef STRCPY_TEST_PAGE_CROSS
-	/* Make everything that isn't Qword aligned look like a page cross.  */
-#define MIN_PAGE_P2 4
-#else
-#define MIN_PAGE_P2 12
-#endif
-
-#define MIN_PAGE_SIZE (1 << MIN_PAGE_P2)
-
-def_fn STRCPY p2align=6
-	/* For moderately short strings, the fastest way to do the copy is to
-	   calculate the length of the string in the same way as strlen, then
-	   essentially do a memcpy of the result.  This avoids the need for
-	   multiple byte copies and further means that by the time we
-	   reach the bulk copy loop we know we can always use DWord
-	   accesses.  We expect strcpy to rarely be called repeatedly
-	   with the same source string, so branch prediction is likely to
-	   always be difficult - we mitigate against this by preferring
-	   conditional select operations over branches whenever this is
-	   feasible.  */
-	and	tmp2, srcin, #(MIN_PAGE_SIZE - 1)
-	mov	zeroones, #REP8_01
-	and	to_align, srcin, #15
-	cmp	tmp2, #(MIN_PAGE_SIZE - 16)
-	neg	tmp1, to_align
-	/* The first fetch will straddle a (possible) page boundary iff
-	   srcin + 15 causes bit[MIN_PAGE_P2] to change value.  A 16-byte
-	   aligned string will never fail the page align check, so will
-	   always take the fast path.  */
-	b.gt	.Lpage_cross
-
-.Lpage_cross_ok:
-	ldp	data1, data2, [srcin]
-#ifdef __AARCH64EB__
-	/* Because we expect the end to be found within 16 characters
-	   (profiling shows this is the most common case), it's worth
-	   swapping the bytes now to save having to recalculate the
-	   termination syndrome later.  We preserve data1 and data2
-	   so that we can re-use the values later on.  */
-	rev	tmp2, data1
-	sub	tmp1, tmp2, zeroones
-	orr	tmp2, tmp2, #REP8_7f
-	bics	has_nul1, tmp1, tmp2
-	b.ne	.Lfp_le8
-	rev	tmp4, data2
-	sub	tmp3, tmp4, zeroones
-	orr	tmp4, tmp4, #REP8_7f
-#else
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, #REP8_7f
-	bics	has_nul1, tmp1, tmp2
-	b.ne	.Lfp_le8
-	sub	tmp3, data2, zeroones
-	orr	tmp4, data2, #REP8_7f
+/*
+   Core algorithm:
+   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
+   per byte. We take 4 bits of every comparison byte with shift right and narrow
+   by 4 instruction. Since the bits in the nibble mask reflect the order in
+   which things occur in the original string, counting leading zeros identifies
+   exactly which byte matched.  */
+
+ENTRY (STRCPY)
+	PTR_ARG (0)
+	PTR_ARG (1)
+	bic	src, srcin, 15
+	ld1	{vdata.16b}, [src]
+	cmeq	vhas_nul.16b, vdata.16b, 0
+	lsl	shift, srcin, 2
+	shrn	vend.8b, vhas_nul.8h, 4
+	fmov	synd, dend
+	lsr	synd, synd, shift
+	cbnz	synd, L(tail)
+
+	ldr	dataq, [src, 16]!
+	cmeq	vhas_nul.16b, vdata.16b, 0
+	shrn	vend.8b, vhas_nul.8h, 4
+	fmov	synd, dend
+	cbz	synd, L(start_loop)
+
+#ifndef __AARCH64EB__
+	rbit	synd, synd
 #endif
-	bics	has_nul2, tmp3, tmp4
-	b.eq	.Lbulk_entry
+	sub	tmp, src, srcin
+	clz	len, synd
+	add	len, tmp, len, lsr 2
+	tbz	len, 4, L(less16)
+	sub	tmp, len, 15
+	ldr	dataq, [srcin]
+	ldr	dataq2, [srcin, tmp]
+	str	dataq, [dstin]
+	str	dataq2, [dstin, tmp]
+	IFSTPCPY (add result, dstin, len)
+	ret
 
-	/* The string is short (<=16 bytes).  We don't know exactly how
-	   short though, yet.  Work out the exact length so that we can
-	   quickly select the optimal copy strategy.  */
-.Lfp_gt8:
-	rev	has_nul2, has_nul2
-	clz	pos, has_nul2
-	mov	tmp2, #56
-	add	dst, dstin, pos, lsr #3		/* Bits to bytes.  */
-	sub	pos, tmp2, pos
-#ifdef __AARCH64EB__
-	lsr	data2, data2, pos
-#else
-	lsl	data2, data2, pos
-#endif
-	str	data2, [dst, #1]
+L(tail):
+	rbit	synd, synd
+	clz	len, synd
+	lsr	len, len, 2
+L(less16):
+	tbz	len, 3, L(less8)
+	sub	tmp, len, 7
+	ldr	data1, [srcin]
+	ldr	data2, [srcin, tmp]
 	str	data1, [dstin]
-#ifdef BUILD_STPCPY
-	add	dstin, dst, #8
-#endif
+	str	data2, [dstin, tmp]
+	IFSTPCPY (add result, dstin, len)
 	ret
 
-.Lfp_le8:
-	rev	has_nul1, has_nul1
-	clz	pos, has_nul1
-	add	dst, dstin, pos, lsr #3		/* Bits to bytes.  */
-	subs	tmp2, pos, #24			/* Pos in bits. */
-	b.lt	.Lfp_lt4
-#ifdef __AARCH64EB__
-	mov	tmp2, #56
-	sub	pos, tmp2, pos
-	lsr	data2, data1, pos
-	lsr	data1, data1, #32
-#else
-	lsr	data2, data1, tmp2
-#endif
-	/* 4->7 bytes to copy.  */
-	str	data2w, [dst, #-3]
-	str	data1w, [dstin]
-#ifdef BUILD_STPCPY
-	mov	dstin, dst
-#endif
-	ret
-.Lfp_lt4:
-	cbz	pos, .Lfp_lt2
-	/* 2->3 bytes to copy.  */
-#ifdef __AARCH64EB__
-	lsr	data1, data1, #48
-#endif
-	strh	data1w, [dstin]
-	/* Fall-through, one byte (max) to go.  */
-.Lfp_lt2:
-	/* Null-terminated string.  Last character must be zero!  */
-	strb	wzr, [dst]
-#ifdef BUILD_STPCPY
-	mov	dstin, dst
-#endif
+	.p2align 4
+L(less8):
+	subs	tmp, len, 3
+	b.lo	L(less4)
+	ldr	dataw1, [srcin]
+	ldr	dataw2, [srcin, tmp]
+	str	dataw1, [dstin]
+	str	dataw2, [dstin, tmp]
+	IFSTPCPY (add result, dstin, len)
 	ret
 
-	.p2align 6
-	/* Aligning here ensures that the entry code and main loop all lies
-	   within one 64-byte cache line.  */
-.Lbulk_entry:
-	sub	to_align, to_align, #16
-	stp	data1, data2, [dstin]
-	sub	src, srcin, to_align
-	sub	dst, dstin, to_align
-	b	.Lentry_no_page_cross
-
-	/* The inner loop deals with two Dwords at a time.  This has a
-	   slightly higher start-up cost, but we should win quite quickly,
-	   especially on cores with a high number of issue slots per
-	   cycle, as we get much better parallelism out of the operations.  */
-.Lmain_loop:
-	stp	data1, data2, [dst], #16
-.Lentry_no_page_cross:
-	ldp	data1, data2, [src], #16
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, #REP8_7f
-	sub	tmp3, data2, zeroones
-	orr	tmp4, data2, #REP8_7f
-	bic	has_nul1, tmp1, tmp2
-	bics	has_nul2, tmp3, tmp4
-	ccmp	has_nul1, #0, #0, eq	/* NZCV = 0000  */
-	b.eq	.Lmain_loop
-
-	/* Since we know we are copying at least 16 bytes, the fastest way
-	   to deal with the tail is to determine the location of the
-	   trailing NUL, then (re)copy the 16 bytes leading up to that.  */
-	cmp	has_nul1, #0
-#ifdef __AARCH64EB__
-	/* For big-endian, carry propagation (if the final byte in the
-	   string is 0x01) means we cannot use has_nul directly.  The
-	   easiest way to get the correct byte is to byte-swap the data
-	   and calculate the syndrome a second time.  */
-	csel	data1, data1, data2, ne
-	rev	data1, data1
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, #REP8_7f
-	bic	has_nul1, tmp1, tmp2
-#else
-	csel	has_nul1, has_nul1, has_nul2, ne
-#endif
-	rev	has_nul1, has_nul1
-	clz	pos, has_nul1
-	add	tmp1, pos, #72
-	add	pos, pos, #8
-	csel	pos, pos, tmp1, ne
-	add	src, src, pos, lsr #3
-	add	dst, dst, pos, lsr #3
-	ldp	data1, data2, [src, #-32]
-	stp	data1, data2, [dst, #-16]
-#ifdef BUILD_STPCPY
-	sub	dstin, dst, #1
-#endif
+L(less4):
+	cbz	len, L(zerobyte)
+	ldrh	dataw1, [srcin]
+	strh	dataw1, [dstin]
+L(zerobyte):
+	strb	wzr, [dstin, len]
+	IFSTPCPY (add result, dstin, len)
 	ret
 
-.Lpage_cross:
-	bic	src, srcin, #15
-	/* Start by loading two words at [srcin & ~15], then forcing the
-	   bytes that precede srcin to 0xff.  This means they never look
-	   like termination bytes.  */
-	ldp	data1, data2, [src]
-	lsl	tmp1, tmp1, #3	/* Bytes beyond alignment -> bits.  */
-	tst	to_align, #7
-	csetm	tmp2, ne
-#ifdef __AARCH64EB__
-	lsl	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
-#else
-	lsr	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
+	.p2align 4
+L(start_loop):
+	sub	tmp, srcin, dstin
+	ldr	dataq2, [srcin]
+	sub	dst, src, tmp
+	str	dataq2, [dstin]
+L(loop):
+	str	dataq, [dst], 32
+	ldr	dataq, [src, 16]
+	cmeq	vhas_nul.16b, vdata.16b, 0
+	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	fmov	synd, dend
+	cbnz	synd, L(loopend)
+	str	dataq, [dst, -16]
+	ldr	dataq, [src, 32]!
+	cmeq	vhas_nul.16b, vdata.16b, 0
+	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	fmov	synd, dend
+	cbz	synd, L(loop)
+	add	dst, dst, 16
+L(loopend):
+	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
+	fmov	synd, dend
+	sub	dst, dst, 31
+#ifndef __AARCH64EB__
+	rbit	synd, synd
 #endif
-	orr	data1, data1, tmp2
-	orr	data2a, data2, tmp2
-	cmp	to_align, #8
-	csinv	data1, data1, xzr, lt
-	csel	data2, data2, data2a, lt
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, #REP8_7f
-	sub	tmp3, data2, zeroones
-	orr	tmp4, data2, #REP8_7f
-	bic	has_nul1, tmp1, tmp2
-	bics	has_nul2, tmp3, tmp4
-	ccmp	has_nul1, #0, #0, eq	/* NZCV = 0000  */
-	b.eq	.Lpage_cross_ok
-	/* We now need to make data1 and data2 look like they've been
-	   loaded directly from srcin.  Do a rotate on the 128-bit value.  */
-	lsl	tmp1, to_align, #3	/* Bytes->bits.  */
-	neg	tmp2, to_align, lsl #3
-#ifdef __AARCH64EB__
-	lsl	data1a, data1, tmp1
-	lsr	tmp4, data2, tmp2
-	lsl	data2, data2, tmp1
-	orr	tmp4, tmp4, data1a
-	cmp	to_align, #8
-	csel	data1, tmp4, data2, lt
-	rev	tmp2, data1
-	rev	tmp4, data2
-	sub	tmp1, tmp2, zeroones
-	orr	tmp2, tmp2, #REP8_7f
-	sub	tmp3, tmp4, zeroones
-	orr	tmp4, tmp4, #REP8_7f
-#else
-	lsr	data1a, data1, tmp1
-	lsl	tmp4, data2, tmp2
-	lsr	data2, data2, tmp1
-	orr	tmp4, tmp4, data1a
-	cmp	to_align, #8
-	csel	data1, tmp4, data2, lt
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, #REP8_7f
-	sub	tmp3, data2, zeroones
-	orr	tmp4, data2, #REP8_7f
-#endif
-	bic	has_nul1, tmp1, tmp2
-	cbnz	has_nul1, .Lfp_le8
-	bic	has_nul2, tmp3, tmp4
-	b	.Lfp_gt8
+	clz	len, synd
+	lsr	len, len, 2
+	add	dst, dst, len
+	ldr	dataq, [dst, tmp]
+	str	dataq, [dst]
+	IFSTPCPY (add result, dst, 15)
+	ret
 
-	.size	STRCPY, . - STRCPY
+END (STRCPY)
 #endif
diff --git a/newlib/libc/machine/aarch64/strlen.S b/newlib/libc/machine/aarch64/strlen.S
index 872d136ef4..68a6f357cf 100644
--- a/newlib/libc/machine/aarch64/strlen.S
+++ b/newlib/libc/machine/aarch64/strlen.S
@@ -1,115 +1,92 @@
-/* Copyright (c) 2013-2015, Linaro Limited
-   All rights reserved.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are met:
-       * Redistributions of source code must retain the above copyright
-	 notice, this list of conditions and the following disclaimer.
-       * Redistributions in binary form must reproduce the above copyright
-	 notice, this list of conditions and the following disclaimer in the
-	 documentation and/or other materials provided with the distribution.
-       * Neither the name of the Linaro nor the
-	 names of its contributors may be used to endorse or promote products
-	 derived from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
-
+/*
+ * strlen - calculate the length of a string.
+ *
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
 #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
 /* See strlen-stub.c  */
 #else
 
 /* Assumptions:
  *
- * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
+ * Not MTE compatible.
  */
 
-/* To test the page crossing code path more thoroughly, compile with
-   -DTEST_PAGE_CROSS - this will force all calls through the slower
-   entry path.  This option is not intended for production use.	 */
-
-/* Arguments and results.  */
-#define srcin		x0
-#define len		x0
-
-/* Locals and temporaries.  */
-#define src		x1
-#define data1		x2
-#define data2		x3
-#define has_nul1	x4
-#define has_nul2	x5
-#define tmp1		x4
-#define tmp2		x5
-#define tmp3		x6
-#define tmp4		x7
-#define zeroones	x8
-
-#define L(l) .L ## l
-
-	.macro def_fn f p2align=0
-	.text
-	.p2align \p2align
-	.global \f
-	.type \f, %function
-\f:
-	.endm
-
-	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
-	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
-	   can be done in parallel across the entire word. A faster check
-	   (X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives
-	   false hits for characters 129..255.	*/
+#include "asmdefs.h"
+
+#define srcin	x0
+#define len	x0
+
+#define src	x1
+#define data1	x2
+#define data2	x3
+#define has_nul1 x4
+#define has_nul2 x5
+#define tmp1	x4
+#define tmp2	x5
+#define tmp3	x6
+#define tmp4	x7
+#define zeroones x8
+
+#define maskv	v0
+#define maskd	d0
+#define dataq1	q1
+#define dataq2	q2
+#define datav1	v1
+#define datav2	v2
+#define tmp	x2
+#define tmpw	w2
+#define synd	x3
+#define syndw	w3
+#define shift	x4
+
+/* For the first 32 bytes, NUL detection works on the principle that
+   (X - 1) & (~X) & 0x80 (=> (X - 1) & ~(X | 0x7f)) is non-zero if a
+   byte is zero, and can be done in parallel across the entire word.  */
 
 #define REP8_01 0x0101010101010101
 #define REP8_7f 0x7f7f7f7f7f7f7f7f
-#define REP8_80 0x8080808080808080
+
+/* To test the page crossing code path more thoroughly, compile with
+   -DTEST_PAGE_CROSS - this will force all calls through the slower
+   entry path.  This option is not intended for production use.  */
 
 #ifdef TEST_PAGE_CROSS
-# define MIN_PAGE_SIZE 15
+# define MIN_PAGE_SIZE 32
 #else
 # define MIN_PAGE_SIZE 4096
 #endif
 
-	/* Since strings are short on average, we check the first 16 bytes
-	   of the string for a NUL character.  In order to do an unaligned ldp
-	   safely we have to do a page cross check first.  If there is a NUL
-	   byte we calculate the length from the 2 8-byte words using
-	   conditional select to reduce branch mispredictions (it is unlikely
-	   strlen will be repeatedly called on strings with the same length).
-
-	   If the string is longer than 16 bytes, we align src so don't need
-	   further page cross checks, and process 32 bytes per iteration
-	   using the fast NUL check.  If we encounter non-ASCII characters,
-	   fallback to a second loop using the full NUL check.
-
-	   If the page cross check fails, we read 16 bytes from an aligned
-	   address, remove any characters before the string, and continue
-	   in the main loop using aligned loads.  Since strings crossing a
-	   page in the first 16 bytes are rare (probability of
-	   16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized.
-
-	   AArch64 systems have a minimum page size of 4k.  We don't bother
-	   checking for larger page sizes - the cost of setting up the correct
-	   page size is just not worth the extra gain from a small reduction in
-	   the cases taking the slow path.  Note that we only care about
-	   whether the first fetch, which may be misaligned, crosses a page
-	   boundary.  */
-
-def_fn strlen p2align=6
+/* Core algorithm:
+
+   Since strings are short on average, we check the first 32 bytes of the
+   string for a NUL character without aligning the string.  In order to use
+   unaligned loads safely we must do a page cross check first.
+
+   If there is a NUL byte we calculate the length from the 2 8-byte words
+   using conditional select to reduce branch mispredictions (it is unlikely
+   strlen will be repeatedly called on strings with the same length).
+
+   If the string is longer than 32 bytes, align src so we don't need further
+   page cross checks, and process 32 bytes per iteration using a fast SIMD
+   loop.
+
+   If the page cross check fails, we read 32 bytes from an aligned address,
+   and ignore any characters before the string.  If it contains a NUL
+   character, return the length, if not, continue in the main loop.  */
+
+ENTRY (strlen)
+	PTR_ARG (0)
 	and	tmp1, srcin, MIN_PAGE_SIZE - 1
-	mov	zeroones, REP8_01
-	cmp	tmp1, MIN_PAGE_SIZE - 16
-	b.gt	L(page_cross)
+	cmp	tmp1, MIN_PAGE_SIZE - 32
+	b.hi	L(page_cross)
+
+	/* Look for a NUL byte in the first 16 bytes.  */
 	ldp	data1, data2, [srcin]
+	mov	zeroones, REP8_01
+
 #ifdef __AARCH64EB__
 	/* For big-endian, carry propagation (if the final byte in the
 	   string is 0x01) means we cannot use has_nul1/2 directly.
@@ -125,114 +102,96 @@ def_fn strlen p2align=6
 	bics	has_nul1, tmp1, tmp2
 	bic	has_nul2, tmp3, tmp4
 	ccmp	has_nul2, 0, 0, eq
-	beq	L(main_loop_entry)
+	b.eq	L(bytes16_31)
 
-	/* Enter with C = has_nul1 == 0.  */
+	/* Find the exact offset of the first NUL byte in the first 16 bytes
+	   from the string start.  Enter with C = has_nul1 == 0.  */
 	csel	has_nul1, has_nul1, has_nul2, cc
 	mov	len, 8
 	rev	has_nul1, has_nul1
-	clz	tmp1, has_nul1
 	csel	len, xzr, len, cc
+	clz	tmp1, has_nul1
 	add	len, len, tmp1, lsr 3
 	ret
 
-	/* The inner loop processes 32 bytes per iteration and uses the fast
-	   NUL check.  If we encounter non-ASCII characters, use a second
-	   loop with the accurate NUL check.  */
-	.p2align 4
-L(main_loop_entry):
-	bic	src, srcin, 15
-	sub	src, src, 16
-L(main_loop):
-	ldp	data1, data2, [src, 32]!
-.Lpage_cross_entry:
-	sub	tmp1, data1, zeroones
-	sub	tmp3, data2, zeroones
-	orr	tmp2, tmp1, tmp3
-	tst	tmp2, zeroones, lsl 7
-	bne	1f
-	ldp	data1, data2, [src, 16]
+	/* Look for a NUL byte at offset 16..31 in the string.  */
+L(bytes16_31):
+	ldp	data1, data2, [srcin, 16]
+#ifdef __AARCH64EB__
+	rev	data1, data1
+	rev	data2, data2
+#endif
 	sub	tmp1, data1, zeroones
-	sub	tmp3, data2, zeroones
-	orr	tmp2, tmp1, tmp3
-	tst	tmp2, zeroones, lsl 7
-	beq	L(main_loop)
-	add	src, src, 16
-1:
-	/* The fast check failed, so do the slower, accurate NUL check.	 */
 	orr	tmp2, data1, REP8_7f
+	sub	tmp3, data2, zeroones
 	orr	tmp4, data2, REP8_7f
 	bics	has_nul1, tmp1, tmp2
 	bic	has_nul2, tmp3, tmp4
 	ccmp	has_nul2, 0, 0, eq
-	beq	L(nonascii_loop)
+	b.eq	L(loop_entry)
 
-	/* Enter with C = has_nul1 == 0.  */
-L(tail):
-#ifdef __AARCH64EB__
-	/* For big-endian, carry propagation (if the final byte in the
-	   string is 0x01) means we cannot use has_nul1/2 directly.  The
-	   easiest way to get the correct byte is to byte-swap the data
-	   and calculate the syndrome a second time.  */
-	csel	data1, data1, data2, cc
-	rev	data1, data1
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, REP8_7f
-	bic	has_nul1, tmp1, tmp2
-#else
+	/* Find the exact offset of the first NUL byte at offset 16..31 from
+	   the string start.  Enter with C = has_nul1 == 0.  */
 	csel	has_nul1, has_nul1, has_nul2, cc
-#endif
-	sub	len, src, srcin
+	mov	len, 24
 	rev	has_nul1, has_nul1
-	add	tmp2, len, 8
+	mov	tmp3, 16
 	clz	tmp1, has_nul1
-	csel	len, len, tmp2, cc
+	csel	len, tmp3, len, cc
 	add	len, len, tmp1, lsr 3
 	ret
 
-L(nonascii_loop):
-	ldp	data1, data2, [src, 16]!
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, REP8_7f
-	sub	tmp3, data2, zeroones
-	orr	tmp4, data2, REP8_7f
-	bics	has_nul1, tmp1, tmp2
-	bic	has_nul2, tmp3, tmp4
-	ccmp	has_nul2, 0, 0, eq
-	bne	L(tail)
-	ldp	data1, data2, [src, 16]!
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, REP8_7f
-	sub	tmp3, data2, zeroones
-	orr	tmp4, data2, REP8_7f
-	bics	has_nul1, tmp1, tmp2
-	bic	has_nul2, tmp3, tmp4
-	ccmp	has_nul2, 0, 0, eq
-	beq	L(nonascii_loop)
-	b	L(tail)
+	nop
+L(loop_entry):
+	bic	src, srcin, 31
+
+	.p2align 5
+L(loop):
+	ldp	dataq1, dataq2, [src, 32]!
+	uminp	maskv.16b, datav1.16b, datav2.16b
+	uminp	maskv.16b, maskv.16b, maskv.16b
+	cmeq	maskv.8b, maskv.8b, 0
+	fmov	synd, maskd
+	cbz	synd, L(loop)
+
+	/* Low 32 bits of synd are non-zero if a NUL was found in datav1.  */
+	cmeq	maskv.16b, datav1.16b, 0
+	sub	len, src, srcin
+	cbnz	syndw, 1f
+	cmeq	maskv.16b, datav2.16b, 0
+	add	len, len, 16
+1:
+	/* Generate a bitmask and compute correct byte offset.  */
+	shrn	maskv.8b, maskv.8h, 4
+	fmov	synd, maskd
+#ifndef __AARCH64EB__
+	rbit	synd, synd
+#endif
+	clz	tmp, synd
+	add	len, len, tmp, lsr 2
+	ret
 
-	/* Load 16 bytes from [srcin & ~15] and force the bytes that precede
-	   srcin to 0x7f, so we ignore any NUL bytes before the string.
-	   Then continue in the aligned loop.  */
 L(page_cross):
-	bic	src, srcin, 15
-	ldp	data1, data2, [src]
-	lsl	tmp1, srcin, 3
-	mov	tmp4, -1
-#ifdef __AARCH64EB__
-	/* Big-endian.	Early bytes are at MSB.	 */
-	lsr	tmp1, tmp4, tmp1	/* Shift (tmp1 & 63).  */
-#else
-	/* Little-endian.  Early bytes are at LSB.  */
-	lsl	tmp1, tmp4, tmp1	/* Shift (tmp1 & 63).  */
-#endif
-	orr	tmp1, tmp1, REP8_80
-	orn	data1, data1, tmp1
-	orn	tmp2, data2, tmp1
-	tst	srcin, 8
-	csel	data1, data1, tmp4, eq
-	csel	data2, data2, tmp2, eq
-	b	L(page_cross_entry)
-
-	.size	strlen, . - strlen
+	bic	src, srcin, 31
+	mov	tmpw, 0x0c03
+	movk	tmpw, 0xc030, lsl 16
+	ld1	{datav1.16b, datav2.16b}, [src]
+	dup	maskv.4s, tmpw
+	cmeq	datav1.16b, datav1.16b, 0
+	cmeq	datav2.16b, datav2.16b, 0
+	and	datav1.16b, datav1.16b, maskv.16b
+	and	datav2.16b, datav2.16b, maskv.16b
+	addp	maskv.16b, datav1.16b, datav2.16b
+	addp	maskv.16b, maskv.16b, maskv.16b
+	fmov	synd, maskd
+	lsl	shift, srcin, 1
+	lsr	synd, synd, shift
+	cbz	synd, L(loop)
+
+	rbit	synd, synd
+	clz	len, synd
+	lsr	len, len, 1
+	ret
+
+END (strlen)
 #endif
diff --git a/newlib/libc/machine/aarch64/strncmp.S b/newlib/libc/machine/aarch64/strncmp.S
index ffdabc2607..373695503d 100644
--- a/newlib/libc/machine/aarch64/strncmp.S
+++ b/newlib/libc/machine/aarch64/strncmp.S
@@ -1,49 +1,23 @@
-/* Copyright (c) 2013, 2018, Linaro Limited
-   All rights reserved.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are met:
-       * Redistributions of source code must retain the above copyright
-         notice, this list of conditions and the following disclaimer.
-       * Redistributions in binary form must reproduce the above copyright
-         notice, this list of conditions and the following disclaimer in the
-         documentation and/or other materials provided with the distribution.
-       * Neither the name of the Linaro nor the
-         names of its contributors may be used to endorse or promote products
-         derived from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
-
+/*
+ * strncmp - compare two strings
+ *
+ * Copyright (c) 2013-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
 #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
 /* See strcmp-stub.c  */
 #else
 
 /* Assumptions:
  *
- * ARMv8-a, AArch64
+ * ARMv8-a, AArch64.
+ * MTE compatible.
  */
 
-	.macro def_fn f p2align=0
-	.text
-	.p2align \p2align
-	.global \f
-	.type \f, %function
-\f:
-	.endm
+#include "asmdefs.h"
 
 #define REP8_01 0x0101010101010101
 #define REP8_7f 0x7f7f7f7f7f7f7f7f
-#define REP8_80 0x8080808080808080
 
 /* Parameters and result.  */
 #define src1		x0
@@ -64,86 +38,91 @@
 #define tmp3		x10
 #define zeroones	x11
 #define pos		x12
-#define limit_wd	x13
-#define mask		x14
-#define endloop		x15
+#define mask		x13
+#define endloop		x14
 #define count		mask
+#define offset		pos
+#define neg_offset	x15
+
+/* Define endian dependent shift operations.
+   On big-endian early bytes are at MSB and on little-endian LSB.
+   LS_FW means shifting towards early bytes.
+   LS_BK means shifting towards later bytes.
+   */
+#ifdef __AARCH64EB__
+#define LS_FW lsl
+#define LS_BK lsr
+#else
+#define LS_FW lsr
+#define LS_BK lsl
+#endif
 
-	.text
-	.p2align 6
-	.rep 7
-	nop	/* Pad so that the loop below fits a cache line.  */
-	.endr
-def_fn strncmp
-	cbz	limit, .Lret0
+ENTRY (strncmp)
+	PTR_ARG (0)
+	PTR_ARG (1)
+	SIZE_ARG (2)
+	cbz	limit, L(ret0)
 	eor	tmp1, src1, src2
 	mov	zeroones, #REP8_01
 	tst	tmp1, #7
 	and	count, src1, #7
-	b.ne	.Lmisaligned8
-	cbnz	count, .Lmutual_align
-	/* Calculate the number of full and partial words -1.  */
-	sub	limit_wd, limit, #1	/* limit != 0, so no underflow.  */
-	lsr	limit_wd, limit_wd, #3	/* Convert to Dwords.  */
+	b.ne	L(misaligned8)
+	cbnz	count, L(mutual_align)
 
 	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
 	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
 	   can be done in parallel across the entire word.  */
-	/* Start of performance-critical section  -- one 64B cache line.  */
-.Lloop_aligned:
+	.p2align 4
+L(loop_aligned):
 	ldr	data1, [src1], #8
 	ldr	data2, [src2], #8
-.Lstart_realigned:
-	subs	limit_wd, limit_wd, #1
+L(start_realigned):
+	subs	limit, limit, #8
 	sub	tmp1, data1, zeroones
 	orr	tmp2, data1, #REP8_7f
 	eor	diff, data1, data2	/* Non-zero if differences found.  */
-	csinv	endloop, diff, xzr, pl	/* Last Dword or differences.  */
+	csinv	endloop, diff, xzr, hi	/* Last Dword or differences.  */
 	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
 	ccmp	endloop, #0, #0, eq
-	b.eq	.Lloop_aligned
-	/* End of performance-critical section  -- one 64B cache line.  */
+	b.eq	L(loop_aligned)
+	/* End of main loop */
 
-	/* Not reached the limit, must have found the end or a diff.  */
-	tbz	limit_wd, #63, .Lnot_limit
-
-	/* Limit % 8 == 0 => all bytes significant.  */
-	ands	limit, limit, #7
-	b.eq	.Lnot_limit
-
-	lsl	limit, limit, #3	/* Bits -> bytes.  */
-	mov	mask, #~0
-#ifdef __AARCH64EB__
-	lsr	mask, mask, limit
-#else
-	lsl	mask, mask, limit
-#endif
-	bic	data1, data1, mask
-	bic	data2, data2, mask
-
-	/* Make sure that the NUL byte is marked in the syndrome.  */
-	orr	has_nul, has_nul, mask
-
-.Lnot_limit:
+L(full_check):
+#ifndef __AARCH64EB__
 	orr	syndrome, diff, has_nul
-
-#ifndef	__AARCH64EB__
+	add	limit, limit, 8	/* Rewind limit to before last subs. */
+L(syndrome_check):
+	/* Limit was reached. Check if the NUL byte or the difference
+	   is before the limit. */
 	rev	syndrome, syndrome
 	rev	data1, data1
-	/* The MS-non-zero bit of the syndrome marks either the first bit
-	   that is different, or the top bit of the first zero byte.
-	   Shifting left now will bring the critical information into the
-	   top bits.  */
 	clz	pos, syndrome
 	rev	data2, data2
 	lsl	data1, data1, pos
+	cmp	limit, pos, lsr #3
 	lsl	data2, data2, pos
 	/* But we need to zero-extend (char is unsigned) the value and then
 	   perform a signed 32-bit subtraction.  */
 	lsr	data1, data1, #56
 	sub	result, data1, data2, lsr #56
+	csel result, result, xzr, hi
 	ret
 #else
+	/* Not reached the limit, must have found the end or a diff.  */
+	tbz	limit, #63, L(not_limit)
+	add	tmp1, limit, 8
+	cbz	limit, L(not_limit)
+
+	lsl	limit, tmp1, #3	/* Bits -> bytes.  */
+	mov	mask, #~0
+	lsr	mask, mask, limit
+	bic	data1, data1, mask
+	bic	data2, data2, mask
+
+	/* Make sure that the NUL byte is marked in the syndrome.  */
+	orr	has_nul, has_nul, mask
+
+L(not_limit):
 	/* For big-endian we cannot use the trick with the syndrome value
 	   as carry-propagation can corrupt the upper bits if the trailing
 	   bytes in the string contain 0x01.  */
@@ -164,10 +143,11 @@ def_fn strncmp
 	rev	has_nul, has_nul
 	orr	syndrome, diff, has_nul
 	clz	pos, syndrome
-	/* The MS-non-zero bit of the syndrome marks either the first bit
-	   that is different, or the top bit of the first zero byte.
+	/* The most-significant-non-zero bit of the syndrome marks either the
+	   first bit that is different, or the top bit of the first zero byte.
 	   Shifting left now will bring the critical information into the
 	   top bits.  */
+L(end_quick):
 	lsl	data1, data1, pos
 	lsl	data2, data2, pos
 	/* But we need to zero-extend (char is unsigned) the value and then
@@ -177,7 +157,7 @@ def_fn strncmp
 	ret
 #endif
 
-.Lmutual_align:
+L(mutual_align):
 	/* Sources are mutually aligned, but are not currently at an
 	   alignment boundary.  Round down the addresses and then mask off
 	   the bytes that precede the start point.
@@ -189,102 +169,143 @@ def_fn strncmp
 	neg	tmp3, count, lsl #3	/* 64 - bits(bytes beyond align). */
 	ldr	data2, [src2], #8
 	mov	tmp2, #~0
-	sub	limit_wd, limit, #1	/* limit != 0, so no underflow.  */
-#ifdef __AARCH64EB__
-	/* Big-endian.  Early bytes are at MSB.  */
-	lsl	tmp2, tmp2, tmp3	/* Shift (count & 63).  */
-#else
-	/* Little-endian.  Early bytes are at LSB.  */
-	lsr	tmp2, tmp2, tmp3	/* Shift (count & 63).  */
-#endif
-	and	tmp3, limit_wd, #7
-	lsr	limit_wd, limit_wd, #3
-	/* Adjust the limit. Only low 3 bits used, so overflow irrelevant.  */
-	add	limit, limit, count
-	add	tmp3, tmp3, count
+	LS_FW	tmp2, tmp2, tmp3	/* Shift (count & 63).  */
+	/* Adjust the limit and ensure it doesn't overflow.  */
+	adds	limit, limit, count
+	csinv	limit, limit, xzr, lo
 	orr	data1, data1, tmp2
 	orr	data2, data2, tmp2
-	add	limit_wd, limit_wd, tmp3, lsr #3
-	b	.Lstart_realigned
+	b	L(start_realigned)
 
-	.p2align 6
+	.p2align 4
 	/* Don't bother with dwords for up to 16 bytes.  */
-.Lmisaligned8:
+L(misaligned8):
 	cmp	limit, #16
-	b.hs	.Ltry_misaligned_words
+	b.hs	L(try_misaligned_words)
 
-.Lbyte_loop:
+L(byte_loop):
 	/* Perhaps we can do better than this.  */
 	ldrb	data1w, [src1], #1
 	ldrb	data2w, [src2], #1
 	subs	limit, limit, #1
 	ccmp	data1w, #1, #0, hi	/* NZCV = 0b0000.  */
 	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
-	b.eq	.Lbyte_loop
-.Ldone:
+	b.eq	L(byte_loop)
+L(done):
 	sub	result, data1, data2
 	ret
 	/* Align the SRC1 to a dword by doing a bytewise compare and then do
 	   the dword loop.  */
-.Ltry_misaligned_words:
-	lsr	limit_wd, limit, #3
-	cbz	count, .Ldo_misaligned
+L(try_misaligned_words):
+	cbz	count, L(src1_aligned)
 
 	neg	count, count
 	and	count, count, #7
 	sub	limit, limit, count
-	lsr	limit_wd, limit, #3
 
-.Lpage_end_loop:
+L(page_end_loop):
 	ldrb	data1w, [src1], #1
 	ldrb	data2w, [src2], #1
 	cmp	data1w, #1
 	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
-	b.ne	.Ldone
+	b.ne	L(done)
 	subs	count, count, #1
-	b.hi	.Lpage_end_loop
+	b.hi	L(page_end_loop)
+
+	/* The following diagram explains the comparison of misaligned strings.
+	   The bytes are shown in natural order. For little-endian, it is
+	   reversed in the registers. The "x" bytes are before the string.
+	   The "|" separates data that is loaded at one time.
+	   src1     | a a a a a a a a | b b b c c c c c | . . .
+	   src2     | x x x x x a a a   a a a a a b b b | c c c c c . . .
 
-.Ldo_misaligned:
-	/* Prepare ourselves for the next page crossing.  Unlike the aligned
-	   loop, we fetch 1 less dword because we risk crossing bounds on
-	   SRC2.  */
-	mov	count, #8
-	subs	limit_wd, limit_wd, #1
-	b.lo	.Ldone_loop
-.Lloop_misaligned:
-	and	tmp2, src2, #0xff8
-	eor	tmp2, tmp2, #0xff8
-	cbz	tmp2, .Lpage_end_loop
+	   After shifting in each step, the data looks like this:
+	                STEP_A              STEP_B              STEP_C
+	   data1    a a a a a a a a     b b b c c c c c     b b b c c c c c
+	   data2    a a a a a a a a     b b b 0 0 0 0 0     0 0 0 c c c c c
 
+	   The bytes with "0" are eliminated from the syndrome via mask.
+
+	   Align SRC2 down to 16 bytes. This way we can read 16 bytes at a
+	   time from SRC2. The comparison happens in 3 steps. After each step
+	   the loop can exit, or read from SRC1 or SRC2. */
+L(src1_aligned):
+	/* Calculate offset from 8 byte alignment to string start in bits. No
+	   need to mask offset since shifts are ignoring upper bits. */
+	lsl	offset, src2, #3
+	bic	src2, src2, #0xf
+	mov	mask, -1
+	neg	neg_offset, offset
 	ldr	data1, [src1], #8
-	ldr	data2, [src2], #8
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, #REP8_7f
-	eor	diff, data1, data2	/* Non-zero if differences found.  */
-	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
-	ccmp	diff, #0, #0, eq
-	b.ne	.Lnot_limit
-	subs	limit_wd, limit_wd, #1
-	b.pl	.Lloop_misaligned
+	ldp	tmp1, tmp2, [src2], #16
+	LS_BK	mask, mask, neg_offset
+	and	neg_offset, neg_offset, #63	/* Need actual value for cmp later. */
+	/* Skip the first compare if data in tmp1 is irrelevant. */
+	tbnz	offset, 6, L(misaligned_mid_loop)
 
-.Ldone_loop:
-	/* We found a difference or a NULL before the limit was reached.  */
-	and	limit, limit, #7
-	cbz	limit, .Lnot_limit
-	/* Read the last word.  */
-	sub	src1, src1, 8
-	sub	src2, src2, 8
-	ldr	data1, [src1, limit]
-	ldr	data2, [src2, limit]
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, #REP8_7f
+L(loop_misaligned):
+	/* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/
+	LS_FW	data2, tmp1, offset
+	LS_BK	tmp1, tmp2, neg_offset
+	subs	limit, limit, #8
+	orr	data2, data2, tmp1	/* 8 bytes from SRC2 combined from two regs.*/
+	sub	has_nul, data1, zeroones
 	eor	diff, data1, data2	/* Non-zero if differences found.  */
-	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
-	ccmp	diff, #0, #0, eq
-	b.ne	.Lnot_limit
+	orr	tmp3, data1, #REP8_7f
+	csinv	endloop, diff, xzr, hi	/* If limit, set to all ones. */
+	bic	has_nul, has_nul, tmp3	/* Non-zero if NUL byte found in SRC1. */
+	orr	tmp3, endloop, has_nul
+	cbnz	tmp3, L(full_check)
+
+	ldr	data1, [src1], #8
+L(misaligned_mid_loop):
+	/* STEP_B: Compare first part of data1 to second part of tmp2. */
+	LS_FW	data2, tmp2, offset
+#ifdef __AARCH64EB__
+	/* For big-endian we do a byte reverse to avoid carry-propagation
+	problem described above. This way we can reuse the has_nul in the
+	next step and also use syndrome value trick at the end. */
+	rev	tmp3, data1
+	#define data1_fixed tmp3
+#else
+	#define data1_fixed data1
+#endif
+	sub	has_nul, data1_fixed, zeroones
+	orr	tmp3, data1_fixed, #REP8_7f
+	eor	diff, data2, data1	/* Non-zero if differences found.  */
+	bic	has_nul, has_nul, tmp3	/* Non-zero if NUL terminator.  */
+#ifdef __AARCH64EB__
+	rev	has_nul, has_nul
+#endif
+	cmp	limit, neg_offset, lsr #3
+	orr	syndrome, diff, has_nul
+	bic	syndrome, syndrome, mask	/* Ignore later bytes. */
+	csinv	tmp3, syndrome, xzr, hi	/* If limit, set to all ones. */
+	cbnz	tmp3, L(syndrome_check)
+
+	/* STEP_C: Compare second part of data1 to first part of tmp1. */
+	ldp	tmp1, tmp2, [src2], #16
+	cmp	limit, #8
+	LS_BK	data2, tmp1, neg_offset
+	eor	diff, data2, data1	/* Non-zero if differences found.  */
+	orr	syndrome, diff, has_nul
+	and	syndrome, syndrome, mask	/* Ignore earlier bytes. */
+	csinv	tmp3, syndrome, xzr, hi	/* If limit, set to all ones. */
+	cbnz	tmp3, L(syndrome_check)
+
+	ldr	data1, [src1], #8
+	sub	limit, limit, #8
+	b	L(loop_misaligned)
+
+#ifdef	__AARCH64EB__
+L(syndrome_check):
+	clz	pos, syndrome
+	cmp	pos, limit, lsl #3
+	b.lo	L(end_quick)
+#endif
 
-.Lret0:
+L(ret0):
 	mov	result, #0
 	ret
-	.size strncmp, . - strncmp
+END(strncmp)
 #endif
diff --git a/newlib/libc/machine/aarch64/strnlen.S b/newlib/libc/machine/aarch64/strnlen.S
index c255c3f7c6..091002e0b0 100644
--- a/newlib/libc/machine/aarch64/strnlen.S
+++ b/newlib/libc/machine/aarch64/strnlen.S
@@ -1,187 +1,105 @@
-/* strnlen - calculate the length of a string with limit.
-
-   Copyright (c) 2013, Linaro Limited
-   All rights reserved.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are met:
-       * Redistributions of source code must retain the above copyright
-         notice, this list of conditions and the following disclaimer.
-       * Redistributions in binary form must reproduce the above copyright
-         notice, this list of conditions and the following disclaimer in the
-         documentation and/or other materials provided with the distribution.
-       * Neither the name of the Linaro nor the
-         names of its contributors may be used to endorse or promote products
-         derived from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
-
+/*
+ * strnlen - calculate the length of a string with limit.
+ *
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
 #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
 /* See strlen-stub.c  */
 #else
 
 /* Assumptions:
  *
- * ARMv8-a, AArch64
+ * ARMv8-a, AArch64, Advanced SIMD.
+ * MTE compatible.
  */
 
-/* Arguments and results.  */
+#include "asmdefs.h"
+
 #define srcin		x0
-#define len		x0
-#define limit		x1
+#define cntin		x1
+#define result		x0
 
-/* Locals and temporaries.  */
 #define src		x2
-#define data1		x3
-#define data2		x4
-#define data2a		x5
-#define has_nul1	x6
-#define has_nul2	x7
-#define tmp1		x8
-#define tmp2		x9
-#define tmp3		x10
-#define tmp4		x11
-#define zeroones	x12
-#define pos		x13
-#define limit_wd	x14
-
-	.macro def_fn f p2align=0
-	.text
-	.p2align \p2align
-	.global \f
-	.type \f, %function
-\f:
-	.endm
-
-#define REP8_01 0x0101010101010101
-#define REP8_7f 0x7f7f7f7f7f7f7f7f
-#define REP8_80 0x8080808080808080
-
-	.text
-	.p2align	6
-.Lstart:
-	/* Pre-pad to ensure critical loop begins an icache line.  */
-	.rep 7
-	nop
-	.endr
-	/* Put this code here to avoid wasting more space with pre-padding.  */
-.Lhit_limit:
-	mov	len, limit
+#define synd		x3
+#define	shift		x4
+#define tmp		x4
+#define cntrem		x5
+
+#define qdata		q0
+#define vdata		v0
+#define vhas_chr	v1
+#define vend		v2
+#define dend		d2
+
+/*
+   Core algorithm:
+   Process the string in 16-byte aligned chunks. Compute a 64-bit mask with
+   four bits per byte using the shrn instruction. A count trailing zeros then
+   identifies the first zero byte.  */
+
+ENTRY (strnlen)
+	PTR_ARG (0)
+	SIZE_ARG (1)
+	bic	src, srcin, 15
+	cbz	cntin, L(nomatch)
+	ld1	{vdata.16b}, [src]
+	cmeq	vhas_chr.16b, vdata.16b, 0
+	lsl	shift, srcin, 2
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
+	fmov	synd, dend
+	lsr	synd, synd, shift
+	cbz	synd, L(start_loop)
+L(finish):
+	rbit	synd, synd
+	clz	synd, synd
+	lsr	result, synd, 2
+	cmp	cntin, result
+	csel	result, cntin, result, ls
 	ret
 
-def_fn strnlen
-	cbz	limit, .Lhit_limit
-	mov	zeroones, #REP8_01
-	bic	src, srcin, #15
-	ands	tmp1, srcin, #15
-	b.ne	.Lmisaligned
-	/* Calculate the number of full and partial words -1.  */
-	sub	limit_wd, limit, #1	/* Limit != 0, so no underflow.  */
-	lsr	limit_wd, limit_wd, #4	/* Convert to Qwords.  */
-
-	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
-	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
-	   can be done in parallel across the entire word.  */
-	/* The inner loop deals with two Dwords at a time.  This has a
-	   slightly higher start-up cost, but we should win quite quickly,
-	   especially on cores with a high number of issue slots per
-	   cycle, as we get much better parallelism out of the operations.  */
-
-	/* Start of critial section -- keep to one 64Byte cache line.  */
-.Lloop:
-	ldp	data1, data2, [src], #16
-.Lrealigned:
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, #REP8_7f
-	sub	tmp3, data2, zeroones
-	orr	tmp4, data2, #REP8_7f
-	bic	has_nul1, tmp1, tmp2
-	bic	has_nul2, tmp3, tmp4
-	subs	limit_wd, limit_wd, #1
-	orr	tmp1, has_nul1, has_nul2
-	ccmp	tmp1, #0, #0, pl	/* NZCV = 0000  */
-	b.eq	.Lloop
-	/* End of critical section -- keep to one 64Byte cache line.  */
-
-	orr	tmp1, has_nul1, has_nul2
-	cbz	tmp1, .Lhit_limit	/* No null in final Qword.  */
-
-	/* We know there's a null in the final Qword.  The easiest thing
-	   to do now is work out the length of the string and return
-	   MIN (len, limit).  */
-
-	sub	len, src, srcin
-	cbz	has_nul1, .Lnul_in_data2
-#ifdef __AARCH64EB__
-	mov	data2, data1
-#endif
-	sub	len, len, #8
-	mov	has_nul2, has_nul1
-.Lnul_in_data2:
-#ifdef __AARCH64EB__
-	/* For big-endian, carry propagation (if the final byte in the
-	   string is 0x01) means we cannot use has_nul directly.  The
-	   easiest way to get the correct byte is to byte-swap the data
-	   and calculate the syndrome a second time.  */
-	rev	data2, data2
-	sub	tmp1, data2, zeroones
-	orr	tmp2, data2, #REP8_7f
-	bic	has_nul2, tmp1, tmp2
-#endif
-	sub	len, len, #8
-	rev	has_nul2, has_nul2
-	clz	pos, has_nul2
-	add	len, len, pos, lsr #3		/* Bits to bytes.  */
-	cmp	len, limit
-	csel	len, len, limit, ls		/* Return the lower value.  */
+L(nomatch):
+	mov	result, cntin
 	ret
 
-.Lmisaligned:
-	/* Deal with a partial first word.
-	   We're doing two things in parallel here;
-	   1) Calculate the number of words (but avoiding overflow if
-	      limit is near ULONG_MAX) - to do this we need to work out
-	      limit + tmp1 - 1 as a 65-bit value before shifting it;
-	   2) Load and mask the initial data words - we force the bytes
-	      before the ones we are interested in to 0xff - this ensures
-	      early bytes will not hit any zero detection.  */
-	sub	limit_wd, limit, #1
-	neg	tmp4, tmp1
-	cmp	tmp1, #8
-
-	and	tmp3, limit_wd, #15
-	lsr	limit_wd, limit_wd, #4
-	mov	tmp2, #~0
-
-	ldp	data1, data2, [src], #16
-	lsl	tmp4, tmp4, #3		/* Bytes beyond alignment -> bits.  */
-	add	tmp3, tmp3, tmp1
-
-#ifdef __AARCH64EB__
-	/* Big-endian.  Early bytes are at MSB.  */
-	lsl	tmp2, tmp2, tmp4	/* Shift (tmp1 & 63).  */
-#else
-	/* Little-endian.  Early bytes are at LSB.  */
-	lsr	tmp2, tmp2, tmp4	/* Shift (tmp1 & 63).  */
+L(start_loop):
+	sub	tmp, src, srcin
+	add	tmp, tmp, 17
+	subs	cntrem, cntin, tmp
+	b.lo	L(nomatch)
+
+	/* Make sure that it won't overread by a 16-byte chunk */
+	tbz	cntrem, 4, L(loop32_2)
+	sub	src, src, 16
+	.p2align 5
+L(loop32):
+	ldr	qdata, [src, 32]!
+	cmeq	vhas_chr.16b, vdata.16b, 0
+	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	fmov	synd, dend
+	cbnz	synd, L(end)
+L(loop32_2):
+	ldr	qdata, [src, 16]
+	subs	cntrem, cntrem, 32
+	cmeq	vhas_chr.16b, vdata.16b, 0
+	b.lo	L(end_2)
+	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	fmov	synd, dend
+	cbz	synd, L(loop32)
+L(end_2):
+	add	src, src, 16
+L(end):
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
+	sub	result, src, srcin
+	fmov	synd, dend
+#ifndef __AARCH64EB__
+	rbit	synd, synd
 #endif
-	add	limit_wd, limit_wd, tmp3, lsr #4
-
-	orr	data1, data1, tmp2
-	orr	data2a, data2, tmp2
-
-	csinv	data1, data1, xzr, le
-	csel	data2, data2, data2a, le
-	b	.Lrealigned
-	.size	strnlen, . - .Lstart	/* Include pre-padding in size.  */
+	clz	synd, synd
+	add	result, result, synd, lsr 2
+	cmp	cntin, result
+	csel	result, cntin, result, ls
+	ret
 
+END (strnlen)
 #endif
diff --git a/newlib/libc/machine/aarch64/strrchr.S b/newlib/libc/machine/aarch64/strrchr.S
index d64fc09b1a..b0574228b6 100644
--- a/newlib/libc/machine/aarch64/strrchr.S
+++ b/newlib/libc/machine/aarch64/strrchr.S
@@ -1,32 +1,9 @@
 /*
-   strrchr - find last instance of a character in a string
-
-   Copyright (c) 2014, ARM Limited
-   All rights Reserved.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are met:
-       * Redistributions of source code must retain the above copyright
-         notice, this list of conditions and the following disclaimer.
-       * Redistributions in binary form must reproduce the above copyright
-         notice, this list of conditions and the following disclaimer in the
-         documentation and/or other materials provided with the distribution.
-       * Neither the name of the company nor the names of its contributors
-         may be used to endorse or promote products derived from this
-         software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  */
-
+ * strrchr - find last position of a character in a string.
+ *
+ * Copyright (c) 2014-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
 #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
 /* See strchr-stub.c  */
 #else
@@ -37,6 +14,8 @@
  * Neon Available.
  */
 
+#include "asmdefs.h"
+
 /* Arguments and results.  */
 #define srcin		x0
 #define chrin		w1
@@ -78,17 +57,8 @@
    in the original string a count_trailing_zeros() operation will
    identify exactly which byte is causing the termination, and why.  */
 
-/* Locals and temporaries.  */
-
-	.macro def_fn f p2align=0
-	.text
-	.p2align \p2align
-	.global \f
-	.type \f, %function
-\f:
-	.endm
-
-def_fn strrchr
+ENTRY (strrchr)
+	PTR_ARG (0)
 	/* Magic constant 0x40100401 to allow us to identify which lane
 	   matches the requested byte.  Magic constant 0x80200802 used
 	   similarly for NUL termination.  */
@@ -100,7 +70,7 @@ def_fn strrchr
 	mov	src_offset, #0
 	ands	tmp1, srcin, #31
 	add	vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */
-	b.eq	.Laligned
+	b.eq	L(aligned)
 
 	/* Input string is not 32-byte aligned.  Rather than forcing
 	   the padding bytes to a safe value, we calculate the syndrome
@@ -118,45 +88,45 @@ def_fn strrchr
 	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
 	addp	vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b	// 256->128
 	addp	vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b	// 256->128
-	addp	vhas_nul1.16b, vhas_nul1.16b, vhas_nul1.16b	// 128->64
-	addp	vhas_chr1.16b, vhas_chr1.16b, vhas_chr1.16b	// 128->64
-	mov	nul_match, vhas_nul1.2d[0]
+	addp	vend1.16b, vhas_nul1.16b, vhas_chr1.16b		// 128->64
+	mov	nul_match, vend1.d[0]
 	lsl	tmp1, tmp1, #1
 	mov	const_m1, #~0
-	mov	chr_match, vhas_chr1.2d[0]
 	lsr	tmp3, const_m1, tmp1
+	mov	chr_match, vend1.d[1]
 
 	bic	nul_match, nul_match, tmp3	// Mask padding bits.
 	bic	chr_match, chr_match, tmp3	// Mask padding bits.
-	cbnz	nul_match, .Ltail
+	cbnz	nul_match, L(tail)
 
-.Lloop:
+	.p2align 4
+L(loop):
 	cmp	chr_match, #0
 	csel	src_match, src, src_match, ne
 	csel	src_offset, chr_match, src_offset, ne
-.Laligned:
+L(aligned):
 	ld1	{vdata1.16b, vdata2.16b}, [src], #32
-	cmeq	vhas_nul1.16b, vdata1.16b, #0
 	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
-	cmeq	vhas_nul2.16b, vdata2.16b, #0
 	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
-	addp	vend1.16b, vhas_nul1.16b, vhas_nul2.16b	// 256->128
+	uminp	vend1.16b, vdata1.16b, vdata2.16b
 	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
 	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
+	cmeq	vend1.16b, vend1.16b, 0
 	addp	vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b	// 256->128
-	addp	vend1.16b, vend1.16b, vend1.16b	// 128->64
-	addp	vhas_chr1.16b, vhas_chr1.16b, vhas_chr1.16b	// 128->64
-	mov	nul_match, vend1.2d[0]
-	mov	chr_match, vhas_chr1.2d[0]
-	cbz	nul_match, .Lloop
+	addp	vend1.16b, vend1.16b, vhas_chr1.16b		// 128->64
+	mov	nul_match, vend1.d[0]
+	mov	chr_match, vend1.d[1]
+	cbz	nul_match, L(loop)
 
+	cmeq	vhas_nul1.16b, vdata1.16b, #0
+	cmeq	vhas_nul2.16b, vdata2.16b, #0
 	and	vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
 	and	vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
 	addp	vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b
 	addp	vhas_nul1.16b, vhas_nul1.16b, vhas_nul1.16b
-	mov	nul_match, vhas_nul1.2d[0]
+	mov	nul_match, vhas_nul1.d[0]
 
-.Ltail:
+L(tail):
 	/* Work out exactly where the string ends.  */
 	sub	tmp4, nul_match, #1
 	eor	tmp4, tmp4, nul_match
@@ -178,5 +148,5 @@ def_fn strrchr
 
 	ret
 
-	.size	strrchr, . - strrchr
+END (strrchr)
 #endif
-- 
2.35.3


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v3 1/2] aarch64: Sync with ARM-software/optimized-routines
  2023-09-12 10:05 ` [PATCH v3 1/2] " Sebastian Huber
@ 2023-10-05 10:37   ` Richard Earnshaw
  2023-10-05 12:23     ` Sebastian Huber
  0 siblings, 1 reply; 7+ messages in thread
From: Richard Earnshaw @ 2023-10-05 10:37 UTC (permalink / raw)
  To: Sebastian Huber, newlib; +Cc: Szabolcs Nagy

Hi Sebastian,

My apologies for the delay replying, the GNU Cauldron organizing took up 
a lot of my time over the last few weeks.

This is basically ok, but you're removing an existing license and adding 
a new one from Arm; I think you need to copy the new license into 
COPYING.NEWLIB - it's not enough just to have an SPDX identifier, the 
text of the license must be added somewhere as well.

R.

On 12/09/2023 11:05, Sebastian Huber wrote:
> Update AArch64 assembly string routines from:
> 
> https://github.com/ARM-software/optimized-routines
> 
> commit 0cf84f26b6b8dcad8287fe30a4dcc1fdabd06560
> Author: Sebastian Huber <sebastian.huber@embedded-brains.de>
> Date:   Thu Jul 27 17:14:57 2023 +0200
> 
>      string: Fix corrupt GNU_PROPERTY_TYPE (5) size
> 
>      For ELF32 the notes alignment is 4 and not 8.
> ---
>   newlib/libc/machine/aarch64/asmdefs.h   | 106 ++++++
>   newlib/libc/machine/aarch64/memchr.S    |  73 ++--
>   newlib/libc/machine/aarch64/memcmp.S    | 311 +++++++++--------
>   newlib/libc/machine/aarch64/memcpy.S    | 272 ++++++++-------
>   newlib/libc/machine/aarch64/memset.S    | 194 ++---------
>   newlib/libc/machine/aarch64/stpcpy.S    |  36 +-
>   newlib/libc/machine/aarch64/strchr.S    | 107 ++----
>   newlib/libc/machine/aarch64/strchrnul.S |  90 ++---
>   newlib/libc/machine/aarch64/strcmp.S    | 282 ++++++++-------
>   newlib/libc/machine/aarch64/strcpy.S    | 437 +++++++-----------------
>   newlib/libc/machine/aarch64/strlen.S    | 319 ++++++++---------
>   newlib/libc/machine/aarch64/strncmp.S   | 323 ++++++++++--------
>   newlib/libc/machine/aarch64/strnlen.S   | 256 +++++---------
>   newlib/libc/machine/aarch64/strrchr.S   |  86 ++---
>   14 files changed, 1226 insertions(+), 1666 deletions(-)
>   create mode 100644 newlib/libc/machine/aarch64/asmdefs.h
> 
> diff --git a/newlib/libc/machine/aarch64/asmdefs.h b/newlib/libc/machine/aarch64/asmdefs.h
> new file mode 100644
> index 0000000000..131b95e1fe
> --- /dev/null
> +++ b/newlib/libc/machine/aarch64/asmdefs.h
> @@ -0,0 +1,106 @@
> +/*
> + * Macros for asm code.  AArch64 version.
> + *
> + * Copyright (c) 2019-2023, Arm Limited.
> + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
> + */
> +
> +#ifndef _ASMDEFS_H
> +#define _ASMDEFS_H
> +
> +/* Branch Target Identitication support.  */
> +#define BTI_C		hint	34
> +#define BTI_J		hint	36
> +/* Return address signing support (pac-ret).  */
> +#define PACIASP		hint	25; .cfi_window_save
> +#define AUTIASP		hint	29; .cfi_window_save
> +
> +/* GNU_PROPERTY_AARCH64_* macros from elf.h.  */
> +#define FEATURE_1_AND 0xc0000000
> +#define FEATURE_1_BTI 1
> +#define FEATURE_1_PAC 2
> +
> +/* Add a NT_GNU_PROPERTY_TYPE_0 note.  */
> +#ifdef __ILP32__
> +#define GNU_PROPERTY(type, value)	\
> +  .section .note.gnu.property, "a";	\
> +  .p2align 2;				\
> +  .word 4;				\
> +  .word 12;				\
> +  .word 5;				\
> +  .asciz "GNU";				\
> +  .word type;				\
> +  .word 4;				\
> +  .word value;				\
> +  .text
> +#else
> +#define GNU_PROPERTY(type, value)	\
> +  .section .note.gnu.property, "a";	\
> +  .p2align 3;				\
> +  .word 4;				\
> +  .word 16;				\
> +  .word 5;				\
> +  .asciz "GNU";				\
> +  .word type;				\
> +  .word 4;				\
> +  .word value;				\
> +  .word 0;				\
> +  .text
> +#endif
> +
> +/* If set then the GNU Property Note section will be added to
> +   mark objects to support BTI and PAC-RET.  */
> +#ifndef WANT_GNU_PROPERTY
> +#define WANT_GNU_PROPERTY 1
> +#endif
> +
> +#if WANT_GNU_PROPERTY
> +/* Add property note with supported features to all asm files.  */
> +GNU_PROPERTY (FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_PAC)
> +#endif
> +
> +#define ENTRY_ALIGN(name, alignment)	\
> +  .global name;		\
> +  .type name,%function;	\
> +  .align alignment;		\
> +  name:			\
> +  .cfi_startproc;	\
> +  BTI_C;
> +
> +#define ENTRY(name)	ENTRY_ALIGN(name, 6)
> +
> +#define ENTRY_ALIAS(name)	\
> +  .global name;		\
> +  .type name,%function;	\
> +  name:
> +
> +#define END(name)	\
> +  .cfi_endproc;		\
> +  .size name, .-name;
> +
> +#define L(l) .L ## l
> +
> +#ifdef __ILP32__
> +  /* Sanitize padding bits of pointer arguments as per aapcs64 */
> +#define PTR_ARG(n)  mov w##n, w##n
> +#else
> +#define PTR_ARG(n)
> +#endif
> +
> +#ifdef __ILP32__
> +  /* Sanitize padding bits of size arguments as per aapcs64 */
> +#define SIZE_ARG(n)  mov w##n, w##n
> +#else
> +#define SIZE_ARG(n)
> +#endif
> +
> +/* Compiler supports SVE instructions  */
> +#ifndef HAVE_SVE
> +# if __aarch64__ && (__GNUC__ >= 8 || __clang_major__ >= 5)
> +#   define HAVE_SVE 1
> +# else
> +#   define HAVE_SVE 0
> +# endif
> +#endif
> +
> +#endif
> diff --git a/newlib/libc/machine/aarch64/memchr.S b/newlib/libc/machine/aarch64/memchr.S
> index 53f5d6bc0e..a0f305e0fc 100644
> --- a/newlib/libc/machine/aarch64/memchr.S
> +++ b/newlib/libc/machine/aarch64/memchr.S
> @@ -1,31 +1,8 @@
>   /*
>    * memchr - find a character in a memory zone
>    *
> - * Copyright (c) 2014, ARM Limited
> - * All rights Reserved.
> - *
> - * Redistribution and use in source and binary forms, with or without
> - * modification, are permitted provided that the following conditions are met:
> - *     * Redistributions of source code must retain the above copyright
> - *       notice, this list of conditions and the following disclaimer.
> - *     * Redistributions in binary form must reproduce the above copyright
> - *       notice, this list of conditions and the following disclaimer in the
> - *       documentation and/or other materials provided with the distribution.
> - *     * Neither the name of the company nor the names of its contributors
> - *       may be used to endorse or promote products derived from this
> - *       software without specific prior written permission.
> - *
> - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> - * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> + * Copyright (c) 2014-2022, Arm Limited.
> + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
>    */
>   
>   #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
> @@ -37,6 +14,8 @@
>    * Neon Available.
>    */
>   
> +#include "asmdefs.h"
> +
>   /* Arguments and results.  */
>   #define srcin		x0
>   #define chrin		w1
> @@ -70,17 +49,11 @@
>    * identify exactly which byte has matched.
>    */
>   
> -	.macro def_fn f p2align=0
> -	.text
> -	.p2align \p2align
> -	.global \f
> -	.type \f, %function
> -\f:
> -	.endm
> -
> -def_fn memchr
> +ENTRY (memchr)
> +	PTR_ARG (0)
> +	SIZE_ARG (2)
>   	/* Do not dereference srcin if no bytes to compare.  */
> -	cbz	cntin, .Lzero_length
> +	cbz	cntin, L(zero_length)
>   	/*
>   	 * Magic constant 0x40100401 allows us to identify which lane matches
>   	 * the requested byte.
> @@ -93,7 +66,7 @@ def_fn memchr
>   	dup	vrepmask.4s, wtmp2
>   	ands	soff, srcin, #31
>   	and	cntrem, cntin, #31
> -	b.eq	.Lloop
> +	b.eq	L(loop)
>   
>   	/*
>   	 * Input string is not 32-byte aligned. We calculate the syndrome
> @@ -110,41 +83,41 @@ def_fn memchr
>   	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
>   	addp	vend.16b, vhas_chr1.16b, vhas_chr2.16b		/* 256->128 */
>   	addp	vend.16b, vend.16b, vend.16b			/* 128->64 */
> -	mov	synd, vend.2d[0]
> +	mov	synd, vend.d[0]
>   	/* Clear the soff*2 lower bits */
>   	lsl	tmp, soff, #1
>   	lsr	synd, synd, tmp
>   	lsl	synd, synd, tmp
>   	/* The first block can also be the last */
> -	b.ls	.Lmasklast
> +	b.ls	L(masklast)
>   	/* Have we found something already? */
> -	cbnz	synd, .Ltail
> +	cbnz	synd, L(tail)
>   
> -.Lloop:
> +L(loop):
>   	ld1	{vdata1.16b, vdata2.16b}, [src], #32
>   	subs	cntin, cntin, #32
>   	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
>   	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
>   	/* If we're out of data we finish regardless of the result */
> -	b.ls	.Lend
> +	b.ls	L(end)
>   	/* Use a fast check for the termination condition */
>   	orr	vend.16b, vhas_chr1.16b, vhas_chr2.16b
>   	addp	vend.2d, vend.2d, vend.2d
> -	mov	synd, vend.2d[0]
> +	mov	synd, vend.d[0]
>   	/* We're not out of data, loop if we haven't found the character */
> -	cbz	synd, .Lloop
> +	cbz	synd, L(loop)
>   
> -.Lend:
> +L(end):
>   	/* Termination condition found, let's calculate the syndrome value */
>   	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
>   	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
>   	addp	vend.16b, vhas_chr1.16b, vhas_chr2.16b		/* 256->128 */
>   	addp	vend.16b, vend.16b, vend.16b			/* 128->64 */
> -	mov	synd, vend.2d[0]
> +	mov	synd, vend.d[0]
>   	/* Only do the clear for the last possible block */
> -	b.hi	.Ltail
> +	b.hs	L(tail)
>   
> -.Lmasklast:
> +L(masklast):
>   	/* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */
>   	add	tmp, cntrem, soff
>   	and	tmp, tmp, #31
> @@ -153,7 +126,7 @@ def_fn memchr
>   	lsl	synd, synd, tmp
>   	lsr	synd, synd, tmp
>   
> -.Ltail:
> +L(tail):
>   	/* Count the trailing zeros using bit reversing */
>   	rbit	synd, synd
>   	/* Compensate the last post-increment */
> @@ -168,9 +141,9 @@ def_fn memchr
>   	csel	result, xzr, result, eq
>   	ret
>   
> -.Lzero_length:
> +L(zero_length):
>   	mov	result, #0
>   	ret
>   
> -	.size	memchr, . - memchr
> +END (memchr)
>   #endif
> diff --git a/newlib/libc/machine/aarch64/memcmp.S b/newlib/libc/machine/aarch64/memcmp.S
> index 605d99365e..18874d3215 100644
> --- a/newlib/libc/machine/aarch64/memcmp.S
> +++ b/newlib/libc/machine/aarch64/memcmp.S
> @@ -1,57 +1,7 @@
>   /* memcmp - compare memory
> -
> -   Copyright (c) 2018 Linaro Limited
> -   All rights reserved.
> -
> -   Redistribution and use in source and binary forms, with or without
> -   modification, are permitted provided that the following conditions are met:
> -       * Redistributions of source code must retain the above copyright
> -         notice, this list of conditions and the following disclaimer.
> -       * Redistributions in binary form must reproduce the above copyright
> -         notice, this list of conditions and the following disclaimer in the
> -         documentation and/or other materials provided with the distribution.
> -       * Neither the name of the Linaro nor the
> -         names of its contributors may be used to endorse or promote products
> -         derived from this software without specific prior written permission.
> -
> -   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> -   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> -   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> -   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> -   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> -   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> -   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> -   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> -   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> -   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> -   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
> -
> -/*
> - * Copyright (c) 2017 ARM Ltd
> - * All rights reserved.
> - *
> - * Redistribution and use in source and binary forms, with or without
> - * modification, are permitted provided that the following conditions
> - * are met:
> - * 1. Redistributions of source code must retain the above copyright
> - *    notice, this list of conditions and the following disclaimer.
> - * 2. Redistributions in binary form must reproduce the above copyright
> - *    notice, this list of conditions and the following disclaimer in the
> - *    documentation and/or other materials provided with the distribution.
> - * 3. The name of the company may not be used to endorse or promote
> - *    products derived from this software without specific prior written
> - *    permission.
>    *
> - * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
> - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
> - * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
> - * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
> - * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
> - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
> - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
> - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
> - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> + * Copyright (c) 2013-2022, Arm Limited.
> + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
>    */
>   
>   #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
> @@ -60,103 +10,79 @@
>   
>   /* Assumptions:
>    *
> - * ARMv8-a, AArch64, unaligned accesses.
> + * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
>    */
>   
> -#define L(l) .L ## l
> -
> -/* Parameters and result.  */
> -#define src1		x0
> -#define src2		x1
> -#define limit		x2
> -#define result		w0
> -
> -/* Internal variables.  */
> -#define data1		x3
> -#define data1w		w3
> -#define data1h		x4
> -#define data2		x5
> -#define data2w		w5
> -#define data2h		x6
> -#define tmp1		x7
> -#define tmp2		x8
> -
> -        .macro def_fn f p2align=0
> -        .text
> -        .p2align \p2align
> -        .global \f
> -        .type \f, %function
> -\f:
> -        .endm
> -
> -def_fn memcmp p2align=6
> -	subs	limit, limit, 8
> -	b.lo	L(less8)
> -
> -	ldr	data1, [src1], 8
> -	ldr	data2, [src2], 8
> -	cmp	data1, data2
> -	b.ne	L(return)
> -
> -	subs	limit, limit, 8
> -	b.gt	L(more16)
> -
> -	ldr	data1, [src1, limit]
> -	ldr	data2, [src2, limit]
> -	b	L(return)
> -
> -L(more16):
> -	ldr	data1, [src1], 8
> -	ldr	data2, [src2], 8
> -	cmp	data1, data2
> -	bne	L(return)
> -
> -	/* Jump directly to comparing the last 16 bytes for 32 byte (or less)
> -	   strings.  */
> -	subs	limit, limit, 16
> +#include "asmdefs.h"
> +
> +#define src1	x0
> +#define src2	x1
> +#define limit	x2
> +#define result	w0
> +
> +#define data1	x3
> +#define data1w	w3
> +#define data2	x4
> +#define data2w	w4
> +#define data3	x5
> +#define data3w	w5
> +#define data4	x6
> +#define data4w	w6
> +#define tmp	x6
> +#define src1end	x7
> +#define src2end	x8
> +
> +
> +ENTRY (memcmp)
> +	PTR_ARG (0)
> +	PTR_ARG (1)
> +	SIZE_ARG (2)
> +
> +	cmp	limit, 16
> +	b.lo	L(less16)
> +	ldp	data1, data3, [src1]
> +	ldp	data2, data4, [src2]
> +	ccmp	data1, data2, 0, ne
> +	ccmp	data3, data4, 0, eq
> +	b.ne	L(return2)
> +
> +	add	src1end, src1, limit
> +	add	src2end, src2, limit
> +	cmp	limit, 32
>   	b.ls	L(last_bytes)
> +	cmp	limit, 160
> +	b.hs	L(loop_align)
> +	sub	limit, limit, 32
>   
> -	/* We overlap loads between 0-32 bytes at either side of SRC1 when we
> -	   try to align, so limit it only to strings larger than 128 bytes.  */
> -	cmp	limit, 96
> -	b.ls	L(loop16)
> -
> -	/* Align src1 and adjust src2 with bytes not yet done.  */
> -	and	tmp1, src1, 15
> -	add	limit, limit, tmp1
> -	sub	src1, src1, tmp1
> -	sub	src2, src2, tmp1
> -
> -	/* Loop performing 16 bytes per iteration using aligned src1.
> -	   Limit is pre-decremented by 16 and must be larger than zero.
> -	   Exit if <= 16 bytes left to do or if the data is not equal.  */
>   	.p2align 4
> -L(loop16):
> -	ldp	data1, data1h, [src1], 16
> -	ldp	data2, data2h, [src2], 16
> -	subs	limit, limit, 16
> -	ccmp	data1, data2, 0, hi
> -	ccmp	data1h, data2h, 0, eq
> -	b.eq	L(loop16)
> -
> +L(loop32):
> +	ldp	data1, data3, [src1, 16]
> +	ldp	data2, data4, [src2, 16]
>   	cmp	data1, data2
> -	bne	L(return)
> -	mov	data1, data1h
> -	mov	data2, data2h
> +	ccmp	data3, data4, 0, eq
> +	b.ne	L(return2)
> +	cmp	limit, 16
> +	b.ls	L(last_bytes)
> +
> +	ldp	data1, data3, [src1, 32]
> +	ldp	data2, data4, [src2, 32]
>   	cmp	data1, data2
> -	bne	L(return)
> +	ccmp	data3, data4, 0, eq
> +	b.ne	L(return2)
> +	add	src1, src1, 32
> +	add	src2, src2, 32
> +L(last64):
> +	subs	limit, limit, 32
> +	b.hi	L(loop32)
>   
>   	/* Compare last 1-16 bytes using unaligned access.  */
>   L(last_bytes):
> -	add	src1, src1, limit
> -	add	src2, src2, limit
> -	ldp	data1, data1h, [src1]
> -	ldp	data2, data2h, [src2]
> -	cmp     data1, data2
> -	bne	L(return)
> -	mov	data1, data1h
> -	mov	data2, data2h
> +	ldp	data1, data3, [src1end, -16]
> +	ldp	data2, data4, [src2end, -16]
> +L(return2):
>   	cmp	data1, data2
> +	csel	data1, data1, data3, ne
> +	csel	data2, data2, data4, ne
>   
>   	/* Compare data bytes and set return value to 0, -1 or 1.  */
>   L(return):
> @@ -164,33 +90,106 @@ L(return):
>   	rev	data1, data1
>   	rev	data2, data2
>   #endif
> -	cmp     data1, data2
> -L(ret_eq):
> +	cmp	data1, data2
>   	cset	result, ne
>   	cneg	result, result, lo
>   	ret
>   
>   	.p2align 4
> -	/* Compare up to 8 bytes.  Limit is [-8..-1].  */
> +L(less16):
> +	add	src1end, src1, limit
> +	add	src2end, src2, limit
> +	tbz	limit, 3, L(less8)
> +	ldr	data1, [src1]
> +	ldr	data2, [src2]
> +	ldr	data3, [src1end, -8]
> +	ldr	data4, [src2end, -8]
> +	b	L(return2)
> +
> +	.p2align 4
>   L(less8):
> -	adds	limit, limit, 4
> -	b.lo	L(less4)
> -	ldr	data1w, [src1], 4
> -	ldr	data2w, [src2], 4
> +	tbz	limit, 2, L(less4)
> +	ldr	data1w, [src1]
> +	ldr	data2w, [src2]
> +	ldr	data3w, [src1end, -4]
> +	ldr	data4w, [src2end, -4]
> +	b	L(return2)
> +
> +L(less4):
> +	tbz	limit, 1, L(less2)
> +	ldrh	data1w, [src1]
> +	ldrh	data2w, [src2]
>   	cmp	data1w, data2w
>   	b.ne	L(return)
> -	sub	limit, limit, 4
> -L(less4):
> -	adds	limit, limit, 4
> -	beq	L(ret_eq)
> -L(byte_loop):
> -	ldrb	data1w, [src1], 1
> -	ldrb	data2w, [src2], 1
> -	subs	limit, limit, 1
> -	ccmp	data1w, data2w, 0, ne	/* NZCV = 0b0000.  */
> -	b.eq	L(byte_loop)
> +L(less2):
> +	mov	result, 0
> +	tbz	limit, 0, L(return_zero)
> +	ldrb	data1w, [src1end, -1]
> +	ldrb	data2w, [src2end, -1]
>   	sub	result, data1w, data2w
> +L(return_zero):
> +	ret
> +
> +L(loop_align):
> +	ldp	data1, data3, [src1, 16]
> +	ldp	data2, data4, [src2, 16]
> +	cmp	data1, data2
> +	ccmp	data3, data4, 0, eq
> +	b.ne	L(return2)
> +
> +	/* Align src2 and adjust src1, src2 and limit.  */
> +	and	tmp, src2, 15
> +	sub	tmp, tmp, 16
> +	sub	src2, src2, tmp
> +	add	limit, limit, tmp
> +	sub	src1, src1, tmp
> +	sub	limit, limit, 64 + 16
> +
> +	.p2align 4
> +L(loop64):
> +	ldr	q0, [src1, 16]
> +	ldr	q1, [src2, 16]
> +	subs	limit, limit, 64
> +	ldr	q2, [src1, 32]
> +	ldr	q3, [src2, 32]
> +	eor	v0.16b, v0.16b, v1.16b
> +	eor	v1.16b, v2.16b, v3.16b
> +	ldr	q2, [src1, 48]
> +	ldr	q3, [src2, 48]
> +	umaxp	v0.16b, v0.16b, v1.16b
> +	ldr	q4, [src1, 64]!
> +	ldr	q5, [src2, 64]!
> +	eor	v1.16b, v2.16b, v3.16b
> +	eor	v2.16b, v4.16b, v5.16b
> +	umaxp	v1.16b, v1.16b, v2.16b
> +	umaxp	v0.16b, v0.16b, v1.16b
> +	umaxp	v0.16b, v0.16b, v0.16b
> +	fmov	tmp, d0
> +	ccmp	tmp, 0, 0, hi
> +	b.eq	L(loop64)
> +
> +	/* If equal, process last 1-64 bytes using scalar loop.  */
> +	add	limit, limit, 64 + 16
> +	cbz	tmp, L(last64)
> +
> +	/* Determine the 8-byte aligned offset of the first difference.  */
> +#ifdef __AARCH64EB__
> +	rev16	tmp, tmp
> +#endif
> +	rev	tmp, tmp
> +	clz	tmp, tmp
> +	bic	tmp, tmp, 7
> +	sub	tmp, tmp, 48
> +	ldr	data1, [src1, tmp]
> +	ldr	data2, [src2, tmp]
> +#ifndef __AARCH64EB__
> +	rev	data1, data1
> +	rev	data2, data2
> +#endif
> +	mov	result, 1
> +	cmp	data1, data2
> +	cneg	result, result, lo
>   	ret
>   
> -	.size	memcmp, . - memcmp
> +END (memcmp)
>   #endif
> diff --git a/newlib/libc/machine/aarch64/memcpy.S b/newlib/libc/machine/aarch64/memcpy.S
> index 463bad0a18..248e7843a2 100644
> --- a/newlib/libc/machine/aarch64/memcpy.S
> +++ b/newlib/libc/machine/aarch64/memcpy.S
> @@ -1,55 +1,8 @@
> -/* Copyright (c) 2012-2013, Linaro Limited
> -   All rights reserved.
> -
> -   Redistribution and use in source and binary forms, with or without
> -   modification, are permitted provided that the following conditions are met:
> -       * Redistributions of source code must retain the above copyright
> -         notice, this list of conditions and the following disclaimer.
> -       * Redistributions in binary form must reproduce the above copyright
> -         notice, this list of conditions and the following disclaimer in the
> -         documentation and/or other materials provided with the distribution.
> -       * Neither the name of the Linaro nor the
> -         names of its contributors may be used to endorse or promote products
> -         derived from this software without specific prior written permission.
> -
> -   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> -   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> -   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> -   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> -   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> -   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> -   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> -   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> -   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> -   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> -   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
> -
>   /*
> - * Copyright (c) 2015 ARM Ltd
> - * All rights reserved.
> - *
> - * Redistribution and use in source and binary forms, with or without
> - * modification, are permitted provided that the following conditions
> - * are met:
> - * 1. Redistributions of source code must retain the above copyright
> - *    notice, this list of conditions and the following disclaimer.
> - * 2. Redistributions in binary form must reproduce the above copyright
> - *    notice, this list of conditions and the following disclaimer in the
> - *    documentation and/or other materials provided with the distribution.
> - * 3. The name of the company may not be used to endorse or promote
> - *    products derived from this software without specific prior written
> - *    permission.
> + * memcpy - copy memory area
>    *
> - * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
> - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
> - * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
> - * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
> - * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
> - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
> - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
> - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
> - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> + * Copyright (c) 2012-2022, Arm Limited.
> + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
>    */
>   
>   /* Assumptions:
> @@ -61,6 +14,7 @@
>   #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
>   /* See memcpy-stub.c  */
>   #else
> +#include "asmdefs.h"
>   
>   #define dstin	x0
>   #define src	x1
> @@ -71,122 +25,139 @@
>   #define A_l	x6
>   #define A_lw	w6
>   #define A_h	x7
> -#define A_hw	w7
>   #define B_l	x8
>   #define B_lw	w8
>   #define B_h	x9
>   #define C_l	x10
> +#define C_lw	w10
>   #define C_h	x11
>   #define D_l	x12
>   #define D_h	x13
> -#define E_l	src
> -#define E_h	count
> -#define F_l	srcend
> -#define F_h	dst
> -#define tmp1	x9
> -
> -#define L(l) .L ## l
> -
> -	.macro def_fn f p2align=0
> -	.text
> -	.p2align \p2align
> -	.global \f
> -	.type \f, %function
> -\f:
> -	.endm
> -
> -/* Copies are split into 3 main cases: small copies of up to 16 bytes,
> -   medium copies of 17..96 bytes which are fully unrolled. Large copies
> -   of more than 96 bytes align the destination and use an unrolled loop
> -   processing 64 bytes per iteration.
> -   Small and medium copies read all data before writing, allowing any
> -   kind of overlap, and memmove tailcalls memcpy for these cases as
> -   well as non-overlapping copies.
> +#define E_l	x14
> +#define E_h	x15
> +#define F_l	x16
> +#define F_h	x17
> +#define G_l	count
> +#define G_h	dst
> +#define H_l	src
> +#define H_h	srcend
> +#define tmp1	x14
> +
> +/* This implementation handles overlaps and supports both memcpy and memmove
> +   from a single entry point.  It uses unaligned accesses and branchless
> +   sequences to keep the code small, simple and improve performance.
> +
> +   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
> +   copies of up to 128 bytes, and large copies.  The overhead of the overlap
> +   check is negligible since it is only required for large copies.
> +
> +   Large copies use a software pipelined loop processing 64 bytes per iteration.
> +   The destination pointer is 16-byte aligned to minimize unaligned accesses.
> +   The loop tail is handled by always copying 64 bytes from the end.
>   */
>   
> -def_fn memcpy p2align=6
> -	prfm	PLDL1KEEP, [src]
> +ENTRY_ALIAS (memmove)
> +ENTRY (memcpy)
> +	PTR_ARG (0)
> +	PTR_ARG (1)
> +	SIZE_ARG (2)
>   	add	srcend, src, count
>   	add	dstend, dstin, count
> -	cmp	count, 16
> -	b.ls	L(copy16)
> -	cmp	count, 96
> +	cmp	count, 128
>   	b.hi	L(copy_long)
> +	cmp	count, 32
> +	b.hi	L(copy32_128)
>   
> -	/* Medium copies: 17..96 bytes.  */
> -	sub	tmp1, count, 1
> +	/* Small copies: 0..32 bytes.  */
> +	cmp	count, 16
> +	b.lo	L(copy16)
>   	ldp	A_l, A_h, [src]
> -	tbnz	tmp1, 6, L(copy96)
>   	ldp	D_l, D_h, [srcend, -16]
> -	tbz	tmp1, 5, 1f
> -	ldp	B_l, B_h, [src, 16]
> -	ldp	C_l, C_h, [srcend, -32]
> -	stp	B_l, B_h, [dstin, 16]
> -	stp	C_l, C_h, [dstend, -32]
> -1:
>   	stp	A_l, A_h, [dstin]
>   	stp	D_l, D_h, [dstend, -16]
>   	ret
>   
> -	.p2align 4
> -	/* Small copies: 0..16 bytes.  */
> +	/* Copy 8-15 bytes.  */
>   L(copy16):
> -	cmp	count, 8
> -	b.lo	1f
> +	tbz	count, 3, L(copy8)
>   	ldr	A_l, [src]
>   	ldr	A_h, [srcend, -8]
>   	str	A_l, [dstin]
>   	str	A_h, [dstend, -8]
>   	ret
> -	.p2align 4
> -1:
> -	tbz	count, 2, 1f
> +
> +	.p2align 3
> +	/* Copy 4-7 bytes.  */
> +L(copy8):
> +	tbz	count, 2, L(copy4)
>   	ldr	A_lw, [src]
> -	ldr	A_hw, [srcend, -4]
> +	ldr	B_lw, [srcend, -4]
>   	str	A_lw, [dstin]
> -	str	A_hw, [dstend, -4]
> +	str	B_lw, [dstend, -4]
>   	ret
>   
> -	/* Copy 0..3 bytes.  Use a branchless sequence that copies the same
> -	   byte 3 times if count==1, or the 2nd byte twice if count==2.  */
> -1:
> -	cbz	count, 2f
> +	/* Copy 0..3 bytes using a branchless sequence.  */
> +L(copy4):
> +	cbz	count, L(copy0)
>   	lsr	tmp1, count, 1
>   	ldrb	A_lw, [src]
> -	ldrb	A_hw, [srcend, -1]
> +	ldrb	C_lw, [srcend, -1]
>   	ldrb	B_lw, [src, tmp1]
>   	strb	A_lw, [dstin]
>   	strb	B_lw, [dstin, tmp1]
> -	strb	A_hw, [dstend, -1]
> -2:	ret
> +	strb	C_lw, [dstend, -1]
> +L(copy0):
> +	ret
>   
>   	.p2align 4
> -	/* Copy 64..96 bytes.  Copy 64 bytes from the start and
> -	   32 bytes from the end.  */
> -L(copy96):
> +	/* Medium copies: 33..128 bytes.  */
> +L(copy32_128):
> +	ldp	A_l, A_h, [src]
>   	ldp	B_l, B_h, [src, 16]
> -	ldp	C_l, C_h, [src, 32]
> -	ldp	D_l, D_h, [src, 48]
> -	ldp	E_l, E_h, [srcend, -32]
> -	ldp	F_l, F_h, [srcend, -16]
> +	ldp	C_l, C_h, [srcend, -32]
> +	ldp	D_l, D_h, [srcend, -16]
> +	cmp	count, 64
> +	b.hi	L(copy128)
>   	stp	A_l, A_h, [dstin]
>   	stp	B_l, B_h, [dstin, 16]
> -	stp	C_l, C_h, [dstin, 32]
> -	stp	D_l, D_h, [dstin, 48]
> -	stp	E_l, E_h, [dstend, -32]
> -	stp	F_l, F_h, [dstend, -16]
> +	stp	C_l, C_h, [dstend, -32]
> +	stp	D_l, D_h, [dstend, -16]
>   	ret
>   
> -	/* Align DST to 16 byte alignment so that we don't cross cache line
> -	   boundaries on both loads and stores.	 There are at least 96 bytes
> -	   to copy, so copy 16 bytes unaligned and then align.	The loop
> -	   copies 64 bytes per iteration and prefetches one iteration ahead.  */
> +	.p2align 4
> +	/* Copy 65..128 bytes.  */
> +L(copy128):
> +	ldp	E_l, E_h, [src, 32]
> +	ldp	F_l, F_h, [src, 48]
> +	cmp	count, 96
> +	b.ls	L(copy96)
> +	ldp	G_l, G_h, [srcend, -64]
> +	ldp	H_l, H_h, [srcend, -48]
> +	stp	G_l, G_h, [dstend, -64]
> +	stp	H_l, H_h, [dstend, -48]
> +L(copy96):
> +	stp	A_l, A_h, [dstin]
> +	stp	B_l, B_h, [dstin, 16]
> +	stp	E_l, E_h, [dstin, 32]
> +	stp	F_l, F_h, [dstin, 48]
> +	stp	C_l, C_h, [dstend, -32]
> +	stp	D_l, D_h, [dstend, -16]
> +	ret
>   
>   	.p2align 4
> +	/* Copy more than 128 bytes.  */
>   L(copy_long):
> +	/* Use backwards copy if there is an overlap.  */
> +	sub	tmp1, dstin, src
> +	cbz	tmp1, L(copy0)
> +	cmp	tmp1, count
> +	b.lo	L(copy_long_backwards)
> +
> +	/* Copy 16 bytes and then align dst to 16-byte alignment.  */
> +
> +	ldp	D_l, D_h, [src]
>   	and	tmp1, dstin, 15
>   	bic	dst, dstin, 15
> -	ldp	D_l, D_h, [src]
>   	sub	src, src, tmp1
>   	add	count, count, tmp1	/* Count is now 16 too large.  */
>   	ldp	A_l, A_h, [src, 16]
> @@ -195,8 +166,9 @@ L(copy_long):
>   	ldp	C_l, C_h, [src, 48]
>   	ldp	D_l, D_h, [src, 64]!
>   	subs	count, count, 128 + 16	/* Test and readjust count.  */
> -	b.ls	2f
> -1:
> +	b.ls	L(copy64_from_end)
> +
> +L(loop64):
>   	stp	A_l, A_h, [dst, 16]
>   	ldp	A_l, A_h, [src, 16]
>   	stp	B_l, B_h, [dst, 32]
> @@ -206,12 +178,10 @@ L(copy_long):
>   	stp	D_l, D_h, [dst, 64]!
>   	ldp	D_l, D_h, [src, 64]!
>   	subs	count, count, 64
> -	b.hi	1b
> +	b.hi	L(loop64)
>   
> -	/* Write the last full set of 64 bytes.	 The remainder is at most 64
> -	   bytes, so it is safe to always copy 64 bytes from the end even if
> -	   there is just 1 byte left.  */
> -2:
> +	/* Write the last iteration and copy 64 bytes from the end.  */
> +L(copy64_from_end):
>   	ldp	E_l, E_h, [srcend, -64]
>   	stp	A_l, A_h, [dst, 16]
>   	ldp	A_l, A_h, [srcend, -48]
> @@ -226,5 +196,51 @@ L(copy_long):
>   	stp	C_l, C_h, [dstend, -16]
>   	ret
>   
> -	.size	memcpy, . - memcpy
> +	.p2align 4
> +
> +	/* Large backwards copy for overlapping copies.
> +	   Copy 16 bytes and then align dst to 16-byte alignment.  */
> +L(copy_long_backwards):
> +	ldp	D_l, D_h, [srcend, -16]
> +	and	tmp1, dstend, 15
> +	sub	srcend, srcend, tmp1
> +	sub	count, count, tmp1
> +	ldp	A_l, A_h, [srcend, -16]
> +	stp	D_l, D_h, [dstend, -16]
> +	ldp	B_l, B_h, [srcend, -32]
> +	ldp	C_l, C_h, [srcend, -48]
> +	ldp	D_l, D_h, [srcend, -64]!
> +	sub	dstend, dstend, tmp1
> +	subs	count, count, 128
> +	b.ls	L(copy64_from_start)
> +
> +L(loop64_backwards):
> +	stp	A_l, A_h, [dstend, -16]
> +	ldp	A_l, A_h, [srcend, -16]
> +	stp	B_l, B_h, [dstend, -32]
> +	ldp	B_l, B_h, [srcend, -32]
> +	stp	C_l, C_h, [dstend, -48]
> +	ldp	C_l, C_h, [srcend, -48]
> +	stp	D_l, D_h, [dstend, -64]!
> +	ldp	D_l, D_h, [srcend, -64]!
> +	subs	count, count, 64
> +	b.hi	L(loop64_backwards)
> +
> +	/* Write the last iteration and copy 64 bytes from the start.  */
> +L(copy64_from_start):
> +	ldp	G_l, G_h, [src, 48]
> +	stp	A_l, A_h, [dstend, -16]
> +	ldp	A_l, A_h, [src, 32]
> +	stp	B_l, B_h, [dstend, -32]
> +	ldp	B_l, B_h, [src, 16]
> +	stp	C_l, C_h, [dstend, -48]
> +	ldp	C_l, C_h, [src]
> +	stp	D_l, D_h, [dstend, -64]
> +	stp	G_l, G_h, [dstin, 48]
> +	stp	A_l, A_h, [dstin, 32]
> +	stp	B_l, B_h, [dstin, 16]
> +	stp	C_l, C_h, [dstin]
> +	ret
> +
> +END (memcpy)
>   #endif
> diff --git a/newlib/libc/machine/aarch64/memset.S b/newlib/libc/machine/aarch64/memset.S
> index 103e3f8bb0..ca76439a91 100644
> --- a/newlib/libc/machine/aarch64/memset.S
> +++ b/newlib/libc/machine/aarch64/memset.S
> @@ -1,66 +1,20 @@
> -/* Copyright (c) 2012-2013, Linaro Limited
> -   All rights reserved.
> -
> -   Redistribution and use in source and binary forms, with or without
> -   modification, are permitted provided that the following conditions are met:
> -       * Redistributions of source code must retain the above copyright
> -         notice, this list of conditions and the following disclaimer.
> -       * Redistributions in binary form must reproduce the above copyright
> -         notice, this list of conditions and the following disclaimer in the
> -         documentation and/or other materials provided with the distribution.
> -       * Neither the name of the Linaro nor the
> -         names of its contributors may be used to endorse or promote products
> -         derived from this software without specific prior written permission.
> -
> -   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> -   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> -   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> -   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> -   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> -   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> -   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> -   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> -   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> -   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> -   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
> -
>   /*
> - * Copyright (c) 2015 ARM Ltd
> - * All rights reserved.
> + * memset - fill memory with a constant byte
>    *
> - * Redistribution and use in source and binary forms, with or without
> - * modification, are permitted provided that the following conditions
> - * are met:
> - * 1. Redistributions of source code must retain the above copyright
> - *    notice, this list of conditions and the following disclaimer.
> - * 2. Redistributions in binary form must reproduce the above copyright
> - *    notice, this list of conditions and the following disclaimer in the
> - *    documentation and/or other materials provided with the distribution.
> - * 3. The name of the company may not be used to endorse or promote
> - *    products derived from this software without specific prior written
> - *    permission.
> - *
> - * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
> - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
> - * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
> - * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
> - * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
> - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
> - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
> - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
> - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> + * Copyright (c) 2012-2022, Arm Limited.
> + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
>    */
>   
>   /* Assumptions:
>    *
> - * ARMv8-a, AArch64, unaligned accesses
> + * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
>    *
>    */
>   
>   #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
>   /* See memset-stub.c  */
>   #else
> +#include "asmdefs.h"
>   
>   #define dstin	x0
>   #define val	x1
> @@ -68,24 +22,11 @@
>   #define count	x2
>   #define dst	x3
>   #define dstend	x4
> -#define tmp1	x5
> -#define tmp1w	w5
> -#define tmp2	x6
> -#define tmp2w	w6
> -#define zva_len x7
> -#define zva_lenw w7
> -
> -#define L(l) .L ## l
> +#define zva_val	x5
>   
> -	.macro def_fn f p2align=0
> -	.text
> -	.p2align \p2align
> -	.global \f
> -	.type \f, %function
> -\f:
> -	.endm
> -
> -def_fn memset p2align=6
> +ENTRY (memset)
> +	PTR_ARG (0)
> +	SIZE_ARG (2)
>   
>   	dup	v0.16B, valw
>   	add	dstend, dstin, count
> @@ -101,7 +42,7 @@ def_fn memset p2align=6
>   	str	val, [dstin]
>   	str	val, [dstend, -8]
>   	ret
> -	nop
> +	.p2align 4
>   1:	tbz	count, 2, 2f
>   	str	valw, [dstin]
>   	str	valw, [dstend, -4]
> @@ -131,110 +72,49 @@ L(set96):
>   	stp	q0, q0, [dstend, -32]
>   	ret
>   
> -	.p2align 3
> -	nop
> +	.p2align 4
>   L(set_long):
>   	and	valw, valw, 255
>   	bic	dst, dstin, 15
>   	str	q0, [dstin]
> -	cmp	count, 256
> -	ccmp	valw, 0, 0, cs
> -	b.eq	L(try_zva)
> -L(no_zva):
> -	sub	count, dstend, dst	/* Count is 16 too large.  */
> -	sub	dst, dst, 16		/* Dst is biased by -32.  */
> -	sub	count, count, 64 + 16	/* Adjust count and bias for loop.  */
> -1:	stp	q0, q0, [dst, 32]
> -	stp	q0, q0, [dst, 64]!
> -L(tail64):
> -	subs	count, count, 64
> -	b.hi	1b
> -2:	stp	q0, q0, [dstend, -64]
> -	stp	q0, q0, [dstend, -32]
> -	ret
> -
> -	.p2align 3
> -L(try_zva):
> -	mrs	tmp1, dczid_el0
> -	tbnz	tmp1w, 4, L(no_zva)
> -	and	tmp1w, tmp1w, 15
> -	cmp	tmp1w, 4	/* ZVA size is 64 bytes.  */
> -	b.ne	 L(zva_128)
> -
> -	/* Write the first and last 64 byte aligned block using stp rather
> -	   than using DC ZVA.  This is faster on some cores.
> -	 */
> -L(zva_64):
> +	cmp	count, 160
> +	ccmp	valw, 0, 0, hs
> +	b.ne	L(no_zva)
> +
> +#ifndef SKIP_ZVA_CHECK
> +	mrs	zva_val, dczid_el0
> +	and	zva_val, zva_val, 31
> +	cmp	zva_val, 4		/* ZVA size is 64 bytes.  */
> +	b.ne	L(no_zva)
> +#endif
>   	str	q0, [dst, 16]
>   	stp	q0, q0, [dst, 32]
>   	bic	dst, dst, 63
> -	stp	q0, q0, [dst, 64]
> -	stp	q0, q0, [dst, 96]
> -	sub	count, dstend, dst	/* Count is now 128 too large.	*/
> -	sub	count, count, 128+64+64	/* Adjust count and bias for loop.  */
> -	add	dst, dst, 128
> -	nop
> -1:	dc	zva, dst
> +	sub	count, dstend, dst	/* Count is now 64 too large.  */
> +	sub	count, count, 128	/* Adjust count and bias for loop.  */
> +
> +	.p2align 4
> +L(zva_loop):
>   	add	dst, dst, 64
> +	dc	zva, dst
>   	subs	count, count, 64
> -	b.hi	1b
> -	stp	q0, q0, [dst, 0]
> -	stp	q0, q0, [dst, 32]
> +	b.hi	L(zva_loop)
>   	stp	q0, q0, [dstend, -64]
>   	stp	q0, q0, [dstend, -32]
>   	ret
>   
> -	.p2align 3
> -L(zva_128):
> -	cmp	tmp1w, 5	/* ZVA size is 128 bytes.  */
> -	b.ne	L(zva_other)
> -
> -	str	q0, [dst, 16]
> +L(no_zva):
> +	sub	count, dstend, dst	/* Count is 16 too large.  */
> +	sub	dst, dst, 16		/* Dst is biased by -32.  */
> +	sub	count, count, 64 + 16	/* Adjust count and bias for loop.  */
> +L(no_zva_loop):
>   	stp	q0, q0, [dst, 32]
> -	stp	q0, q0, [dst, 64]
> -	stp	q0, q0, [dst, 96]
> -	bic	dst, dst, 127
> -	sub	count, dstend, dst	/* Count is now 128 too large.	*/
> -	sub	count, count, 128+128	/* Adjust count and bias for loop.  */
> -	add	dst, dst, 128
> -1:	dc	zva, dst
> -	add	dst, dst, 128
> -	subs	count, count, 128
> -	b.hi	1b
> -	stp	q0, q0, [dstend, -128]
> -	stp	q0, q0, [dstend, -96]
> +	stp	q0, q0, [dst, 64]!
> +	subs	count, count, 64
> +	b.hi	L(no_zva_loop)
>   	stp	q0, q0, [dstend, -64]
>   	stp	q0, q0, [dstend, -32]
>   	ret
>   
> -L(zva_other):
> -	mov	tmp2w, 4
> -	lsl	zva_lenw, tmp2w, tmp1w
> -	add	tmp1, zva_len, 64	/* Max alignment bytes written.	 */
> -	cmp	count, tmp1
> -	blo	L(no_zva)
> -
> -	sub	tmp2, zva_len, 1
> -	add	tmp1, dst, zva_len
> -	add	dst, dst, 16
> -	subs	count, tmp1, dst	/* Actual alignment bytes to write.  */
> -	bic	tmp1, tmp1, tmp2	/* Aligned dc zva start address.  */
> -	beq	2f
> -1:	stp	q0, q0, [dst], 64
> -	stp	q0, q0, [dst, -32]
> -	subs	count, count, 64
> -	b.hi	1b
> -2:	mov	dst, tmp1
> -	sub	count, dstend, tmp1	/* Remaining bytes to write.  */
> -	subs	count, count, zva_len
> -	b.lo	4f
> -3:	dc	zva, dst
> -	add	dst, dst, zva_len
> -	subs	count, count, zva_len
> -	b.hs	3b
> -4:	add	count, count, zva_len
> -	sub	dst, dst, 32		/* Bias dst for tail loop.  */
> -	b	L(tail64)
> -
> -	.size	memset, . - memset
> +END (memset)
>   #endif
> diff --git a/newlib/libc/machine/aarch64/stpcpy.S b/newlib/libc/machine/aarch64/stpcpy.S
> index 696b45889f..155c68d75a 100644
> --- a/newlib/libc/machine/aarch64/stpcpy.S
> +++ b/newlib/libc/machine/aarch64/stpcpy.S
> @@ -1,34 +1,10 @@
>   /*
> -   stpcpy - copy a string returning pointer to end.
> + * stpcpy - copy a string returning pointer to end.
> + *
> + * Copyright (c) 2020, Arm Limited.
> + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
> + */
>   
> -   Copyright (c) 2015 ARM Ltd.
> -   All Rights Reserved.
> +#define BUILD_STPCPY 1
>   
> -   Redistribution and use in source and binary forms, with or without
> -   modification, are permitted provided that the following conditions are met:
> -       * Redistributions of source code must retain the above copyright
> -         notice, this list of conditions and the following disclaimer.
> -       * Redistributions in binary form must reproduce the above copyright
> -         notice, this list of conditions and the following disclaimer in the
> -         documentation and/or other materials provided with the distribution.
> -       * Neither the name of the company nor the names of its contributors
> -         may be used to endorse or promote products derived from this
> -         software without specific prior written permission.
> -
> -   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> -   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> -   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> -   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> -   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> -   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> -   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> -   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> -   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> -   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> -   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  */
> -
> -/* This is just a wrapper that uses strcpy code with appropriate
> -   pre-defines.  */
> -
> -#define BUILD_STPCPY
>   #include "strcpy.S"
> diff --git a/newlib/libc/machine/aarch64/strchr.S b/newlib/libc/machine/aarch64/strchr.S
> index 2448dbc7d5..500d9aff29 100644
> --- a/newlib/libc/machine/aarch64/strchr.S
> +++ b/newlib/libc/machine/aarch64/strchr.S
> @@ -1,32 +1,9 @@
>   /*
> -   strchr - find a character in a string
> -
> -   Copyright (c) 2014, ARM Limited
> -   All rights Reserved.
> -
> -   Redistribution and use in source and binary forms, with or without
> -   modification, are permitted provided that the following conditions are met:
> -       * Redistributions of source code must retain the above copyright
> -         notice, this list of conditions and the following disclaimer.
> -       * Redistributions in binary form must reproduce the above copyright
> -         notice, this list of conditions and the following disclaimer in the
> -         documentation and/or other materials provided with the distribution.
> -       * Neither the name of the company nor the names of its contributors
> -         may be used to endorse or promote products derived from this
> -         software without specific prior written permission.
> -
> -   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> -   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> -   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> -   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> -   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> -   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> -   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> -   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> -   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> -   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> -   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  */
> -
> + * strchr - find a character in a string
> + *
> + * Copyright (c) 2014-2022, Arm Limited.
> + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
> + */
>   #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
>   /* See strchr-stub.c  */
>   #else
> @@ -37,6 +14,8 @@
>    * Neon Available.
>    */
>   
> +#include "asmdefs.h"
> +
>   /* Arguments and results.  */
>   #define srcin		x0
>   #define chrin		w1
> @@ -74,26 +53,19 @@
>   
>   /* Locals and temporaries.  */
>   
> -	.macro def_fn f p2align=0
> -	.text
> -	.p2align \p2align
> -	.global \f
> -	.type \f, %function
> -\f:
> -	.endm
> -
> -def_fn strchr
> -	/* Magic constant 0x40100401 to allow us to identify which lane
> -	   matches the requested byte.  Magic constant 0x80200802 used
> -	   similarly for NUL termination.  */
> -	mov	wtmp2, #0x0401
> -	movk	wtmp2, #0x4010, lsl #16
> +ENTRY (strchr)
> +	PTR_ARG (0)
> +	/* Magic constant 0xc0300c03 to allow us to identify which lane
> +	   matches the requested byte.  Even bits are set if the character
> +	   matches, odd bits if either the char is NUL or matches.  */
> +	mov	wtmp2, 0x0c03
> +	movk	wtmp2, 0xc030, lsl 16
>   	dup	vrepchr.16b, chrin
>   	bic	src, srcin, #31		/* Work with aligned 32-byte hunks.  */
>   	dup	vrepmask_c.4s, wtmp2
>   	ands	tmp1, srcin, #31
>   	add	vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */
> -	b.eq	.Lloop
> +	b.eq	L(loop)
>   
>   	/* Input string is not 32-byte aligned.  Rather than forcing
>   	   the padding bytes to a safe value, we calculate the syndrome
> @@ -105,49 +77,42 @@ def_fn strchr
>   	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
>   	cmeq	vhas_nul2.16b, vdata2.16b, #0
>   	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
> -	and	vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
> -	and	vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
> -	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
> -	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
> -	orr	vend1.16b, vhas_nul1.16b, vhas_chr1.16b
> -	orr	vend2.16b, vhas_nul2.16b, vhas_chr2.16b
> +	bif	vhas_nul1.16b, vhas_chr1.16b, vrepmask_0.16b
> +	bif	vhas_nul2.16b, vhas_chr2.16b, vrepmask_0.16b
> +	and	vend1.16b, vhas_nul1.16b, vrepmask_c.16b
> +	and	vend2.16b, vhas_nul2.16b, vrepmask_c.16b
>   	lsl	tmp1, tmp1, #1
>   	addp	vend1.16b, vend1.16b, vend2.16b		// 256->128
>   	mov	tmp3, #~0
>   	addp	vend1.16b, vend1.16b, vend2.16b		// 128->64
>   	lsr	tmp1, tmp3, tmp1
>   
> -	mov	tmp3, vend1.2d[0]
> +	mov	tmp3, vend1.d[0]
>   	bic	tmp1, tmp3, tmp1	// Mask padding bits.
> -	cbnz	tmp1, .Ltail
> +	cbnz	tmp1, L(tail)
>   
> -.Lloop:
> +	.p2align 4
> +L(loop):
>   	ld1	{vdata1.16b, vdata2.16b}, [src], #32
> -	cmeq	vhas_nul1.16b, vdata1.16b, #0
>   	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
> -	cmeq	vhas_nul2.16b, vdata2.16b, #0
>   	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
> -	/* Use a fast check for the termination condition.  */
> -	orr	vend1.16b, vhas_nul1.16b, vhas_chr1.16b
> -	orr	vend2.16b, vhas_nul2.16b, vhas_chr2.16b
> -	orr	vend1.16b, vend1.16b, vend2.16b
> -	addp	vend1.2d, vend1.2d, vend1.2d
> -	mov	tmp1, vend1.2d[0]
> -	cbz	tmp1, .Lloop
> +	cmhs	vhas_nul1.16b, vhas_chr1.16b, vdata1.16b
> +	cmhs	vhas_nul2.16b, vhas_chr2.16b, vdata2.16b
> +	orr	vend1.16b, vhas_nul1.16b, vhas_nul2.16b
> +	umaxp	vend1.16b, vend1.16b, vend1.16b
> +	mov	tmp1, vend1.d[0]
> +	cbz	tmp1, L(loop)
>   
>   	/* Termination condition found.  Now need to establish exactly why
>   	   we terminated.  */
> -	and	vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
> -	and	vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
> -	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
> -	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
> -	orr	vend1.16b, vhas_nul1.16b, vhas_chr1.16b
> -	orr	vend2.16b, vhas_nul2.16b, vhas_chr2.16b
> +	bif	vhas_nul1.16b, vhas_chr1.16b, vrepmask_0.16b
> +	bif	vhas_nul2.16b, vhas_chr2.16b, vrepmask_0.16b
> +	and	vend1.16b, vhas_nul1.16b, vrepmask_c.16b
> +	and	vend2.16b, vhas_nul2.16b, vrepmask_c.16b
>   	addp	vend1.16b, vend1.16b, vend2.16b		// 256->128
>   	addp	vend1.16b, vend1.16b, vend2.16b		// 128->64
> -
> -	mov	tmp1, vend1.2d[0]
> -.Ltail:
> +	mov	tmp1, vend1.d[0]
> +L(tail):
>   	/* Count the trailing zeros, by bit reversing...  */
>   	rbit	tmp1, tmp1
>   	/* Re-bias source.  */
> @@ -160,5 +125,5 @@ def_fn strchr
>   	csel	result, result, xzr, eq
>   	ret
>   
> -	.size	strchr, . - strchr
> +END (strchr)
>   #endif
> diff --git a/newlib/libc/machine/aarch64/strchrnul.S b/newlib/libc/machine/aarch64/strchrnul.S
> index a0ac13b7f4..ceaf4dca17 100644
> --- a/newlib/libc/machine/aarch64/strchrnul.S
> +++ b/newlib/libc/machine/aarch64/strchrnul.S
> @@ -1,32 +1,9 @@
>   /*
> -   strchrnul - find a character or nul in a string
> -
> -   Copyright (c) 2014, ARM Limited
> -   All rights Reserved.
> -
> -   Redistribution and use in source and binary forms, with or without
> -   modification, are permitted provided that the following conditions are met:
> -       * Redistributions of source code must retain the above copyright
> -         notice, this list of conditions and the following disclaimer.
> -       * Redistributions in binary form must reproduce the above copyright
> -         notice, this list of conditions and the following disclaimer in the
> -         documentation and/or other materials provided with the distribution.
> -       * Neither the name of the company nor the names of its contributors
> -         may be used to endorse or promote products derived from this
> -         software without specific prior written permission.
> -
> -   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> -   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> -   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> -   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> -   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> -   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> -   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> -   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> -   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> -   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> -   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  */
> -
> + * strchrnul - find a character or nul in a string
> + *
> + * Copyright (c) 2014-2022, Arm Limited.
> + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
> + */
>   #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
>   /* See strchrnul-stub.c  */
>   #else
> @@ -37,6 +14,8 @@
>    * Neon Available.
>    */
>   
> +#include "asmdefs.h"
> +
>   /* Arguments and results.  */
>   #define srcin		x0
>   #define chrin		w1
> @@ -70,15 +49,8 @@
>   
>   /* Locals and temporaries.  */
>   
> -	.macro def_fn f p2align=0
> -	.text
> -	.p2align \p2align
> -	.global \f
> -	.type \f, %function
> -\f:
> -	.endm
> -
> -def_fn strchrnul
> +ENTRY (strchrnul)
> +	PTR_ARG (0)
>   	/* Magic constant 0x40100401 to allow us to identify which lane
>   	   matches the termination condition.  */
>   	mov	wtmp2, #0x0401
> @@ -87,7 +59,7 @@ def_fn strchrnul
>   	bic	src, srcin, #31		/* Work with aligned 32-byte hunks.  */
>   	dup	vrepmask.4s, wtmp2
>   	ands	tmp1, srcin, #31
> -	b.eq	.Lloop
> +	b.eq	L(loop)
>   
>   	/* Input string is not 32-byte aligned.  Rather than forcing
>   	   the padding bytes to a safe value, we calculate the syndrome
> @@ -95,47 +67,43 @@ def_fn strchrnul
>   	   syndrome that are related to the padding.  */
>   	ld1	{vdata1.16b, vdata2.16b}, [src], #32
>   	neg	tmp1, tmp1
> -	cmeq	vhas_nul1.16b, vdata1.16b, #0
>   	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
> -	cmeq	vhas_nul2.16b, vdata2.16b, #0
>   	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
> -	orr	vhas_chr1.16b, vhas_chr1.16b, vhas_nul1.16b
> -	orr	vhas_chr2.16b, vhas_chr2.16b, vhas_nul2.16b
> -	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
> -	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
> +	cmhs	vhas_nul1.16b, vhas_chr1.16b, vdata1.16b
> +	cmhs	vhas_nul2.16b, vhas_chr2.16b, vdata2.16b
> +	and	vhas_chr1.16b, vhas_nul1.16b, vrepmask.16b
> +	and	vhas_chr2.16b, vhas_nul2.16b, vrepmask.16b
>   	lsl	tmp1, tmp1, #1
>   	addp	vend1.16b, vhas_chr1.16b, vhas_chr2.16b	// 256->128
>   	mov	tmp3, #~0
>   	addp	vend1.16b, vend1.16b, vend1.16b		// 128->64
>   	lsr	tmp1, tmp3, tmp1
>   
> -	mov	tmp3, vend1.2d[0]
> +	mov	tmp3, vend1.d[0]
>   	bic	tmp1, tmp3, tmp1	// Mask padding bits.
> -	cbnz	tmp1, .Ltail
> +	cbnz	tmp1, L(tail)
>   
> -.Lloop:
> +	.p2align 4
> +L(loop):
>   	ld1	{vdata1.16b, vdata2.16b}, [src], #32
> -	cmeq	vhas_nul1.16b, vdata1.16b, #0
>   	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
> -	cmeq	vhas_nul2.16b, vdata2.16b, #0
>   	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
> -	/* Use a fast check for the termination condition.  */
> -	orr	vhas_chr1.16b, vhas_nul1.16b, vhas_chr1.16b
> -	orr	vhas_chr2.16b, vhas_nul2.16b, vhas_chr2.16b
> -	orr	vend1.16b, vhas_chr1.16b, vhas_chr2.16b
> -	addp	vend1.2d, vend1.2d, vend1.2d
> -	mov	tmp1, vend1.2d[0]
> -	cbz	tmp1, .Lloop
> +	cmhs	vhas_nul1.16b, vhas_chr1.16b, vdata1.16b
> +	cmhs	vhas_nul2.16b, vhas_chr2.16b, vdata2.16b
> +	orr	vend1.16b, vhas_nul1.16b, vhas_nul2.16b
> +	umaxp	vend1.16b, vend1.16b, vend1.16b
> +	mov	tmp1, vend1.d[0]
> +	cbz	tmp1, L(loop)
>   
>   	/* Termination condition found.  Now need to establish exactly why
>   	   we terminated.  */
> -	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
> -	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
> +	and	vhas_chr1.16b, vhas_nul1.16b, vrepmask.16b
> +	and	vhas_chr2.16b, vhas_nul2.16b, vrepmask.16b
>   	addp	vend1.16b, vhas_chr1.16b, vhas_chr2.16b		// 256->128
>   	addp	vend1.16b, vend1.16b, vend1.16b		// 128->64
>   
> -	mov	tmp1, vend1.2d[0]
> -.Ltail:
> +	mov	tmp1, vend1.d[0]
> +L(tail):
>   	/* Count the trailing zeros, by bit reversing...  */
>   	rbit	tmp1, tmp1
>   	/* Re-bias source.  */
> @@ -145,5 +113,5 @@ def_fn strchrnul
>   	add	result, src, tmp1, lsr #1
>   	ret
>   
> -	.size	strchrnul, . - strchrnul
> +END (strchrnul)
>   #endif
> diff --git a/newlib/libc/machine/aarch64/strcmp.S b/newlib/libc/machine/aarch64/strcmp.S
> index e2bef2d49d..691a1760ee 100644
> --- a/newlib/libc/machine/aarch64/strcmp.S
> +++ b/newlib/libc/machine/aarch64/strcmp.S
> @@ -1,202 +1,192 @@
> -/* Copyright (c) 2012-2018, Linaro Limited
> -   All rights reserved.
> -
> -   Redistribution and use in source and binary forms, with or without
> -   modification, are permitted provided that the following conditions are met:
> -       * Redistributions of source code must retain the above copyright
> -         notice, this list of conditions and the following disclaimer.
> -       * Redistributions in binary form must reproduce the above copyright
> -         notice, this list of conditions and the following disclaimer in the
> -         documentation and/or other materials provided with the distribution.
> -       * Neither the name of the Linaro nor the
> -         names of its contributors may be used to endorse or promote products
> -         derived from this software without specific prior written permission.
> -
> -   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> -   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> -   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> -   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> -   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> -   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> -   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> -   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> -   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> -   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> -   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
> -
> -/* Assumptions:
> +/*
> + * strcmp - compare two strings
>    *
> - * ARMv8-a, AArch64
> + * Copyright (c) 2012-2022, Arm Limited.
> + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
>    */
>   
>   #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
>   /* See strcmp-stub.c  */
>   #else
>   
> -	.macro def_fn f p2align=0
> -	.text
> -	.p2align \p2align
> -	.global \f
> -	.type \f, %function
> -\f:
> -	.endm
> +/* Assumptions:
> + *
> + * ARMv8-a, AArch64.
> + * MTE compatible.
> + */
>   
> -#define L(label) .L ## label
> +#include "asmdefs.h"
>   
>   #define REP8_01 0x0101010101010101
>   #define REP8_7f 0x7f7f7f7f7f7f7f7f
> -#define REP8_80 0x8080808080808080
>   
> -/* Parameters and result.  */
>   #define src1		x0
>   #define src2		x1
>   #define result		x0
>   
> -/* Internal variables.  */
>   #define data1		x2
>   #define data1w		w2
>   #define data2		x3
>   #define data2w		w3
>   #define has_nul		x4
>   #define diff		x5
> +#define off1		x5
>   #define syndrome	x6
> -#define tmp1		x7
> -#define tmp2		x8
> -#define tmp3		x9
> -#define zeroones	x10
> -#define pos		x11
> -
> -	/* Start of performance-critical section  -- one 64B cache line.  */
> -def_fn strcmp p2align=6
> -	eor	tmp1, src1, src2
> -	mov	zeroones, #REP8_01
> -	tst	tmp1, #7
> +#define tmp		x6
> +#define data3		x7
> +#define zeroones	x8
> +#define shift		x9
> +#define off2		x10
> +
> +/* On big-endian early bytes are at MSB and on little-endian LSB.
> +   LS_FW means shifting towards early bytes.  */
> +#ifdef __AARCH64EB__
> +# define LS_FW lsl
> +#else
> +# define LS_FW lsr
> +#endif
> +
> +/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
> +   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
> +   can be done in parallel across the entire word.
> +   Since carry propagation makes 0x1 bytes before a NUL byte appear
> +   NUL too in big-endian, byte-reverse the data before the NUL check.  */
> +
> +
> +ENTRY (strcmp)
> +	PTR_ARG (0)
> +	PTR_ARG (1)
> +	sub	off2, src2, src1
> +	mov	zeroones, REP8_01
> +	and	tmp, src1, 7
> +	tst	off2, 7
>   	b.ne	L(misaligned8)
> -	ands	tmp1, src1, #7
> -	b.ne	L(mutual_align)
> -	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
> -	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
> -	   can be done in parallel across the entire word.  */
> +	cbnz	tmp, L(mutual_align)
> +
> +	.p2align 4
> +
>   L(loop_aligned):
> -	ldr	data1, [src1], #8
> -	ldr	data2, [src2], #8
> +	ldr	data2, [src1, off2]
> +	ldr	data1, [src1], 8
>   L(start_realigned):
> -	sub	tmp1, data1, zeroones
> -	orr	tmp2, data1, #REP8_7f
> -	eor	diff, data1, data2	/* Non-zero if differences found.  */
> -	bic	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
> +#ifdef __AARCH64EB__
> +	rev	tmp, data1
> +	sub	has_nul, tmp, zeroones
> +	orr	tmp, tmp, REP8_7f
> +#else
> +	sub	has_nul, data1, zeroones
> +	orr	tmp, data1, REP8_7f
> +#endif
> +	bics	has_nul, has_nul, tmp	/* Non-zero if NUL terminator.  */
> +	ccmp	data1, data2, 0, eq
> +	b.eq	L(loop_aligned)
> +#ifdef __AARCH64EB__
> +	rev	has_nul, has_nul
> +#endif
> +	eor	diff, data1, data2
>   	orr	syndrome, diff, has_nul
> -	cbz	syndrome, L(loop_aligned)
> -	/* End of performance-critical section  -- one 64B cache line.  */
> -
>   L(end):
> -#ifndef	__AARCH64EB__
> +#ifndef __AARCH64EB__
>   	rev	syndrome, syndrome
>   	rev	data1, data1
> -	/* The MS-non-zero bit of the syndrome marks either the first bit
> -	   that is different, or the top bit of the first zero byte.
> -	   Shifting left now will bring the critical information into the
> -	   top bits.  */
> -	clz	pos, syndrome
>   	rev	data2, data2
> -	lsl	data1, data1, pos
> -	lsl	data2, data2, pos
> -	/* But we need to zero-extend (char is unsigned) the value and then
> -	   perform a signed 32-bit subtraction.  */
> -	lsr	data1, data1, #56
> -	sub	result, data1, data2, lsr #56
> -	ret
> -#else
> -	/* For big-endian we cannot use the trick with the syndrome value
> -	   as carry-propagation can corrupt the upper bits if the trailing
> -	   bytes in the string contain 0x01.  */
> -	/* However, if there is no NUL byte in the dword, we can generate
> -	   the result directly.  We can't just subtract the bytes as the
> -	   MSB might be significant.  */
> -	cbnz	has_nul, 1f
> -	cmp	data1, data2
> -	cset	result, ne
> -	cneg	result, result, lo
> -	ret
> -1:
> -	/* Re-compute the NUL-byte detection, using a byte-reversed value.  */
> -	rev	tmp3, data1
> -	sub	tmp1, tmp3, zeroones
> -	orr	tmp2, tmp3, #REP8_7f
> -	bic	has_nul, tmp1, tmp2
> -	rev	has_nul, has_nul
> -	orr	syndrome, diff, has_nul
> -	clz	pos, syndrome
> -	/* The MS-non-zero bit of the syndrome marks either the first bit
> -	   that is different, or the top bit of the first zero byte.
> +#endif
> +	clz	shift, syndrome
> +	/* The most-significant-non-zero bit of the syndrome marks either the
> +	   first bit that is different, or the top bit of the first zero byte.
>   	   Shifting left now will bring the critical information into the
>   	   top bits.  */
> -	lsl	data1, data1, pos
> -	lsl	data2, data2, pos
> +	lsl	data1, data1, shift
> +	lsl	data2, data2, shift
>   	/* But we need to zero-extend (char is unsigned) the value and then
>   	   perform a signed 32-bit subtraction.  */
> -	lsr	data1, data1, #56
> -	sub	result, data1, data2, lsr #56
> +	lsr	data1, data1, 56
> +	sub	result, data1, data2, lsr 56
>   	ret
> -#endif
> +
> +	.p2align 4
>   
>   L(mutual_align):
>   	/* Sources are mutually aligned, but are not currently at an
>   	   alignment boundary.  Round down the addresses and then mask off
> -	   the bytes that preceed the start point.  */
> -	bic	src1, src1, #7
> -	bic	src2, src2, #7
> -	lsl	tmp1, tmp1, #3		/* Bytes beyond alignment -> bits.  */
> -	ldr	data1, [src1], #8
> -	neg	tmp1, tmp1		/* Bits to alignment -64.  */
> -	ldr	data2, [src2], #8
> -	mov	tmp2, #~0
> -#ifdef __AARCH64EB__
> -	/* Big-endian.  Early bytes are at MSB.  */
> -	lsl	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
> -#else
> -	/* Little-endian.  Early bytes are at LSB.  */
> -	lsr	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
> -#endif
> -	orr	data1, data1, tmp2
> -	orr	data2, data2, tmp2
> +	   the bytes that precede the start point.  */
> +	bic	src1, src1, 7
> +	ldr	data2, [src1, off2]
> +	ldr	data1, [src1], 8
> +	neg	shift, src2, lsl 3	/* Bits to alignment -64.  */
> +	mov	tmp, -1
> +	LS_FW	tmp, tmp, shift
> +	orr	data1, data1, tmp
> +	orr	data2, data2, tmp
>   	b	L(start_realigned)
>   
>   L(misaligned8):
>   	/* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always
> -	   checking to make sure that we don't access beyond page boundary in
> -	   SRC2.  */
> -	tst	src1, #7
> -	b.eq	L(loop_misaligned)
> +	   checking to make sure that we don't access beyond the end of SRC2.  */
> +	cbz	tmp, L(src1_aligned)
>   L(do_misaligned):
> -	ldrb	data1w, [src1], #1
> -	ldrb	data2w, [src2], #1
> -	cmp	data1w, #1
> -	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
> +	ldrb	data1w, [src1], 1
> +	ldrb	data2w, [src2], 1
> +	cmp	data1w, 0
> +	ccmp	data1w, data2w, 0, ne	/* NZCV = 0b0000.  */
>   	b.ne	L(done)
> -	tst	src1, #7
> +	tst	src1, 7
>   	b.ne	L(do_misaligned)
>   
> -L(loop_misaligned):
> -	/* Test if we are within the last dword of the end of a 4K page.  If
> -	   yes then jump back to the misaligned loop to copy a byte at a time.  */
> -	and	tmp1, src2, #0xff8
> -	eor	tmp1, tmp1, #0xff8
> -	cbz	tmp1, L(do_misaligned)
> -	ldr	data1, [src1], #8
> -	ldr	data2, [src2], #8
> -
> -	sub	tmp1, data1, zeroones
> -	orr	tmp2, data1, #REP8_7f
> -	eor	diff, data1, data2	/* Non-zero if differences found.  */
> -	bic	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
> +L(src1_aligned):
> +	neg	shift, src2, lsl 3
> +	bic	src2, src2, 7
> +	ldr	data3, [src2], 8
> +#ifdef __AARCH64EB__
> +	rev	data3, data3
> +#endif
> +	lsr	tmp, zeroones, shift
> +	orr	data3, data3, tmp
> +	sub	has_nul, data3, zeroones
> +	orr	tmp, data3, REP8_7f
> +	bics	has_nul, has_nul, tmp
> +	b.ne	L(tail)
> +
> +	sub	off1, src2, src1
> +
> +	.p2align 4
> +
> +L(loop_unaligned):
> +	ldr	data3, [src1, off1]
> +	ldr	data2, [src1, off2]
> +#ifdef __AARCH64EB__
> +	rev	data3, data3
> +#endif
> +	sub	has_nul, data3, zeroones
> +	orr	tmp, data3, REP8_7f
> +	ldr	data1, [src1], 8
> +	bics	has_nul, has_nul, tmp
> +	ccmp	data1, data2, 0, eq
> +	b.eq	L(loop_unaligned)
> +
> +	lsl	tmp, has_nul, shift
> +#ifdef __AARCH64EB__
> +	rev	tmp, tmp
> +#endif
> +	eor	diff, data1, data2
> +	orr	syndrome, diff, tmp
> +	cbnz	syndrome, L(end)
> +L(tail):
> +	ldr	data1, [src1]
> +	neg	shift, shift
> +	lsr	data2, data3, shift
> +	lsr	has_nul, has_nul, shift
> +#ifdef __AARCH64EB__
> +	rev     data2, data2
> +	rev	has_nul, has_nul
> +#endif
> +	eor	diff, data1, data2
>   	orr	syndrome, diff, has_nul
> -	cbz	syndrome, L(loop_misaligned)
>   	b	L(end)
>   
>   L(done):
>   	sub	result, data1, data2
>   	ret
> -	.size	strcmp, .-strcmp
>   
> +END (strcmp)
>   #endif
> diff --git a/newlib/libc/machine/aarch64/strcpy.S b/newlib/libc/machine/aarch64/strcpy.S
> index e5405f2535..57c46f3908 100644
> --- a/newlib/libc/machine/aarch64/strcpy.S
> +++ b/newlib/libc/machine/aarch64/strcpy.S
> @@ -1,341 +1,160 @@
>   /*
> -   strcpy/stpcpy - copy a string returning pointer to start/end.
> -
> -   Copyright (c) 2013, 2014, 2015 ARM Ltd.
> -   All Rights Reserved.
> -
> -   Redistribution and use in source and binary forms, with or without
> -   modification, are permitted provided that the following conditions are met:
> -       * Redistributions of source code must retain the above copyright
> -         notice, this list of conditions and the following disclaimer.
> -       * Redistributions in binary form must reproduce the above copyright
> -         notice, this list of conditions and the following disclaimer in the
> -         documentation and/or other materials provided with the distribution.
> -       * Neither the name of the company nor the names of its contributors
> -         may be used to endorse or promote products derived from this
> -         software without specific prior written permission.
> -
> -   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> -   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> -   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> -   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> -   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> -   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> -   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> -   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> -   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> -   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> -   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  */
> -
> + * strcpy/stpcpy - copy a string returning pointer to start/end.
> + *
> + * Copyright (c) 2020-2023, Arm Limited.
> + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
> + */
>   #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
>   /* See strchr-stub.c  */
>   #else
>   
>   /* Assumptions:
>    *
> - * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
> + * ARMv8-a, AArch64, Advanced SIMD.
> + * MTE compatible.
>    */
>   
> -/* To build as stpcpy, define BUILD_STPCPY before compiling this file.
> +#include "asmdefs.h"
>   
> -   To test the page crossing code path more thoroughly, compile with
> -   -DSTRCPY_TEST_PAGE_CROSS - this will force all copies through the slower
> -   entry path.  This option is not intended for production use.  */
> -
> -/* Arguments and results.  */
>   #define dstin		x0
>   #define srcin		x1
> +#define result		x0
>   
> -/* Locals and temporaries.  */
>   #define src		x2
>   #define dst		x3
> -#define data1		x4
> -#define data1w		w4
> -#define data2		x5
> -#define data2w		w5
> -#define has_nul1	x6
> -#define has_nul2	x7
> -#define tmp1		x8
> -#define tmp2		x9
> -#define tmp3		x10
> -#define tmp4		x11
> -#define zeroones	x12
> -#define data1a		x13
> -#define data2a		x14
> -#define pos		x15
> -#define len		x16
> -#define to_align	x17
> +#define len		x4
> +#define synd		x4
> +#define	tmp		x5
> +#define shift		x5
> +#define data1		x6
> +#define dataw1		w6
> +#define data2		x7
> +#define dataw2		w7
> +
> +#define dataq		q0
> +#define vdata		v0
> +#define vhas_nul	v1
> +#define vend		v2
> +#define dend		d2
> +#define dataq2		q1
>   
>   #ifdef BUILD_STPCPY
> -#define STRCPY stpcpy
> +# define STRCPY stpcpy
> +# define IFSTPCPY(X,...) X,__VA_ARGS__
>   #else
> -#define STRCPY strcpy
> +# define STRCPY strcpy
> +# define IFSTPCPY(X,...)
>   #endif
>   
> -	.macro def_fn f p2align=0
> -	.text
> -	.p2align \p2align
> -	.global \f
> -	.type \f, %function
> -\f:
> -	.endm
> -
> -	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
> -	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
> -	   can be done in parallel across the entire word.  */
> -
> -#define REP8_01 0x0101010101010101
> -#define REP8_7f 0x7f7f7f7f7f7f7f7f
> -#define REP8_80 0x8080808080808080
> -
> -	/* AArch64 systems have a minimum page size of 4k.  We can do a quick
> -	   page size check for crossing this boundary on entry and if we
> -	   do not, then we can short-circuit much of the entry code.  We
> -	   expect early page-crossing strings to be rare (probability of
> -	   16/MIN_PAGE_SIZE ~= 0.4%), so the branch should be quite
> -	   predictable, even with random strings.
> -
> -	   We don't bother checking for larger page sizes, the cost of setting
> -	   up the correct page size is just not worth the extra gain from
> -	   a small reduction in the cases taking the slow path.  Note that
> -	   we only care about whether the first fetch, which may be
> -	   misaligned, crosses a page boundary - after that we move to aligned
> -	   fetches for the remainder of the string.  */
> -
> -#ifdef STRCPY_TEST_PAGE_CROSS
> -	/* Make everything that isn't Qword aligned look like a page cross.  */
> -#define MIN_PAGE_P2 4
> -#else
> -#define MIN_PAGE_P2 12
> -#endif
> -
> -#define MIN_PAGE_SIZE (1 << MIN_PAGE_P2)
> -
> -def_fn STRCPY p2align=6
> -	/* For moderately short strings, the fastest way to do the copy is to
> -	   calculate the length of the string in the same way as strlen, then
> -	   essentially do a memcpy of the result.  This avoids the need for
> -	   multiple byte copies and further means that by the time we
> -	   reach the bulk copy loop we know we can always use DWord
> -	   accesses.  We expect strcpy to rarely be called repeatedly
> -	   with the same source string, so branch prediction is likely to
> -	   always be difficult - we mitigate against this by preferring
> -	   conditional select operations over branches whenever this is
> -	   feasible.  */
> -	and	tmp2, srcin, #(MIN_PAGE_SIZE - 1)
> -	mov	zeroones, #REP8_01
> -	and	to_align, srcin, #15
> -	cmp	tmp2, #(MIN_PAGE_SIZE - 16)
> -	neg	tmp1, to_align
> -	/* The first fetch will straddle a (possible) page boundary iff
> -	   srcin + 15 causes bit[MIN_PAGE_P2] to change value.  A 16-byte
> -	   aligned string will never fail the page align check, so will
> -	   always take the fast path.  */
> -	b.gt	.Lpage_cross
> -
> -.Lpage_cross_ok:
> -	ldp	data1, data2, [srcin]
> -#ifdef __AARCH64EB__
> -	/* Because we expect the end to be found within 16 characters
> -	   (profiling shows this is the most common case), it's worth
> -	   swapping the bytes now to save having to recalculate the
> -	   termination syndrome later.  We preserve data1 and data2
> -	   so that we can re-use the values later on.  */
> -	rev	tmp2, data1
> -	sub	tmp1, tmp2, zeroones
> -	orr	tmp2, tmp2, #REP8_7f
> -	bics	has_nul1, tmp1, tmp2
> -	b.ne	.Lfp_le8
> -	rev	tmp4, data2
> -	sub	tmp3, tmp4, zeroones
> -	orr	tmp4, tmp4, #REP8_7f
> -#else
> -	sub	tmp1, data1, zeroones
> -	orr	tmp2, data1, #REP8_7f
> -	bics	has_nul1, tmp1, tmp2
> -	b.ne	.Lfp_le8
> -	sub	tmp3, data2, zeroones
> -	orr	tmp4, data2, #REP8_7f
> +/*
> +   Core algorithm:
> +   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
> +   per byte. We take 4 bits of every comparison byte with shift right and narrow
> +   by 4 instruction. Since the bits in the nibble mask reflect the order in
> +   which things occur in the original string, counting leading zeros identifies
> +   exactly which byte matched.  */
> +
> +ENTRY (STRCPY)
> +	PTR_ARG (0)
> +	PTR_ARG (1)
> +	bic	src, srcin, 15
> +	ld1	{vdata.16b}, [src]
> +	cmeq	vhas_nul.16b, vdata.16b, 0
> +	lsl	shift, srcin, 2
> +	shrn	vend.8b, vhas_nul.8h, 4
> +	fmov	synd, dend
> +	lsr	synd, synd, shift
> +	cbnz	synd, L(tail)
> +
> +	ldr	dataq, [src, 16]!
> +	cmeq	vhas_nul.16b, vdata.16b, 0
> +	shrn	vend.8b, vhas_nul.8h, 4
> +	fmov	synd, dend
> +	cbz	synd, L(start_loop)
> +
> +#ifndef __AARCH64EB__
> +	rbit	synd, synd
>   #endif
> -	bics	has_nul2, tmp3, tmp4
> -	b.eq	.Lbulk_entry
> +	sub	tmp, src, srcin
> +	clz	len, synd
> +	add	len, tmp, len, lsr 2
> +	tbz	len, 4, L(less16)
> +	sub	tmp, len, 15
> +	ldr	dataq, [srcin]
> +	ldr	dataq2, [srcin, tmp]
> +	str	dataq, [dstin]
> +	str	dataq2, [dstin, tmp]
> +	IFSTPCPY (add result, dstin, len)
> +	ret
>   
> -	/* The string is short (<=16 bytes).  We don't know exactly how
> -	   short though, yet.  Work out the exact length so that we can
> -	   quickly select the optimal copy strategy.  */
> -.Lfp_gt8:
> -	rev	has_nul2, has_nul2
> -	clz	pos, has_nul2
> -	mov	tmp2, #56
> -	add	dst, dstin, pos, lsr #3		/* Bits to bytes.  */
> -	sub	pos, tmp2, pos
> -#ifdef __AARCH64EB__
> -	lsr	data2, data2, pos
> -#else
> -	lsl	data2, data2, pos
> -#endif
> -	str	data2, [dst, #1]
> +L(tail):
> +	rbit	synd, synd
> +	clz	len, synd
> +	lsr	len, len, 2
> +L(less16):
> +	tbz	len, 3, L(less8)
> +	sub	tmp, len, 7
> +	ldr	data1, [srcin]
> +	ldr	data2, [srcin, tmp]
>   	str	data1, [dstin]
> -#ifdef BUILD_STPCPY
> -	add	dstin, dst, #8
> -#endif
> +	str	data2, [dstin, tmp]
> +	IFSTPCPY (add result, dstin, len)
>   	ret
>   
> -.Lfp_le8:
> -	rev	has_nul1, has_nul1
> -	clz	pos, has_nul1
> -	add	dst, dstin, pos, lsr #3		/* Bits to bytes.  */
> -	subs	tmp2, pos, #24			/* Pos in bits. */
> -	b.lt	.Lfp_lt4
> -#ifdef __AARCH64EB__
> -	mov	tmp2, #56
> -	sub	pos, tmp2, pos
> -	lsr	data2, data1, pos
> -	lsr	data1, data1, #32
> -#else
> -	lsr	data2, data1, tmp2
> -#endif
> -	/* 4->7 bytes to copy.  */
> -	str	data2w, [dst, #-3]
> -	str	data1w, [dstin]
> -#ifdef BUILD_STPCPY
> -	mov	dstin, dst
> -#endif
> -	ret
> -.Lfp_lt4:
> -	cbz	pos, .Lfp_lt2
> -	/* 2->3 bytes to copy.  */
> -#ifdef __AARCH64EB__
> -	lsr	data1, data1, #48
> -#endif
> -	strh	data1w, [dstin]
> -	/* Fall-through, one byte (max) to go.  */
> -.Lfp_lt2:
> -	/* Null-terminated string.  Last character must be zero!  */
> -	strb	wzr, [dst]
> -#ifdef BUILD_STPCPY
> -	mov	dstin, dst
> -#endif
> +	.p2align 4
> +L(less8):
> +	subs	tmp, len, 3
> +	b.lo	L(less4)
> +	ldr	dataw1, [srcin]
> +	ldr	dataw2, [srcin, tmp]
> +	str	dataw1, [dstin]
> +	str	dataw2, [dstin, tmp]
> +	IFSTPCPY (add result, dstin, len)
>   	ret
>   
> -	.p2align 6
> -	/* Aligning here ensures that the entry code and main loop all lies
> -	   within one 64-byte cache line.  */
> -.Lbulk_entry:
> -	sub	to_align, to_align, #16
> -	stp	data1, data2, [dstin]
> -	sub	src, srcin, to_align
> -	sub	dst, dstin, to_align
> -	b	.Lentry_no_page_cross
> -
> -	/* The inner loop deals with two Dwords at a time.  This has a
> -	   slightly higher start-up cost, but we should win quite quickly,
> -	   especially on cores with a high number of issue slots per
> -	   cycle, as we get much better parallelism out of the operations.  */
> -.Lmain_loop:
> -	stp	data1, data2, [dst], #16
> -.Lentry_no_page_cross:
> -	ldp	data1, data2, [src], #16
> -	sub	tmp1, data1, zeroones
> -	orr	tmp2, data1, #REP8_7f
> -	sub	tmp3, data2, zeroones
> -	orr	tmp4, data2, #REP8_7f
> -	bic	has_nul1, tmp1, tmp2
> -	bics	has_nul2, tmp3, tmp4
> -	ccmp	has_nul1, #0, #0, eq	/* NZCV = 0000  */
> -	b.eq	.Lmain_loop
> -
> -	/* Since we know we are copying at least 16 bytes, the fastest way
> -	   to deal with the tail is to determine the location of the
> -	   trailing NUL, then (re)copy the 16 bytes leading up to that.  */
> -	cmp	has_nul1, #0
> -#ifdef __AARCH64EB__
> -	/* For big-endian, carry propagation (if the final byte in the
> -	   string is 0x01) means we cannot use has_nul directly.  The
> -	   easiest way to get the correct byte is to byte-swap the data
> -	   and calculate the syndrome a second time.  */
> -	csel	data1, data1, data2, ne
> -	rev	data1, data1
> -	sub	tmp1, data1, zeroones
> -	orr	tmp2, data1, #REP8_7f
> -	bic	has_nul1, tmp1, tmp2
> -#else
> -	csel	has_nul1, has_nul1, has_nul2, ne
> -#endif
> -	rev	has_nul1, has_nul1
> -	clz	pos, has_nul1
> -	add	tmp1, pos, #72
> -	add	pos, pos, #8
> -	csel	pos, pos, tmp1, ne
> -	add	src, src, pos, lsr #3
> -	add	dst, dst, pos, lsr #3
> -	ldp	data1, data2, [src, #-32]
> -	stp	data1, data2, [dst, #-16]
> -#ifdef BUILD_STPCPY
> -	sub	dstin, dst, #1
> -#endif
> +L(less4):
> +	cbz	len, L(zerobyte)
> +	ldrh	dataw1, [srcin]
> +	strh	dataw1, [dstin]
> +L(zerobyte):
> +	strb	wzr, [dstin, len]
> +	IFSTPCPY (add result, dstin, len)
>   	ret
>   
> -.Lpage_cross:
> -	bic	src, srcin, #15
> -	/* Start by loading two words at [srcin & ~15], then forcing the
> -	   bytes that precede srcin to 0xff.  This means they never look
> -	   like termination bytes.  */
> -	ldp	data1, data2, [src]
> -	lsl	tmp1, tmp1, #3	/* Bytes beyond alignment -> bits.  */
> -	tst	to_align, #7
> -	csetm	tmp2, ne
> -#ifdef __AARCH64EB__
> -	lsl	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
> -#else
> -	lsr	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
> +	.p2align 4
> +L(start_loop):
> +	sub	tmp, srcin, dstin
> +	ldr	dataq2, [srcin]
> +	sub	dst, src, tmp
> +	str	dataq2, [dstin]
> +L(loop):
> +	str	dataq, [dst], 32
> +	ldr	dataq, [src, 16]
> +	cmeq	vhas_nul.16b, vdata.16b, 0
> +	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
> +	fmov	synd, dend
> +	cbnz	synd, L(loopend)
> +	str	dataq, [dst, -16]
> +	ldr	dataq, [src, 32]!
> +	cmeq	vhas_nul.16b, vdata.16b, 0
> +	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
> +	fmov	synd, dend
> +	cbz	synd, L(loop)
> +	add	dst, dst, 16
> +L(loopend):
> +	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
> +	fmov	synd, dend
> +	sub	dst, dst, 31
> +#ifndef __AARCH64EB__
> +	rbit	synd, synd
>   #endif
> -	orr	data1, data1, tmp2
> -	orr	data2a, data2, tmp2
> -	cmp	to_align, #8
> -	csinv	data1, data1, xzr, lt
> -	csel	data2, data2, data2a, lt
> -	sub	tmp1, data1, zeroones
> -	orr	tmp2, data1, #REP8_7f
> -	sub	tmp3, data2, zeroones
> -	orr	tmp4, data2, #REP8_7f
> -	bic	has_nul1, tmp1, tmp2
> -	bics	has_nul2, tmp3, tmp4
> -	ccmp	has_nul1, #0, #0, eq	/* NZCV = 0000  */
> -	b.eq	.Lpage_cross_ok
> -	/* We now need to make data1 and data2 look like they've been
> -	   loaded directly from srcin.  Do a rotate on the 128-bit value.  */
> -	lsl	tmp1, to_align, #3	/* Bytes->bits.  */
> -	neg	tmp2, to_align, lsl #3
> -#ifdef __AARCH64EB__
> -	lsl	data1a, data1, tmp1
> -	lsr	tmp4, data2, tmp2
> -	lsl	data2, data2, tmp1
> -	orr	tmp4, tmp4, data1a
> -	cmp	to_align, #8
> -	csel	data1, tmp4, data2, lt
> -	rev	tmp2, data1
> -	rev	tmp4, data2
> -	sub	tmp1, tmp2, zeroones
> -	orr	tmp2, tmp2, #REP8_7f
> -	sub	tmp3, tmp4, zeroones
> -	orr	tmp4, tmp4, #REP8_7f
> -#else
> -	lsr	data1a, data1, tmp1
> -	lsl	tmp4, data2, tmp2
> -	lsr	data2, data2, tmp1
> -	orr	tmp4, tmp4, data1a
> -	cmp	to_align, #8
> -	csel	data1, tmp4, data2, lt
> -	sub	tmp1, data1, zeroones
> -	orr	tmp2, data1, #REP8_7f
> -	sub	tmp3, data2, zeroones
> -	orr	tmp4, data2, #REP8_7f
> -#endif
> -	bic	has_nul1, tmp1, tmp2
> -	cbnz	has_nul1, .Lfp_le8
> -	bic	has_nul2, tmp3, tmp4
> -	b	.Lfp_gt8
> +	clz	len, synd
> +	lsr	len, len, 2
> +	add	dst, dst, len
> +	ldr	dataq, [dst, tmp]
> +	str	dataq, [dst]
> +	IFSTPCPY (add result, dst, 15)
> +	ret
>   
> -	.size	STRCPY, . - STRCPY
> +END (STRCPY)
>   #endif
> diff --git a/newlib/libc/machine/aarch64/strlen.S b/newlib/libc/machine/aarch64/strlen.S
> index 872d136ef4..68a6f357cf 100644
> --- a/newlib/libc/machine/aarch64/strlen.S
> +++ b/newlib/libc/machine/aarch64/strlen.S
> @@ -1,115 +1,92 @@
> -/* Copyright (c) 2013-2015, Linaro Limited
> -   All rights reserved.
> -
> -   Redistribution and use in source and binary forms, with or without
> -   modification, are permitted provided that the following conditions are met:
> -       * Redistributions of source code must retain the above copyright
> -	 notice, this list of conditions and the following disclaimer.
> -       * Redistributions in binary form must reproduce the above copyright
> -	 notice, this list of conditions and the following disclaimer in the
> -	 documentation and/or other materials provided with the distribution.
> -       * Neither the name of the Linaro nor the
> -	 names of its contributors may be used to endorse or promote products
> -	 derived from this software without specific prior written permission.
> -
> -   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> -   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> -   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> -   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> -   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> -   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> -   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> -   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> -   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> -   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> -   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
> -
> +/*
> + * strlen - calculate the length of a string.
> + *
> + * Copyright (c) 2020-2022, Arm Limited.
> + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
> + */
>   #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
>   /* See strlen-stub.c  */
>   #else
>   
>   /* Assumptions:
>    *
> - * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
> + * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
> + * Not MTE compatible.
>    */
>   
> -/* To test the page crossing code path more thoroughly, compile with
> -   -DTEST_PAGE_CROSS - this will force all calls through the slower
> -   entry path.  This option is not intended for production use.	 */
> -
> -/* Arguments and results.  */
> -#define srcin		x0
> -#define len		x0
> -
> -/* Locals and temporaries.  */
> -#define src		x1
> -#define data1		x2
> -#define data2		x3
> -#define has_nul1	x4
> -#define has_nul2	x5
> -#define tmp1		x4
> -#define tmp2		x5
> -#define tmp3		x6
> -#define tmp4		x7
> -#define zeroones	x8
> -
> -#define L(l) .L ## l
> -
> -	.macro def_fn f p2align=0
> -	.text
> -	.p2align \p2align
> -	.global \f
> -	.type \f, %function
> -\f:
> -	.endm
> -
> -	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
> -	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
> -	   can be done in parallel across the entire word. A faster check
> -	   (X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives
> -	   false hits for characters 129..255.	*/
> +#include "asmdefs.h"
> +
> +#define srcin	x0
> +#define len	x0
> +
> +#define src	x1
> +#define data1	x2
> +#define data2	x3
> +#define has_nul1 x4
> +#define has_nul2 x5
> +#define tmp1	x4
> +#define tmp2	x5
> +#define tmp3	x6
> +#define tmp4	x7
> +#define zeroones x8
> +
> +#define maskv	v0
> +#define maskd	d0
> +#define dataq1	q1
> +#define dataq2	q2
> +#define datav1	v1
> +#define datav2	v2
> +#define tmp	x2
> +#define tmpw	w2
> +#define synd	x3
> +#define syndw	w3
> +#define shift	x4
> +
> +/* For the first 32 bytes, NUL detection works on the principle that
> +   (X - 1) & (~X) & 0x80 (=> (X - 1) & ~(X | 0x7f)) is non-zero if a
> +   byte is zero, and can be done in parallel across the entire word.  */
>   
>   #define REP8_01 0x0101010101010101
>   #define REP8_7f 0x7f7f7f7f7f7f7f7f
> -#define REP8_80 0x8080808080808080
> +
> +/* To test the page crossing code path more thoroughly, compile with
> +   -DTEST_PAGE_CROSS - this will force all calls through the slower
> +   entry path.  This option is not intended for production use.  */
>   
>   #ifdef TEST_PAGE_CROSS
> -# define MIN_PAGE_SIZE 15
> +# define MIN_PAGE_SIZE 32
>   #else
>   # define MIN_PAGE_SIZE 4096
>   #endif
>   
> -	/* Since strings are short on average, we check the first 16 bytes
> -	   of the string for a NUL character.  In order to do an unaligned ldp
> -	   safely we have to do a page cross check first.  If there is a NUL
> -	   byte we calculate the length from the 2 8-byte words using
> -	   conditional select to reduce branch mispredictions (it is unlikely
> -	   strlen will be repeatedly called on strings with the same length).
> -
> -	   If the string is longer than 16 bytes, we align src so don't need
> -	   further page cross checks, and process 32 bytes per iteration
> -	   using the fast NUL check.  If we encounter non-ASCII characters,
> -	   fallback to a second loop using the full NUL check.
> -
> -	   If the page cross check fails, we read 16 bytes from an aligned
> -	   address, remove any characters before the string, and continue
> -	   in the main loop using aligned loads.  Since strings crossing a
> -	   page in the first 16 bytes are rare (probability of
> -	   16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized.
> -
> -	   AArch64 systems have a minimum page size of 4k.  We don't bother
> -	   checking for larger page sizes - the cost of setting up the correct
> -	   page size is just not worth the extra gain from a small reduction in
> -	   the cases taking the slow path.  Note that we only care about
> -	   whether the first fetch, which may be misaligned, crosses a page
> -	   boundary.  */
> -
> -def_fn strlen p2align=6
> +/* Core algorithm:
> +
> +   Since strings are short on average, we check the first 32 bytes of the
> +   string for a NUL character without aligning the string.  In order to use
> +   unaligned loads safely we must do a page cross check first.
> +
> +   If there is a NUL byte we calculate the length from the 2 8-byte words
> +   using conditional select to reduce branch mispredictions (it is unlikely
> +   strlen will be repeatedly called on strings with the same length).
> +
> +   If the string is longer than 32 bytes, align src so we don't need further
> +   page cross checks, and process 32 bytes per iteration using a fast SIMD
> +   loop.
> +
> +   If the page cross check fails, we read 32 bytes from an aligned address,
> +   and ignore any characters before the string.  If it contains a NUL
> +   character, return the length, if not, continue in the main loop.  */
> +
> +ENTRY (strlen)
> +	PTR_ARG (0)
>   	and	tmp1, srcin, MIN_PAGE_SIZE - 1
> -	mov	zeroones, REP8_01
> -	cmp	tmp1, MIN_PAGE_SIZE - 16
> -	b.gt	L(page_cross)
> +	cmp	tmp1, MIN_PAGE_SIZE - 32
> +	b.hi	L(page_cross)
> +
> +	/* Look for a NUL byte in the first 16 bytes.  */
>   	ldp	data1, data2, [srcin]
> +	mov	zeroones, REP8_01
> +
>   #ifdef __AARCH64EB__
>   	/* For big-endian, carry propagation (if the final byte in the
>   	   string is 0x01) means we cannot use has_nul1/2 directly.
> @@ -125,114 +102,96 @@ def_fn strlen p2align=6
>   	bics	has_nul1, tmp1, tmp2
>   	bic	has_nul2, tmp3, tmp4
>   	ccmp	has_nul2, 0, 0, eq
> -	beq	L(main_loop_entry)
> +	b.eq	L(bytes16_31)
>   
> -	/* Enter with C = has_nul1 == 0.  */
> +	/* Find the exact offset of the first NUL byte in the first 16 bytes
> +	   from the string start.  Enter with C = has_nul1 == 0.  */
>   	csel	has_nul1, has_nul1, has_nul2, cc
>   	mov	len, 8
>   	rev	has_nul1, has_nul1
> -	clz	tmp1, has_nul1
>   	csel	len, xzr, len, cc
> +	clz	tmp1, has_nul1
>   	add	len, len, tmp1, lsr 3
>   	ret
>   
> -	/* The inner loop processes 32 bytes per iteration and uses the fast
> -	   NUL check.  If we encounter non-ASCII characters, use a second
> -	   loop with the accurate NUL check.  */
> -	.p2align 4
> -L(main_loop_entry):
> -	bic	src, srcin, 15
> -	sub	src, src, 16
> -L(main_loop):
> -	ldp	data1, data2, [src, 32]!
> -.Lpage_cross_entry:
> -	sub	tmp1, data1, zeroones
> -	sub	tmp3, data2, zeroones
> -	orr	tmp2, tmp1, tmp3
> -	tst	tmp2, zeroones, lsl 7
> -	bne	1f
> -	ldp	data1, data2, [src, 16]
> +	/* Look for a NUL byte at offset 16..31 in the string.  */
> +L(bytes16_31):
> +	ldp	data1, data2, [srcin, 16]
> +#ifdef __AARCH64EB__
> +	rev	data1, data1
> +	rev	data2, data2
> +#endif
>   	sub	tmp1, data1, zeroones
> -	sub	tmp3, data2, zeroones
> -	orr	tmp2, tmp1, tmp3
> -	tst	tmp2, zeroones, lsl 7
> -	beq	L(main_loop)
> -	add	src, src, 16
> -1:
> -	/* The fast check failed, so do the slower, accurate NUL check.	 */
>   	orr	tmp2, data1, REP8_7f
> +	sub	tmp3, data2, zeroones
>   	orr	tmp4, data2, REP8_7f
>   	bics	has_nul1, tmp1, tmp2
>   	bic	has_nul2, tmp3, tmp4
>   	ccmp	has_nul2, 0, 0, eq
> -	beq	L(nonascii_loop)
> +	b.eq	L(loop_entry)
>   
> -	/* Enter with C = has_nul1 == 0.  */
> -L(tail):
> -#ifdef __AARCH64EB__
> -	/* For big-endian, carry propagation (if the final byte in the
> -	   string is 0x01) means we cannot use has_nul1/2 directly.  The
> -	   easiest way to get the correct byte is to byte-swap the data
> -	   and calculate the syndrome a second time.  */
> -	csel	data1, data1, data2, cc
> -	rev	data1, data1
> -	sub	tmp1, data1, zeroones
> -	orr	tmp2, data1, REP8_7f
> -	bic	has_nul1, tmp1, tmp2
> -#else
> +	/* Find the exact offset of the first NUL byte at offset 16..31 from
> +	   the string start.  Enter with C = has_nul1 == 0.  */
>   	csel	has_nul1, has_nul1, has_nul2, cc
> -#endif
> -	sub	len, src, srcin
> +	mov	len, 24
>   	rev	has_nul1, has_nul1
> -	add	tmp2, len, 8
> +	mov	tmp3, 16
>   	clz	tmp1, has_nul1
> -	csel	len, len, tmp2, cc
> +	csel	len, tmp3, len, cc
>   	add	len, len, tmp1, lsr 3
>   	ret
>   
> -L(nonascii_loop):
> -	ldp	data1, data2, [src, 16]!
> -	sub	tmp1, data1, zeroones
> -	orr	tmp2, data1, REP8_7f
> -	sub	tmp3, data2, zeroones
> -	orr	tmp4, data2, REP8_7f
> -	bics	has_nul1, tmp1, tmp2
> -	bic	has_nul2, tmp3, tmp4
> -	ccmp	has_nul2, 0, 0, eq
> -	bne	L(tail)
> -	ldp	data1, data2, [src, 16]!
> -	sub	tmp1, data1, zeroones
> -	orr	tmp2, data1, REP8_7f
> -	sub	tmp3, data2, zeroones
> -	orr	tmp4, data2, REP8_7f
> -	bics	has_nul1, tmp1, tmp2
> -	bic	has_nul2, tmp3, tmp4
> -	ccmp	has_nul2, 0, 0, eq
> -	beq	L(nonascii_loop)
> -	b	L(tail)
> +	nop
> +L(loop_entry):
> +	bic	src, srcin, 31
> +
> +	.p2align 5
> +L(loop):
> +	ldp	dataq1, dataq2, [src, 32]!
> +	uminp	maskv.16b, datav1.16b, datav2.16b
> +	uminp	maskv.16b, maskv.16b, maskv.16b
> +	cmeq	maskv.8b, maskv.8b, 0
> +	fmov	synd, maskd
> +	cbz	synd, L(loop)
> +
> +	/* Low 32 bits of synd are non-zero if a NUL was found in datav1.  */
> +	cmeq	maskv.16b, datav1.16b, 0
> +	sub	len, src, srcin
> +	cbnz	syndw, 1f
> +	cmeq	maskv.16b, datav2.16b, 0
> +	add	len, len, 16
> +1:
> +	/* Generate a bitmask and compute correct byte offset.  */
> +	shrn	maskv.8b, maskv.8h, 4
> +	fmov	synd, maskd
> +#ifndef __AARCH64EB__
> +	rbit	synd, synd
> +#endif
> +	clz	tmp, synd
> +	add	len, len, tmp, lsr 2
> +	ret
>   
> -	/* Load 16 bytes from [srcin & ~15] and force the bytes that precede
> -	   srcin to 0x7f, so we ignore any NUL bytes before the string.
> -	   Then continue in the aligned loop.  */
>   L(page_cross):
> -	bic	src, srcin, 15
> -	ldp	data1, data2, [src]
> -	lsl	tmp1, srcin, 3
> -	mov	tmp4, -1
> -#ifdef __AARCH64EB__
> -	/* Big-endian.	Early bytes are at MSB.	 */
> -	lsr	tmp1, tmp4, tmp1	/* Shift (tmp1 & 63).  */
> -#else
> -	/* Little-endian.  Early bytes are at LSB.  */
> -	lsl	tmp1, tmp4, tmp1	/* Shift (tmp1 & 63).  */
> -#endif
> -	orr	tmp1, tmp1, REP8_80
> -	orn	data1, data1, tmp1
> -	orn	tmp2, data2, tmp1
> -	tst	srcin, 8
> -	csel	data1, data1, tmp4, eq
> -	csel	data2, data2, tmp2, eq
> -	b	L(page_cross_entry)
> -
> -	.size	strlen, . - strlen
> +	bic	src, srcin, 31
> +	mov	tmpw, 0x0c03
> +	movk	tmpw, 0xc030, lsl 16
> +	ld1	{datav1.16b, datav2.16b}, [src]
> +	dup	maskv.4s, tmpw
> +	cmeq	datav1.16b, datav1.16b, 0
> +	cmeq	datav2.16b, datav2.16b, 0
> +	and	datav1.16b, datav1.16b, maskv.16b
> +	and	datav2.16b, datav2.16b, maskv.16b
> +	addp	maskv.16b, datav1.16b, datav2.16b
> +	addp	maskv.16b, maskv.16b, maskv.16b
> +	fmov	synd, maskd
> +	lsl	shift, srcin, 1
> +	lsr	synd, synd, shift
> +	cbz	synd, L(loop)
> +
> +	rbit	synd, synd
> +	clz	len, synd
> +	lsr	len, len, 1
> +	ret
> +
> +END (strlen)
>   #endif
> diff --git a/newlib/libc/machine/aarch64/strncmp.S b/newlib/libc/machine/aarch64/strncmp.S
> index ffdabc2607..373695503d 100644
> --- a/newlib/libc/machine/aarch64/strncmp.S
> +++ b/newlib/libc/machine/aarch64/strncmp.S
> @@ -1,49 +1,23 @@
> -/* Copyright (c) 2013, 2018, Linaro Limited
> -   All rights reserved.
> -
> -   Redistribution and use in source and binary forms, with or without
> -   modification, are permitted provided that the following conditions are met:
> -       * Redistributions of source code must retain the above copyright
> -         notice, this list of conditions and the following disclaimer.
> -       * Redistributions in binary form must reproduce the above copyright
> -         notice, this list of conditions and the following disclaimer in the
> -         documentation and/or other materials provided with the distribution.
> -       * Neither the name of the Linaro nor the
> -         names of its contributors may be used to endorse or promote products
> -         derived from this software without specific prior written permission.
> -
> -   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> -   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> -   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> -   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> -   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> -   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> -   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> -   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> -   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> -   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> -   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
> -
> +/*
> + * strncmp - compare two strings
> + *
> + * Copyright (c) 2013-2022, Arm Limited.
> + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
> + */
>   #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
>   /* See strcmp-stub.c  */
>   #else
>   
>   /* Assumptions:
>    *
> - * ARMv8-a, AArch64
> + * ARMv8-a, AArch64.
> + * MTE compatible.
>    */
>   
> -	.macro def_fn f p2align=0
> -	.text
> -	.p2align \p2align
> -	.global \f
> -	.type \f, %function
> -\f:
> -	.endm
> +#include "asmdefs.h"
>   
>   #define REP8_01 0x0101010101010101
>   #define REP8_7f 0x7f7f7f7f7f7f7f7f
> -#define REP8_80 0x8080808080808080
>   
>   /* Parameters and result.  */
>   #define src1		x0
> @@ -64,86 +38,91 @@
>   #define tmp3		x10
>   #define zeroones	x11
>   #define pos		x12
> -#define limit_wd	x13
> -#define mask		x14
> -#define endloop		x15
> +#define mask		x13
> +#define endloop		x14
>   #define count		mask
> +#define offset		pos
> +#define neg_offset	x15
> +
> +/* Define endian dependent shift operations.
> +   On big-endian early bytes are at MSB and on little-endian LSB.
> +   LS_FW means shifting towards early bytes.
> +   LS_BK means shifting towards later bytes.
> +   */
> +#ifdef __AARCH64EB__
> +#define LS_FW lsl
> +#define LS_BK lsr
> +#else
> +#define LS_FW lsr
> +#define LS_BK lsl
> +#endif
>   
> -	.text
> -	.p2align 6
> -	.rep 7
> -	nop	/* Pad so that the loop below fits a cache line.  */
> -	.endr
> -def_fn strncmp
> -	cbz	limit, .Lret0
> +ENTRY (strncmp)
> +	PTR_ARG (0)
> +	PTR_ARG (1)
> +	SIZE_ARG (2)
> +	cbz	limit, L(ret0)
>   	eor	tmp1, src1, src2
>   	mov	zeroones, #REP8_01
>   	tst	tmp1, #7
>   	and	count, src1, #7
> -	b.ne	.Lmisaligned8
> -	cbnz	count, .Lmutual_align
> -	/* Calculate the number of full and partial words -1.  */
> -	sub	limit_wd, limit, #1	/* limit != 0, so no underflow.  */
> -	lsr	limit_wd, limit_wd, #3	/* Convert to Dwords.  */
> +	b.ne	L(misaligned8)
> +	cbnz	count, L(mutual_align)
>   
>   	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
>   	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
>   	   can be done in parallel across the entire word.  */
> -	/* Start of performance-critical section  -- one 64B cache line.  */
> -.Lloop_aligned:
> +	.p2align 4
> +L(loop_aligned):
>   	ldr	data1, [src1], #8
>   	ldr	data2, [src2], #8
> -.Lstart_realigned:
> -	subs	limit_wd, limit_wd, #1
> +L(start_realigned):
> +	subs	limit, limit, #8
>   	sub	tmp1, data1, zeroones
>   	orr	tmp2, data1, #REP8_7f
>   	eor	diff, data1, data2	/* Non-zero if differences found.  */
> -	csinv	endloop, diff, xzr, pl	/* Last Dword or differences.  */
> +	csinv	endloop, diff, xzr, hi	/* Last Dword or differences.  */
>   	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
>   	ccmp	endloop, #0, #0, eq
> -	b.eq	.Lloop_aligned
> -	/* End of performance-critical section  -- one 64B cache line.  */
> +	b.eq	L(loop_aligned)
> +	/* End of main loop */
>   
> -	/* Not reached the limit, must have found the end or a diff.  */
> -	tbz	limit_wd, #63, .Lnot_limit
> -
> -	/* Limit % 8 == 0 => all bytes significant.  */
> -	ands	limit, limit, #7
> -	b.eq	.Lnot_limit
> -
> -	lsl	limit, limit, #3	/* Bits -> bytes.  */
> -	mov	mask, #~0
> -#ifdef __AARCH64EB__
> -	lsr	mask, mask, limit
> -#else
> -	lsl	mask, mask, limit
> -#endif
> -	bic	data1, data1, mask
> -	bic	data2, data2, mask
> -
> -	/* Make sure that the NUL byte is marked in the syndrome.  */
> -	orr	has_nul, has_nul, mask
> -
> -.Lnot_limit:
> +L(full_check):
> +#ifndef __AARCH64EB__
>   	orr	syndrome, diff, has_nul
> -
> -#ifndef	__AARCH64EB__
> +	add	limit, limit, 8	/* Rewind limit to before last subs. */
> +L(syndrome_check):
> +	/* Limit was reached. Check if the NUL byte or the difference
> +	   is before the limit. */
>   	rev	syndrome, syndrome
>   	rev	data1, data1
> -	/* The MS-non-zero bit of the syndrome marks either the first bit
> -	   that is different, or the top bit of the first zero byte.
> -	   Shifting left now will bring the critical information into the
> -	   top bits.  */
>   	clz	pos, syndrome
>   	rev	data2, data2
>   	lsl	data1, data1, pos
> +	cmp	limit, pos, lsr #3
>   	lsl	data2, data2, pos
>   	/* But we need to zero-extend (char is unsigned) the value and then
>   	   perform a signed 32-bit subtraction.  */
>   	lsr	data1, data1, #56
>   	sub	result, data1, data2, lsr #56
> +	csel result, result, xzr, hi
>   	ret
>   #else
> +	/* Not reached the limit, must have found the end or a diff.  */
> +	tbz	limit, #63, L(not_limit)
> +	add	tmp1, limit, 8
> +	cbz	limit, L(not_limit)
> +
> +	lsl	limit, tmp1, #3	/* Bits -> bytes.  */
> +	mov	mask, #~0
> +	lsr	mask, mask, limit
> +	bic	data1, data1, mask
> +	bic	data2, data2, mask
> +
> +	/* Make sure that the NUL byte is marked in the syndrome.  */
> +	orr	has_nul, has_nul, mask
> +
> +L(not_limit):
>   	/* For big-endian we cannot use the trick with the syndrome value
>   	   as carry-propagation can corrupt the upper bits if the trailing
>   	   bytes in the string contain 0x01.  */
> @@ -164,10 +143,11 @@ def_fn strncmp
>   	rev	has_nul, has_nul
>   	orr	syndrome, diff, has_nul
>   	clz	pos, syndrome
> -	/* The MS-non-zero bit of the syndrome marks either the first bit
> -	   that is different, or the top bit of the first zero byte.
> +	/* The most-significant-non-zero bit of the syndrome marks either the
> +	   first bit that is different, or the top bit of the first zero byte.
>   	   Shifting left now will bring the critical information into the
>   	   top bits.  */
> +L(end_quick):
>   	lsl	data1, data1, pos
>   	lsl	data2, data2, pos
>   	/* But we need to zero-extend (char is unsigned) the value and then
> @@ -177,7 +157,7 @@ def_fn strncmp
>   	ret
>   #endif
>   
> -.Lmutual_align:
> +L(mutual_align):
>   	/* Sources are mutually aligned, but are not currently at an
>   	   alignment boundary.  Round down the addresses and then mask off
>   	   the bytes that precede the start point.
> @@ -189,102 +169,143 @@ def_fn strncmp
>   	neg	tmp3, count, lsl #3	/* 64 - bits(bytes beyond align). */
>   	ldr	data2, [src2], #8
>   	mov	tmp2, #~0
> -	sub	limit_wd, limit, #1	/* limit != 0, so no underflow.  */
> -#ifdef __AARCH64EB__
> -	/* Big-endian.  Early bytes are at MSB.  */
> -	lsl	tmp2, tmp2, tmp3	/* Shift (count & 63).  */
> -#else
> -	/* Little-endian.  Early bytes are at LSB.  */
> -	lsr	tmp2, tmp2, tmp3	/* Shift (count & 63).  */
> -#endif
> -	and	tmp3, limit_wd, #7
> -	lsr	limit_wd, limit_wd, #3
> -	/* Adjust the limit. Only low 3 bits used, so overflow irrelevant.  */
> -	add	limit, limit, count
> -	add	tmp3, tmp3, count
> +	LS_FW	tmp2, tmp2, tmp3	/* Shift (count & 63).  */
> +	/* Adjust the limit and ensure it doesn't overflow.  */
> +	adds	limit, limit, count
> +	csinv	limit, limit, xzr, lo
>   	orr	data1, data1, tmp2
>   	orr	data2, data2, tmp2
> -	add	limit_wd, limit_wd, tmp3, lsr #3
> -	b	.Lstart_realigned
> +	b	L(start_realigned)
>   
> -	.p2align 6
> +	.p2align 4
>   	/* Don't bother with dwords for up to 16 bytes.  */
> -.Lmisaligned8:
> +L(misaligned8):
>   	cmp	limit, #16
> -	b.hs	.Ltry_misaligned_words
> +	b.hs	L(try_misaligned_words)
>   
> -.Lbyte_loop:
> +L(byte_loop):
>   	/* Perhaps we can do better than this.  */
>   	ldrb	data1w, [src1], #1
>   	ldrb	data2w, [src2], #1
>   	subs	limit, limit, #1
>   	ccmp	data1w, #1, #0, hi	/* NZCV = 0b0000.  */
>   	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
> -	b.eq	.Lbyte_loop
> -.Ldone:
> +	b.eq	L(byte_loop)
> +L(done):
>   	sub	result, data1, data2
>   	ret
>   	/* Align the SRC1 to a dword by doing a bytewise compare and then do
>   	   the dword loop.  */
> -.Ltry_misaligned_words:
> -	lsr	limit_wd, limit, #3
> -	cbz	count, .Ldo_misaligned
> +L(try_misaligned_words):
> +	cbz	count, L(src1_aligned)
>   
>   	neg	count, count
>   	and	count, count, #7
>   	sub	limit, limit, count
> -	lsr	limit_wd, limit, #3
>   
> -.Lpage_end_loop:
> +L(page_end_loop):
>   	ldrb	data1w, [src1], #1
>   	ldrb	data2w, [src2], #1
>   	cmp	data1w, #1
>   	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
> -	b.ne	.Ldone
> +	b.ne	L(done)
>   	subs	count, count, #1
> -	b.hi	.Lpage_end_loop
> +	b.hi	L(page_end_loop)
> +
> +	/* The following diagram explains the comparison of misaligned strings.
> +	   The bytes are shown in natural order. For little-endian, it is
> +	   reversed in the registers. The "x" bytes are before the string.
> +	   The "|" separates data that is loaded at one time.
> +	   src1     | a a a a a a a a | b b b c c c c c | . . .
> +	   src2     | x x x x x a a a   a a a a a b b b | c c c c c . . .
>   
> -.Ldo_misaligned:
> -	/* Prepare ourselves for the next page crossing.  Unlike the aligned
> -	   loop, we fetch 1 less dword because we risk crossing bounds on
> -	   SRC2.  */
> -	mov	count, #8
> -	subs	limit_wd, limit_wd, #1
> -	b.lo	.Ldone_loop
> -.Lloop_misaligned:
> -	and	tmp2, src2, #0xff8
> -	eor	tmp2, tmp2, #0xff8
> -	cbz	tmp2, .Lpage_end_loop
> +	   After shifting in each step, the data looks like this:
> +	                STEP_A              STEP_B              STEP_C
> +	   data1    a a a a a a a a     b b b c c c c c     b b b c c c c c
> +	   data2    a a a a a a a a     b b b 0 0 0 0 0     0 0 0 c c c c c
>   
> +	   The bytes with "0" are eliminated from the syndrome via mask.
> +
> +	   Align SRC2 down to 16 bytes. This way we can read 16 bytes at a
> +	   time from SRC2. The comparison happens in 3 steps. After each step
> +	   the loop can exit, or read from SRC1 or SRC2. */
> +L(src1_aligned):
> +	/* Calculate offset from 8 byte alignment to string start in bits. No
> +	   need to mask offset since shifts are ignoring upper bits. */
> +	lsl	offset, src2, #3
> +	bic	src2, src2, #0xf
> +	mov	mask, -1
> +	neg	neg_offset, offset
>   	ldr	data1, [src1], #8
> -	ldr	data2, [src2], #8
> -	sub	tmp1, data1, zeroones
> -	orr	tmp2, data1, #REP8_7f
> -	eor	diff, data1, data2	/* Non-zero if differences found.  */
> -	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
> -	ccmp	diff, #0, #0, eq
> -	b.ne	.Lnot_limit
> -	subs	limit_wd, limit_wd, #1
> -	b.pl	.Lloop_misaligned
> +	ldp	tmp1, tmp2, [src2], #16
> +	LS_BK	mask, mask, neg_offset
> +	and	neg_offset, neg_offset, #63	/* Need actual value for cmp later. */
> +	/* Skip the first compare if data in tmp1 is irrelevant. */
> +	tbnz	offset, 6, L(misaligned_mid_loop)
>   
> -.Ldone_loop:
> -	/* We found a difference or a NULL before the limit was reached.  */
> -	and	limit, limit, #7
> -	cbz	limit, .Lnot_limit
> -	/* Read the last word.  */
> -	sub	src1, src1, 8
> -	sub	src2, src2, 8
> -	ldr	data1, [src1, limit]
> -	ldr	data2, [src2, limit]
> -	sub	tmp1, data1, zeroones
> -	orr	tmp2, data1, #REP8_7f
> +L(loop_misaligned):
> +	/* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/
> +	LS_FW	data2, tmp1, offset
> +	LS_BK	tmp1, tmp2, neg_offset
> +	subs	limit, limit, #8
> +	orr	data2, data2, tmp1	/* 8 bytes from SRC2 combined from two regs.*/
> +	sub	has_nul, data1, zeroones
>   	eor	diff, data1, data2	/* Non-zero if differences found.  */
> -	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
> -	ccmp	diff, #0, #0, eq
> -	b.ne	.Lnot_limit
> +	orr	tmp3, data1, #REP8_7f
> +	csinv	endloop, diff, xzr, hi	/* If limit, set to all ones. */
> +	bic	has_nul, has_nul, tmp3	/* Non-zero if NUL byte found in SRC1. */
> +	orr	tmp3, endloop, has_nul
> +	cbnz	tmp3, L(full_check)
> +
> +	ldr	data1, [src1], #8
> +L(misaligned_mid_loop):
> +	/* STEP_B: Compare first part of data1 to second part of tmp2. */
> +	LS_FW	data2, tmp2, offset
> +#ifdef __AARCH64EB__
> +	/* For big-endian we do a byte reverse to avoid carry-propagation
> +	problem described above. This way we can reuse the has_nul in the
> +	next step and also use syndrome value trick at the end. */
> +	rev	tmp3, data1
> +	#define data1_fixed tmp3
> +#else
> +	#define data1_fixed data1
> +#endif
> +	sub	has_nul, data1_fixed, zeroones
> +	orr	tmp3, data1_fixed, #REP8_7f
> +	eor	diff, data2, data1	/* Non-zero if differences found.  */
> +	bic	has_nul, has_nul, tmp3	/* Non-zero if NUL terminator.  */
> +#ifdef __AARCH64EB__
> +	rev	has_nul, has_nul
> +#endif
> +	cmp	limit, neg_offset, lsr #3
> +	orr	syndrome, diff, has_nul
> +	bic	syndrome, syndrome, mask	/* Ignore later bytes. */
> +	csinv	tmp3, syndrome, xzr, hi	/* If limit, set to all ones. */
> +	cbnz	tmp3, L(syndrome_check)
> +
> +	/* STEP_C: Compare second part of data1 to first part of tmp1. */
> +	ldp	tmp1, tmp2, [src2], #16
> +	cmp	limit, #8
> +	LS_BK	data2, tmp1, neg_offset
> +	eor	diff, data2, data1	/* Non-zero if differences found.  */
> +	orr	syndrome, diff, has_nul
> +	and	syndrome, syndrome, mask	/* Ignore earlier bytes. */
> +	csinv	tmp3, syndrome, xzr, hi	/* If limit, set to all ones. */
> +	cbnz	tmp3, L(syndrome_check)
> +
> +	ldr	data1, [src1], #8
> +	sub	limit, limit, #8
> +	b	L(loop_misaligned)
> +
> +#ifdef	__AARCH64EB__
> +L(syndrome_check):
> +	clz	pos, syndrome
> +	cmp	pos, limit, lsl #3
> +	b.lo	L(end_quick)
> +#endif
>   
> -.Lret0:
> +L(ret0):
>   	mov	result, #0
>   	ret
> -	.size strncmp, . - strncmp
> +END(strncmp)
>   #endif
> diff --git a/newlib/libc/machine/aarch64/strnlen.S b/newlib/libc/machine/aarch64/strnlen.S
> index c255c3f7c6..091002e0b0 100644
> --- a/newlib/libc/machine/aarch64/strnlen.S
> +++ b/newlib/libc/machine/aarch64/strnlen.S
> @@ -1,187 +1,105 @@
> -/* strnlen - calculate the length of a string with limit.
> -
> -   Copyright (c) 2013, Linaro Limited
> -   All rights reserved.
> -
> -   Redistribution and use in source and binary forms, with or without
> -   modification, are permitted provided that the following conditions are met:
> -       * Redistributions of source code must retain the above copyright
> -         notice, this list of conditions and the following disclaimer.
> -       * Redistributions in binary form must reproduce the above copyright
> -         notice, this list of conditions and the following disclaimer in the
> -         documentation and/or other materials provided with the distribution.
> -       * Neither the name of the Linaro nor the
> -         names of its contributors may be used to endorse or promote products
> -         derived from this software without specific prior written permission.
> -
> -   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> -   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> -   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> -   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> -   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> -   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> -   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> -   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> -   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> -   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> -   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
> -
> +/*
> + * strnlen - calculate the length of a string with limit.
> + *
> + * Copyright (c) 2020-2022, Arm Limited.
> + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
> + */
>   #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
>   /* See strlen-stub.c  */
>   #else
>   
>   /* Assumptions:
>    *
> - * ARMv8-a, AArch64
> + * ARMv8-a, AArch64, Advanced SIMD.
> + * MTE compatible.
>    */
>   
> -/* Arguments and results.  */
> +#include "asmdefs.h"
> +
>   #define srcin		x0
> -#define len		x0
> -#define limit		x1
> +#define cntin		x1
> +#define result		x0
>   
> -/* Locals and temporaries.  */
>   #define src		x2
> -#define data1		x3
> -#define data2		x4
> -#define data2a		x5
> -#define has_nul1	x6
> -#define has_nul2	x7
> -#define tmp1		x8
> -#define tmp2		x9
> -#define tmp3		x10
> -#define tmp4		x11
> -#define zeroones	x12
> -#define pos		x13
> -#define limit_wd	x14
> -
> -	.macro def_fn f p2align=0
> -	.text
> -	.p2align \p2align
> -	.global \f
> -	.type \f, %function
> -\f:
> -	.endm
> -
> -#define REP8_01 0x0101010101010101
> -#define REP8_7f 0x7f7f7f7f7f7f7f7f
> -#define REP8_80 0x8080808080808080
> -
> -	.text
> -	.p2align	6
> -.Lstart:
> -	/* Pre-pad to ensure critical loop begins an icache line.  */
> -	.rep 7
> -	nop
> -	.endr
> -	/* Put this code here to avoid wasting more space with pre-padding.  */
> -.Lhit_limit:
> -	mov	len, limit
> +#define synd		x3
> +#define	shift		x4
> +#define tmp		x4
> +#define cntrem		x5
> +
> +#define qdata		q0
> +#define vdata		v0
> +#define vhas_chr	v1
> +#define vend		v2
> +#define dend		d2
> +
> +/*
> +   Core algorithm:
> +   Process the string in 16-byte aligned chunks. Compute a 64-bit mask with
> +   four bits per byte using the shrn instruction. A count trailing zeros then
> +   identifies the first zero byte.  */
> +
> +ENTRY (strnlen)
> +	PTR_ARG (0)
> +	SIZE_ARG (1)
> +	bic	src, srcin, 15
> +	cbz	cntin, L(nomatch)
> +	ld1	{vdata.16b}, [src]
> +	cmeq	vhas_chr.16b, vdata.16b, 0
> +	lsl	shift, srcin, 2
> +	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
> +	fmov	synd, dend
> +	lsr	synd, synd, shift
> +	cbz	synd, L(start_loop)
> +L(finish):
> +	rbit	synd, synd
> +	clz	synd, synd
> +	lsr	result, synd, 2
> +	cmp	cntin, result
> +	csel	result, cntin, result, ls
>   	ret
>   
> -def_fn strnlen
> -	cbz	limit, .Lhit_limit
> -	mov	zeroones, #REP8_01
> -	bic	src, srcin, #15
> -	ands	tmp1, srcin, #15
> -	b.ne	.Lmisaligned
> -	/* Calculate the number of full and partial words -1.  */
> -	sub	limit_wd, limit, #1	/* Limit != 0, so no underflow.  */
> -	lsr	limit_wd, limit_wd, #4	/* Convert to Qwords.  */
> -
> -	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
> -	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
> -	   can be done in parallel across the entire word.  */
> -	/* The inner loop deals with two Dwords at a time.  This has a
> -	   slightly higher start-up cost, but we should win quite quickly,
> -	   especially on cores with a high number of issue slots per
> -	   cycle, as we get much better parallelism out of the operations.  */
> -
> -	/* Start of critial section -- keep to one 64Byte cache line.  */
> -.Lloop:
> -	ldp	data1, data2, [src], #16
> -.Lrealigned:
> -	sub	tmp1, data1, zeroones
> -	orr	tmp2, data1, #REP8_7f
> -	sub	tmp3, data2, zeroones
> -	orr	tmp4, data2, #REP8_7f
> -	bic	has_nul1, tmp1, tmp2
> -	bic	has_nul2, tmp3, tmp4
> -	subs	limit_wd, limit_wd, #1
> -	orr	tmp1, has_nul1, has_nul2
> -	ccmp	tmp1, #0, #0, pl	/* NZCV = 0000  */
> -	b.eq	.Lloop
> -	/* End of critical section -- keep to one 64Byte cache line.  */
> -
> -	orr	tmp1, has_nul1, has_nul2
> -	cbz	tmp1, .Lhit_limit	/* No null in final Qword.  */
> -
> -	/* We know there's a null in the final Qword.  The easiest thing
> -	   to do now is work out the length of the string and return
> -	   MIN (len, limit).  */
> -
> -	sub	len, src, srcin
> -	cbz	has_nul1, .Lnul_in_data2
> -#ifdef __AARCH64EB__
> -	mov	data2, data1
> -#endif
> -	sub	len, len, #8
> -	mov	has_nul2, has_nul1
> -.Lnul_in_data2:
> -#ifdef __AARCH64EB__
> -	/* For big-endian, carry propagation (if the final byte in the
> -	   string is 0x01) means we cannot use has_nul directly.  The
> -	   easiest way to get the correct byte is to byte-swap the data
> -	   and calculate the syndrome a second time.  */
> -	rev	data2, data2
> -	sub	tmp1, data2, zeroones
> -	orr	tmp2, data2, #REP8_7f
> -	bic	has_nul2, tmp1, tmp2
> -#endif
> -	sub	len, len, #8
> -	rev	has_nul2, has_nul2
> -	clz	pos, has_nul2
> -	add	len, len, pos, lsr #3		/* Bits to bytes.  */
> -	cmp	len, limit
> -	csel	len, len, limit, ls		/* Return the lower value.  */
> +L(nomatch):
> +	mov	result, cntin
>   	ret
>   
> -.Lmisaligned:
> -	/* Deal with a partial first word.
> -	   We're doing two things in parallel here;
> -	   1) Calculate the number of words (but avoiding overflow if
> -	      limit is near ULONG_MAX) - to do this we need to work out
> -	      limit + tmp1 - 1 as a 65-bit value before shifting it;
> -	   2) Load and mask the initial data words - we force the bytes
> -	      before the ones we are interested in to 0xff - this ensures
> -	      early bytes will not hit any zero detection.  */
> -	sub	limit_wd, limit, #1
> -	neg	tmp4, tmp1
> -	cmp	tmp1, #8
> -
> -	and	tmp3, limit_wd, #15
> -	lsr	limit_wd, limit_wd, #4
> -	mov	tmp2, #~0
> -
> -	ldp	data1, data2, [src], #16
> -	lsl	tmp4, tmp4, #3		/* Bytes beyond alignment -> bits.  */
> -	add	tmp3, tmp3, tmp1
> -
> -#ifdef __AARCH64EB__
> -	/* Big-endian.  Early bytes are at MSB.  */
> -	lsl	tmp2, tmp2, tmp4	/* Shift (tmp1 & 63).  */
> -#else
> -	/* Little-endian.  Early bytes are at LSB.  */
> -	lsr	tmp2, tmp2, tmp4	/* Shift (tmp1 & 63).  */
> +L(start_loop):
> +	sub	tmp, src, srcin
> +	add	tmp, tmp, 17
> +	subs	cntrem, cntin, tmp
> +	b.lo	L(nomatch)
> +
> +	/* Make sure that it won't overread by a 16-byte chunk */
> +	tbz	cntrem, 4, L(loop32_2)
> +	sub	src, src, 16
> +	.p2align 5
> +L(loop32):
> +	ldr	qdata, [src, 32]!
> +	cmeq	vhas_chr.16b, vdata.16b, 0
> +	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
> +	fmov	synd, dend
> +	cbnz	synd, L(end)
> +L(loop32_2):
> +	ldr	qdata, [src, 16]
> +	subs	cntrem, cntrem, 32
> +	cmeq	vhas_chr.16b, vdata.16b, 0
> +	b.lo	L(end_2)
> +	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
> +	fmov	synd, dend
> +	cbz	synd, L(loop32)
> +L(end_2):
> +	add	src, src, 16
> +L(end):
> +	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
> +	sub	result, src, srcin
> +	fmov	synd, dend
> +#ifndef __AARCH64EB__
> +	rbit	synd, synd
>   #endif
> -	add	limit_wd, limit_wd, tmp3, lsr #4
> -
> -	orr	data1, data1, tmp2
> -	orr	data2a, data2, tmp2
> -
> -	csinv	data1, data1, xzr, le
> -	csel	data2, data2, data2a, le
> -	b	.Lrealigned
> -	.size	strnlen, . - .Lstart	/* Include pre-padding in size.  */
> +	clz	synd, synd
> +	add	result, result, synd, lsr 2
> +	cmp	cntin, result
> +	csel	result, cntin, result, ls
> +	ret
>   
> +END (strnlen)
>   #endif
> diff --git a/newlib/libc/machine/aarch64/strrchr.S b/newlib/libc/machine/aarch64/strrchr.S
> index d64fc09b1a..b0574228b6 100644
> --- a/newlib/libc/machine/aarch64/strrchr.S
> +++ b/newlib/libc/machine/aarch64/strrchr.S
> @@ -1,32 +1,9 @@
>   /*
> -   strrchr - find last instance of a character in a string
> -
> -   Copyright (c) 2014, ARM Limited
> -   All rights Reserved.
> -
> -   Redistribution and use in source and binary forms, with or without
> -   modification, are permitted provided that the following conditions are met:
> -       * Redistributions of source code must retain the above copyright
> -         notice, this list of conditions and the following disclaimer.
> -       * Redistributions in binary form must reproduce the above copyright
> -         notice, this list of conditions and the following disclaimer in the
> -         documentation and/or other materials provided with the distribution.
> -       * Neither the name of the company nor the names of its contributors
> -         may be used to endorse or promote products derived from this
> -         software without specific prior written permission.
> -
> -   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> -   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> -   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> -   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> -   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> -   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> -   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> -   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> -   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> -   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> -   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  */
> -
> + * strrchr - find last position of a character in a string.
> + *
> + * Copyright (c) 2014-2022, Arm Limited.
> + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
> + */
>   #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
>   /* See strchr-stub.c  */
>   #else
> @@ -37,6 +14,8 @@
>    * Neon Available.
>    */
>   
> +#include "asmdefs.h"
> +
>   /* Arguments and results.  */
>   #define srcin		x0
>   #define chrin		w1
> @@ -78,17 +57,8 @@
>      in the original string a count_trailing_zeros() operation will
>      identify exactly which byte is causing the termination, and why.  */
>   
> -/* Locals and temporaries.  */
> -
> -	.macro def_fn f p2align=0
> -	.text
> -	.p2align \p2align
> -	.global \f
> -	.type \f, %function
> -\f:
> -	.endm
> -
> -def_fn strrchr
> +ENTRY (strrchr)
> +	PTR_ARG (0)
>   	/* Magic constant 0x40100401 to allow us to identify which lane
>   	   matches the requested byte.  Magic constant 0x80200802 used
>   	   similarly for NUL termination.  */
> @@ -100,7 +70,7 @@ def_fn strrchr
>   	mov	src_offset, #0
>   	ands	tmp1, srcin, #31
>   	add	vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */
> -	b.eq	.Laligned
> +	b.eq	L(aligned)
>   
>   	/* Input string is not 32-byte aligned.  Rather than forcing
>   	   the padding bytes to a safe value, we calculate the syndrome
> @@ -118,45 +88,45 @@ def_fn strrchr
>   	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
>   	addp	vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b	// 256->128
>   	addp	vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b	// 256->128
> -	addp	vhas_nul1.16b, vhas_nul1.16b, vhas_nul1.16b	// 128->64
> -	addp	vhas_chr1.16b, vhas_chr1.16b, vhas_chr1.16b	// 128->64
> -	mov	nul_match, vhas_nul1.2d[0]
> +	addp	vend1.16b, vhas_nul1.16b, vhas_chr1.16b		// 128->64
> +	mov	nul_match, vend1.d[0]
>   	lsl	tmp1, tmp1, #1
>   	mov	const_m1, #~0
> -	mov	chr_match, vhas_chr1.2d[0]
>   	lsr	tmp3, const_m1, tmp1
> +	mov	chr_match, vend1.d[1]
>   
>   	bic	nul_match, nul_match, tmp3	// Mask padding bits.
>   	bic	chr_match, chr_match, tmp3	// Mask padding bits.
> -	cbnz	nul_match, .Ltail
> +	cbnz	nul_match, L(tail)
>   
> -.Lloop:
> +	.p2align 4
> +L(loop):
>   	cmp	chr_match, #0
>   	csel	src_match, src, src_match, ne
>   	csel	src_offset, chr_match, src_offset, ne
> -.Laligned:
> +L(aligned):
>   	ld1	{vdata1.16b, vdata2.16b}, [src], #32
> -	cmeq	vhas_nul1.16b, vdata1.16b, #0
>   	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
> -	cmeq	vhas_nul2.16b, vdata2.16b, #0
>   	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
> -	addp	vend1.16b, vhas_nul1.16b, vhas_nul2.16b	// 256->128
> +	uminp	vend1.16b, vdata1.16b, vdata2.16b
>   	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
>   	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
> +	cmeq	vend1.16b, vend1.16b, 0
>   	addp	vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b	// 256->128
> -	addp	vend1.16b, vend1.16b, vend1.16b	// 128->64
> -	addp	vhas_chr1.16b, vhas_chr1.16b, vhas_chr1.16b	// 128->64
> -	mov	nul_match, vend1.2d[0]
> -	mov	chr_match, vhas_chr1.2d[0]
> -	cbz	nul_match, .Lloop
> +	addp	vend1.16b, vend1.16b, vhas_chr1.16b		// 128->64
> +	mov	nul_match, vend1.d[0]
> +	mov	chr_match, vend1.d[1]
> +	cbz	nul_match, L(loop)
>   
> +	cmeq	vhas_nul1.16b, vdata1.16b, #0
> +	cmeq	vhas_nul2.16b, vdata2.16b, #0
>   	and	vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
>   	and	vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
>   	addp	vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b
>   	addp	vhas_nul1.16b, vhas_nul1.16b, vhas_nul1.16b
> -	mov	nul_match, vhas_nul1.2d[0]
> +	mov	nul_match, vhas_nul1.d[0]
>   
> -.Ltail:
> +L(tail):
>   	/* Work out exactly where the string ends.  */
>   	sub	tmp4, nul_match, #1
>   	eor	tmp4, tmp4, nul_match
> @@ -178,5 +148,5 @@ def_fn strrchr
>   
>   	ret
>   
> -	.size	strrchr, . - strrchr
> +END (strrchr)
>   #endif

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v3 1/2] aarch64: Sync with ARM-software/optimized-routines
  2023-10-05 10:37   ` Richard Earnshaw
@ 2023-10-05 12:23     ` Sebastian Huber
  0 siblings, 0 replies; 7+ messages in thread
From: Sebastian Huber @ 2023-10-05 12:23 UTC (permalink / raw)
  To: Richard Earnshaw, newlib; +Cc: Szabolcs Nagy

Hello Richard,

On 05.10.23 12:37, Richard Earnshaw wrote:
> This is basically ok, but you're removing an existing license and adding 
> a new one from Arm; I think you need to copy the new license into 
> COPYING.NEWLIB - it's not enough just to have an SPDX identifier, the 
> text of the license must be added somewhere as well.

thanks for the review. I sent v4 of the patch set which should fix the 
license issue.

-- 
embedded brains GmbH
Herr Sebastian HUBER
Dornierstr. 4
82178 Puchheim
Germany
email: sebastian.huber@embedded-brains.de
phone: +49-89-18 94 741 - 16
fax:   +49-89-18 94 741 - 08

Registergericht: Amtsgericht München
Registernummer: HRB 157899
Vertretungsberechtigte Geschäftsführer: Peter Rasmussen, Thomas Dörfler
Unsere Datenschutzerklärung finden Sie hier:
https://embedded-brains.de/datenschutzerklaerung/

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH v3 2/2] aarch64: Import memrchr.S
  2023-09-12 10:05 [PATCH 0/2] aarch64: Sync with ARM-software/optimized-routines Sebastian Huber
  2023-09-12 10:05 ` [PATCH v3 1/2] " Sebastian Huber
@ 2023-09-12 10:05 ` Sebastian Huber
  2023-09-18 12:25 ` [PATCH 0/2] aarch64: Sync with ARM-software/optimized-routines Sebastian Huber
  2 siblings, 0 replies; 7+ messages in thread
From: Sebastian Huber @ 2023-09-12 10:05 UTC (permalink / raw)
  To: newlib; +Cc: Szabolcs Nagy

Import memrchr.S for AArch64 from:

https://github.com/ARM-software/optimized-routines

commit 0cf84f26b6b8dcad8287fe30a4dcc1fdabd06560
Author: Sebastian Huber <sebastian.huber@embedded-brains.de>
Date:   Thu Jul 27 17:14:57 2023 +0200

    string: Fix corrupt GNU_PROPERTY_TYPE (5) size

    For ELF32 the notes alignment is 4 and not 8.
---
 newlib/Makefile.in                         |  40 +++++++
 newlib/libc/machine/aarch64/Makefile.inc   |   2 +
 newlib/libc/machine/aarch64/memrchr-stub.c |  11 ++
 newlib/libc/machine/aarch64/memrchr.S      | 115 +++++++++++++++++++++
 4 files changed, 168 insertions(+)
 create mode 100644 newlib/libc/machine/aarch64/memrchr-stub.c
 create mode 100644 newlib/libc/machine/aarch64/memrchr.S

diff --git a/newlib/Makefile.in b/newlib/Makefile.in
index c3052acb93..028dd99f60 100644
--- a/newlib/Makefile.in
+++ b/newlib/Makefile.in
@@ -596,6 +596,8 @@ check_PROGRAMS =
 @HAVE_LIBC_MACHINE_AARCH64_TRUE@	libc/machine/aarch64/memcpy.S \
 @HAVE_LIBC_MACHINE_AARCH64_TRUE@	libc/machine/aarch64/memmove-stub.c \
 @HAVE_LIBC_MACHINE_AARCH64_TRUE@	libc/machine/aarch64/memmove.S \
+@HAVE_LIBC_MACHINE_AARCH64_TRUE@	libc/machine/aarch64/memrchr-stub.c \
+@HAVE_LIBC_MACHINE_AARCH64_TRUE@	libc/machine/aarch64/memrchr.S \
 @HAVE_LIBC_MACHINE_AARCH64_TRUE@	libc/machine/aarch64/memset-stub.c \
 @HAVE_LIBC_MACHINE_AARCH64_TRUE@	libc/machine/aarch64/memset.S \
 @HAVE_LIBC_MACHINE_AARCH64_TRUE@	libc/machine/aarch64/rawmemchr.S \
@@ -1847,6 +1849,8 @@ am__objects_51 = libc/ssp/libc_a-chk_fail.$(OBJEXT) \
 @HAVE_LIBC_MACHINE_AARCH64_TRUE@	libc/machine/aarch64/libc_a-memcpy.$(OBJEXT) \
 @HAVE_LIBC_MACHINE_AARCH64_TRUE@	libc/machine/aarch64/libc_a-memmove-stub.$(OBJEXT) \
 @HAVE_LIBC_MACHINE_AARCH64_TRUE@	libc/machine/aarch64/libc_a-memmove.$(OBJEXT) \
+@HAVE_LIBC_MACHINE_AARCH64_TRUE@	libc/machine/aarch64/libc_a-memrchr-stub.$(OBJEXT) \
+@HAVE_LIBC_MACHINE_AARCH64_TRUE@	libc/machine/aarch64/libc_a-memrchr.$(OBJEXT) \
 @HAVE_LIBC_MACHINE_AARCH64_TRUE@	libc/machine/aarch64/libc_a-memset-stub.$(OBJEXT) \
 @HAVE_LIBC_MACHINE_AARCH64_TRUE@	libc/machine/aarch64/libc_a-memset.$(OBJEXT) \
 @HAVE_LIBC_MACHINE_AARCH64_TRUE@	libc/machine/aarch64/libc_a-rawmemchr.$(OBJEXT) \
@@ -8024,6 +8028,12 @@ libc/machine/aarch64/libc_a-memmove-stub.$(OBJEXT):  \
 libc/machine/aarch64/libc_a-memmove.$(OBJEXT):  \
 	libc/machine/aarch64/$(am__dirstamp) \
 	libc/machine/aarch64/$(DEPDIR)/$(am__dirstamp)
+libc/machine/aarch64/libc_a-memrchr-stub.$(OBJEXT):  \
+	libc/machine/aarch64/$(am__dirstamp) \
+	libc/machine/aarch64/$(DEPDIR)/$(am__dirstamp)
+libc/machine/aarch64/libc_a-memrchr.$(OBJEXT):  \
+	libc/machine/aarch64/$(am__dirstamp) \
+	libc/machine/aarch64/$(DEPDIR)/$(am__dirstamp)
 libc/machine/aarch64/libc_a-memset-stub.$(OBJEXT):  \
 	libc/machine/aarch64/$(am__dirstamp) \
 	libc/machine/aarch64/$(DEPDIR)/$(am__dirstamp)
@@ -12730,6 +12740,8 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@libc/machine/aarch64/$(DEPDIR)/libc_a-memcpy.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@libc/machine/aarch64/$(DEPDIR)/libc_a-memmove-stub.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@libc/machine/aarch64/$(DEPDIR)/libc_a-memmove.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@libc/machine/aarch64/$(DEPDIR)/libc_a-memrchr-stub.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@libc/machine/aarch64/$(DEPDIR)/libc_a-memrchr.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@libc/machine/aarch64/$(DEPDIR)/libc_a-memset-stub.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@libc/machine/aarch64/$(DEPDIR)/libc_a-memset.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@libc/machine/aarch64/$(DEPDIR)/libc_a-rawmemchr-stub.Po@am__quote@
@@ -16711,6 +16723,20 @@ libc/machine/aarch64/libc_a-memmove.obj: libc/machine/aarch64/memmove.S
 @AMDEP_TRUE@@am__fastdepCCAS_FALSE@	DEPDIR=$(DEPDIR) $(CCASDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCCAS_FALSE@	$(AM_V_CPPAS@am__nodep@)$(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libc_a_CPPFLAGS) $(CPPFLAGS) $(libc_a_CCASFLAGS) $(CCASFLAGS) -c -o libc/machine/aarch64/libc_a-memmove.obj `if test -f 'libc/machine/aarch64/memmove.S'; then $(CYGPATH_W) 'libc/machine/aarch64/memmove.S'; else $(CYGPATH_W) '$(srcdir)/libc/machine/aarch64/memmove.S'; fi`
 
+libc/machine/aarch64/libc_a-memrchr.o: libc/machine/aarch64/memrchr.S
+@am__fastdepCCAS_TRUE@	$(AM_V_CPPAS)$(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libc_a_CPPFLAGS) $(CPPFLAGS) $(libc_a_CCASFLAGS) $(CCASFLAGS) -MT libc/machine/aarch64/libc_a-memrchr.o -MD -MP -MF libc/machine/aarch64/$(DEPDIR)/libc_a-memrchr.Tpo -c -o libc/machine/aarch64/libc_a-memrchr.o `test -f 'libc/machine/aarch64/memrchr.S' || echo '$(srcdir)/'`libc/machine/aarch64/memrchr.S
+@am__fastdepCCAS_TRUE@	$(AM_V_at)$(am__mv) libc/machine/aarch64/$(DEPDIR)/libc_a-memrchr.Tpo libc/machine/aarch64/$(DEPDIR)/libc_a-memrchr.Po
+@AMDEP_TRUE@@am__fastdepCCAS_FALSE@	$(AM_V_CPPAS)source='libc/machine/aarch64/memrchr.S' object='libc/machine/aarch64/libc_a-memrchr.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCCAS_FALSE@	DEPDIR=$(DEPDIR) $(CCASDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCCAS_FALSE@	$(AM_V_CPPAS@am__nodep@)$(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libc_a_CPPFLAGS) $(CPPFLAGS) $(libc_a_CCASFLAGS) $(CCASFLAGS) -c -o libc/machine/aarch64/libc_a-memrchr.o `test -f 'libc/machine/aarch64/memrchr.S' || echo '$(srcdir)/'`libc/machine/aarch64/memrchr.S
+
+libc/machine/aarch64/libc_a-memrchr.obj: libc/machine/aarch64/memrchr.S
+@am__fastdepCCAS_TRUE@	$(AM_V_CPPAS)$(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libc_a_CPPFLAGS) $(CPPFLAGS) $(libc_a_CCASFLAGS) $(CCASFLAGS) -MT libc/machine/aarch64/libc_a-memrchr.obj -MD -MP -MF libc/machine/aarch64/$(DEPDIR)/libc_a-memrchr.Tpo -c -o libc/machine/aarch64/libc_a-memrchr.obj `if test -f 'libc/machine/aarch64/memrchr.S'; then $(CYGPATH_W) 'libc/machine/aarch64/memrchr.S'; else $(CYGPATH_W) '$(srcdir)/libc/machine/aarch64/memrchr.S'; fi`
+@am__fastdepCCAS_TRUE@	$(AM_V_at)$(am__mv) libc/machine/aarch64/$(DEPDIR)/libc_a-memrchr.Tpo libc/machine/aarch64/$(DEPDIR)/libc_a-memrchr.Po
+@AMDEP_TRUE@@am__fastdepCCAS_FALSE@	$(AM_V_CPPAS)source='libc/machine/aarch64/memrchr.S' object='libc/machine/aarch64/libc_a-memrchr.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCCAS_FALSE@	DEPDIR=$(DEPDIR) $(CCASDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCCAS_FALSE@	$(AM_V_CPPAS@am__nodep@)$(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libc_a_CPPFLAGS) $(CPPFLAGS) $(libc_a_CCASFLAGS) $(CCASFLAGS) -c -o libc/machine/aarch64/libc_a-memrchr.obj `if test -f 'libc/machine/aarch64/memrchr.S'; then $(CYGPATH_W) 'libc/machine/aarch64/memrchr.S'; else $(CYGPATH_W) '$(srcdir)/libc/machine/aarch64/memrchr.S'; fi`
+
 libc/machine/aarch64/libc_a-memset.o: libc/machine/aarch64/memset.S
 @am__fastdepCCAS_TRUE@	$(AM_V_CPPAS)$(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libc_a_CPPFLAGS) $(CPPFLAGS) $(libc_a_CCASFLAGS) $(CCASFLAGS) -MT libc/machine/aarch64/libc_a-memset.o -MD -MP -MF libc/machine/aarch64/$(DEPDIR)/libc_a-memset.Tpo -c -o libc/machine/aarch64/libc_a-memset.o `test -f 'libc/machine/aarch64/memset.S' || echo '$(srcdir)/'`libc/machine/aarch64/memset.S
 @am__fastdepCCAS_TRUE@	$(AM_V_at)$(am__mv) libc/machine/aarch64/$(DEPDIR)/libc_a-memset.Tpo libc/machine/aarch64/$(DEPDIR)/libc_a-memset.Po
@@ -32981,6 +33007,20 @@ libc/machine/aarch64/libc_a-memmove-stub.obj: libc/machine/aarch64/memmove-stub.
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libc_a_CPPFLAGS) $(CPPFLAGS) $(libc_a_CFLAGS) $(CFLAGS) -c -o libc/machine/aarch64/libc_a-memmove-stub.obj `if test -f 'libc/machine/aarch64/memmove-stub.c'; then $(CYGPATH_W) 'libc/machine/aarch64/memmove-stub.c'; else $(CYGPATH_W) '$(srcdir)/libc/machine/aarch64/memmove-stub.c'; fi`
 
+libc/machine/aarch64/libc_a-memrchr-stub.o: libc/machine/aarch64/memrchr-stub.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libc_a_CPPFLAGS) $(CPPFLAGS) $(libc_a_CFLAGS) $(CFLAGS) -MT libc/machine/aarch64/libc_a-memrchr-stub.o -MD -MP -MF libc/machine/aarch64/$(DEPDIR)/libc_a-memrchr-stub.Tpo -c -o libc/machine/aarch64/libc_a-memrchr-stub.o `test -f 'libc/machine/aarch64/memrchr-stub.c' || echo '$(srcdir)/'`libc/machine/aarch64/memrchr-stub.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) libc/machine/aarch64/$(DEPDIR)/libc_a-memrchr-stub.Tpo libc/machine/aarch64/$(DEPDIR)/libc_a-memrchr-stub.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='libc/machine/aarch64/memrchr-stub.c' object='libc/machine/aarch64/libc_a-memrchr-stub.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libc_a_CPPFLAGS) $(CPPFLAGS) $(libc_a_CFLAGS) $(CFLAGS) -c -o libc/machine/aarch64/libc_a-memrchr-stub.o `test -f 'libc/machine/aarch64/memrchr-stub.c' || echo '$(srcdir)/'`libc/machine/aarch64/memrchr-stub.c
+
+libc/machine/aarch64/libc_a-memrchr-stub.obj: libc/machine/aarch64/memrchr-stub.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libc_a_CPPFLAGS) $(CPPFLAGS) $(libc_a_CFLAGS) $(CFLAGS) -MT libc/machine/aarch64/libc_a-memrchr-stub.obj -MD -MP -MF libc/machine/aarch64/$(DEPDIR)/libc_a-memrchr-stub.Tpo -c -o libc/machine/aarch64/libc_a-memrchr-stub.obj `if test -f 'libc/machine/aarch64/memrchr-stub.c'; then $(CYGPATH_W) 'libc/machine/aarch64/memrchr-stub.c'; else $(CYGPATH_W) '$(srcdir)/libc/machine/aarch64/memrchr-stub.c'; fi`
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) libc/machine/aarch64/$(DEPDIR)/libc_a-memrchr-stub.Tpo libc/machine/aarch64/$(DEPDIR)/libc_a-memrchr-stub.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='libc/machine/aarch64/memrchr-stub.c' object='libc/machine/aarch64/libc_a-memrchr-stub.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libc_a_CPPFLAGS) $(CPPFLAGS) $(libc_a_CFLAGS) $(CFLAGS) -c -o libc/machine/aarch64/libc_a-memrchr-stub.obj `if test -f 'libc/machine/aarch64/memrchr-stub.c'; then $(CYGPATH_W) 'libc/machine/aarch64/memrchr-stub.c'; else $(CYGPATH_W) '$(srcdir)/libc/machine/aarch64/memrchr-stub.c'; fi`
+
 libc/machine/aarch64/libc_a-memset-stub.o: libc/machine/aarch64/memset-stub.c
 @am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libc_a_CPPFLAGS) $(CPPFLAGS) $(libc_a_CFLAGS) $(CFLAGS) -MT libc/machine/aarch64/libc_a-memset-stub.o -MD -MP -MF libc/machine/aarch64/$(DEPDIR)/libc_a-memset-stub.Tpo -c -o libc/machine/aarch64/libc_a-memset-stub.o `test -f 'libc/machine/aarch64/memset-stub.c' || echo '$(srcdir)/'`libc/machine/aarch64/memset-stub.c
 @am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) libc/machine/aarch64/$(DEPDIR)/libc_a-memset-stub.Tpo libc/machine/aarch64/$(DEPDIR)/libc_a-memset-stub.Po
diff --git a/newlib/libc/machine/aarch64/Makefile.inc b/newlib/libc/machine/aarch64/Makefile.inc
index 063a2a84ae..c749b0d575 100644
--- a/newlib/libc/machine/aarch64/Makefile.inc
+++ b/newlib/libc/machine/aarch64/Makefile.inc
@@ -7,6 +7,8 @@ libc_a_SOURCES += \
 	%D%/memcpy.S \
 	%D%/memmove-stub.c \
 	%D%/memmove.S \
+	%D%/memrchr-stub.c \
+	%D%/memrchr.S \
 	%D%/memset-stub.c \
 	%D%/memset.S \
 	%D%/rawmemchr.S \
diff --git a/newlib/libc/machine/aarch64/memrchr-stub.c b/newlib/libc/machine/aarch64/memrchr-stub.c
new file mode 100644
index 0000000000..48f13bedc8
--- /dev/null
+++ b/newlib/libc/machine/aarch64/memrchr-stub.c
@@ -0,0 +1,11 @@
+/*
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (C) 2023 embedded brains GmbH & Co. KG
+ */
+
+#if defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED)
+#include "../../string/memrchr.c"
+#else
+/* See memrchr.S */
+#endif
diff --git a/newlib/libc/machine/aarch64/memrchr.S b/newlib/libc/machine/aarch64/memrchr.S
new file mode 100644
index 0000000000..ba9915cc3d
--- /dev/null
+++ b/newlib/libc/machine/aarch64/memrchr.S
@@ -0,0 +1,115 @@
+/*
+ * memrchr - find last character in a memory zone.
+ *
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD.
+ * MTE compatible.
+ */
+
+#if defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED)
+/* See memrchr-stub.c */
+#else
+#include "asmdefs.h"
+
+#define srcin		x0
+#define chrin		w1
+#define cntin		x2
+#define result		x0
+
+#define src		x3
+#define cntrem		x4
+#define synd		x5
+#define shift		x6
+#define	tmp		x7
+#define end		x8
+#define endm1		x9
+
+#define vrepchr		v0
+#define qdata		q1
+#define vdata		v1
+#define vhas_chr	v2
+#define vend		v3
+#define dend		d3
+
+/*
+   Core algorithm:
+   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
+   per byte. We take 4 bits of every comparison byte with shift right and narrow
+   by 4 instruction. Since the bits in the nibble mask reflect the order in
+   which things occur in the original string, counting leading zeros identifies
+   exactly which byte matched.  */
+
+ENTRY (memrchr)
+	PTR_ARG (0)
+	add	end, srcin, cntin
+	sub	endm1, end, 1
+	bic	src, endm1, 15
+	cbz	cntin, L(nomatch)
+	ld1	{vdata.16b}, [src]
+	dup	vrepchr.16b, chrin
+	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
+	neg	shift, end, lsl 2
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
+	fmov	synd, dend
+	lsl	synd, synd, shift
+	cbz	synd, L(start_loop)
+
+	clz	synd, synd
+	sub	result, endm1, synd, lsr 2
+	cmp	cntin, synd, lsr 2
+	csel	result, result, xzr, hi
+	ret
+
+	nop
+L(start_loop):
+	subs	cntrem, src, srcin
+	b.ls	L(nomatch)
+
+	/* Make sure that it won't overread by a 16-byte chunk */
+	sub	cntrem, cntrem, 1
+	tbz	cntrem, 4, L(loop32_2)
+	add	src, src, 16
+
+	.p2align 5
+L(loop32):
+	ldr	qdata, [src, -32]!
+	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
+	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	fmov	synd, dend
+	cbnz	synd, L(end)
+
+L(loop32_2):
+	ldr	qdata, [src, -16]
+	subs	cntrem, cntrem, 32
+	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
+	b.lo	L(end_2)
+	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	fmov	synd, dend
+	cbz	synd, L(loop32)
+L(end_2):
+	sub	src, src, 16
+L(end):
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
+	fmov	synd, dend
+
+	add	tmp, src, 15
+#ifdef __AARCH64EB__
+	rbit	synd, synd
+#endif
+	clz	synd, synd
+	sub	tmp, tmp, synd, lsr 2
+	cmp	tmp, srcin
+	csel	result, tmp, xzr, hs
+	ret
+
+L(nomatch):
+	mov	result, 0
+	ret
+
+END (memrchr)
+#endif
-- 
2.35.3


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH 0/2] aarch64: Sync with ARM-software/optimized-routines
  2023-09-12 10:05 [PATCH 0/2] aarch64: Sync with ARM-software/optimized-routines Sebastian Huber
  2023-09-12 10:05 ` [PATCH v3 1/2] " Sebastian Huber
  2023-09-12 10:05 ` [PATCH v3 2/2] aarch64: Import memrchr.S Sebastian Huber
@ 2023-09-18 12:25 ` Sebastian Huber
  2023-09-27  9:53   ` Sebastian Huber
  2 siblings, 1 reply; 7+ messages in thread
From: Sebastian Huber @ 2023-09-18 12:25 UTC (permalink / raw)
  To: newlib

On 12.09.23 12:05, Sebastian Huber wrote:
> This patch set synchronizes AArch64-specific files with the 
> https://github.com/ARM-software/optimized-routines upstream. Sebastian 
> Huber (2): aarch64: Sync with ARM-software/optimized-routines aarch64: 
> Import memrchr.S v3: Use latest commit.

Would someone from ARM mind having a look at this patch set? This issue 
is now open for more than a year.

-- 
embedded brains GmbH
Herr Sebastian HUBER
Dornierstr. 4
82178 Puchheim
Germany
email: sebastian.huber@embedded-brains.de
phone: +49-89-18 94 741 - 16
fax:   +49-89-18 94 741 - 08

Registergericht: Amtsgericht München
Registernummer: HRB 157899
Vertretungsberechtigte Geschäftsführer: Peter Rasmussen, Thomas Dörfler
Unsere Datenschutzerklärung finden Sie hier:
https://embedded-brains.de/datenschutzerklaerung/

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH 0/2] aarch64: Sync with ARM-software/optimized-routines
  2023-09-18 12:25 ` [PATCH 0/2] aarch64: Sync with ARM-software/optimized-routines Sebastian Huber
@ 2023-09-27  9:53   ` Sebastian Huber
  0 siblings, 0 replies; 7+ messages in thread
From: Sebastian Huber @ 2023-09-27  9:53 UTC (permalink / raw)
  To: newlib

On 18.09.23 14:25, Sebastian Huber wrote:
> On 12.09.23 12:05, Sebastian Huber wrote:
>> This patch set synchronizes AArch64-specific files with the 
>> https://github.com/ARM-software/optimized-routines upstream. Sebastian 
>> Huber (2): aarch64: Sync with ARM-software/optimized-routines aarch64: 
>> Import memrchr.S v3: Use latest commit.
> 
> Would someone from ARM mind having a look at this patch set? This issue 
> is now open for more than a year.

Since ARM seems be be unwilling to maintain this part of Newlib and even 
review patches related to this area, we need a way forward.

-- 
embedded brains GmbH
Herr Sebastian HUBER
Dornierstr. 4
82178 Puchheim
Germany
email: sebastian.huber@embedded-brains.de
phone: +49-89-18 94 741 - 16
fax:   +49-89-18 94 741 - 08

Registergericht: Amtsgericht München
Registernummer: HRB 157899
Vertretungsberechtigte Geschäftsführer: Peter Rasmussen, Thomas Dörfler
Unsere Datenschutzerklärung finden Sie hier:
https://embedded-brains.de/datenschutzerklaerung/

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2023-10-05 12:23 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-09-12 10:05 [PATCH 0/2] aarch64: Sync with ARM-software/optimized-routines Sebastian Huber
2023-09-12 10:05 ` [PATCH v3 1/2] " Sebastian Huber
2023-10-05 10:37   ` Richard Earnshaw
2023-10-05 12:23     ` Sebastian Huber
2023-09-12 10:05 ` [PATCH v3 2/2] aarch64: Import memrchr.S Sebastian Huber
2023-09-18 12:25 ` [PATCH 0/2] aarch64: Sync with ARM-software/optimized-routines Sebastian Huber
2023-09-27  9:53   ` Sebastian Huber

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).