From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from foss.arm.com (foss.arm.com [217.140.110.172]) by sourceware.org (Postfix) with ESMTP id C5BA73858C52 for ; Thu, 5 Oct 2023 10:37:09 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.2 sourceware.org C5BA73858C52 Authentication-Results: sourceware.org; dmarc=none (p=none dis=none) header.from=foss.arm.com Authentication-Results: sourceware.org; spf=none smtp.mailfrom=foss.arm.com Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14]) by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id 174A11570; Thu, 5 Oct 2023 03:37:48 -0700 (PDT) Received: from [10.57.1.138] (unknown [10.57.1.138]) by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPSA id 5C2973F5A1; Thu, 5 Oct 2023 03:37:08 -0700 (PDT) Message-ID: Date: Thu, 5 Oct 2023 11:37:06 +0100 MIME-Version: 1.0 User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:102.0) Gecko/20100101 Thunderbird/102.15.0 Subject: Re: [PATCH v3 1/2] aarch64: Sync with ARM-software/optimized-routines To: Sebastian Huber , newlib@sourceware.org Cc: Szabolcs Nagy References: <20230912100507.33946-1-sebastian.huber@embedded-brains.de> <20230912100507.33946-2-sebastian.huber@embedded-brains.de> Content-Language: en-GB From: Richard Earnshaw In-Reply-To: <20230912100507.33946-2-sebastian.huber@embedded-brains.de> Content-Type: text/plain; charset=UTF-8; format=flowed Content-Transfer-Encoding: 7bit X-Spam-Status: No, score=-3497.6 required=5.0 tests=BAYES_00,GIT_PATCH_0,KAM_DMARC_STATUS,KAM_LAZY_DOMAIN_SECURITY,NICE_REPLY_A,SPF_HELO_NONE,SPF_NONE,TXREP autolearn=ham autolearn_force=no version=3.4.6 X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on server2.sourceware.org List-Id: Hi Sebastian, My apologies for the delay replying, the GNU Cauldron organizing took up a lot of my time over the last few weeks. This is basically ok, but you're removing an existing license and adding a new one from Arm; I think you need to copy the new license into COPYING.NEWLIB - it's not enough just to have an SPDX identifier, the text of the license must be added somewhere as well. R. On 12/09/2023 11:05, Sebastian Huber wrote: > Update AArch64 assembly string routines from: > > https://github.com/ARM-software/optimized-routines > > commit 0cf84f26b6b8dcad8287fe30a4dcc1fdabd06560 > Author: Sebastian Huber > Date: Thu Jul 27 17:14:57 2023 +0200 > > string: Fix corrupt GNU_PROPERTY_TYPE (5) size > > For ELF32 the notes alignment is 4 and not 8. > --- > newlib/libc/machine/aarch64/asmdefs.h | 106 ++++++ > newlib/libc/machine/aarch64/memchr.S | 73 ++-- > newlib/libc/machine/aarch64/memcmp.S | 311 +++++++++-------- > newlib/libc/machine/aarch64/memcpy.S | 272 ++++++++------- > newlib/libc/machine/aarch64/memset.S | 194 ++--------- > newlib/libc/machine/aarch64/stpcpy.S | 36 +- > newlib/libc/machine/aarch64/strchr.S | 107 ++---- > newlib/libc/machine/aarch64/strchrnul.S | 90 ++--- > newlib/libc/machine/aarch64/strcmp.S | 282 ++++++++------- > newlib/libc/machine/aarch64/strcpy.S | 437 +++++++----------------- > newlib/libc/machine/aarch64/strlen.S | 319 ++++++++--------- > newlib/libc/machine/aarch64/strncmp.S | 323 ++++++++++-------- > newlib/libc/machine/aarch64/strnlen.S | 256 +++++--------- > newlib/libc/machine/aarch64/strrchr.S | 86 ++--- > 14 files changed, 1226 insertions(+), 1666 deletions(-) > create mode 100644 newlib/libc/machine/aarch64/asmdefs.h > > diff --git a/newlib/libc/machine/aarch64/asmdefs.h b/newlib/libc/machine/aarch64/asmdefs.h > new file mode 100644 > index 0000000000..131b95e1fe > --- /dev/null > +++ b/newlib/libc/machine/aarch64/asmdefs.h > @@ -0,0 +1,106 @@ > +/* > + * Macros for asm code. AArch64 version. > + * > + * Copyright (c) 2019-2023, Arm Limited. > + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception > + */ > + > +#ifndef _ASMDEFS_H > +#define _ASMDEFS_H > + > +/* Branch Target Identitication support. */ > +#define BTI_C hint 34 > +#define BTI_J hint 36 > +/* Return address signing support (pac-ret). */ > +#define PACIASP hint 25; .cfi_window_save > +#define AUTIASP hint 29; .cfi_window_save > + > +/* GNU_PROPERTY_AARCH64_* macros from elf.h. */ > +#define FEATURE_1_AND 0xc0000000 > +#define FEATURE_1_BTI 1 > +#define FEATURE_1_PAC 2 > + > +/* Add a NT_GNU_PROPERTY_TYPE_0 note. */ > +#ifdef __ILP32__ > +#define GNU_PROPERTY(type, value) \ > + .section .note.gnu.property, "a"; \ > + .p2align 2; \ > + .word 4; \ > + .word 12; \ > + .word 5; \ > + .asciz "GNU"; \ > + .word type; \ > + .word 4; \ > + .word value; \ > + .text > +#else > +#define GNU_PROPERTY(type, value) \ > + .section .note.gnu.property, "a"; \ > + .p2align 3; \ > + .word 4; \ > + .word 16; \ > + .word 5; \ > + .asciz "GNU"; \ > + .word type; \ > + .word 4; \ > + .word value; \ > + .word 0; \ > + .text > +#endif > + > +/* If set then the GNU Property Note section will be added to > + mark objects to support BTI and PAC-RET. */ > +#ifndef WANT_GNU_PROPERTY > +#define WANT_GNU_PROPERTY 1 > +#endif > + > +#if WANT_GNU_PROPERTY > +/* Add property note with supported features to all asm files. */ > +GNU_PROPERTY (FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_PAC) > +#endif > + > +#define ENTRY_ALIGN(name, alignment) \ > + .global name; \ > + .type name,%function; \ > + .align alignment; \ > + name: \ > + .cfi_startproc; \ > + BTI_C; > + > +#define ENTRY(name) ENTRY_ALIGN(name, 6) > + > +#define ENTRY_ALIAS(name) \ > + .global name; \ > + .type name,%function; \ > + name: > + > +#define END(name) \ > + .cfi_endproc; \ > + .size name, .-name; > + > +#define L(l) .L ## l > + > +#ifdef __ILP32__ > + /* Sanitize padding bits of pointer arguments as per aapcs64 */ > +#define PTR_ARG(n) mov w##n, w##n > +#else > +#define PTR_ARG(n) > +#endif > + > +#ifdef __ILP32__ > + /* Sanitize padding bits of size arguments as per aapcs64 */ > +#define SIZE_ARG(n) mov w##n, w##n > +#else > +#define SIZE_ARG(n) > +#endif > + > +/* Compiler supports SVE instructions */ > +#ifndef HAVE_SVE > +# if __aarch64__ && (__GNUC__ >= 8 || __clang_major__ >= 5) > +# define HAVE_SVE 1 > +# else > +# define HAVE_SVE 0 > +# endif > +#endif > + > +#endif > diff --git a/newlib/libc/machine/aarch64/memchr.S b/newlib/libc/machine/aarch64/memchr.S > index 53f5d6bc0e..a0f305e0fc 100644 > --- a/newlib/libc/machine/aarch64/memchr.S > +++ b/newlib/libc/machine/aarch64/memchr.S > @@ -1,31 +1,8 @@ > /* > * memchr - find a character in a memory zone > * > - * Copyright (c) 2014, ARM Limited > - * All rights Reserved. > - * > - * Redistribution and use in source and binary forms, with or without > - * modification, are permitted provided that the following conditions are met: > - * * Redistributions of source code must retain the above copyright > - * notice, this list of conditions and the following disclaimer. > - * * Redistributions in binary form must reproduce the above copyright > - * notice, this list of conditions and the following disclaimer in the > - * documentation and/or other materials provided with the distribution. > - * * Neither the name of the company nor the names of its contributors > - * may be used to endorse or promote products derived from this > - * software without specific prior written permission. > - * > - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS > - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT > - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR > - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT > - * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, > - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT > - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, > - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY > - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT > - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE > - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. > + * Copyright (c) 2014-2022, Arm Limited. > + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception > */ > > #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED)) > @@ -37,6 +14,8 @@ > * Neon Available. > */ > > +#include "asmdefs.h" > + > /* Arguments and results. */ > #define srcin x0 > #define chrin w1 > @@ -70,17 +49,11 @@ > * identify exactly which byte has matched. > */ > > - .macro def_fn f p2align=0 > - .text > - .p2align \p2align > - .global \f > - .type \f, %function > -\f: > - .endm > - > -def_fn memchr > +ENTRY (memchr) > + PTR_ARG (0) > + SIZE_ARG (2) > /* Do not dereference srcin if no bytes to compare. */ > - cbz cntin, .Lzero_length > + cbz cntin, L(zero_length) > /* > * Magic constant 0x40100401 allows us to identify which lane matches > * the requested byte. > @@ -93,7 +66,7 @@ def_fn memchr > dup vrepmask.4s, wtmp2 > ands soff, srcin, #31 > and cntrem, cntin, #31 > - b.eq .Lloop > + b.eq L(loop) > > /* > * Input string is not 32-byte aligned. We calculate the syndrome > @@ -110,41 +83,41 @@ def_fn memchr > and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b > addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */ > addp vend.16b, vend.16b, vend.16b /* 128->64 */ > - mov synd, vend.2d[0] > + mov synd, vend.d[0] > /* Clear the soff*2 lower bits */ > lsl tmp, soff, #1 > lsr synd, synd, tmp > lsl synd, synd, tmp > /* The first block can also be the last */ > - b.ls .Lmasklast > + b.ls L(masklast) > /* Have we found something already? */ > - cbnz synd, .Ltail > + cbnz synd, L(tail) > > -.Lloop: > +L(loop): > ld1 {vdata1.16b, vdata2.16b}, [src], #32 > subs cntin, cntin, #32 > cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b > cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b > /* If we're out of data we finish regardless of the result */ > - b.ls .Lend > + b.ls L(end) > /* Use a fast check for the termination condition */ > orr vend.16b, vhas_chr1.16b, vhas_chr2.16b > addp vend.2d, vend.2d, vend.2d > - mov synd, vend.2d[0] > + mov synd, vend.d[0] > /* We're not out of data, loop if we haven't found the character */ > - cbz synd, .Lloop > + cbz synd, L(loop) > > -.Lend: > +L(end): > /* Termination condition found, let's calculate the syndrome value */ > and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b > and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b > addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */ > addp vend.16b, vend.16b, vend.16b /* 128->64 */ > - mov synd, vend.2d[0] > + mov synd, vend.d[0] > /* Only do the clear for the last possible block */ > - b.hi .Ltail > + b.hs L(tail) > > -.Lmasklast: > +L(masklast): > /* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */ > add tmp, cntrem, soff > and tmp, tmp, #31 > @@ -153,7 +126,7 @@ def_fn memchr > lsl synd, synd, tmp > lsr synd, synd, tmp > > -.Ltail: > +L(tail): > /* Count the trailing zeros using bit reversing */ > rbit synd, synd > /* Compensate the last post-increment */ > @@ -168,9 +141,9 @@ def_fn memchr > csel result, xzr, result, eq > ret > > -.Lzero_length: > +L(zero_length): > mov result, #0 > ret > > - .size memchr, . - memchr > +END (memchr) > #endif > diff --git a/newlib/libc/machine/aarch64/memcmp.S b/newlib/libc/machine/aarch64/memcmp.S > index 605d99365e..18874d3215 100644 > --- a/newlib/libc/machine/aarch64/memcmp.S > +++ b/newlib/libc/machine/aarch64/memcmp.S > @@ -1,57 +1,7 @@ > /* memcmp - compare memory > - > - Copyright (c) 2018 Linaro Limited > - All rights reserved. > - > - Redistribution and use in source and binary forms, with or without > - modification, are permitted provided that the following conditions are met: > - * Redistributions of source code must retain the above copyright > - notice, this list of conditions and the following disclaimer. > - * Redistributions in binary form must reproduce the above copyright > - notice, this list of conditions and the following disclaimer in the > - documentation and/or other materials provided with the distribution. > - * Neither the name of the Linaro nor the > - names of its contributors may be used to endorse or promote products > - derived from this software without specific prior written permission. > - > - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS > - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT > - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR > - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT > - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, > - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT > - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, > - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY > - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT > - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE > - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ > - > -/* > - * Copyright (c) 2017 ARM Ltd > - * All rights reserved. > - * > - * Redistribution and use in source and binary forms, with or without > - * modification, are permitted provided that the following conditions > - * are met: > - * 1. Redistributions of source code must retain the above copyright > - * notice, this list of conditions and the following disclaimer. > - * 2. Redistributions in binary form must reproduce the above copyright > - * notice, this list of conditions and the following disclaimer in the > - * documentation and/or other materials provided with the distribution. > - * 3. The name of the company may not be used to endorse or promote > - * products derived from this software without specific prior written > - * permission. > * > - * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED > - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF > - * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. > - * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, > - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED > - * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR > - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF > - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING > - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS > - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. > + * Copyright (c) 2013-2022, Arm Limited. > + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception > */ > > #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED)) > @@ -60,103 +10,79 @@ > > /* Assumptions: > * > - * ARMv8-a, AArch64, unaligned accesses. > + * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. > */ > > -#define L(l) .L ## l > - > -/* Parameters and result. */ > -#define src1 x0 > -#define src2 x1 > -#define limit x2 > -#define result w0 > - > -/* Internal variables. */ > -#define data1 x3 > -#define data1w w3 > -#define data1h x4 > -#define data2 x5 > -#define data2w w5 > -#define data2h x6 > -#define tmp1 x7 > -#define tmp2 x8 > - > - .macro def_fn f p2align=0 > - .text > - .p2align \p2align > - .global \f > - .type \f, %function > -\f: > - .endm > - > -def_fn memcmp p2align=6 > - subs limit, limit, 8 > - b.lo L(less8) > - > - ldr data1, [src1], 8 > - ldr data2, [src2], 8 > - cmp data1, data2 > - b.ne L(return) > - > - subs limit, limit, 8 > - b.gt L(more16) > - > - ldr data1, [src1, limit] > - ldr data2, [src2, limit] > - b L(return) > - > -L(more16): > - ldr data1, [src1], 8 > - ldr data2, [src2], 8 > - cmp data1, data2 > - bne L(return) > - > - /* Jump directly to comparing the last 16 bytes for 32 byte (or less) > - strings. */ > - subs limit, limit, 16 > +#include "asmdefs.h" > + > +#define src1 x0 > +#define src2 x1 > +#define limit x2 > +#define result w0 > + > +#define data1 x3 > +#define data1w w3 > +#define data2 x4 > +#define data2w w4 > +#define data3 x5 > +#define data3w w5 > +#define data4 x6 > +#define data4w w6 > +#define tmp x6 > +#define src1end x7 > +#define src2end x8 > + > + > +ENTRY (memcmp) > + PTR_ARG (0) > + PTR_ARG (1) > + SIZE_ARG (2) > + > + cmp limit, 16 > + b.lo L(less16) > + ldp data1, data3, [src1] > + ldp data2, data4, [src2] > + ccmp data1, data2, 0, ne > + ccmp data3, data4, 0, eq > + b.ne L(return2) > + > + add src1end, src1, limit > + add src2end, src2, limit > + cmp limit, 32 > b.ls L(last_bytes) > + cmp limit, 160 > + b.hs L(loop_align) > + sub limit, limit, 32 > > - /* We overlap loads between 0-32 bytes at either side of SRC1 when we > - try to align, so limit it only to strings larger than 128 bytes. */ > - cmp limit, 96 > - b.ls L(loop16) > - > - /* Align src1 and adjust src2 with bytes not yet done. */ > - and tmp1, src1, 15 > - add limit, limit, tmp1 > - sub src1, src1, tmp1 > - sub src2, src2, tmp1 > - > - /* Loop performing 16 bytes per iteration using aligned src1. > - Limit is pre-decremented by 16 and must be larger than zero. > - Exit if <= 16 bytes left to do or if the data is not equal. */ > .p2align 4 > -L(loop16): > - ldp data1, data1h, [src1], 16 > - ldp data2, data2h, [src2], 16 > - subs limit, limit, 16 > - ccmp data1, data2, 0, hi > - ccmp data1h, data2h, 0, eq > - b.eq L(loop16) > - > +L(loop32): > + ldp data1, data3, [src1, 16] > + ldp data2, data4, [src2, 16] > cmp data1, data2 > - bne L(return) > - mov data1, data1h > - mov data2, data2h > + ccmp data3, data4, 0, eq > + b.ne L(return2) > + cmp limit, 16 > + b.ls L(last_bytes) > + > + ldp data1, data3, [src1, 32] > + ldp data2, data4, [src2, 32] > cmp data1, data2 > - bne L(return) > + ccmp data3, data4, 0, eq > + b.ne L(return2) > + add src1, src1, 32 > + add src2, src2, 32 > +L(last64): > + subs limit, limit, 32 > + b.hi L(loop32) > > /* Compare last 1-16 bytes using unaligned access. */ > L(last_bytes): > - add src1, src1, limit > - add src2, src2, limit > - ldp data1, data1h, [src1] > - ldp data2, data2h, [src2] > - cmp data1, data2 > - bne L(return) > - mov data1, data1h > - mov data2, data2h > + ldp data1, data3, [src1end, -16] > + ldp data2, data4, [src2end, -16] > +L(return2): > cmp data1, data2 > + csel data1, data1, data3, ne > + csel data2, data2, data4, ne > > /* Compare data bytes and set return value to 0, -1 or 1. */ > L(return): > @@ -164,33 +90,106 @@ L(return): > rev data1, data1 > rev data2, data2 > #endif > - cmp data1, data2 > -L(ret_eq): > + cmp data1, data2 > cset result, ne > cneg result, result, lo > ret > > .p2align 4 > - /* Compare up to 8 bytes. Limit is [-8..-1]. */ > +L(less16): > + add src1end, src1, limit > + add src2end, src2, limit > + tbz limit, 3, L(less8) > + ldr data1, [src1] > + ldr data2, [src2] > + ldr data3, [src1end, -8] > + ldr data4, [src2end, -8] > + b L(return2) > + > + .p2align 4 > L(less8): > - adds limit, limit, 4 > - b.lo L(less4) > - ldr data1w, [src1], 4 > - ldr data2w, [src2], 4 > + tbz limit, 2, L(less4) > + ldr data1w, [src1] > + ldr data2w, [src2] > + ldr data3w, [src1end, -4] > + ldr data4w, [src2end, -4] > + b L(return2) > + > +L(less4): > + tbz limit, 1, L(less2) > + ldrh data1w, [src1] > + ldrh data2w, [src2] > cmp data1w, data2w > b.ne L(return) > - sub limit, limit, 4 > -L(less4): > - adds limit, limit, 4 > - beq L(ret_eq) > -L(byte_loop): > - ldrb data1w, [src1], 1 > - ldrb data2w, [src2], 1 > - subs limit, limit, 1 > - ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */ > - b.eq L(byte_loop) > +L(less2): > + mov result, 0 > + tbz limit, 0, L(return_zero) > + ldrb data1w, [src1end, -1] > + ldrb data2w, [src2end, -1] > sub result, data1w, data2w > +L(return_zero): > + ret > + > +L(loop_align): > + ldp data1, data3, [src1, 16] > + ldp data2, data4, [src2, 16] > + cmp data1, data2 > + ccmp data3, data4, 0, eq > + b.ne L(return2) > + > + /* Align src2 and adjust src1, src2 and limit. */ > + and tmp, src2, 15 > + sub tmp, tmp, 16 > + sub src2, src2, tmp > + add limit, limit, tmp > + sub src1, src1, tmp > + sub limit, limit, 64 + 16 > + > + .p2align 4 > +L(loop64): > + ldr q0, [src1, 16] > + ldr q1, [src2, 16] > + subs limit, limit, 64 > + ldr q2, [src1, 32] > + ldr q3, [src2, 32] > + eor v0.16b, v0.16b, v1.16b > + eor v1.16b, v2.16b, v3.16b > + ldr q2, [src1, 48] > + ldr q3, [src2, 48] > + umaxp v0.16b, v0.16b, v1.16b > + ldr q4, [src1, 64]! > + ldr q5, [src2, 64]! > + eor v1.16b, v2.16b, v3.16b > + eor v2.16b, v4.16b, v5.16b > + umaxp v1.16b, v1.16b, v2.16b > + umaxp v0.16b, v0.16b, v1.16b > + umaxp v0.16b, v0.16b, v0.16b > + fmov tmp, d0 > + ccmp tmp, 0, 0, hi > + b.eq L(loop64) > + > + /* If equal, process last 1-64 bytes using scalar loop. */ > + add limit, limit, 64 + 16 > + cbz tmp, L(last64) > + > + /* Determine the 8-byte aligned offset of the first difference. */ > +#ifdef __AARCH64EB__ > + rev16 tmp, tmp > +#endif > + rev tmp, tmp > + clz tmp, tmp > + bic tmp, tmp, 7 > + sub tmp, tmp, 48 > + ldr data1, [src1, tmp] > + ldr data2, [src2, tmp] > +#ifndef __AARCH64EB__ > + rev data1, data1 > + rev data2, data2 > +#endif > + mov result, 1 > + cmp data1, data2 > + cneg result, result, lo > ret > > - .size memcmp, . - memcmp > +END (memcmp) > #endif > diff --git a/newlib/libc/machine/aarch64/memcpy.S b/newlib/libc/machine/aarch64/memcpy.S > index 463bad0a18..248e7843a2 100644 > --- a/newlib/libc/machine/aarch64/memcpy.S > +++ b/newlib/libc/machine/aarch64/memcpy.S > @@ -1,55 +1,8 @@ > -/* Copyright (c) 2012-2013, Linaro Limited > - All rights reserved. > - > - Redistribution and use in source and binary forms, with or without > - modification, are permitted provided that the following conditions are met: > - * Redistributions of source code must retain the above copyright > - notice, this list of conditions and the following disclaimer. > - * Redistributions in binary form must reproduce the above copyright > - notice, this list of conditions and the following disclaimer in the > - documentation and/or other materials provided with the distribution. > - * Neither the name of the Linaro nor the > - names of its contributors may be used to endorse or promote products > - derived from this software without specific prior written permission. > - > - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS > - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT > - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR > - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT > - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, > - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT > - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, > - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY > - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT > - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE > - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ > - > /* > - * Copyright (c) 2015 ARM Ltd > - * All rights reserved. > - * > - * Redistribution and use in source and binary forms, with or without > - * modification, are permitted provided that the following conditions > - * are met: > - * 1. Redistributions of source code must retain the above copyright > - * notice, this list of conditions and the following disclaimer. > - * 2. Redistributions in binary form must reproduce the above copyright > - * notice, this list of conditions and the following disclaimer in the > - * documentation and/or other materials provided with the distribution. > - * 3. The name of the company may not be used to endorse or promote > - * products derived from this software without specific prior written > - * permission. > + * memcpy - copy memory area > * > - * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED > - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF > - * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. > - * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, > - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED > - * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR > - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF > - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING > - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS > - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. > + * Copyright (c) 2012-2022, Arm Limited. > + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception > */ > > /* Assumptions: > @@ -61,6 +14,7 @@ > #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED)) > /* See memcpy-stub.c */ > #else > +#include "asmdefs.h" > > #define dstin x0 > #define src x1 > @@ -71,122 +25,139 @@ > #define A_l x6 > #define A_lw w6 > #define A_h x7 > -#define A_hw w7 > #define B_l x8 > #define B_lw w8 > #define B_h x9 > #define C_l x10 > +#define C_lw w10 > #define C_h x11 > #define D_l x12 > #define D_h x13 > -#define E_l src > -#define E_h count > -#define F_l srcend > -#define F_h dst > -#define tmp1 x9 > - > -#define L(l) .L ## l > - > - .macro def_fn f p2align=0 > - .text > - .p2align \p2align > - .global \f > - .type \f, %function > -\f: > - .endm > - > -/* Copies are split into 3 main cases: small copies of up to 16 bytes, > - medium copies of 17..96 bytes which are fully unrolled. Large copies > - of more than 96 bytes align the destination and use an unrolled loop > - processing 64 bytes per iteration. > - Small and medium copies read all data before writing, allowing any > - kind of overlap, and memmove tailcalls memcpy for these cases as > - well as non-overlapping copies. > +#define E_l x14 > +#define E_h x15 > +#define F_l x16 > +#define F_h x17 > +#define G_l count > +#define G_h dst > +#define H_l src > +#define H_h srcend > +#define tmp1 x14 > + > +/* This implementation handles overlaps and supports both memcpy and memmove > + from a single entry point. It uses unaligned accesses and branchless > + sequences to keep the code small, simple and improve performance. > + > + Copies are split into 3 main cases: small copies of up to 32 bytes, medium > + copies of up to 128 bytes, and large copies. The overhead of the overlap > + check is negligible since it is only required for large copies. > + > + Large copies use a software pipelined loop processing 64 bytes per iteration. > + The destination pointer is 16-byte aligned to minimize unaligned accesses. > + The loop tail is handled by always copying 64 bytes from the end. > */ > > -def_fn memcpy p2align=6 > - prfm PLDL1KEEP, [src] > +ENTRY_ALIAS (memmove) > +ENTRY (memcpy) > + PTR_ARG (0) > + PTR_ARG (1) > + SIZE_ARG (2) > add srcend, src, count > add dstend, dstin, count > - cmp count, 16 > - b.ls L(copy16) > - cmp count, 96 > + cmp count, 128 > b.hi L(copy_long) > + cmp count, 32 > + b.hi L(copy32_128) > > - /* Medium copies: 17..96 bytes. */ > - sub tmp1, count, 1 > + /* Small copies: 0..32 bytes. */ > + cmp count, 16 > + b.lo L(copy16) > ldp A_l, A_h, [src] > - tbnz tmp1, 6, L(copy96) > ldp D_l, D_h, [srcend, -16] > - tbz tmp1, 5, 1f > - ldp B_l, B_h, [src, 16] > - ldp C_l, C_h, [srcend, -32] > - stp B_l, B_h, [dstin, 16] > - stp C_l, C_h, [dstend, -32] > -1: > stp A_l, A_h, [dstin] > stp D_l, D_h, [dstend, -16] > ret > > - .p2align 4 > - /* Small copies: 0..16 bytes. */ > + /* Copy 8-15 bytes. */ > L(copy16): > - cmp count, 8 > - b.lo 1f > + tbz count, 3, L(copy8) > ldr A_l, [src] > ldr A_h, [srcend, -8] > str A_l, [dstin] > str A_h, [dstend, -8] > ret > - .p2align 4 > -1: > - tbz count, 2, 1f > + > + .p2align 3 > + /* Copy 4-7 bytes. */ > +L(copy8): > + tbz count, 2, L(copy4) > ldr A_lw, [src] > - ldr A_hw, [srcend, -4] > + ldr B_lw, [srcend, -4] > str A_lw, [dstin] > - str A_hw, [dstend, -4] > + str B_lw, [dstend, -4] > ret > > - /* Copy 0..3 bytes. Use a branchless sequence that copies the same > - byte 3 times if count==1, or the 2nd byte twice if count==2. */ > -1: > - cbz count, 2f > + /* Copy 0..3 bytes using a branchless sequence. */ > +L(copy4): > + cbz count, L(copy0) > lsr tmp1, count, 1 > ldrb A_lw, [src] > - ldrb A_hw, [srcend, -1] > + ldrb C_lw, [srcend, -1] > ldrb B_lw, [src, tmp1] > strb A_lw, [dstin] > strb B_lw, [dstin, tmp1] > - strb A_hw, [dstend, -1] > -2: ret > + strb C_lw, [dstend, -1] > +L(copy0): > + ret > > .p2align 4 > - /* Copy 64..96 bytes. Copy 64 bytes from the start and > - 32 bytes from the end. */ > -L(copy96): > + /* Medium copies: 33..128 bytes. */ > +L(copy32_128): > + ldp A_l, A_h, [src] > ldp B_l, B_h, [src, 16] > - ldp C_l, C_h, [src, 32] > - ldp D_l, D_h, [src, 48] > - ldp E_l, E_h, [srcend, -32] > - ldp F_l, F_h, [srcend, -16] > + ldp C_l, C_h, [srcend, -32] > + ldp D_l, D_h, [srcend, -16] > + cmp count, 64 > + b.hi L(copy128) > stp A_l, A_h, [dstin] > stp B_l, B_h, [dstin, 16] > - stp C_l, C_h, [dstin, 32] > - stp D_l, D_h, [dstin, 48] > - stp E_l, E_h, [dstend, -32] > - stp F_l, F_h, [dstend, -16] > + stp C_l, C_h, [dstend, -32] > + stp D_l, D_h, [dstend, -16] > ret > > - /* Align DST to 16 byte alignment so that we don't cross cache line > - boundaries on both loads and stores. There are at least 96 bytes > - to copy, so copy 16 bytes unaligned and then align. The loop > - copies 64 bytes per iteration and prefetches one iteration ahead. */ > + .p2align 4 > + /* Copy 65..128 bytes. */ > +L(copy128): > + ldp E_l, E_h, [src, 32] > + ldp F_l, F_h, [src, 48] > + cmp count, 96 > + b.ls L(copy96) > + ldp G_l, G_h, [srcend, -64] > + ldp H_l, H_h, [srcend, -48] > + stp G_l, G_h, [dstend, -64] > + stp H_l, H_h, [dstend, -48] > +L(copy96): > + stp A_l, A_h, [dstin] > + stp B_l, B_h, [dstin, 16] > + stp E_l, E_h, [dstin, 32] > + stp F_l, F_h, [dstin, 48] > + stp C_l, C_h, [dstend, -32] > + stp D_l, D_h, [dstend, -16] > + ret > > .p2align 4 > + /* Copy more than 128 bytes. */ > L(copy_long): > + /* Use backwards copy if there is an overlap. */ > + sub tmp1, dstin, src > + cbz tmp1, L(copy0) > + cmp tmp1, count > + b.lo L(copy_long_backwards) > + > + /* Copy 16 bytes and then align dst to 16-byte alignment. */ > + > + ldp D_l, D_h, [src] > and tmp1, dstin, 15 > bic dst, dstin, 15 > - ldp D_l, D_h, [src] > sub src, src, tmp1 > add count, count, tmp1 /* Count is now 16 too large. */ > ldp A_l, A_h, [src, 16] > @@ -195,8 +166,9 @@ L(copy_long): > ldp C_l, C_h, [src, 48] > ldp D_l, D_h, [src, 64]! > subs count, count, 128 + 16 /* Test and readjust count. */ > - b.ls 2f > -1: > + b.ls L(copy64_from_end) > + > +L(loop64): > stp A_l, A_h, [dst, 16] > ldp A_l, A_h, [src, 16] > stp B_l, B_h, [dst, 32] > @@ -206,12 +178,10 @@ L(copy_long): > stp D_l, D_h, [dst, 64]! > ldp D_l, D_h, [src, 64]! > subs count, count, 64 > - b.hi 1b > + b.hi L(loop64) > > - /* Write the last full set of 64 bytes. The remainder is at most 64 > - bytes, so it is safe to always copy 64 bytes from the end even if > - there is just 1 byte left. */ > -2: > + /* Write the last iteration and copy 64 bytes from the end. */ > +L(copy64_from_end): > ldp E_l, E_h, [srcend, -64] > stp A_l, A_h, [dst, 16] > ldp A_l, A_h, [srcend, -48] > @@ -226,5 +196,51 @@ L(copy_long): > stp C_l, C_h, [dstend, -16] > ret > > - .size memcpy, . - memcpy > + .p2align 4 > + > + /* Large backwards copy for overlapping copies. > + Copy 16 bytes and then align dst to 16-byte alignment. */ > +L(copy_long_backwards): > + ldp D_l, D_h, [srcend, -16] > + and tmp1, dstend, 15 > + sub srcend, srcend, tmp1 > + sub count, count, tmp1 > + ldp A_l, A_h, [srcend, -16] > + stp D_l, D_h, [dstend, -16] > + ldp B_l, B_h, [srcend, -32] > + ldp C_l, C_h, [srcend, -48] > + ldp D_l, D_h, [srcend, -64]! > + sub dstend, dstend, tmp1 > + subs count, count, 128 > + b.ls L(copy64_from_start) > + > +L(loop64_backwards): > + stp A_l, A_h, [dstend, -16] > + ldp A_l, A_h, [srcend, -16] > + stp B_l, B_h, [dstend, -32] > + ldp B_l, B_h, [srcend, -32] > + stp C_l, C_h, [dstend, -48] > + ldp C_l, C_h, [srcend, -48] > + stp D_l, D_h, [dstend, -64]! > + ldp D_l, D_h, [srcend, -64]! > + subs count, count, 64 > + b.hi L(loop64_backwards) > + > + /* Write the last iteration and copy 64 bytes from the start. */ > +L(copy64_from_start): > + ldp G_l, G_h, [src, 48] > + stp A_l, A_h, [dstend, -16] > + ldp A_l, A_h, [src, 32] > + stp B_l, B_h, [dstend, -32] > + ldp B_l, B_h, [src, 16] > + stp C_l, C_h, [dstend, -48] > + ldp C_l, C_h, [src] > + stp D_l, D_h, [dstend, -64] > + stp G_l, G_h, [dstin, 48] > + stp A_l, A_h, [dstin, 32] > + stp B_l, B_h, [dstin, 16] > + stp C_l, C_h, [dstin] > + ret > + > +END (memcpy) > #endif > diff --git a/newlib/libc/machine/aarch64/memset.S b/newlib/libc/machine/aarch64/memset.S > index 103e3f8bb0..ca76439a91 100644 > --- a/newlib/libc/machine/aarch64/memset.S > +++ b/newlib/libc/machine/aarch64/memset.S > @@ -1,66 +1,20 @@ > -/* Copyright (c) 2012-2013, Linaro Limited > - All rights reserved. > - > - Redistribution and use in source and binary forms, with or without > - modification, are permitted provided that the following conditions are met: > - * Redistributions of source code must retain the above copyright > - notice, this list of conditions and the following disclaimer. > - * Redistributions in binary form must reproduce the above copyright > - notice, this list of conditions and the following disclaimer in the > - documentation and/or other materials provided with the distribution. > - * Neither the name of the Linaro nor the > - names of its contributors may be used to endorse or promote products > - derived from this software without specific prior written permission. > - > - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS > - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT > - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR > - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT > - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, > - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT > - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, > - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY > - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT > - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE > - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ > - > /* > - * Copyright (c) 2015 ARM Ltd > - * All rights reserved. > + * memset - fill memory with a constant byte > * > - * Redistribution and use in source and binary forms, with or without > - * modification, are permitted provided that the following conditions > - * are met: > - * 1. Redistributions of source code must retain the above copyright > - * notice, this list of conditions and the following disclaimer. > - * 2. Redistributions in binary form must reproduce the above copyright > - * notice, this list of conditions and the following disclaimer in the > - * documentation and/or other materials provided with the distribution. > - * 3. The name of the company may not be used to endorse or promote > - * products derived from this software without specific prior written > - * permission. > - * > - * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED > - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF > - * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. > - * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, > - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED > - * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR > - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF > - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING > - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS > - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. > + * Copyright (c) 2012-2022, Arm Limited. > + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception > */ > > /* Assumptions: > * > - * ARMv8-a, AArch64, unaligned accesses > + * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. > * > */ > > #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED)) > /* See memset-stub.c */ > #else > +#include "asmdefs.h" > > #define dstin x0 > #define val x1 > @@ -68,24 +22,11 @@ > #define count x2 > #define dst x3 > #define dstend x4 > -#define tmp1 x5 > -#define tmp1w w5 > -#define tmp2 x6 > -#define tmp2w w6 > -#define zva_len x7 > -#define zva_lenw w7 > - > -#define L(l) .L ## l > +#define zva_val x5 > > - .macro def_fn f p2align=0 > - .text > - .p2align \p2align > - .global \f > - .type \f, %function > -\f: > - .endm > - > -def_fn memset p2align=6 > +ENTRY (memset) > + PTR_ARG (0) > + SIZE_ARG (2) > > dup v0.16B, valw > add dstend, dstin, count > @@ -101,7 +42,7 @@ def_fn memset p2align=6 > str val, [dstin] > str val, [dstend, -8] > ret > - nop > + .p2align 4 > 1: tbz count, 2, 2f > str valw, [dstin] > str valw, [dstend, -4] > @@ -131,110 +72,49 @@ L(set96): > stp q0, q0, [dstend, -32] > ret > > - .p2align 3 > - nop > + .p2align 4 > L(set_long): > and valw, valw, 255 > bic dst, dstin, 15 > str q0, [dstin] > - cmp count, 256 > - ccmp valw, 0, 0, cs > - b.eq L(try_zva) > -L(no_zva): > - sub count, dstend, dst /* Count is 16 too large. */ > - sub dst, dst, 16 /* Dst is biased by -32. */ > - sub count, count, 64 + 16 /* Adjust count and bias for loop. */ > -1: stp q0, q0, [dst, 32] > - stp q0, q0, [dst, 64]! > -L(tail64): > - subs count, count, 64 > - b.hi 1b > -2: stp q0, q0, [dstend, -64] > - stp q0, q0, [dstend, -32] > - ret > - > - .p2align 3 > -L(try_zva): > - mrs tmp1, dczid_el0 > - tbnz tmp1w, 4, L(no_zva) > - and tmp1w, tmp1w, 15 > - cmp tmp1w, 4 /* ZVA size is 64 bytes. */ > - b.ne L(zva_128) > - > - /* Write the first and last 64 byte aligned block using stp rather > - than using DC ZVA. This is faster on some cores. > - */ > -L(zva_64): > + cmp count, 160 > + ccmp valw, 0, 0, hs > + b.ne L(no_zva) > + > +#ifndef SKIP_ZVA_CHECK > + mrs zva_val, dczid_el0 > + and zva_val, zva_val, 31 > + cmp zva_val, 4 /* ZVA size is 64 bytes. */ > + b.ne L(no_zva) > +#endif > str q0, [dst, 16] > stp q0, q0, [dst, 32] > bic dst, dst, 63 > - stp q0, q0, [dst, 64] > - stp q0, q0, [dst, 96] > - sub count, dstend, dst /* Count is now 128 too large. */ > - sub count, count, 128+64+64 /* Adjust count and bias for loop. */ > - add dst, dst, 128 > - nop > -1: dc zva, dst > + sub count, dstend, dst /* Count is now 64 too large. */ > + sub count, count, 128 /* Adjust count and bias for loop. */ > + > + .p2align 4 > +L(zva_loop): > add dst, dst, 64 > + dc zva, dst > subs count, count, 64 > - b.hi 1b > - stp q0, q0, [dst, 0] > - stp q0, q0, [dst, 32] > + b.hi L(zva_loop) > stp q0, q0, [dstend, -64] > stp q0, q0, [dstend, -32] > ret > > - .p2align 3 > -L(zva_128): > - cmp tmp1w, 5 /* ZVA size is 128 bytes. */ > - b.ne L(zva_other) > - > - str q0, [dst, 16] > +L(no_zva): > + sub count, dstend, dst /* Count is 16 too large. */ > + sub dst, dst, 16 /* Dst is biased by -32. */ > + sub count, count, 64 + 16 /* Adjust count and bias for loop. */ > +L(no_zva_loop): > stp q0, q0, [dst, 32] > - stp q0, q0, [dst, 64] > - stp q0, q0, [dst, 96] > - bic dst, dst, 127 > - sub count, dstend, dst /* Count is now 128 too large. */ > - sub count, count, 128+128 /* Adjust count and bias for loop. */ > - add dst, dst, 128 > -1: dc zva, dst > - add dst, dst, 128 > - subs count, count, 128 > - b.hi 1b > - stp q0, q0, [dstend, -128] > - stp q0, q0, [dstend, -96] > + stp q0, q0, [dst, 64]! > + subs count, count, 64 > + b.hi L(no_zva_loop) > stp q0, q0, [dstend, -64] > stp q0, q0, [dstend, -32] > ret > > -L(zva_other): > - mov tmp2w, 4 > - lsl zva_lenw, tmp2w, tmp1w > - add tmp1, zva_len, 64 /* Max alignment bytes written. */ > - cmp count, tmp1 > - blo L(no_zva) > - > - sub tmp2, zva_len, 1 > - add tmp1, dst, zva_len > - add dst, dst, 16 > - subs count, tmp1, dst /* Actual alignment bytes to write. */ > - bic tmp1, tmp1, tmp2 /* Aligned dc zva start address. */ > - beq 2f > -1: stp q0, q0, [dst], 64 > - stp q0, q0, [dst, -32] > - subs count, count, 64 > - b.hi 1b > -2: mov dst, tmp1 > - sub count, dstend, tmp1 /* Remaining bytes to write. */ > - subs count, count, zva_len > - b.lo 4f > -3: dc zva, dst > - add dst, dst, zva_len > - subs count, count, zva_len > - b.hs 3b > -4: add count, count, zva_len > - sub dst, dst, 32 /* Bias dst for tail loop. */ > - b L(tail64) > - > - .size memset, . - memset > +END (memset) > #endif > diff --git a/newlib/libc/machine/aarch64/stpcpy.S b/newlib/libc/machine/aarch64/stpcpy.S > index 696b45889f..155c68d75a 100644 > --- a/newlib/libc/machine/aarch64/stpcpy.S > +++ b/newlib/libc/machine/aarch64/stpcpy.S > @@ -1,34 +1,10 @@ > /* > - stpcpy - copy a string returning pointer to end. > + * stpcpy - copy a string returning pointer to end. > + * > + * Copyright (c) 2020, Arm Limited. > + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception > + */ > > - Copyright (c) 2015 ARM Ltd. > - All Rights Reserved. > +#define BUILD_STPCPY 1 > > - Redistribution and use in source and binary forms, with or without > - modification, are permitted provided that the following conditions are met: > - * Redistributions of source code must retain the above copyright > - notice, this list of conditions and the following disclaimer. > - * Redistributions in binary form must reproduce the above copyright > - notice, this list of conditions and the following disclaimer in the > - documentation and/or other materials provided with the distribution. > - * Neither the name of the company nor the names of its contributors > - may be used to endorse or promote products derived from this > - software without specific prior written permission. > - > - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS > - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT > - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR > - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT > - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, > - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT > - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, > - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY > - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT > - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE > - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ > - > -/* This is just a wrapper that uses strcpy code with appropriate > - pre-defines. */ > - > -#define BUILD_STPCPY > #include "strcpy.S" > diff --git a/newlib/libc/machine/aarch64/strchr.S b/newlib/libc/machine/aarch64/strchr.S > index 2448dbc7d5..500d9aff29 100644 > --- a/newlib/libc/machine/aarch64/strchr.S > +++ b/newlib/libc/machine/aarch64/strchr.S > @@ -1,32 +1,9 @@ > /* > - strchr - find a character in a string > - > - Copyright (c) 2014, ARM Limited > - All rights Reserved. > - > - Redistribution and use in source and binary forms, with or without > - modification, are permitted provided that the following conditions are met: > - * Redistributions of source code must retain the above copyright > - notice, this list of conditions and the following disclaimer. > - * Redistributions in binary form must reproduce the above copyright > - notice, this list of conditions and the following disclaimer in the > - documentation and/or other materials provided with the distribution. > - * Neither the name of the company nor the names of its contributors > - may be used to endorse or promote products derived from this > - software without specific prior written permission. > - > - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS > - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT > - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR > - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT > - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, > - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT > - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, > - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY > - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT > - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE > - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ > - > + * strchr - find a character in a string > + * > + * Copyright (c) 2014-2022, Arm Limited. > + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception > + */ > #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED)) > /* See strchr-stub.c */ > #else > @@ -37,6 +14,8 @@ > * Neon Available. > */ > > +#include "asmdefs.h" > + > /* Arguments and results. */ > #define srcin x0 > #define chrin w1 > @@ -74,26 +53,19 @@ > > /* Locals and temporaries. */ > > - .macro def_fn f p2align=0 > - .text > - .p2align \p2align > - .global \f > - .type \f, %function > -\f: > - .endm > - > -def_fn strchr > - /* Magic constant 0x40100401 to allow us to identify which lane > - matches the requested byte. Magic constant 0x80200802 used > - similarly for NUL termination. */ > - mov wtmp2, #0x0401 > - movk wtmp2, #0x4010, lsl #16 > +ENTRY (strchr) > + PTR_ARG (0) > + /* Magic constant 0xc0300c03 to allow us to identify which lane > + matches the requested byte. Even bits are set if the character > + matches, odd bits if either the char is NUL or matches. */ > + mov wtmp2, 0x0c03 > + movk wtmp2, 0xc030, lsl 16 > dup vrepchr.16b, chrin > bic src, srcin, #31 /* Work with aligned 32-byte hunks. */ > dup vrepmask_c.4s, wtmp2 > ands tmp1, srcin, #31 > add vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */ > - b.eq .Lloop > + b.eq L(loop) > > /* Input string is not 32-byte aligned. Rather than forcing > the padding bytes to a safe value, we calculate the syndrome > @@ -105,49 +77,42 @@ def_fn strchr > cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b > cmeq vhas_nul2.16b, vdata2.16b, #0 > cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b > - and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b > - and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b > - and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b > - and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b > - orr vend1.16b, vhas_nul1.16b, vhas_chr1.16b > - orr vend2.16b, vhas_nul2.16b, vhas_chr2.16b > + bif vhas_nul1.16b, vhas_chr1.16b, vrepmask_0.16b > + bif vhas_nul2.16b, vhas_chr2.16b, vrepmask_0.16b > + and vend1.16b, vhas_nul1.16b, vrepmask_c.16b > + and vend2.16b, vhas_nul2.16b, vrepmask_c.16b > lsl tmp1, tmp1, #1 > addp vend1.16b, vend1.16b, vend2.16b // 256->128 > mov tmp3, #~0 > addp vend1.16b, vend1.16b, vend2.16b // 128->64 > lsr tmp1, tmp3, tmp1 > > - mov tmp3, vend1.2d[0] > + mov tmp3, vend1.d[0] > bic tmp1, tmp3, tmp1 // Mask padding bits. > - cbnz tmp1, .Ltail > + cbnz tmp1, L(tail) > > -.Lloop: > + .p2align 4 > +L(loop): > ld1 {vdata1.16b, vdata2.16b}, [src], #32 > - cmeq vhas_nul1.16b, vdata1.16b, #0 > cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b > - cmeq vhas_nul2.16b, vdata2.16b, #0 > cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b > - /* Use a fast check for the termination condition. */ > - orr vend1.16b, vhas_nul1.16b, vhas_chr1.16b > - orr vend2.16b, vhas_nul2.16b, vhas_chr2.16b > - orr vend1.16b, vend1.16b, vend2.16b > - addp vend1.2d, vend1.2d, vend1.2d > - mov tmp1, vend1.2d[0] > - cbz tmp1, .Lloop > + cmhs vhas_nul1.16b, vhas_chr1.16b, vdata1.16b > + cmhs vhas_nul2.16b, vhas_chr2.16b, vdata2.16b > + orr vend1.16b, vhas_nul1.16b, vhas_nul2.16b > + umaxp vend1.16b, vend1.16b, vend1.16b > + mov tmp1, vend1.d[0] > + cbz tmp1, L(loop) > > /* Termination condition found. Now need to establish exactly why > we terminated. */ > - and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b > - and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b > - and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b > - and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b > - orr vend1.16b, vhas_nul1.16b, vhas_chr1.16b > - orr vend2.16b, vhas_nul2.16b, vhas_chr2.16b > + bif vhas_nul1.16b, vhas_chr1.16b, vrepmask_0.16b > + bif vhas_nul2.16b, vhas_chr2.16b, vrepmask_0.16b > + and vend1.16b, vhas_nul1.16b, vrepmask_c.16b > + and vend2.16b, vhas_nul2.16b, vrepmask_c.16b > addp vend1.16b, vend1.16b, vend2.16b // 256->128 > addp vend1.16b, vend1.16b, vend2.16b // 128->64 > - > - mov tmp1, vend1.2d[0] > -.Ltail: > + mov tmp1, vend1.d[0] > +L(tail): > /* Count the trailing zeros, by bit reversing... */ > rbit tmp1, tmp1 > /* Re-bias source. */ > @@ -160,5 +125,5 @@ def_fn strchr > csel result, result, xzr, eq > ret > > - .size strchr, . - strchr > +END (strchr) > #endif > diff --git a/newlib/libc/machine/aarch64/strchrnul.S b/newlib/libc/machine/aarch64/strchrnul.S > index a0ac13b7f4..ceaf4dca17 100644 > --- a/newlib/libc/machine/aarch64/strchrnul.S > +++ b/newlib/libc/machine/aarch64/strchrnul.S > @@ -1,32 +1,9 @@ > /* > - strchrnul - find a character or nul in a string > - > - Copyright (c) 2014, ARM Limited > - All rights Reserved. > - > - Redistribution and use in source and binary forms, with or without > - modification, are permitted provided that the following conditions are met: > - * Redistributions of source code must retain the above copyright > - notice, this list of conditions and the following disclaimer. > - * Redistributions in binary form must reproduce the above copyright > - notice, this list of conditions and the following disclaimer in the > - documentation and/or other materials provided with the distribution. > - * Neither the name of the company nor the names of its contributors > - may be used to endorse or promote products derived from this > - software without specific prior written permission. > - > - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS > - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT > - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR > - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT > - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, > - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT > - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, > - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY > - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT > - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE > - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ > - > + * strchrnul - find a character or nul in a string > + * > + * Copyright (c) 2014-2022, Arm Limited. > + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception > + */ > #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED)) > /* See strchrnul-stub.c */ > #else > @@ -37,6 +14,8 @@ > * Neon Available. > */ > > +#include "asmdefs.h" > + > /* Arguments and results. */ > #define srcin x0 > #define chrin w1 > @@ -70,15 +49,8 @@ > > /* Locals and temporaries. */ > > - .macro def_fn f p2align=0 > - .text > - .p2align \p2align > - .global \f > - .type \f, %function > -\f: > - .endm > - > -def_fn strchrnul > +ENTRY (strchrnul) > + PTR_ARG (0) > /* Magic constant 0x40100401 to allow us to identify which lane > matches the termination condition. */ > mov wtmp2, #0x0401 > @@ -87,7 +59,7 @@ def_fn strchrnul > bic src, srcin, #31 /* Work with aligned 32-byte hunks. */ > dup vrepmask.4s, wtmp2 > ands tmp1, srcin, #31 > - b.eq .Lloop > + b.eq L(loop) > > /* Input string is not 32-byte aligned. Rather than forcing > the padding bytes to a safe value, we calculate the syndrome > @@ -95,47 +67,43 @@ def_fn strchrnul > syndrome that are related to the padding. */ > ld1 {vdata1.16b, vdata2.16b}, [src], #32 > neg tmp1, tmp1 > - cmeq vhas_nul1.16b, vdata1.16b, #0 > cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b > - cmeq vhas_nul2.16b, vdata2.16b, #0 > cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b > - orr vhas_chr1.16b, vhas_chr1.16b, vhas_nul1.16b > - orr vhas_chr2.16b, vhas_chr2.16b, vhas_nul2.16b > - and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b > - and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b > + cmhs vhas_nul1.16b, vhas_chr1.16b, vdata1.16b > + cmhs vhas_nul2.16b, vhas_chr2.16b, vdata2.16b > + and vhas_chr1.16b, vhas_nul1.16b, vrepmask.16b > + and vhas_chr2.16b, vhas_nul2.16b, vrepmask.16b > lsl tmp1, tmp1, #1 > addp vend1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128 > mov tmp3, #~0 > addp vend1.16b, vend1.16b, vend1.16b // 128->64 > lsr tmp1, tmp3, tmp1 > > - mov tmp3, vend1.2d[0] > + mov tmp3, vend1.d[0] > bic tmp1, tmp3, tmp1 // Mask padding bits. > - cbnz tmp1, .Ltail > + cbnz tmp1, L(tail) > > -.Lloop: > + .p2align 4 > +L(loop): > ld1 {vdata1.16b, vdata2.16b}, [src], #32 > - cmeq vhas_nul1.16b, vdata1.16b, #0 > cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b > - cmeq vhas_nul2.16b, vdata2.16b, #0 > cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b > - /* Use a fast check for the termination condition. */ > - orr vhas_chr1.16b, vhas_nul1.16b, vhas_chr1.16b > - orr vhas_chr2.16b, vhas_nul2.16b, vhas_chr2.16b > - orr vend1.16b, vhas_chr1.16b, vhas_chr2.16b > - addp vend1.2d, vend1.2d, vend1.2d > - mov tmp1, vend1.2d[0] > - cbz tmp1, .Lloop > + cmhs vhas_nul1.16b, vhas_chr1.16b, vdata1.16b > + cmhs vhas_nul2.16b, vhas_chr2.16b, vdata2.16b > + orr vend1.16b, vhas_nul1.16b, vhas_nul2.16b > + umaxp vend1.16b, vend1.16b, vend1.16b > + mov tmp1, vend1.d[0] > + cbz tmp1, L(loop) > > /* Termination condition found. Now need to establish exactly why > we terminated. */ > - and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b > - and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b > + and vhas_chr1.16b, vhas_nul1.16b, vrepmask.16b > + and vhas_chr2.16b, vhas_nul2.16b, vrepmask.16b > addp vend1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128 > addp vend1.16b, vend1.16b, vend1.16b // 128->64 > > - mov tmp1, vend1.2d[0] > -.Ltail: > + mov tmp1, vend1.d[0] > +L(tail): > /* Count the trailing zeros, by bit reversing... */ > rbit tmp1, tmp1 > /* Re-bias source. */ > @@ -145,5 +113,5 @@ def_fn strchrnul > add result, src, tmp1, lsr #1 > ret > > - .size strchrnul, . - strchrnul > +END (strchrnul) > #endif > diff --git a/newlib/libc/machine/aarch64/strcmp.S b/newlib/libc/machine/aarch64/strcmp.S > index e2bef2d49d..691a1760ee 100644 > --- a/newlib/libc/machine/aarch64/strcmp.S > +++ b/newlib/libc/machine/aarch64/strcmp.S > @@ -1,202 +1,192 @@ > -/* Copyright (c) 2012-2018, Linaro Limited > - All rights reserved. > - > - Redistribution and use in source and binary forms, with or without > - modification, are permitted provided that the following conditions are met: > - * Redistributions of source code must retain the above copyright > - notice, this list of conditions and the following disclaimer. > - * Redistributions in binary form must reproduce the above copyright > - notice, this list of conditions and the following disclaimer in the > - documentation and/or other materials provided with the distribution. > - * Neither the name of the Linaro nor the > - names of its contributors may be used to endorse or promote products > - derived from this software without specific prior written permission. > - > - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS > - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT > - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR > - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT > - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, > - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT > - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, > - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY > - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT > - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE > - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ > - > -/* Assumptions: > +/* > + * strcmp - compare two strings > * > - * ARMv8-a, AArch64 > + * Copyright (c) 2012-2022, Arm Limited. > + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception > */ > > #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED)) > /* See strcmp-stub.c */ > #else > > - .macro def_fn f p2align=0 > - .text > - .p2align \p2align > - .global \f > - .type \f, %function > -\f: > - .endm > +/* Assumptions: > + * > + * ARMv8-a, AArch64. > + * MTE compatible. > + */ > > -#define L(label) .L ## label > +#include "asmdefs.h" > > #define REP8_01 0x0101010101010101 > #define REP8_7f 0x7f7f7f7f7f7f7f7f > -#define REP8_80 0x8080808080808080 > > -/* Parameters and result. */ > #define src1 x0 > #define src2 x1 > #define result x0 > > -/* Internal variables. */ > #define data1 x2 > #define data1w w2 > #define data2 x3 > #define data2w w3 > #define has_nul x4 > #define diff x5 > +#define off1 x5 > #define syndrome x6 > -#define tmp1 x7 > -#define tmp2 x8 > -#define tmp3 x9 > -#define zeroones x10 > -#define pos x11 > - > - /* Start of performance-critical section -- one 64B cache line. */ > -def_fn strcmp p2align=6 > - eor tmp1, src1, src2 > - mov zeroones, #REP8_01 > - tst tmp1, #7 > +#define tmp x6 > +#define data3 x7 > +#define zeroones x8 > +#define shift x9 > +#define off2 x10 > + > +/* On big-endian early bytes are at MSB and on little-endian LSB. > + LS_FW means shifting towards early bytes. */ > +#ifdef __AARCH64EB__ > +# define LS_FW lsl > +#else > +# define LS_FW lsr > +#endif > + > +/* NUL detection works on the principle that (X - 1) & (~X) & 0x80 > + (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and > + can be done in parallel across the entire word. > + Since carry propagation makes 0x1 bytes before a NUL byte appear > + NUL too in big-endian, byte-reverse the data before the NUL check. */ > + > + > +ENTRY (strcmp) > + PTR_ARG (0) > + PTR_ARG (1) > + sub off2, src2, src1 > + mov zeroones, REP8_01 > + and tmp, src1, 7 > + tst off2, 7 > b.ne L(misaligned8) > - ands tmp1, src1, #7 > - b.ne L(mutual_align) > - /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 > - (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and > - can be done in parallel across the entire word. */ > + cbnz tmp, L(mutual_align) > + > + .p2align 4 > + > L(loop_aligned): > - ldr data1, [src1], #8 > - ldr data2, [src2], #8 > + ldr data2, [src1, off2] > + ldr data1, [src1], 8 > L(start_realigned): > - sub tmp1, data1, zeroones > - orr tmp2, data1, #REP8_7f > - eor diff, data1, data2 /* Non-zero if differences found. */ > - bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ > +#ifdef __AARCH64EB__ > + rev tmp, data1 > + sub has_nul, tmp, zeroones > + orr tmp, tmp, REP8_7f > +#else > + sub has_nul, data1, zeroones > + orr tmp, data1, REP8_7f > +#endif > + bics has_nul, has_nul, tmp /* Non-zero if NUL terminator. */ > + ccmp data1, data2, 0, eq > + b.eq L(loop_aligned) > +#ifdef __AARCH64EB__ > + rev has_nul, has_nul > +#endif > + eor diff, data1, data2 > orr syndrome, diff, has_nul > - cbz syndrome, L(loop_aligned) > - /* End of performance-critical section -- one 64B cache line. */ > - > L(end): > -#ifndef __AARCH64EB__ > +#ifndef __AARCH64EB__ > rev syndrome, syndrome > rev data1, data1 > - /* The MS-non-zero bit of the syndrome marks either the first bit > - that is different, or the top bit of the first zero byte. > - Shifting left now will bring the critical information into the > - top bits. */ > - clz pos, syndrome > rev data2, data2 > - lsl data1, data1, pos > - lsl data2, data2, pos > - /* But we need to zero-extend (char is unsigned) the value and then > - perform a signed 32-bit subtraction. */ > - lsr data1, data1, #56 > - sub result, data1, data2, lsr #56 > - ret > -#else > - /* For big-endian we cannot use the trick with the syndrome value > - as carry-propagation can corrupt the upper bits if the trailing > - bytes in the string contain 0x01. */ > - /* However, if there is no NUL byte in the dword, we can generate > - the result directly. We can't just subtract the bytes as the > - MSB might be significant. */ > - cbnz has_nul, 1f > - cmp data1, data2 > - cset result, ne > - cneg result, result, lo > - ret > -1: > - /* Re-compute the NUL-byte detection, using a byte-reversed value. */ > - rev tmp3, data1 > - sub tmp1, tmp3, zeroones > - orr tmp2, tmp3, #REP8_7f > - bic has_nul, tmp1, tmp2 > - rev has_nul, has_nul > - orr syndrome, diff, has_nul > - clz pos, syndrome > - /* The MS-non-zero bit of the syndrome marks either the first bit > - that is different, or the top bit of the first zero byte. > +#endif > + clz shift, syndrome > + /* The most-significant-non-zero bit of the syndrome marks either the > + first bit that is different, or the top bit of the first zero byte. > Shifting left now will bring the critical information into the > top bits. */ > - lsl data1, data1, pos > - lsl data2, data2, pos > + lsl data1, data1, shift > + lsl data2, data2, shift > /* But we need to zero-extend (char is unsigned) the value and then > perform a signed 32-bit subtraction. */ > - lsr data1, data1, #56 > - sub result, data1, data2, lsr #56 > + lsr data1, data1, 56 > + sub result, data1, data2, lsr 56 > ret > -#endif > + > + .p2align 4 > > L(mutual_align): > /* Sources are mutually aligned, but are not currently at an > alignment boundary. Round down the addresses and then mask off > - the bytes that preceed the start point. */ > - bic src1, src1, #7 > - bic src2, src2, #7 > - lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */ > - ldr data1, [src1], #8 > - neg tmp1, tmp1 /* Bits to alignment -64. */ > - ldr data2, [src2], #8 > - mov tmp2, #~0 > -#ifdef __AARCH64EB__ > - /* Big-endian. Early bytes are at MSB. */ > - lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ > -#else > - /* Little-endian. Early bytes are at LSB. */ > - lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ > -#endif > - orr data1, data1, tmp2 > - orr data2, data2, tmp2 > + the bytes that precede the start point. */ > + bic src1, src1, 7 > + ldr data2, [src1, off2] > + ldr data1, [src1], 8 > + neg shift, src2, lsl 3 /* Bits to alignment -64. */ > + mov tmp, -1 > + LS_FW tmp, tmp, shift > + orr data1, data1, tmp > + orr data2, data2, tmp > b L(start_realigned) > > L(misaligned8): > /* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always > - checking to make sure that we don't access beyond page boundary in > - SRC2. */ > - tst src1, #7 > - b.eq L(loop_misaligned) > + checking to make sure that we don't access beyond the end of SRC2. */ > + cbz tmp, L(src1_aligned) > L(do_misaligned): > - ldrb data1w, [src1], #1 > - ldrb data2w, [src2], #1 > - cmp data1w, #1 > - ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ > + ldrb data1w, [src1], 1 > + ldrb data2w, [src2], 1 > + cmp data1w, 0 > + ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */ > b.ne L(done) > - tst src1, #7 > + tst src1, 7 > b.ne L(do_misaligned) > > -L(loop_misaligned): > - /* Test if we are within the last dword of the end of a 4K page. If > - yes then jump back to the misaligned loop to copy a byte at a time. */ > - and tmp1, src2, #0xff8 > - eor tmp1, tmp1, #0xff8 > - cbz tmp1, L(do_misaligned) > - ldr data1, [src1], #8 > - ldr data2, [src2], #8 > - > - sub tmp1, data1, zeroones > - orr tmp2, data1, #REP8_7f > - eor diff, data1, data2 /* Non-zero if differences found. */ > - bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ > +L(src1_aligned): > + neg shift, src2, lsl 3 > + bic src2, src2, 7 > + ldr data3, [src2], 8 > +#ifdef __AARCH64EB__ > + rev data3, data3 > +#endif > + lsr tmp, zeroones, shift > + orr data3, data3, tmp > + sub has_nul, data3, zeroones > + orr tmp, data3, REP8_7f > + bics has_nul, has_nul, tmp > + b.ne L(tail) > + > + sub off1, src2, src1 > + > + .p2align 4 > + > +L(loop_unaligned): > + ldr data3, [src1, off1] > + ldr data2, [src1, off2] > +#ifdef __AARCH64EB__ > + rev data3, data3 > +#endif > + sub has_nul, data3, zeroones > + orr tmp, data3, REP8_7f > + ldr data1, [src1], 8 > + bics has_nul, has_nul, tmp > + ccmp data1, data2, 0, eq > + b.eq L(loop_unaligned) > + > + lsl tmp, has_nul, shift > +#ifdef __AARCH64EB__ > + rev tmp, tmp > +#endif > + eor diff, data1, data2 > + orr syndrome, diff, tmp > + cbnz syndrome, L(end) > +L(tail): > + ldr data1, [src1] > + neg shift, shift > + lsr data2, data3, shift > + lsr has_nul, has_nul, shift > +#ifdef __AARCH64EB__ > + rev data2, data2 > + rev has_nul, has_nul > +#endif > + eor diff, data1, data2 > orr syndrome, diff, has_nul > - cbz syndrome, L(loop_misaligned) > b L(end) > > L(done): > sub result, data1, data2 > ret > - .size strcmp, .-strcmp > > +END (strcmp) > #endif > diff --git a/newlib/libc/machine/aarch64/strcpy.S b/newlib/libc/machine/aarch64/strcpy.S > index e5405f2535..57c46f3908 100644 > --- a/newlib/libc/machine/aarch64/strcpy.S > +++ b/newlib/libc/machine/aarch64/strcpy.S > @@ -1,341 +1,160 @@ > /* > - strcpy/stpcpy - copy a string returning pointer to start/end. > - > - Copyright (c) 2013, 2014, 2015 ARM Ltd. > - All Rights Reserved. > - > - Redistribution and use in source and binary forms, with or without > - modification, are permitted provided that the following conditions are met: > - * Redistributions of source code must retain the above copyright > - notice, this list of conditions and the following disclaimer. > - * Redistributions in binary form must reproduce the above copyright > - notice, this list of conditions and the following disclaimer in the > - documentation and/or other materials provided with the distribution. > - * Neither the name of the company nor the names of its contributors > - may be used to endorse or promote products derived from this > - software without specific prior written permission. > - > - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS > - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT > - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR > - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT > - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, > - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT > - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, > - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY > - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT > - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE > - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ > - > + * strcpy/stpcpy - copy a string returning pointer to start/end. > + * > + * Copyright (c) 2020-2023, Arm Limited. > + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception > + */ > #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED)) > /* See strchr-stub.c */ > #else > > /* Assumptions: > * > - * ARMv8-a, AArch64, unaligned accesses, min page size 4k. > + * ARMv8-a, AArch64, Advanced SIMD. > + * MTE compatible. > */ > > -/* To build as stpcpy, define BUILD_STPCPY before compiling this file. > +#include "asmdefs.h" > > - To test the page crossing code path more thoroughly, compile with > - -DSTRCPY_TEST_PAGE_CROSS - this will force all copies through the slower > - entry path. This option is not intended for production use. */ > - > -/* Arguments and results. */ > #define dstin x0 > #define srcin x1 > +#define result x0 > > -/* Locals and temporaries. */ > #define src x2 > #define dst x3 > -#define data1 x4 > -#define data1w w4 > -#define data2 x5 > -#define data2w w5 > -#define has_nul1 x6 > -#define has_nul2 x7 > -#define tmp1 x8 > -#define tmp2 x9 > -#define tmp3 x10 > -#define tmp4 x11 > -#define zeroones x12 > -#define data1a x13 > -#define data2a x14 > -#define pos x15 > -#define len x16 > -#define to_align x17 > +#define len x4 > +#define synd x4 > +#define tmp x5 > +#define shift x5 > +#define data1 x6 > +#define dataw1 w6 > +#define data2 x7 > +#define dataw2 w7 > + > +#define dataq q0 > +#define vdata v0 > +#define vhas_nul v1 > +#define vend v2 > +#define dend d2 > +#define dataq2 q1 > > #ifdef BUILD_STPCPY > -#define STRCPY stpcpy > +# define STRCPY stpcpy > +# define IFSTPCPY(X,...) X,__VA_ARGS__ > #else > -#define STRCPY strcpy > +# define STRCPY strcpy > +# define IFSTPCPY(X,...) > #endif > > - .macro def_fn f p2align=0 > - .text > - .p2align \p2align > - .global \f > - .type \f, %function > -\f: > - .endm > - > - /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 > - (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and > - can be done in parallel across the entire word. */ > - > -#define REP8_01 0x0101010101010101 > -#define REP8_7f 0x7f7f7f7f7f7f7f7f > -#define REP8_80 0x8080808080808080 > - > - /* AArch64 systems have a minimum page size of 4k. We can do a quick > - page size check for crossing this boundary on entry and if we > - do not, then we can short-circuit much of the entry code. We > - expect early page-crossing strings to be rare (probability of > - 16/MIN_PAGE_SIZE ~= 0.4%), so the branch should be quite > - predictable, even with random strings. > - > - We don't bother checking for larger page sizes, the cost of setting > - up the correct page size is just not worth the extra gain from > - a small reduction in the cases taking the slow path. Note that > - we only care about whether the first fetch, which may be > - misaligned, crosses a page boundary - after that we move to aligned > - fetches for the remainder of the string. */ > - > -#ifdef STRCPY_TEST_PAGE_CROSS > - /* Make everything that isn't Qword aligned look like a page cross. */ > -#define MIN_PAGE_P2 4 > -#else > -#define MIN_PAGE_P2 12 > -#endif > - > -#define MIN_PAGE_SIZE (1 << MIN_PAGE_P2) > - > -def_fn STRCPY p2align=6 > - /* For moderately short strings, the fastest way to do the copy is to > - calculate the length of the string in the same way as strlen, then > - essentially do a memcpy of the result. This avoids the need for > - multiple byte copies and further means that by the time we > - reach the bulk copy loop we know we can always use DWord > - accesses. We expect strcpy to rarely be called repeatedly > - with the same source string, so branch prediction is likely to > - always be difficult - we mitigate against this by preferring > - conditional select operations over branches whenever this is > - feasible. */ > - and tmp2, srcin, #(MIN_PAGE_SIZE - 1) > - mov zeroones, #REP8_01 > - and to_align, srcin, #15 > - cmp tmp2, #(MIN_PAGE_SIZE - 16) > - neg tmp1, to_align > - /* The first fetch will straddle a (possible) page boundary iff > - srcin + 15 causes bit[MIN_PAGE_P2] to change value. A 16-byte > - aligned string will never fail the page align check, so will > - always take the fast path. */ > - b.gt .Lpage_cross > - > -.Lpage_cross_ok: > - ldp data1, data2, [srcin] > -#ifdef __AARCH64EB__ > - /* Because we expect the end to be found within 16 characters > - (profiling shows this is the most common case), it's worth > - swapping the bytes now to save having to recalculate the > - termination syndrome later. We preserve data1 and data2 > - so that we can re-use the values later on. */ > - rev tmp2, data1 > - sub tmp1, tmp2, zeroones > - orr tmp2, tmp2, #REP8_7f > - bics has_nul1, tmp1, tmp2 > - b.ne .Lfp_le8 > - rev tmp4, data2 > - sub tmp3, tmp4, zeroones > - orr tmp4, tmp4, #REP8_7f > -#else > - sub tmp1, data1, zeroones > - orr tmp2, data1, #REP8_7f > - bics has_nul1, tmp1, tmp2 > - b.ne .Lfp_le8 > - sub tmp3, data2, zeroones > - orr tmp4, data2, #REP8_7f > +/* > + Core algorithm: > + For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits > + per byte. We take 4 bits of every comparison byte with shift right and narrow > + by 4 instruction. Since the bits in the nibble mask reflect the order in > + which things occur in the original string, counting leading zeros identifies > + exactly which byte matched. */ > + > +ENTRY (STRCPY) > + PTR_ARG (0) > + PTR_ARG (1) > + bic src, srcin, 15 > + ld1 {vdata.16b}, [src] > + cmeq vhas_nul.16b, vdata.16b, 0 > + lsl shift, srcin, 2 > + shrn vend.8b, vhas_nul.8h, 4 > + fmov synd, dend > + lsr synd, synd, shift > + cbnz synd, L(tail) > + > + ldr dataq, [src, 16]! > + cmeq vhas_nul.16b, vdata.16b, 0 > + shrn vend.8b, vhas_nul.8h, 4 > + fmov synd, dend > + cbz synd, L(start_loop) > + > +#ifndef __AARCH64EB__ > + rbit synd, synd > #endif > - bics has_nul2, tmp3, tmp4 > - b.eq .Lbulk_entry > + sub tmp, src, srcin > + clz len, synd > + add len, tmp, len, lsr 2 > + tbz len, 4, L(less16) > + sub tmp, len, 15 > + ldr dataq, [srcin] > + ldr dataq2, [srcin, tmp] > + str dataq, [dstin] > + str dataq2, [dstin, tmp] > + IFSTPCPY (add result, dstin, len) > + ret > > - /* The string is short (<=16 bytes). We don't know exactly how > - short though, yet. Work out the exact length so that we can > - quickly select the optimal copy strategy. */ > -.Lfp_gt8: > - rev has_nul2, has_nul2 > - clz pos, has_nul2 > - mov tmp2, #56 > - add dst, dstin, pos, lsr #3 /* Bits to bytes. */ > - sub pos, tmp2, pos > -#ifdef __AARCH64EB__ > - lsr data2, data2, pos > -#else > - lsl data2, data2, pos > -#endif > - str data2, [dst, #1] > +L(tail): > + rbit synd, synd > + clz len, synd > + lsr len, len, 2 > +L(less16): > + tbz len, 3, L(less8) > + sub tmp, len, 7 > + ldr data1, [srcin] > + ldr data2, [srcin, tmp] > str data1, [dstin] > -#ifdef BUILD_STPCPY > - add dstin, dst, #8 > -#endif > + str data2, [dstin, tmp] > + IFSTPCPY (add result, dstin, len) > ret > > -.Lfp_le8: > - rev has_nul1, has_nul1 > - clz pos, has_nul1 > - add dst, dstin, pos, lsr #3 /* Bits to bytes. */ > - subs tmp2, pos, #24 /* Pos in bits. */ > - b.lt .Lfp_lt4 > -#ifdef __AARCH64EB__ > - mov tmp2, #56 > - sub pos, tmp2, pos > - lsr data2, data1, pos > - lsr data1, data1, #32 > -#else > - lsr data2, data1, tmp2 > -#endif > - /* 4->7 bytes to copy. */ > - str data2w, [dst, #-3] > - str data1w, [dstin] > -#ifdef BUILD_STPCPY > - mov dstin, dst > -#endif > - ret > -.Lfp_lt4: > - cbz pos, .Lfp_lt2 > - /* 2->3 bytes to copy. */ > -#ifdef __AARCH64EB__ > - lsr data1, data1, #48 > -#endif > - strh data1w, [dstin] > - /* Fall-through, one byte (max) to go. */ > -.Lfp_lt2: > - /* Null-terminated string. Last character must be zero! */ > - strb wzr, [dst] > -#ifdef BUILD_STPCPY > - mov dstin, dst > -#endif > + .p2align 4 > +L(less8): > + subs tmp, len, 3 > + b.lo L(less4) > + ldr dataw1, [srcin] > + ldr dataw2, [srcin, tmp] > + str dataw1, [dstin] > + str dataw2, [dstin, tmp] > + IFSTPCPY (add result, dstin, len) > ret > > - .p2align 6 > - /* Aligning here ensures that the entry code and main loop all lies > - within one 64-byte cache line. */ > -.Lbulk_entry: > - sub to_align, to_align, #16 > - stp data1, data2, [dstin] > - sub src, srcin, to_align > - sub dst, dstin, to_align > - b .Lentry_no_page_cross > - > - /* The inner loop deals with two Dwords at a time. This has a > - slightly higher start-up cost, but we should win quite quickly, > - especially on cores with a high number of issue slots per > - cycle, as we get much better parallelism out of the operations. */ > -.Lmain_loop: > - stp data1, data2, [dst], #16 > -.Lentry_no_page_cross: > - ldp data1, data2, [src], #16 > - sub tmp1, data1, zeroones > - orr tmp2, data1, #REP8_7f > - sub tmp3, data2, zeroones > - orr tmp4, data2, #REP8_7f > - bic has_nul1, tmp1, tmp2 > - bics has_nul2, tmp3, tmp4 > - ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */ > - b.eq .Lmain_loop > - > - /* Since we know we are copying at least 16 bytes, the fastest way > - to deal with the tail is to determine the location of the > - trailing NUL, then (re)copy the 16 bytes leading up to that. */ > - cmp has_nul1, #0 > -#ifdef __AARCH64EB__ > - /* For big-endian, carry propagation (if the final byte in the > - string is 0x01) means we cannot use has_nul directly. The > - easiest way to get the correct byte is to byte-swap the data > - and calculate the syndrome a second time. */ > - csel data1, data1, data2, ne > - rev data1, data1 > - sub tmp1, data1, zeroones > - orr tmp2, data1, #REP8_7f > - bic has_nul1, tmp1, tmp2 > -#else > - csel has_nul1, has_nul1, has_nul2, ne > -#endif > - rev has_nul1, has_nul1 > - clz pos, has_nul1 > - add tmp1, pos, #72 > - add pos, pos, #8 > - csel pos, pos, tmp1, ne > - add src, src, pos, lsr #3 > - add dst, dst, pos, lsr #3 > - ldp data1, data2, [src, #-32] > - stp data1, data2, [dst, #-16] > -#ifdef BUILD_STPCPY > - sub dstin, dst, #1 > -#endif > +L(less4): > + cbz len, L(zerobyte) > + ldrh dataw1, [srcin] > + strh dataw1, [dstin] > +L(zerobyte): > + strb wzr, [dstin, len] > + IFSTPCPY (add result, dstin, len) > ret > > -.Lpage_cross: > - bic src, srcin, #15 > - /* Start by loading two words at [srcin & ~15], then forcing the > - bytes that precede srcin to 0xff. This means they never look > - like termination bytes. */ > - ldp data1, data2, [src] > - lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */ > - tst to_align, #7 > - csetm tmp2, ne > -#ifdef __AARCH64EB__ > - lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ > -#else > - lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ > + .p2align 4 > +L(start_loop): > + sub tmp, srcin, dstin > + ldr dataq2, [srcin] > + sub dst, src, tmp > + str dataq2, [dstin] > +L(loop): > + str dataq, [dst], 32 > + ldr dataq, [src, 16] > + cmeq vhas_nul.16b, vdata.16b, 0 > + umaxp vend.16b, vhas_nul.16b, vhas_nul.16b > + fmov synd, dend > + cbnz synd, L(loopend) > + str dataq, [dst, -16] > + ldr dataq, [src, 32]! > + cmeq vhas_nul.16b, vdata.16b, 0 > + umaxp vend.16b, vhas_nul.16b, vhas_nul.16b > + fmov synd, dend > + cbz synd, L(loop) > + add dst, dst, 16 > +L(loopend): > + shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */ > + fmov synd, dend > + sub dst, dst, 31 > +#ifndef __AARCH64EB__ > + rbit synd, synd > #endif > - orr data1, data1, tmp2 > - orr data2a, data2, tmp2 > - cmp to_align, #8 > - csinv data1, data1, xzr, lt > - csel data2, data2, data2a, lt > - sub tmp1, data1, zeroones > - orr tmp2, data1, #REP8_7f > - sub tmp3, data2, zeroones > - orr tmp4, data2, #REP8_7f > - bic has_nul1, tmp1, tmp2 > - bics has_nul2, tmp3, tmp4 > - ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */ > - b.eq .Lpage_cross_ok > - /* We now need to make data1 and data2 look like they've been > - loaded directly from srcin. Do a rotate on the 128-bit value. */ > - lsl tmp1, to_align, #3 /* Bytes->bits. */ > - neg tmp2, to_align, lsl #3 > -#ifdef __AARCH64EB__ > - lsl data1a, data1, tmp1 > - lsr tmp4, data2, tmp2 > - lsl data2, data2, tmp1 > - orr tmp4, tmp4, data1a > - cmp to_align, #8 > - csel data1, tmp4, data2, lt > - rev tmp2, data1 > - rev tmp4, data2 > - sub tmp1, tmp2, zeroones > - orr tmp2, tmp2, #REP8_7f > - sub tmp3, tmp4, zeroones > - orr tmp4, tmp4, #REP8_7f > -#else > - lsr data1a, data1, tmp1 > - lsl tmp4, data2, tmp2 > - lsr data2, data2, tmp1 > - orr tmp4, tmp4, data1a > - cmp to_align, #8 > - csel data1, tmp4, data2, lt > - sub tmp1, data1, zeroones > - orr tmp2, data1, #REP8_7f > - sub tmp3, data2, zeroones > - orr tmp4, data2, #REP8_7f > -#endif > - bic has_nul1, tmp1, tmp2 > - cbnz has_nul1, .Lfp_le8 > - bic has_nul2, tmp3, tmp4 > - b .Lfp_gt8 > + clz len, synd > + lsr len, len, 2 > + add dst, dst, len > + ldr dataq, [dst, tmp] > + str dataq, [dst] > + IFSTPCPY (add result, dst, 15) > + ret > > - .size STRCPY, . - STRCPY > +END (STRCPY) > #endif > diff --git a/newlib/libc/machine/aarch64/strlen.S b/newlib/libc/machine/aarch64/strlen.S > index 872d136ef4..68a6f357cf 100644 > --- a/newlib/libc/machine/aarch64/strlen.S > +++ b/newlib/libc/machine/aarch64/strlen.S > @@ -1,115 +1,92 @@ > -/* Copyright (c) 2013-2015, Linaro Limited > - All rights reserved. > - > - Redistribution and use in source and binary forms, with or without > - modification, are permitted provided that the following conditions are met: > - * Redistributions of source code must retain the above copyright > - notice, this list of conditions and the following disclaimer. > - * Redistributions in binary form must reproduce the above copyright > - notice, this list of conditions and the following disclaimer in the > - documentation and/or other materials provided with the distribution. > - * Neither the name of the Linaro nor the > - names of its contributors may be used to endorse or promote products > - derived from this software without specific prior written permission. > - > - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS > - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT > - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR > - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT > - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, > - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT > - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, > - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY > - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT > - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE > - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ > - > +/* > + * strlen - calculate the length of a string. > + * > + * Copyright (c) 2020-2022, Arm Limited. > + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception > + */ > #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED)) > /* See strlen-stub.c */ > #else > > /* Assumptions: > * > - * ARMv8-a, AArch64, unaligned accesses, min page size 4k. > + * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. > + * Not MTE compatible. > */ > > -/* To test the page crossing code path more thoroughly, compile with > - -DTEST_PAGE_CROSS - this will force all calls through the slower > - entry path. This option is not intended for production use. */ > - > -/* Arguments and results. */ > -#define srcin x0 > -#define len x0 > - > -/* Locals and temporaries. */ > -#define src x1 > -#define data1 x2 > -#define data2 x3 > -#define has_nul1 x4 > -#define has_nul2 x5 > -#define tmp1 x4 > -#define tmp2 x5 > -#define tmp3 x6 > -#define tmp4 x7 > -#define zeroones x8 > - > -#define L(l) .L ## l > - > - .macro def_fn f p2align=0 > - .text > - .p2align \p2align > - .global \f > - .type \f, %function > -\f: > - .endm > - > - /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 > - (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and > - can be done in parallel across the entire word. A faster check > - (X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives > - false hits for characters 129..255. */ > +#include "asmdefs.h" > + > +#define srcin x0 > +#define len x0 > + > +#define src x1 > +#define data1 x2 > +#define data2 x3 > +#define has_nul1 x4 > +#define has_nul2 x5 > +#define tmp1 x4 > +#define tmp2 x5 > +#define tmp3 x6 > +#define tmp4 x7 > +#define zeroones x8 > + > +#define maskv v0 > +#define maskd d0 > +#define dataq1 q1 > +#define dataq2 q2 > +#define datav1 v1 > +#define datav2 v2 > +#define tmp x2 > +#define tmpw w2 > +#define synd x3 > +#define syndw w3 > +#define shift x4 > + > +/* For the first 32 bytes, NUL detection works on the principle that > + (X - 1) & (~X) & 0x80 (=> (X - 1) & ~(X | 0x7f)) is non-zero if a > + byte is zero, and can be done in parallel across the entire word. */ > > #define REP8_01 0x0101010101010101 > #define REP8_7f 0x7f7f7f7f7f7f7f7f > -#define REP8_80 0x8080808080808080 > + > +/* To test the page crossing code path more thoroughly, compile with > + -DTEST_PAGE_CROSS - this will force all calls through the slower > + entry path. This option is not intended for production use. */ > > #ifdef TEST_PAGE_CROSS > -# define MIN_PAGE_SIZE 15 > +# define MIN_PAGE_SIZE 32 > #else > # define MIN_PAGE_SIZE 4096 > #endif > > - /* Since strings are short on average, we check the first 16 bytes > - of the string for a NUL character. In order to do an unaligned ldp > - safely we have to do a page cross check first. If there is a NUL > - byte we calculate the length from the 2 8-byte words using > - conditional select to reduce branch mispredictions (it is unlikely > - strlen will be repeatedly called on strings with the same length). > - > - If the string is longer than 16 bytes, we align src so don't need > - further page cross checks, and process 32 bytes per iteration > - using the fast NUL check. If we encounter non-ASCII characters, > - fallback to a second loop using the full NUL check. > - > - If the page cross check fails, we read 16 bytes from an aligned > - address, remove any characters before the string, and continue > - in the main loop using aligned loads. Since strings crossing a > - page in the first 16 bytes are rare (probability of > - 16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized. > - > - AArch64 systems have a minimum page size of 4k. We don't bother > - checking for larger page sizes - the cost of setting up the correct > - page size is just not worth the extra gain from a small reduction in > - the cases taking the slow path. Note that we only care about > - whether the first fetch, which may be misaligned, crosses a page > - boundary. */ > - > -def_fn strlen p2align=6 > +/* Core algorithm: > + > + Since strings are short on average, we check the first 32 bytes of the > + string for a NUL character without aligning the string. In order to use > + unaligned loads safely we must do a page cross check first. > + > + If there is a NUL byte we calculate the length from the 2 8-byte words > + using conditional select to reduce branch mispredictions (it is unlikely > + strlen will be repeatedly called on strings with the same length). > + > + If the string is longer than 32 bytes, align src so we don't need further > + page cross checks, and process 32 bytes per iteration using a fast SIMD > + loop. > + > + If the page cross check fails, we read 32 bytes from an aligned address, > + and ignore any characters before the string. If it contains a NUL > + character, return the length, if not, continue in the main loop. */ > + > +ENTRY (strlen) > + PTR_ARG (0) > and tmp1, srcin, MIN_PAGE_SIZE - 1 > - mov zeroones, REP8_01 > - cmp tmp1, MIN_PAGE_SIZE - 16 > - b.gt L(page_cross) > + cmp tmp1, MIN_PAGE_SIZE - 32 > + b.hi L(page_cross) > + > + /* Look for a NUL byte in the first 16 bytes. */ > ldp data1, data2, [srcin] > + mov zeroones, REP8_01 > + > #ifdef __AARCH64EB__ > /* For big-endian, carry propagation (if the final byte in the > string is 0x01) means we cannot use has_nul1/2 directly. > @@ -125,114 +102,96 @@ def_fn strlen p2align=6 > bics has_nul1, tmp1, tmp2 > bic has_nul2, tmp3, tmp4 > ccmp has_nul2, 0, 0, eq > - beq L(main_loop_entry) > + b.eq L(bytes16_31) > > - /* Enter with C = has_nul1 == 0. */ > + /* Find the exact offset of the first NUL byte in the first 16 bytes > + from the string start. Enter with C = has_nul1 == 0. */ > csel has_nul1, has_nul1, has_nul2, cc > mov len, 8 > rev has_nul1, has_nul1 > - clz tmp1, has_nul1 > csel len, xzr, len, cc > + clz tmp1, has_nul1 > add len, len, tmp1, lsr 3 > ret > > - /* The inner loop processes 32 bytes per iteration and uses the fast > - NUL check. If we encounter non-ASCII characters, use a second > - loop with the accurate NUL check. */ > - .p2align 4 > -L(main_loop_entry): > - bic src, srcin, 15 > - sub src, src, 16 > -L(main_loop): > - ldp data1, data2, [src, 32]! > -.Lpage_cross_entry: > - sub tmp1, data1, zeroones > - sub tmp3, data2, zeroones > - orr tmp2, tmp1, tmp3 > - tst tmp2, zeroones, lsl 7 > - bne 1f > - ldp data1, data2, [src, 16] > + /* Look for a NUL byte at offset 16..31 in the string. */ > +L(bytes16_31): > + ldp data1, data2, [srcin, 16] > +#ifdef __AARCH64EB__ > + rev data1, data1 > + rev data2, data2 > +#endif > sub tmp1, data1, zeroones > - sub tmp3, data2, zeroones > - orr tmp2, tmp1, tmp3 > - tst tmp2, zeroones, lsl 7 > - beq L(main_loop) > - add src, src, 16 > -1: > - /* The fast check failed, so do the slower, accurate NUL check. */ > orr tmp2, data1, REP8_7f > + sub tmp3, data2, zeroones > orr tmp4, data2, REP8_7f > bics has_nul1, tmp1, tmp2 > bic has_nul2, tmp3, tmp4 > ccmp has_nul2, 0, 0, eq > - beq L(nonascii_loop) > + b.eq L(loop_entry) > > - /* Enter with C = has_nul1 == 0. */ > -L(tail): > -#ifdef __AARCH64EB__ > - /* For big-endian, carry propagation (if the final byte in the > - string is 0x01) means we cannot use has_nul1/2 directly. The > - easiest way to get the correct byte is to byte-swap the data > - and calculate the syndrome a second time. */ > - csel data1, data1, data2, cc > - rev data1, data1 > - sub tmp1, data1, zeroones > - orr tmp2, data1, REP8_7f > - bic has_nul1, tmp1, tmp2 > -#else > + /* Find the exact offset of the first NUL byte at offset 16..31 from > + the string start. Enter with C = has_nul1 == 0. */ > csel has_nul1, has_nul1, has_nul2, cc > -#endif > - sub len, src, srcin > + mov len, 24 > rev has_nul1, has_nul1 > - add tmp2, len, 8 > + mov tmp3, 16 > clz tmp1, has_nul1 > - csel len, len, tmp2, cc > + csel len, tmp3, len, cc > add len, len, tmp1, lsr 3 > ret > > -L(nonascii_loop): > - ldp data1, data2, [src, 16]! > - sub tmp1, data1, zeroones > - orr tmp2, data1, REP8_7f > - sub tmp3, data2, zeroones > - orr tmp4, data2, REP8_7f > - bics has_nul1, tmp1, tmp2 > - bic has_nul2, tmp3, tmp4 > - ccmp has_nul2, 0, 0, eq > - bne L(tail) > - ldp data1, data2, [src, 16]! > - sub tmp1, data1, zeroones > - orr tmp2, data1, REP8_7f > - sub tmp3, data2, zeroones > - orr tmp4, data2, REP8_7f > - bics has_nul1, tmp1, tmp2 > - bic has_nul2, tmp3, tmp4 > - ccmp has_nul2, 0, 0, eq > - beq L(nonascii_loop) > - b L(tail) > + nop > +L(loop_entry): > + bic src, srcin, 31 > + > + .p2align 5 > +L(loop): > + ldp dataq1, dataq2, [src, 32]! > + uminp maskv.16b, datav1.16b, datav2.16b > + uminp maskv.16b, maskv.16b, maskv.16b > + cmeq maskv.8b, maskv.8b, 0 > + fmov synd, maskd > + cbz synd, L(loop) > + > + /* Low 32 bits of synd are non-zero if a NUL was found in datav1. */ > + cmeq maskv.16b, datav1.16b, 0 > + sub len, src, srcin > + cbnz syndw, 1f > + cmeq maskv.16b, datav2.16b, 0 > + add len, len, 16 > +1: > + /* Generate a bitmask and compute correct byte offset. */ > + shrn maskv.8b, maskv.8h, 4 > + fmov synd, maskd > +#ifndef __AARCH64EB__ > + rbit synd, synd > +#endif > + clz tmp, synd > + add len, len, tmp, lsr 2 > + ret > > - /* Load 16 bytes from [srcin & ~15] and force the bytes that precede > - srcin to 0x7f, so we ignore any NUL bytes before the string. > - Then continue in the aligned loop. */ > L(page_cross): > - bic src, srcin, 15 > - ldp data1, data2, [src] > - lsl tmp1, srcin, 3 > - mov tmp4, -1 > -#ifdef __AARCH64EB__ > - /* Big-endian. Early bytes are at MSB. */ > - lsr tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */ > -#else > - /* Little-endian. Early bytes are at LSB. */ > - lsl tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */ > -#endif > - orr tmp1, tmp1, REP8_80 > - orn data1, data1, tmp1 > - orn tmp2, data2, tmp1 > - tst srcin, 8 > - csel data1, data1, tmp4, eq > - csel data2, data2, tmp2, eq > - b L(page_cross_entry) > - > - .size strlen, . - strlen > + bic src, srcin, 31 > + mov tmpw, 0x0c03 > + movk tmpw, 0xc030, lsl 16 > + ld1 {datav1.16b, datav2.16b}, [src] > + dup maskv.4s, tmpw > + cmeq datav1.16b, datav1.16b, 0 > + cmeq datav2.16b, datav2.16b, 0 > + and datav1.16b, datav1.16b, maskv.16b > + and datav2.16b, datav2.16b, maskv.16b > + addp maskv.16b, datav1.16b, datav2.16b > + addp maskv.16b, maskv.16b, maskv.16b > + fmov synd, maskd > + lsl shift, srcin, 1 > + lsr synd, synd, shift > + cbz synd, L(loop) > + > + rbit synd, synd > + clz len, synd > + lsr len, len, 1 > + ret > + > +END (strlen) > #endif > diff --git a/newlib/libc/machine/aarch64/strncmp.S b/newlib/libc/machine/aarch64/strncmp.S > index ffdabc2607..373695503d 100644 > --- a/newlib/libc/machine/aarch64/strncmp.S > +++ b/newlib/libc/machine/aarch64/strncmp.S > @@ -1,49 +1,23 @@ > -/* Copyright (c) 2013, 2018, Linaro Limited > - All rights reserved. > - > - Redistribution and use in source and binary forms, with or without > - modification, are permitted provided that the following conditions are met: > - * Redistributions of source code must retain the above copyright > - notice, this list of conditions and the following disclaimer. > - * Redistributions in binary form must reproduce the above copyright > - notice, this list of conditions and the following disclaimer in the > - documentation and/or other materials provided with the distribution. > - * Neither the name of the Linaro nor the > - names of its contributors may be used to endorse or promote products > - derived from this software without specific prior written permission. > - > - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS > - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT > - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR > - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT > - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, > - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT > - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, > - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY > - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT > - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE > - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ > - > +/* > + * strncmp - compare two strings > + * > + * Copyright (c) 2013-2022, Arm Limited. > + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception > + */ > #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED)) > /* See strcmp-stub.c */ > #else > > /* Assumptions: > * > - * ARMv8-a, AArch64 > + * ARMv8-a, AArch64. > + * MTE compatible. > */ > > - .macro def_fn f p2align=0 > - .text > - .p2align \p2align > - .global \f > - .type \f, %function > -\f: > - .endm > +#include "asmdefs.h" > > #define REP8_01 0x0101010101010101 > #define REP8_7f 0x7f7f7f7f7f7f7f7f > -#define REP8_80 0x8080808080808080 > > /* Parameters and result. */ > #define src1 x0 > @@ -64,86 +38,91 @@ > #define tmp3 x10 > #define zeroones x11 > #define pos x12 > -#define limit_wd x13 > -#define mask x14 > -#define endloop x15 > +#define mask x13 > +#define endloop x14 > #define count mask > +#define offset pos > +#define neg_offset x15 > + > +/* Define endian dependent shift operations. > + On big-endian early bytes are at MSB and on little-endian LSB. > + LS_FW means shifting towards early bytes. > + LS_BK means shifting towards later bytes. > + */ > +#ifdef __AARCH64EB__ > +#define LS_FW lsl > +#define LS_BK lsr > +#else > +#define LS_FW lsr > +#define LS_BK lsl > +#endif > > - .text > - .p2align 6 > - .rep 7 > - nop /* Pad so that the loop below fits a cache line. */ > - .endr > -def_fn strncmp > - cbz limit, .Lret0 > +ENTRY (strncmp) > + PTR_ARG (0) > + PTR_ARG (1) > + SIZE_ARG (2) > + cbz limit, L(ret0) > eor tmp1, src1, src2 > mov zeroones, #REP8_01 > tst tmp1, #7 > and count, src1, #7 > - b.ne .Lmisaligned8 > - cbnz count, .Lmutual_align > - /* Calculate the number of full and partial words -1. */ > - sub limit_wd, limit, #1 /* limit != 0, so no underflow. */ > - lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */ > + b.ne L(misaligned8) > + cbnz count, L(mutual_align) > > /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 > (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and > can be done in parallel across the entire word. */ > - /* Start of performance-critical section -- one 64B cache line. */ > -.Lloop_aligned: > + .p2align 4 > +L(loop_aligned): > ldr data1, [src1], #8 > ldr data2, [src2], #8 > -.Lstart_realigned: > - subs limit_wd, limit_wd, #1 > +L(start_realigned): > + subs limit, limit, #8 > sub tmp1, data1, zeroones > orr tmp2, data1, #REP8_7f > eor diff, data1, data2 /* Non-zero if differences found. */ > - csinv endloop, diff, xzr, pl /* Last Dword or differences. */ > + csinv endloop, diff, xzr, hi /* Last Dword or differences. */ > bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ > ccmp endloop, #0, #0, eq > - b.eq .Lloop_aligned > - /* End of performance-critical section -- one 64B cache line. */ > + b.eq L(loop_aligned) > + /* End of main loop */ > > - /* Not reached the limit, must have found the end or a diff. */ > - tbz limit_wd, #63, .Lnot_limit > - > - /* Limit % 8 == 0 => all bytes significant. */ > - ands limit, limit, #7 > - b.eq .Lnot_limit > - > - lsl limit, limit, #3 /* Bits -> bytes. */ > - mov mask, #~0 > -#ifdef __AARCH64EB__ > - lsr mask, mask, limit > -#else > - lsl mask, mask, limit > -#endif > - bic data1, data1, mask > - bic data2, data2, mask > - > - /* Make sure that the NUL byte is marked in the syndrome. */ > - orr has_nul, has_nul, mask > - > -.Lnot_limit: > +L(full_check): > +#ifndef __AARCH64EB__ > orr syndrome, diff, has_nul > - > -#ifndef __AARCH64EB__ > + add limit, limit, 8 /* Rewind limit to before last subs. */ > +L(syndrome_check): > + /* Limit was reached. Check if the NUL byte or the difference > + is before the limit. */ > rev syndrome, syndrome > rev data1, data1 > - /* The MS-non-zero bit of the syndrome marks either the first bit > - that is different, or the top bit of the first zero byte. > - Shifting left now will bring the critical information into the > - top bits. */ > clz pos, syndrome > rev data2, data2 > lsl data1, data1, pos > + cmp limit, pos, lsr #3 > lsl data2, data2, pos > /* But we need to zero-extend (char is unsigned) the value and then > perform a signed 32-bit subtraction. */ > lsr data1, data1, #56 > sub result, data1, data2, lsr #56 > + csel result, result, xzr, hi > ret > #else > + /* Not reached the limit, must have found the end or a diff. */ > + tbz limit, #63, L(not_limit) > + add tmp1, limit, 8 > + cbz limit, L(not_limit) > + > + lsl limit, tmp1, #3 /* Bits -> bytes. */ > + mov mask, #~0 > + lsr mask, mask, limit > + bic data1, data1, mask > + bic data2, data2, mask > + > + /* Make sure that the NUL byte is marked in the syndrome. */ > + orr has_nul, has_nul, mask > + > +L(not_limit): > /* For big-endian we cannot use the trick with the syndrome value > as carry-propagation can corrupt the upper bits if the trailing > bytes in the string contain 0x01. */ > @@ -164,10 +143,11 @@ def_fn strncmp > rev has_nul, has_nul > orr syndrome, diff, has_nul > clz pos, syndrome > - /* The MS-non-zero bit of the syndrome marks either the first bit > - that is different, or the top bit of the first zero byte. > + /* The most-significant-non-zero bit of the syndrome marks either the > + first bit that is different, or the top bit of the first zero byte. > Shifting left now will bring the critical information into the > top bits. */ > +L(end_quick): > lsl data1, data1, pos > lsl data2, data2, pos > /* But we need to zero-extend (char is unsigned) the value and then > @@ -177,7 +157,7 @@ def_fn strncmp > ret > #endif > > -.Lmutual_align: > +L(mutual_align): > /* Sources are mutually aligned, but are not currently at an > alignment boundary. Round down the addresses and then mask off > the bytes that precede the start point. > @@ -189,102 +169,143 @@ def_fn strncmp > neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */ > ldr data2, [src2], #8 > mov tmp2, #~0 > - sub limit_wd, limit, #1 /* limit != 0, so no underflow. */ > -#ifdef __AARCH64EB__ > - /* Big-endian. Early bytes are at MSB. */ > - lsl tmp2, tmp2, tmp3 /* Shift (count & 63). */ > -#else > - /* Little-endian. Early bytes are at LSB. */ > - lsr tmp2, tmp2, tmp3 /* Shift (count & 63). */ > -#endif > - and tmp3, limit_wd, #7 > - lsr limit_wd, limit_wd, #3 > - /* Adjust the limit. Only low 3 bits used, so overflow irrelevant. */ > - add limit, limit, count > - add tmp3, tmp3, count > + LS_FW tmp2, tmp2, tmp3 /* Shift (count & 63). */ > + /* Adjust the limit and ensure it doesn't overflow. */ > + adds limit, limit, count > + csinv limit, limit, xzr, lo > orr data1, data1, tmp2 > orr data2, data2, tmp2 > - add limit_wd, limit_wd, tmp3, lsr #3 > - b .Lstart_realigned > + b L(start_realigned) > > - .p2align 6 > + .p2align 4 > /* Don't bother with dwords for up to 16 bytes. */ > -.Lmisaligned8: > +L(misaligned8): > cmp limit, #16 > - b.hs .Ltry_misaligned_words > + b.hs L(try_misaligned_words) > > -.Lbyte_loop: > +L(byte_loop): > /* Perhaps we can do better than this. */ > ldrb data1w, [src1], #1 > ldrb data2w, [src2], #1 > subs limit, limit, #1 > ccmp data1w, #1, #0, hi /* NZCV = 0b0000. */ > ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ > - b.eq .Lbyte_loop > -.Ldone: > + b.eq L(byte_loop) > +L(done): > sub result, data1, data2 > ret > /* Align the SRC1 to a dword by doing a bytewise compare and then do > the dword loop. */ > -.Ltry_misaligned_words: > - lsr limit_wd, limit, #3 > - cbz count, .Ldo_misaligned > +L(try_misaligned_words): > + cbz count, L(src1_aligned) > > neg count, count > and count, count, #7 > sub limit, limit, count > - lsr limit_wd, limit, #3 > > -.Lpage_end_loop: > +L(page_end_loop): > ldrb data1w, [src1], #1 > ldrb data2w, [src2], #1 > cmp data1w, #1 > ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ > - b.ne .Ldone > + b.ne L(done) > subs count, count, #1 > - b.hi .Lpage_end_loop > + b.hi L(page_end_loop) > + > + /* The following diagram explains the comparison of misaligned strings. > + The bytes are shown in natural order. For little-endian, it is > + reversed in the registers. The "x" bytes are before the string. > + The "|" separates data that is loaded at one time. > + src1 | a a a a a a a a | b b b c c c c c | . . . > + src2 | x x x x x a a a a a a a a b b b | c c c c c . . . > > -.Ldo_misaligned: > - /* Prepare ourselves for the next page crossing. Unlike the aligned > - loop, we fetch 1 less dword because we risk crossing bounds on > - SRC2. */ > - mov count, #8 > - subs limit_wd, limit_wd, #1 > - b.lo .Ldone_loop > -.Lloop_misaligned: > - and tmp2, src2, #0xff8 > - eor tmp2, tmp2, #0xff8 > - cbz tmp2, .Lpage_end_loop > + After shifting in each step, the data looks like this: > + STEP_A STEP_B STEP_C > + data1 a a a a a a a a b b b c c c c c b b b c c c c c > + data2 a a a a a a a a b b b 0 0 0 0 0 0 0 0 c c c c c > > + The bytes with "0" are eliminated from the syndrome via mask. > + > + Align SRC2 down to 16 bytes. This way we can read 16 bytes at a > + time from SRC2. The comparison happens in 3 steps. After each step > + the loop can exit, or read from SRC1 or SRC2. */ > +L(src1_aligned): > + /* Calculate offset from 8 byte alignment to string start in bits. No > + need to mask offset since shifts are ignoring upper bits. */ > + lsl offset, src2, #3 > + bic src2, src2, #0xf > + mov mask, -1 > + neg neg_offset, offset > ldr data1, [src1], #8 > - ldr data2, [src2], #8 > - sub tmp1, data1, zeroones > - orr tmp2, data1, #REP8_7f > - eor diff, data1, data2 /* Non-zero if differences found. */ > - bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ > - ccmp diff, #0, #0, eq > - b.ne .Lnot_limit > - subs limit_wd, limit_wd, #1 > - b.pl .Lloop_misaligned > + ldp tmp1, tmp2, [src2], #16 > + LS_BK mask, mask, neg_offset > + and neg_offset, neg_offset, #63 /* Need actual value for cmp later. */ > + /* Skip the first compare if data in tmp1 is irrelevant. */ > + tbnz offset, 6, L(misaligned_mid_loop) > > -.Ldone_loop: > - /* We found a difference or a NULL before the limit was reached. */ > - and limit, limit, #7 > - cbz limit, .Lnot_limit > - /* Read the last word. */ > - sub src1, src1, 8 > - sub src2, src2, 8 > - ldr data1, [src1, limit] > - ldr data2, [src2, limit] > - sub tmp1, data1, zeroones > - orr tmp2, data1, #REP8_7f > +L(loop_misaligned): > + /* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/ > + LS_FW data2, tmp1, offset > + LS_BK tmp1, tmp2, neg_offset > + subs limit, limit, #8 > + orr data2, data2, tmp1 /* 8 bytes from SRC2 combined from two regs.*/ > + sub has_nul, data1, zeroones > eor diff, data1, data2 /* Non-zero if differences found. */ > - bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ > - ccmp diff, #0, #0, eq > - b.ne .Lnot_limit > + orr tmp3, data1, #REP8_7f > + csinv endloop, diff, xzr, hi /* If limit, set to all ones. */ > + bic has_nul, has_nul, tmp3 /* Non-zero if NUL byte found in SRC1. */ > + orr tmp3, endloop, has_nul > + cbnz tmp3, L(full_check) > + > + ldr data1, [src1], #8 > +L(misaligned_mid_loop): > + /* STEP_B: Compare first part of data1 to second part of tmp2. */ > + LS_FW data2, tmp2, offset > +#ifdef __AARCH64EB__ > + /* For big-endian we do a byte reverse to avoid carry-propagation > + problem described above. This way we can reuse the has_nul in the > + next step and also use syndrome value trick at the end. */ > + rev tmp3, data1 > + #define data1_fixed tmp3 > +#else > + #define data1_fixed data1 > +#endif > + sub has_nul, data1_fixed, zeroones > + orr tmp3, data1_fixed, #REP8_7f > + eor diff, data2, data1 /* Non-zero if differences found. */ > + bic has_nul, has_nul, tmp3 /* Non-zero if NUL terminator. */ > +#ifdef __AARCH64EB__ > + rev has_nul, has_nul > +#endif > + cmp limit, neg_offset, lsr #3 > + orr syndrome, diff, has_nul > + bic syndrome, syndrome, mask /* Ignore later bytes. */ > + csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */ > + cbnz tmp3, L(syndrome_check) > + > + /* STEP_C: Compare second part of data1 to first part of tmp1. */ > + ldp tmp1, tmp2, [src2], #16 > + cmp limit, #8 > + LS_BK data2, tmp1, neg_offset > + eor diff, data2, data1 /* Non-zero if differences found. */ > + orr syndrome, diff, has_nul > + and syndrome, syndrome, mask /* Ignore earlier bytes. */ > + csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */ > + cbnz tmp3, L(syndrome_check) > + > + ldr data1, [src1], #8 > + sub limit, limit, #8 > + b L(loop_misaligned) > + > +#ifdef __AARCH64EB__ > +L(syndrome_check): > + clz pos, syndrome > + cmp pos, limit, lsl #3 > + b.lo L(end_quick) > +#endif > > -.Lret0: > +L(ret0): > mov result, #0 > ret > - .size strncmp, . - strncmp > +END(strncmp) > #endif > diff --git a/newlib/libc/machine/aarch64/strnlen.S b/newlib/libc/machine/aarch64/strnlen.S > index c255c3f7c6..091002e0b0 100644 > --- a/newlib/libc/machine/aarch64/strnlen.S > +++ b/newlib/libc/machine/aarch64/strnlen.S > @@ -1,187 +1,105 @@ > -/* strnlen - calculate the length of a string with limit. > - > - Copyright (c) 2013, Linaro Limited > - All rights reserved. > - > - Redistribution and use in source and binary forms, with or without > - modification, are permitted provided that the following conditions are met: > - * Redistributions of source code must retain the above copyright > - notice, this list of conditions and the following disclaimer. > - * Redistributions in binary form must reproduce the above copyright > - notice, this list of conditions and the following disclaimer in the > - documentation and/or other materials provided with the distribution. > - * Neither the name of the Linaro nor the > - names of its contributors may be used to endorse or promote products > - derived from this software without specific prior written permission. > - > - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS > - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT > - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR > - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT > - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, > - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT > - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, > - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY > - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT > - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE > - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ > - > +/* > + * strnlen - calculate the length of a string with limit. > + * > + * Copyright (c) 2020-2022, Arm Limited. > + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception > + */ > #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED)) > /* See strlen-stub.c */ > #else > > /* Assumptions: > * > - * ARMv8-a, AArch64 > + * ARMv8-a, AArch64, Advanced SIMD. > + * MTE compatible. > */ > > -/* Arguments and results. */ > +#include "asmdefs.h" > + > #define srcin x0 > -#define len x0 > -#define limit x1 > +#define cntin x1 > +#define result x0 > > -/* Locals and temporaries. */ > #define src x2 > -#define data1 x3 > -#define data2 x4 > -#define data2a x5 > -#define has_nul1 x6 > -#define has_nul2 x7 > -#define tmp1 x8 > -#define tmp2 x9 > -#define tmp3 x10 > -#define tmp4 x11 > -#define zeroones x12 > -#define pos x13 > -#define limit_wd x14 > - > - .macro def_fn f p2align=0 > - .text > - .p2align \p2align > - .global \f > - .type \f, %function > -\f: > - .endm > - > -#define REP8_01 0x0101010101010101 > -#define REP8_7f 0x7f7f7f7f7f7f7f7f > -#define REP8_80 0x8080808080808080 > - > - .text > - .p2align 6 > -.Lstart: > - /* Pre-pad to ensure critical loop begins an icache line. */ > - .rep 7 > - nop > - .endr > - /* Put this code here to avoid wasting more space with pre-padding. */ > -.Lhit_limit: > - mov len, limit > +#define synd x3 > +#define shift x4 > +#define tmp x4 > +#define cntrem x5 > + > +#define qdata q0 > +#define vdata v0 > +#define vhas_chr v1 > +#define vend v2 > +#define dend d2 > + > +/* > + Core algorithm: > + Process the string in 16-byte aligned chunks. Compute a 64-bit mask with > + four bits per byte using the shrn instruction. A count trailing zeros then > + identifies the first zero byte. */ > + > +ENTRY (strnlen) > + PTR_ARG (0) > + SIZE_ARG (1) > + bic src, srcin, 15 > + cbz cntin, L(nomatch) > + ld1 {vdata.16b}, [src] > + cmeq vhas_chr.16b, vdata.16b, 0 > + lsl shift, srcin, 2 > + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */ > + fmov synd, dend > + lsr synd, synd, shift > + cbz synd, L(start_loop) > +L(finish): > + rbit synd, synd > + clz synd, synd > + lsr result, synd, 2 > + cmp cntin, result > + csel result, cntin, result, ls > ret > > -def_fn strnlen > - cbz limit, .Lhit_limit > - mov zeroones, #REP8_01 > - bic src, srcin, #15 > - ands tmp1, srcin, #15 > - b.ne .Lmisaligned > - /* Calculate the number of full and partial words -1. */ > - sub limit_wd, limit, #1 /* Limit != 0, so no underflow. */ > - lsr limit_wd, limit_wd, #4 /* Convert to Qwords. */ > - > - /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 > - (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and > - can be done in parallel across the entire word. */ > - /* The inner loop deals with two Dwords at a time. This has a > - slightly higher start-up cost, but we should win quite quickly, > - especially on cores with a high number of issue slots per > - cycle, as we get much better parallelism out of the operations. */ > - > - /* Start of critial section -- keep to one 64Byte cache line. */ > -.Lloop: > - ldp data1, data2, [src], #16 > -.Lrealigned: > - sub tmp1, data1, zeroones > - orr tmp2, data1, #REP8_7f > - sub tmp3, data2, zeroones > - orr tmp4, data2, #REP8_7f > - bic has_nul1, tmp1, tmp2 > - bic has_nul2, tmp3, tmp4 > - subs limit_wd, limit_wd, #1 > - orr tmp1, has_nul1, has_nul2 > - ccmp tmp1, #0, #0, pl /* NZCV = 0000 */ > - b.eq .Lloop > - /* End of critical section -- keep to one 64Byte cache line. */ > - > - orr tmp1, has_nul1, has_nul2 > - cbz tmp1, .Lhit_limit /* No null in final Qword. */ > - > - /* We know there's a null in the final Qword. The easiest thing > - to do now is work out the length of the string and return > - MIN (len, limit). */ > - > - sub len, src, srcin > - cbz has_nul1, .Lnul_in_data2 > -#ifdef __AARCH64EB__ > - mov data2, data1 > -#endif > - sub len, len, #8 > - mov has_nul2, has_nul1 > -.Lnul_in_data2: > -#ifdef __AARCH64EB__ > - /* For big-endian, carry propagation (if the final byte in the > - string is 0x01) means we cannot use has_nul directly. The > - easiest way to get the correct byte is to byte-swap the data > - and calculate the syndrome a second time. */ > - rev data2, data2 > - sub tmp1, data2, zeroones > - orr tmp2, data2, #REP8_7f > - bic has_nul2, tmp1, tmp2 > -#endif > - sub len, len, #8 > - rev has_nul2, has_nul2 > - clz pos, has_nul2 > - add len, len, pos, lsr #3 /* Bits to bytes. */ > - cmp len, limit > - csel len, len, limit, ls /* Return the lower value. */ > +L(nomatch): > + mov result, cntin > ret > > -.Lmisaligned: > - /* Deal with a partial first word. > - We're doing two things in parallel here; > - 1) Calculate the number of words (but avoiding overflow if > - limit is near ULONG_MAX) - to do this we need to work out > - limit + tmp1 - 1 as a 65-bit value before shifting it; > - 2) Load and mask the initial data words - we force the bytes > - before the ones we are interested in to 0xff - this ensures > - early bytes will not hit any zero detection. */ > - sub limit_wd, limit, #1 > - neg tmp4, tmp1 > - cmp tmp1, #8 > - > - and tmp3, limit_wd, #15 > - lsr limit_wd, limit_wd, #4 > - mov tmp2, #~0 > - > - ldp data1, data2, [src], #16 > - lsl tmp4, tmp4, #3 /* Bytes beyond alignment -> bits. */ > - add tmp3, tmp3, tmp1 > - > -#ifdef __AARCH64EB__ > - /* Big-endian. Early bytes are at MSB. */ > - lsl tmp2, tmp2, tmp4 /* Shift (tmp1 & 63). */ > -#else > - /* Little-endian. Early bytes are at LSB. */ > - lsr tmp2, tmp2, tmp4 /* Shift (tmp1 & 63). */ > +L(start_loop): > + sub tmp, src, srcin > + add tmp, tmp, 17 > + subs cntrem, cntin, tmp > + b.lo L(nomatch) > + > + /* Make sure that it won't overread by a 16-byte chunk */ > + tbz cntrem, 4, L(loop32_2) > + sub src, src, 16 > + .p2align 5 > +L(loop32): > + ldr qdata, [src, 32]! > + cmeq vhas_chr.16b, vdata.16b, 0 > + umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ > + fmov synd, dend > + cbnz synd, L(end) > +L(loop32_2): > + ldr qdata, [src, 16] > + subs cntrem, cntrem, 32 > + cmeq vhas_chr.16b, vdata.16b, 0 > + b.lo L(end_2) > + umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ > + fmov synd, dend > + cbz synd, L(loop32) > +L(end_2): > + add src, src, 16 > +L(end): > + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */ > + sub result, src, srcin > + fmov synd, dend > +#ifndef __AARCH64EB__ > + rbit synd, synd > #endif > - add limit_wd, limit_wd, tmp3, lsr #4 > - > - orr data1, data1, tmp2 > - orr data2a, data2, tmp2 > - > - csinv data1, data1, xzr, le > - csel data2, data2, data2a, le > - b .Lrealigned > - .size strnlen, . - .Lstart /* Include pre-padding in size. */ > + clz synd, synd > + add result, result, synd, lsr 2 > + cmp cntin, result > + csel result, cntin, result, ls > + ret > > +END (strnlen) > #endif > diff --git a/newlib/libc/machine/aarch64/strrchr.S b/newlib/libc/machine/aarch64/strrchr.S > index d64fc09b1a..b0574228b6 100644 > --- a/newlib/libc/machine/aarch64/strrchr.S > +++ b/newlib/libc/machine/aarch64/strrchr.S > @@ -1,32 +1,9 @@ > /* > - strrchr - find last instance of a character in a string > - > - Copyright (c) 2014, ARM Limited > - All rights Reserved. > - > - Redistribution and use in source and binary forms, with or without > - modification, are permitted provided that the following conditions are met: > - * Redistributions of source code must retain the above copyright > - notice, this list of conditions and the following disclaimer. > - * Redistributions in binary form must reproduce the above copyright > - notice, this list of conditions and the following disclaimer in the > - documentation and/or other materials provided with the distribution. > - * Neither the name of the company nor the names of its contributors > - may be used to endorse or promote products derived from this > - software without specific prior written permission. > - > - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS > - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT > - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR > - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT > - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, > - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT > - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, > - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY > - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT > - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE > - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ > - > + * strrchr - find last position of a character in a string. > + * > + * Copyright (c) 2014-2022, Arm Limited. > + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception > + */ > #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED)) > /* See strchr-stub.c */ > #else > @@ -37,6 +14,8 @@ > * Neon Available. > */ > > +#include "asmdefs.h" > + > /* Arguments and results. */ > #define srcin x0 > #define chrin w1 > @@ -78,17 +57,8 @@ > in the original string a count_trailing_zeros() operation will > identify exactly which byte is causing the termination, and why. */ > > -/* Locals and temporaries. */ > - > - .macro def_fn f p2align=0 > - .text > - .p2align \p2align > - .global \f > - .type \f, %function > -\f: > - .endm > - > -def_fn strrchr > +ENTRY (strrchr) > + PTR_ARG (0) > /* Magic constant 0x40100401 to allow us to identify which lane > matches the requested byte. Magic constant 0x80200802 used > similarly for NUL termination. */ > @@ -100,7 +70,7 @@ def_fn strrchr > mov src_offset, #0 > ands tmp1, srcin, #31 > add vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */ > - b.eq .Laligned > + b.eq L(aligned) > > /* Input string is not 32-byte aligned. Rather than forcing > the padding bytes to a safe value, we calculate the syndrome > @@ -118,45 +88,45 @@ def_fn strrchr > and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b > addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b // 256->128 > addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128 > - addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul1.16b // 128->64 > - addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr1.16b // 128->64 > - mov nul_match, vhas_nul1.2d[0] > + addp vend1.16b, vhas_nul1.16b, vhas_chr1.16b // 128->64 > + mov nul_match, vend1.d[0] > lsl tmp1, tmp1, #1 > mov const_m1, #~0 > - mov chr_match, vhas_chr1.2d[0] > lsr tmp3, const_m1, tmp1 > + mov chr_match, vend1.d[1] > > bic nul_match, nul_match, tmp3 // Mask padding bits. > bic chr_match, chr_match, tmp3 // Mask padding bits. > - cbnz nul_match, .Ltail > + cbnz nul_match, L(tail) > > -.Lloop: > + .p2align 4 > +L(loop): > cmp chr_match, #0 > csel src_match, src, src_match, ne > csel src_offset, chr_match, src_offset, ne > -.Laligned: > +L(aligned): > ld1 {vdata1.16b, vdata2.16b}, [src], #32 > - cmeq vhas_nul1.16b, vdata1.16b, #0 > cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b > - cmeq vhas_nul2.16b, vdata2.16b, #0 > cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b > - addp vend1.16b, vhas_nul1.16b, vhas_nul2.16b // 256->128 > + uminp vend1.16b, vdata1.16b, vdata2.16b > and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b > and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b > + cmeq vend1.16b, vend1.16b, 0 > addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128 > - addp vend1.16b, vend1.16b, vend1.16b // 128->64 > - addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr1.16b // 128->64 > - mov nul_match, vend1.2d[0] > - mov chr_match, vhas_chr1.2d[0] > - cbz nul_match, .Lloop > + addp vend1.16b, vend1.16b, vhas_chr1.16b // 128->64 > + mov nul_match, vend1.d[0] > + mov chr_match, vend1.d[1] > + cbz nul_match, L(loop) > > + cmeq vhas_nul1.16b, vdata1.16b, #0 > + cmeq vhas_nul2.16b, vdata2.16b, #0 > and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b > and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b > addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b > addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul1.16b > - mov nul_match, vhas_nul1.2d[0] > + mov nul_match, vhas_nul1.d[0] > > -.Ltail: > +L(tail): > /* Work out exactly where the string ends. */ > sub tmp4, nul_match, #1 > eor tmp4, tmp4, nul_match > @@ -178,5 +148,5 @@ def_fn strrchr > > ret > > - .size strrchr, . - strrchr > +END (strrchr) > #endif