From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 1895) id EB1383858426; Wed, 10 Apr 2024 14:10:37 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org EB1383858426 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org; s=default; t=1712758237; bh=E6ADOFoCfCd5A/6WBwQJDaKK0mxkqqgWB/2VfKT+6Sc=; h=From:To:Subject:Date:From; b=cXMePLhiESAYdR5ZFkElhU4cFljid1IDyh1Xl2qU/m7Ic2trxZhkG0Vyt0D0VAIcN uPIq5YgT+AFvMi/ie5t3r8Y5AQzSyrx2CG2tffMNzDeEtyas8Oyse4OP+w7QT9CYUO qOk940aLsKPOuDxuuUfHbJL+aI6NHIKwh/TjCx58= Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: Wilco Dijkstra To: glibc-cvs@sourceware.org Subject: [glibc/release/2.36/master] AArch64: Improve strrchr X-Act-Checkin: glibc X-Git-Author: Wilco Dijkstra X-Git-Refname: refs/heads/release/2.36/master X-Git-Oldrev: 7cbcc959270dcf25789b2f09c4bf1c60a86b760a X-Git-Newrev: 600098c58ab53107a76237cba8b90ce26b253b56 Message-Id: <20240410141037.EB1383858426@sourceware.org> Date: Wed, 10 Apr 2024 14:10:37 +0000 (GMT) List-Id: https://sourceware.org/git/gitweb.cgi?p=glibc.git;h=600098c58ab53107a76237cba8b90ce26b253b56 commit 600098c58ab53107a76237cba8b90ce26b253b56 Author: Wilco Dijkstra Date: Wed Jan 11 13:53:19 2023 +0000 AArch64: Improve strrchr Use shrn for narrowing the mask which simplifies code and speeds up small strings. Unroll the first search loop to improve performance on large strings. Reviewed-by: Szabolcs Nagy (cherry picked from commit 55599d480437dcf129b41b95be32b48f2a9e5da9) Diff: --- sysdeps/aarch64/strrchr.S | 58 +++++++++++++++++++++++++++-------------------- 1 file changed, 33 insertions(+), 25 deletions(-) diff --git a/sysdeps/aarch64/strrchr.S b/sysdeps/aarch64/strrchr.S index 596e77c43b..eda6fefb99 100644 --- a/sysdeps/aarch64/strrchr.S +++ b/sysdeps/aarch64/strrchr.S @@ -22,19 +22,16 @@ /* Assumptions: * - * ARMv8-a, AArch64 - * Neon Available. + * ARMv8-a, AArch64, Advanced SIMD. * MTE compatible. */ -/* Arguments and results. */ #define srcin x0 #define chrin w1 #define result x0 #define src x2 #define tmp x3 -#define wtmp w3 #define synd x3 #define shift x4 #define src_match x4 @@ -46,7 +43,6 @@ #define vhas_nul v2 #define vhas_chr v3 #define vrepmask v4 -#define vrepmask2 v5 #define vend v5 #define dend d5 @@ -58,59 +54,71 @@ the relevant byte matched the requested character; bits 2-3 are set if the relevant byte matched the NUL end of string. */ -ENTRY(strrchr) +ENTRY (strrchr) PTR_ARG (0) bic src, srcin, 15 dup vrepchr.16b, chrin - mov wtmp, 0x3003 - dup vrepmask.8h, wtmp - tst srcin, 15 - beq L(loop1) - - ld1 {vdata.16b}, [src], 16 + movi vrepmask.16b, 0x33 + ld1 {vdata.16b}, [src] cmeq vhas_nul.16b, vdata.16b, 0 cmeq vhas_chr.16b, vdata.16b, vrepchr.16b - mov wtmp, 0xf00f - dup vrepmask2.8h, wtmp bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b - and vhas_nul.16b, vhas_nul.16b, vrepmask2.16b - addp vend.16b, vhas_nul.16b, vhas_nul.16b + shrn vend.8b, vhas_nul.8h, 4 lsl shift, srcin, 2 fmov synd, dend lsr synd, synd, shift lsl synd, synd, shift ands nul_match, synd, 0xcccccccccccccccc bne L(tail) - cbnz synd, L(loop2) + cbnz synd, L(loop2_start) - .p2align 5 + .p2align 4 L(loop1): - ld1 {vdata.16b}, [src], 16 + ldr q1, [src, 16] + cmeq vhas_chr.16b, vdata.16b, vrepchr.16b + cmhs vhas_nul.16b, vhas_chr.16b, vdata.16b + umaxp vend.16b, vhas_nul.16b, vhas_nul.16b + fmov synd, dend + cbnz synd, L(loop1_end) + ldr q1, [src, 32]! cmeq vhas_chr.16b, vdata.16b, vrepchr.16b cmhs vhas_nul.16b, vhas_chr.16b, vdata.16b umaxp vend.16b, vhas_nul.16b, vhas_nul.16b fmov synd, dend cbz synd, L(loop1) - + sub src, src, 16 +L(loop1_end): + add src, src, 16 cmeq vhas_nul.16b, vdata.16b, 0 +#ifdef __AARCH64EB__ + bif vhas_nul.16b, vhas_chr.16b, vrepmask.16b + shrn vend.8b, vhas_nul.8h, 4 + fmov synd, dend + rbit synd, synd +#else bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b - bic vhas_nul.8h, 0x0f, lsl 8 - addp vend.16b, vhas_nul.16b, vhas_nul.16b + shrn vend.8b, vhas_nul.8h, 4 fmov synd, dend +#endif ands nul_match, synd, 0xcccccccccccccccc - beq L(loop2) - + beq L(loop2_start) L(tail): sub nul_match, nul_match, 1 and chr_match, synd, 0x3333333333333333 ands chr_match, chr_match, nul_match - sub result, src, 1 + add result, src, 15 clz tmp, chr_match sub result, result, tmp, lsr 2 csel result, result, xzr, ne ret .p2align 4 + nop + nop +L(loop2_start): + add src, src, 16 + bic vrepmask.8h, 0xf0 + L(loop2): cmp synd, 0 csel src_match, src, src_match, ne