From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 1895) id 396093858421; Tue, 17 Jan 2023 16:36:17 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 396093858421 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org; s=default; t=1673973377; bh=uspgD+l54JRYaEW35z2hmcJ1Q9Lergcv8tqg0MzOL2Q=; h=From:To:Subject:Date:From; b=ZleXpGa2lW7KKzTZEAoUbEtjXRC1gq8SInaNF2m8Zt8YnU2XX1vIPAHU4dZIqjDYw 7FxvKWxOeTCdVCZfTRl3u1aIcKh51aQLPuN+fQcmTebuauREZotQiLKaMmaL65HjBN V4/ACOM7HTh9TXgyhbGZaXj5Z1LghJYIK6aRvmsY= Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: Wilco Dijkstra To: glibc-cvs@sourceware.org Subject: [glibc] AArch64: Improve strrchr X-Act-Checkin: glibc X-Git-Author: Wilco Dijkstra X-Git-Refname: refs/heads/master X-Git-Oldrev: ad098893ba3c3344a5f2f6ab1627c47204afdb47 X-Git-Newrev: 55599d480437dcf129b41b95be32b48f2a9e5da9 Message-Id: <20230117163617.396093858421@sourceware.org> Date: Tue, 17 Jan 2023 16:36:17 +0000 (GMT) List-Id: https://sourceware.org/git/gitweb.cgi?p=glibc.git;h=55599d480437dcf129b41b95be32b48f2a9e5da9 commit 55599d480437dcf129b41b95be32b48f2a9e5da9 Author: Wilco Dijkstra Date: Wed Jan 11 13:53:19 2023 +0000 AArch64: Improve strrchr Use shrn for narrowing the mask which simplifies code and speeds up small strings. Unroll the first search loop to improve performance on large strings. Reviewed-by: Szabolcs Nagy Diff: --- sysdeps/aarch64/strrchr.S | 58 +++++++++++++++++++++++++++-------------------- 1 file changed, 33 insertions(+), 25 deletions(-) diff --git a/sysdeps/aarch64/strrchr.S b/sysdeps/aarch64/strrchr.S index b4b8b44aad..528034a518 100644 --- a/sysdeps/aarch64/strrchr.S +++ b/sysdeps/aarch64/strrchr.S @@ -22,19 +22,16 @@ /* Assumptions: * - * ARMv8-a, AArch64 - * Neon Available. + * ARMv8-a, AArch64, Advanced SIMD. * MTE compatible. */ -/* Arguments and results. */ #define srcin x0 #define chrin w1 #define result x0 #define src x2 #define tmp x3 -#define wtmp w3 #define synd x3 #define shift x4 #define src_match x4 @@ -46,7 +43,6 @@ #define vhas_nul v2 #define vhas_chr v3 #define vrepmask v4 -#define vrepmask2 v5 #define vend v5 #define dend d5 @@ -58,59 +54,71 @@ the relevant byte matched the requested character; bits 2-3 are set if the relevant byte matched the NUL end of string. */ -ENTRY(strrchr) +ENTRY (strrchr) PTR_ARG (0) bic src, srcin, 15 dup vrepchr.16b, chrin - mov wtmp, 0x3003 - dup vrepmask.8h, wtmp - tst srcin, 15 - beq L(loop1) - - ld1 {vdata.16b}, [src], 16 + movi vrepmask.16b, 0x33 + ld1 {vdata.16b}, [src] cmeq vhas_nul.16b, vdata.16b, 0 cmeq vhas_chr.16b, vdata.16b, vrepchr.16b - mov wtmp, 0xf00f - dup vrepmask2.8h, wtmp bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b - and vhas_nul.16b, vhas_nul.16b, vrepmask2.16b - addp vend.16b, vhas_nul.16b, vhas_nul.16b + shrn vend.8b, vhas_nul.8h, 4 lsl shift, srcin, 2 fmov synd, dend lsr synd, synd, shift lsl synd, synd, shift ands nul_match, synd, 0xcccccccccccccccc bne L(tail) - cbnz synd, L(loop2) + cbnz synd, L(loop2_start) - .p2align 5 + .p2align 4 L(loop1): - ld1 {vdata.16b}, [src], 16 + ldr q1, [src, 16] + cmeq vhas_chr.16b, vdata.16b, vrepchr.16b + cmhs vhas_nul.16b, vhas_chr.16b, vdata.16b + umaxp vend.16b, vhas_nul.16b, vhas_nul.16b + fmov synd, dend + cbnz synd, L(loop1_end) + ldr q1, [src, 32]! cmeq vhas_chr.16b, vdata.16b, vrepchr.16b cmhs vhas_nul.16b, vhas_chr.16b, vdata.16b umaxp vend.16b, vhas_nul.16b, vhas_nul.16b fmov synd, dend cbz synd, L(loop1) - + sub src, src, 16 +L(loop1_end): + add src, src, 16 cmeq vhas_nul.16b, vdata.16b, 0 +#ifdef __AARCH64EB__ + bif vhas_nul.16b, vhas_chr.16b, vrepmask.16b + shrn vend.8b, vhas_nul.8h, 4 + fmov synd, dend + rbit synd, synd +#else bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b - bic vhas_nul.8h, 0x0f, lsl 8 - addp vend.16b, vhas_nul.16b, vhas_nul.16b + shrn vend.8b, vhas_nul.8h, 4 fmov synd, dend +#endif ands nul_match, synd, 0xcccccccccccccccc - beq L(loop2) - + beq L(loop2_start) L(tail): sub nul_match, nul_match, 1 and chr_match, synd, 0x3333333333333333 ands chr_match, chr_match, nul_match - sub result, src, 1 + add result, src, 15 clz tmp, chr_match sub result, result, tmp, lsr 2 csel result, result, xzr, ne ret .p2align 4 + nop + nop +L(loop2_start): + add src, src, 16 + bic vrepmask.8h, 0xf0 + L(loop2): cmp synd, 0 csel src_match, src, src_match, ne