public inbox for glibc-cvs@sourceware.org
help / color / mirror / Atom feed
* [glibc/release/2.33/master] AArch64: Optimize strnlen
@ 2024-04-10 16:37 Wilco Dijkstra
0 siblings, 0 replies; only message in thread
From: Wilco Dijkstra @ 2024-04-10 16:37 UTC (permalink / raw)
To: glibc-cvs
https://sourceware.org/git/gitweb.cgi?p=glibc.git;h=3527d31d0bbafcc8b5b18eb9ebc5f290adeec495
commit 3527d31d0bbafcc8b5b18eb9ebc5f290adeec495
Author: Wilco Dijkstra <wilco.dijkstra@arm.com>
Date: Wed Jan 11 13:53:05 2023 +0000
AArch64: Optimize strnlen
Optimize strnlen using the shrn instruction and improve the main loop.
Small strings are around 10% faster, large strings are 40% faster on
modern CPUs.
Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
(cherry picked from commit ad098893ba3c3344a5f2f6ab1627c47204afdb47)
Diff:
---
sysdeps/aarch64/strnlen.S | 39 ++++++++++++++++++---------------------
1 file changed, 18 insertions(+), 21 deletions(-)
diff --git a/sysdeps/aarch64/strnlen.S b/sysdeps/aarch64/strnlen.S
index 613d521b62..5cd5ef50ec 100644
--- a/sysdeps/aarch64/strnlen.S
+++ b/sysdeps/aarch64/strnlen.S
@@ -44,19 +44,16 @@
/*
Core algorithm:
-
- For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
- per byte. We take 4 bits of every comparison byte with shift right and narrow
- by 4 instruction. Since the bits in the nibble mask reflect the order in
- which things occur in the original string, counting trailing zeros identifies
- exactly which byte matched. */
+ Process the string in 16-byte aligned chunks. Compute a 64-bit mask with
+ four bits per byte using the shrn instruction. A count trailing zeros then
+ identifies the first zero byte. */
ENTRY (__strnlen)
PTR_ARG (0)
SIZE_ARG (1)
bic src, srcin, 15
cbz cntin, L(nomatch)
- ld1 {vdata.16b}, [src], 16
+ ld1 {vdata.16b}, [src]
cmeq vhas_chr.16b, vdata.16b, 0
lsl shift, srcin, 2
shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
@@ -71,36 +68,40 @@ L(finish):
csel result, cntin, result, ls
ret
+L(nomatch):
+ mov result, cntin
+ ret
+
L(start_loop):
sub tmp, src, srcin
+ add tmp, tmp, 17
subs cntrem, cntin, tmp
- b.ls L(nomatch)
+ b.lo L(nomatch)
/* Make sure that it won't overread by a 16-byte chunk */
- add tmp, cntrem, 15
- tbnz tmp, 4, L(loop32_2)
-
+ tbz cntrem, 4, L(loop32_2)
+ sub src, src, 16
.p2align 5
L(loop32):
- ldr qdata, [src], 16
+ ldr qdata, [src, 32]!
cmeq vhas_chr.16b, vdata.16b, 0
umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
fmov synd, dend
cbnz synd, L(end)
L(loop32_2):
- ldr qdata, [src], 16
+ ldr qdata, [src, 16]
subs cntrem, cntrem, 32
cmeq vhas_chr.16b, vdata.16b, 0
- b.ls L(end)
+ b.lo L(end_2)
umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
fmov synd, dend
cbz synd, L(loop32)
-
+L(end_2):
+ add src, src, 16
L(end):
shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
- sub src, src, 16
- mov synd, vend.d[0]
sub result, src, srcin
+ fmov synd, dend
#ifndef __AARCH64EB__
rbit synd, synd
#endif
@@ -110,10 +111,6 @@ L(end):
csel result, cntin, result, ls
ret
-L(nomatch):
- mov result, cntin
- ret
-
END (__strnlen)
libc_hidden_def (__strnlen)
weak_alias (__strnlen, strnlen)
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2024-04-10 16:37 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-04-10 16:37 [glibc/release/2.33/master] AArch64: Optimize strnlen Wilco Dijkstra
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).