public inbox for glibc-cvs@sourceware.org
help / color / mirror / Atom feed
* [glibc/fw/builtin-syscalls-4] aarch64: Optimized strlen for strlen_asimd
@ 2019-12-31 10:47 Florian Weimer
0 siblings, 0 replies; only message in thread
From: Florian Weimer @ 2019-12-31 10:47 UTC (permalink / raw)
To: glibc-cvs
https://sourceware.org/git/gitweb.cgi?p=glibc.git;h=c2150769d015dca1805334af7743829f1e4c0b6a
commit c2150769d015dca1805334af7743829f1e4c0b6a
Author: Xuelei Zhang <zhangxuelei4@huawei.com>
Date: Thu Dec 19 13:41:40 2019 +0000
aarch64: Optimized strlen for strlen_asimd
Optimize the strlen implementation by using vector operations and
loop unrolling in main loop.Compared to __strlen_generic,it reduces
latency of cases in bench-strlen by 7%~18% when the length of src
is greater than 128 bytes, with gains throughout the benchmark.
Checked on aarch64-linux-gnu.
Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
Diff:
---
sysdeps/aarch64/multiarch/strlen.c | 4 ++-
sysdeps/aarch64/multiarch/strlen_asimd.S | 42 ++++++++++++++++++++------------
2 files changed, 29 insertions(+), 17 deletions(-)
diff --git a/sysdeps/aarch64/multiarch/strlen.c b/sysdeps/aarch64/multiarch/strlen.c
index 1db01ba..abf6513 100644
--- a/sysdeps/aarch64/multiarch/strlen.c
+++ b/sysdeps/aarch64/multiarch/strlen.c
@@ -34,7 +34,9 @@ extern __typeof (__redirect_strlen) __strlen_generic attribute_hidden;
extern __typeof (__redirect_strlen) __strlen_asimd attribute_hidden;
libc_ifunc (__strlen,
- (USE_ASIMD_STRLEN () ? __strlen_asimd : __strlen_generic));
+ (USE_ASIMD_STRLEN () || IS_KUNPENG(midr)
+ ? __strlen_asimd
+ :__strlen_generic));
# undef strlen
strong_alias (__strlen, strlen);
diff --git a/sysdeps/aarch64/multiarch/strlen_asimd.S b/sysdeps/aarch64/multiarch/strlen_asimd.S
index 1d1c6ab..1de6cd3 100644
--- a/sysdeps/aarch64/multiarch/strlen_asimd.S
+++ b/sysdeps/aarch64/multiarch/strlen_asimd.S
@@ -48,6 +48,9 @@
#define dataq2 q3
#define datav2 v3
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+
#ifdef TEST_PAGE_CROSS
# define MIN_PAGE_SIZE 16
#else
@@ -82,40 +85,47 @@ ENTRY_ALIGN (__strlen_asimd, 6)
DELOUSE (0)
DELOUSE (1)
and tmp1, srcin, MIN_PAGE_SIZE - 1
+ mov zeroones, REP8_01
cmp tmp1, MIN_PAGE_SIZE - 16
b.gt L(page_cross)
- ldr dataq, [srcin]
+ ldp data1, data2, [srcin]
#ifdef __AARCH64EB__
- rev64 datav.16b, datav.16b
+ rev data1, data1
+ rev data2, data2
#endif
- /* Get the minimum value and keep going if it is not zero. */
- uminv datab2, datav.16b
- mov tmp1, datav2.d[0]
- cbnz tmp1, L(main_loop_entry)
-
- cmeq datav.16b, datav.16b, #0
- mov data1, datav.d[0]
- mov data2, datav.d[1]
- cmp data1, 0
- csel data1, data1, data2, ne
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, REP8_7f
+ sub tmp3, data2, zeroones
+ orr tmp4, data2, REP8_7f
+ bics has_nul1, tmp1, tmp2
+ bic has_nul2, tmp3, tmp4
+ ccmp has_nul2, 0, 0, eq
+ beq L(main_loop_entry)
+ csel has_nul1, has_nul1, has_nul2, cc
mov len, 8
- rev data1, data1
- clz tmp1, data1
- csel len, xzr, len, ne
+ rev has_nul1, has_nul1
+ clz tmp1, has_nul1
+ csel len, xzr, len, cc
add len, len, tmp1, lsr 3
ret
L(main_loop_entry):
bic src, srcin, 15
+ sub src, src, 16
L(main_loop):
- ldr dataq, [src, 16]!
+ ldr dataq, [src, 32]!
L(page_cross_entry):
/* Get the minimum value and keep going if it is not zero. */
uminv datab2, datav.16b
mov tmp1, datav2.d[0]
+ cbz tmp1, L(tail)
+ ldr dataq, [src, 16]
+ uminv datab2, datav.16b
+ mov tmp1, datav2.d[0]
cbnz tmp1, L(main_loop)
+ add src, src, 16
L(tail):
#ifdef __AARCH64EB__
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2019-12-31 10:47 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-12-31 10:47 [glibc/fw/builtin-syscalls-4] aarch64: Optimized strlen for strlen_asimd Florian Weimer
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).