public inbox for glibc-cvs@sourceware.org
help / color / mirror / Atom feed
* [glibc/fw/builtin-syscalls-4] aarch64: Optimized strlen for strlen_asimd
@ 2019-12-31 10:47 Florian Weimer
  0 siblings, 0 replies; only message in thread
From: Florian Weimer @ 2019-12-31 10:47 UTC (permalink / raw)
  To: glibc-cvs

https://sourceware.org/git/gitweb.cgi?p=glibc.git;h=c2150769d015dca1805334af7743829f1e4c0b6a

commit c2150769d015dca1805334af7743829f1e4c0b6a
Author: Xuelei Zhang <zhangxuelei4@huawei.com>
Date:   Thu Dec 19 13:41:40 2019 +0000

    aarch64: Optimized strlen for strlen_asimd
    
    Optimize the strlen implementation by using vector operations and
    loop unrolling in main loop.Compared to __strlen_generic,it reduces
    latency of cases in bench-strlen by 7%~18% when the length of src
    is greater than 128 bytes, with gains throughout the benchmark.
    
    Checked on aarch64-linux-gnu.
    
    Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>

Diff:
---
 sysdeps/aarch64/multiarch/strlen.c       |  4 ++-
 sysdeps/aarch64/multiarch/strlen_asimd.S | 42 ++++++++++++++++++++------------
 2 files changed, 29 insertions(+), 17 deletions(-)

diff --git a/sysdeps/aarch64/multiarch/strlen.c b/sysdeps/aarch64/multiarch/strlen.c
index 1db01ba..abf6513 100644
--- a/sysdeps/aarch64/multiarch/strlen.c
+++ b/sysdeps/aarch64/multiarch/strlen.c
@@ -34,7 +34,9 @@ extern __typeof (__redirect_strlen) __strlen_generic attribute_hidden;
 extern __typeof (__redirect_strlen) __strlen_asimd attribute_hidden;
 
 libc_ifunc (__strlen,
-	    (USE_ASIMD_STRLEN () ? __strlen_asimd : __strlen_generic));
+	    (USE_ASIMD_STRLEN () || IS_KUNPENG(midr)
+	    ? __strlen_asimd
+	    :__strlen_generic));
 
 # undef strlen
 strong_alias (__strlen, strlen);
diff --git a/sysdeps/aarch64/multiarch/strlen_asimd.S b/sysdeps/aarch64/multiarch/strlen_asimd.S
index 1d1c6ab..1de6cd3 100644
--- a/sysdeps/aarch64/multiarch/strlen_asimd.S
+++ b/sysdeps/aarch64/multiarch/strlen_asimd.S
@@ -48,6 +48,9 @@
 #define dataq2		q3
 #define datav2		v3
 
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+
 #ifdef TEST_PAGE_CROSS
 # define MIN_PAGE_SIZE 16
 #else
@@ -82,40 +85,47 @@ ENTRY_ALIGN (__strlen_asimd, 6)
 	DELOUSE (0)
 	DELOUSE (1)
 	and	tmp1, srcin, MIN_PAGE_SIZE - 1
+	mov	zeroones, REP8_01
 	cmp	tmp1, MIN_PAGE_SIZE - 16
 	b.gt	L(page_cross)
-	ldr	dataq, [srcin]
+	ldp	data1, data2, [srcin]
 #ifdef __AARCH64EB__
-	rev64	datav.16b, datav.16b
+	rev	data1, data1
+	rev	data2, data2
 #endif
 
-	/* Get the minimum value and keep going if it is not zero.  */
-	uminv	datab2, datav.16b
-	mov	tmp1, datav2.d[0]
-	cbnz	tmp1, L(main_loop_entry)
-
-	cmeq	datav.16b, datav.16b, #0
-	mov	data1, datav.d[0]
-	mov	data2, datav.d[1]
-	cmp	data1, 0
-	csel	data1, data1, data2, ne
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, REP8_7f
+	sub	tmp3, data2, zeroones
+	orr	tmp4, data2, REP8_7f
+	bics	has_nul1, tmp1, tmp2
+	bic	has_nul2, tmp3, tmp4
+	ccmp	has_nul2, 0, 0, eq
+	beq	L(main_loop_entry)
+	csel	has_nul1, has_nul1, has_nul2, cc
 	mov	len, 8
-	rev	data1, data1
-	clz	tmp1, data1
-	csel	len, xzr, len, ne
+	rev	has_nul1, has_nul1
+	clz	tmp1, has_nul1
+	csel	len, xzr, len, cc
 	add	len, len, tmp1, lsr 3
 	ret
 
 L(main_loop_entry):
 	bic	src, srcin, 15
+	sub	src, src, 16
 
 L(main_loop):
-	ldr	dataq, [src, 16]!
+	ldr	dataq, [src, 32]!
 L(page_cross_entry):
 	/* Get the minimum value and keep going if it is not zero.  */
 	uminv	datab2, datav.16b
 	mov	tmp1, datav2.d[0]
+	cbz	tmp1, L(tail)
+	ldr	dataq, [src, 16]
+	uminv	datab2, datav.16b
+	mov	tmp1, datav2.d[0]
 	cbnz	tmp1, L(main_loop)
+	add	src, src, 16
 
 L(tail):
 #ifdef __AARCH64EB__


^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2019-12-31 10:47 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-12-31 10:47 [glibc/fw/builtin-syscalls-4] aarch64: Optimized strlen for strlen_asimd Florian Weimer

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).