public inbox for newlib-cvs@sourceware.org
help / color / mirror / Atom feed
* [newlib-cygwin] Improve strncmp for mutually misaligned inputs
@ 2018-07-13 11:28 Corinna Vinschen
0 siblings, 0 replies; only message in thread
From: Corinna Vinschen @ 2018-07-13 11:28 UTC (permalink / raw)
To: newlib-cvs
https://sourceware.org/git/gitweb.cgi?p=newlib-cygwin.git;h=f44eee8f1bb88c235cea564106b9eab86e5ea29b
commit f44eee8f1bb88c235cea564106b9eab86e5ea29b
Author: Siddhesh Poyarekar <siddhesh@sourceware.org>
Date: Fri Jun 29 18:08:20 2018 +0530
Improve strncmp for mutually misaligned inputs
The mutually misaligned inputs on aarch64 are compared with a simple
byte copy, which is not very efficient. Enhance the comparison
similar to strcmp by loading a double-word at a time. The peak
performance improvement (i.e. 4k maxlen comparisons) due to this on
the strncmp microbenchmark in glibc is as follows:
falkor: 3.5x (up to 72% time reduction)
cortex-a73: 3.5x (up to 71% time reduction)
cortex-a53: 3.5x (up to 71% time reduction)
All mutually misaligned inputs from 16 bytes maxlen onwards show
upwards of 15% improvement and there is no measurable effect on the
performance of aligned/mutually aligned inputs.
Diff:
---
newlib/libc/machine/aarch64/strncmp.S | 96 ++++++++++++++++++++++++++++-------
1 file changed, 79 insertions(+), 17 deletions(-)
diff --git a/newlib/libc/machine/aarch64/strncmp.S b/newlib/libc/machine/aarch64/strncmp.S
index 0b90dd8..ffdabc2 100644
--- a/newlib/libc/machine/aarch64/strncmp.S
+++ b/newlib/libc/machine/aarch64/strncmp.S
@@ -1,4 +1,4 @@
-/* Copyright (c) 2013, Linaro Limited
+/* Copyright (c) 2013, 2018, Linaro Limited
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -67,6 +67,7 @@
#define limit_wd x13
#define mask x14
#define endloop x15
+#define count mask
.text
.p2align 6
@@ -78,9 +79,9 @@ def_fn strncmp
eor tmp1, src1, src2
mov zeroones, #REP8_01
tst tmp1, #7
+ and count, src1, #7
b.ne .Lmisaligned8
- ands tmp1, src1, #7
- b.ne .Lmutual_align
+ cbnz count, .Lmutual_align
/* Calculate the number of full and partial words -1. */
sub limit_wd, limit, #1 /* limit != 0, so no underflow. */
lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */
@@ -185,44 +186,105 @@ def_fn strncmp
bic src1, src1, #7
bic src2, src2, #7
ldr data1, [src1], #8
- neg tmp3, tmp1, lsl #3 /* 64 - bits(bytes beyond align). */
+ neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */
ldr data2, [src2], #8
mov tmp2, #~0
sub limit_wd, limit, #1 /* limit != 0, so no underflow. */
#ifdef __AARCH64EB__
/* Big-endian. Early bytes are at MSB. */
- lsl tmp2, tmp2, tmp3 /* Shift (tmp1 & 63). */
+ lsl tmp2, tmp2, tmp3 /* Shift (count & 63). */
#else
/* Little-endian. Early bytes are at LSB. */
- lsr tmp2, tmp2, tmp3 /* Shift (tmp1 & 63). */
+ lsr tmp2, tmp2, tmp3 /* Shift (count & 63). */
#endif
and tmp3, limit_wd, #7
lsr limit_wd, limit_wd, #3
/* Adjust the limit. Only low 3 bits used, so overflow irrelevant. */
- add limit, limit, tmp1
- add tmp3, tmp3, tmp1
+ add limit, limit, count
+ add tmp3, tmp3, count
orr data1, data1, tmp2
orr data2, data2, tmp2
add limit_wd, limit_wd, tmp3, lsr #3
b .Lstart_realigned
-.Lret0:
- mov result, #0
- ret
-
.p2align 6
+ /* Don't bother with dwords for up to 16 bytes. */
.Lmisaligned8:
- sub limit, limit, #1
-1:
+ cmp limit, #16
+ b.hs .Ltry_misaligned_words
+
+.Lbyte_loop:
/* Perhaps we can do better than this. */
ldrb data1w, [src1], #1
ldrb data2w, [src2], #1
subs limit, limit, #1
- ccmp data1w, #1, #0, cs /* NZCV = 0b0000. */
+ ccmp data1w, #1, #0, hi /* NZCV = 0b0000. */
ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
- b.eq 1b
+ b.eq .Lbyte_loop
+.Ldone:
sub result, data1, data2
ret
- .size strncmp, . - strncmp
+ /* Align the SRC1 to a dword by doing a bytewise compare and then do
+ the dword loop. */
+.Ltry_misaligned_words:
+ lsr limit_wd, limit, #3
+ cbz count, .Ldo_misaligned
+
+ neg count, count
+ and count, count, #7
+ sub limit, limit, count
+ lsr limit_wd, limit, #3
+.Lpage_end_loop:
+ ldrb data1w, [src1], #1
+ ldrb data2w, [src2], #1
+ cmp data1w, #1
+ ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
+ b.ne .Ldone
+ subs count, count, #1
+ b.hi .Lpage_end_loop
+
+.Ldo_misaligned:
+ /* Prepare ourselves for the next page crossing. Unlike the aligned
+ loop, we fetch 1 less dword because we risk crossing bounds on
+ SRC2. */
+ mov count, #8
+ subs limit_wd, limit_wd, #1
+ b.lo .Ldone_loop
+.Lloop_misaligned:
+ and tmp2, src2, #0xff8
+ eor tmp2, tmp2, #0xff8
+ cbz tmp2, .Lpage_end_loop
+
+ ldr data1, [src1], #8
+ ldr data2, [src2], #8
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, #REP8_7f
+ eor diff, data1, data2 /* Non-zero if differences found. */
+ bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
+ ccmp diff, #0, #0, eq
+ b.ne .Lnot_limit
+ subs limit_wd, limit_wd, #1
+ b.pl .Lloop_misaligned
+
+.Ldone_loop:
+ /* We found a difference or a NULL before the limit was reached. */
+ and limit, limit, #7
+ cbz limit, .Lnot_limit
+ /* Read the last word. */
+ sub src1, src1, 8
+ sub src2, src2, 8
+ ldr data1, [src1, limit]
+ ldr data2, [src2, limit]
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, #REP8_7f
+ eor diff, data1, data2 /* Non-zero if differences found. */
+ bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
+ ccmp diff, #0, #0, eq
+ b.ne .Lnot_limit
+
+.Lret0:
+ mov result, #0
+ ret
+ .size strncmp, . - strncmp
#endif
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2018-07-13 11:28 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-07-13 11:28 [newlib-cygwin] Improve strncmp for mutually misaligned inputs Corinna Vinschen
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).