* [PATCH 0/3] Improved string comparison routines for aarch64
@ 2018-06-29 12:38 Siddhesh Poyarekar
2018-06-29 12:38 ` [PATCH 2/3] [aarch64] memcmp.S: optimize for medium to large sizes Siddhesh Poyarekar
` (4 more replies)
0 siblings, 5 replies; 7+ messages in thread
From: Siddhesh Poyarekar @ 2018-06-29 12:38 UTC (permalink / raw)
To: newlib
Hi,
Following patches improve performance of comparing mutually misaligned
strings by up to 3.5x on aarch64 and up to 2x for memcmp.
Siddhesh
Siddhesh Poyarekar (3):
[aarch64] Improve strncmp for mutually misaligned inputs
[aarch64] memcmp.S: optimize for medium to large sizes
[aarch64] strcmp.S: Improve performance for misaligned strings
newlib/libc/machine/aarch64/memcmp.S | 142 ++++++++++++++++++--------
newlib/libc/machine/aarch64/strcmp.S | 51 +++++++--
newlib/libc/machine/aarch64/strncmp.S | 96 ++++++++++++++---
3 files changed, 218 insertions(+), 71 deletions(-)
--
2.17.1
^ permalink raw reply [flat|nested] 7+ messages in thread
* [PATCH 1/3] [aarch64] Improve strncmp for mutually misaligned inputs
2018-06-29 12:38 [PATCH 0/3] Improved string comparison routines for aarch64 Siddhesh Poyarekar
2018-06-29 12:38 ` [PATCH 2/3] [aarch64] memcmp.S: optimize for medium to large sizes Siddhesh Poyarekar
@ 2018-06-29 12:38 ` Siddhesh Poyarekar
2018-06-30 3:19 ` [PATCH 3/3] [aarch64] strcmp.S: Improve performance for misaligned strings Siddhesh Poyarekar
` (2 subsequent siblings)
4 siblings, 0 replies; 7+ messages in thread
From: Siddhesh Poyarekar @ 2018-06-29 12:38 UTC (permalink / raw)
To: newlib
The mutually misaligned inputs on aarch64 are compared with a simple
byte copy, which is not very efficient. Enhance the comparison
similar to strcmp by loading a double-word at a time. The peak
performance improvement (i.e. 4k maxlen comparisons) due to this on
the strncmp microbenchmark in glibc is as follows:
falkor: 3.5x (up to 72% time reduction)
cortex-a73: 3.5x (up to 71% time reduction)
cortex-a53: 3.5x (up to 71% time reduction)
All mutually misaligned inputs from 16 bytes maxlen onwards show
upwards of 15% improvement and there is no measurable effect on the
performance of aligned/mutually aligned inputs.
---
newlib/libc/machine/aarch64/strncmp.S | 96 ++++++++++++++++++++++-----
1 file changed, 79 insertions(+), 17 deletions(-)
diff --git a/newlib/libc/machine/aarch64/strncmp.S b/newlib/libc/machine/aarch64/strncmp.S
index 0b90dd8a6..ffdabc260 100644
--- a/newlib/libc/machine/aarch64/strncmp.S
+++ b/newlib/libc/machine/aarch64/strncmp.S
@@ -1,4 +1,4 @@
-/* Copyright (c) 2013, Linaro Limited
+/* Copyright (c) 2013, 2018, Linaro Limited
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -67,6 +67,7 @@
#define limit_wd x13
#define mask x14
#define endloop x15
+#define count mask
.text
.p2align 6
@@ -78,9 +79,9 @@ def_fn strncmp
eor tmp1, src1, src2
mov zeroones, #REP8_01
tst tmp1, #7
+ and count, src1, #7
b.ne .Lmisaligned8
- ands tmp1, src1, #7
- b.ne .Lmutual_align
+ cbnz count, .Lmutual_align
/* Calculate the number of full and partial words -1. */
sub limit_wd, limit, #1 /* limit != 0, so no underflow. */
lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */
@@ -185,44 +186,105 @@ def_fn strncmp
bic src1, src1, #7
bic src2, src2, #7
ldr data1, [src1], #8
- neg tmp3, tmp1, lsl #3 /* 64 - bits(bytes beyond align). */
+ neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */
ldr data2, [src2], #8
mov tmp2, #~0
sub limit_wd, limit, #1 /* limit != 0, so no underflow. */
#ifdef __AARCH64EB__
/* Big-endian. Early bytes are at MSB. */
- lsl tmp2, tmp2, tmp3 /* Shift (tmp1 & 63). */
+ lsl tmp2, tmp2, tmp3 /* Shift (count & 63). */
#else
/* Little-endian. Early bytes are at LSB. */
- lsr tmp2, tmp2, tmp3 /* Shift (tmp1 & 63). */
+ lsr tmp2, tmp2, tmp3 /* Shift (count & 63). */
#endif
and tmp3, limit_wd, #7
lsr limit_wd, limit_wd, #3
/* Adjust the limit. Only low 3 bits used, so overflow irrelevant. */
- add limit, limit, tmp1
- add tmp3, tmp3, tmp1
+ add limit, limit, count
+ add tmp3, tmp3, count
orr data1, data1, tmp2
orr data2, data2, tmp2
add limit_wd, limit_wd, tmp3, lsr #3
b .Lstart_realigned
-.Lret0:
- mov result, #0
- ret
-
.p2align 6
+ /* Don't bother with dwords for up to 16 bytes. */
.Lmisaligned8:
- sub limit, limit, #1
-1:
+ cmp limit, #16
+ b.hs .Ltry_misaligned_words
+
+.Lbyte_loop:
/* Perhaps we can do better than this. */
ldrb data1w, [src1], #1
ldrb data2w, [src2], #1
subs limit, limit, #1
- ccmp data1w, #1, #0, cs /* NZCV = 0b0000. */
+ ccmp data1w, #1, #0, hi /* NZCV = 0b0000. */
ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
- b.eq 1b
+ b.eq .Lbyte_loop
+.Ldone:
sub result, data1, data2
ret
+ /* Align the SRC1 to a dword by doing a bytewise compare and then do
+ the dword loop. */
+.Ltry_misaligned_words:
+ lsr limit_wd, limit, #3
+ cbz count, .Ldo_misaligned
+
+ neg count, count
+ and count, count, #7
+ sub limit, limit, count
+ lsr limit_wd, limit, #3
+
+.Lpage_end_loop:
+ ldrb data1w, [src1], #1
+ ldrb data2w, [src2], #1
+ cmp data1w, #1
+ ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
+ b.ne .Ldone
+ subs count, count, #1
+ b.hi .Lpage_end_loop
+
+.Ldo_misaligned:
+ /* Prepare ourselves for the next page crossing. Unlike the aligned
+ loop, we fetch 1 less dword because we risk crossing bounds on
+ SRC2. */
+ mov count, #8
+ subs limit_wd, limit_wd, #1
+ b.lo .Ldone_loop
+.Lloop_misaligned:
+ and tmp2, src2, #0xff8
+ eor tmp2, tmp2, #0xff8
+ cbz tmp2, .Lpage_end_loop
+
+ ldr data1, [src1], #8
+ ldr data2, [src2], #8
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, #REP8_7f
+ eor diff, data1, data2 /* Non-zero if differences found. */
+ bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
+ ccmp diff, #0, #0, eq
+ b.ne .Lnot_limit
+ subs limit_wd, limit_wd, #1
+ b.pl .Lloop_misaligned
+
+.Ldone_loop:
+ /* We found a difference or a NULL before the limit was reached. */
+ and limit, limit, #7
+ cbz limit, .Lnot_limit
+ /* Read the last word. */
+ sub src1, src1, 8
+ sub src2, src2, 8
+ ldr data1, [src1, limit]
+ ldr data2, [src2, limit]
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, #REP8_7f
+ eor diff, data1, data2 /* Non-zero if differences found. */
+ bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
+ ccmp diff, #0, #0, eq
+ b.ne .Lnot_limit
+
+.Lret0:
+ mov result, #0
+ ret
.size strncmp, . - strncmp
-
#endif
--
2.17.1
^ permalink raw reply [flat|nested] 7+ messages in thread
* [PATCH 2/3] [aarch64] memcmp.S: optimize for medium to large sizes
2018-06-29 12:38 [PATCH 0/3] Improved string comparison routines for aarch64 Siddhesh Poyarekar
@ 2018-06-29 12:38 ` Siddhesh Poyarekar
2018-06-29 12:38 ` [PATCH 1/3] [aarch64] Improve strncmp for mutually misaligned inputs Siddhesh Poyarekar
` (3 subsequent siblings)
4 siblings, 0 replies; 7+ messages in thread
From: Siddhesh Poyarekar @ 2018-06-29 12:38 UTC (permalink / raw)
To: newlib
This improved memcmp provides a fast path for compares up to 16 bytes
and then compares 16 bytes at a time, thus optimizing loads from both
sources. The glibc memcmp microbenchmark retains performance (with an
error of ~1ns) for smaller compare sizes and reduces up to 31% of
execution time for compares up to 4K on the APM Mustang. On Qualcomm
Falkor this improves to almost 48%, i.e. it is almost 2x improvement
for sizes of 2K and above.
---
newlib/libc/machine/aarch64/memcmp.S | 142 +++++++++++++++++++--------
1 file changed, 99 insertions(+), 43 deletions(-)
diff --git a/newlib/libc/machine/aarch64/memcmp.S b/newlib/libc/machine/aarch64/memcmp.S
index 1ffb79eb3..605d99365 100644
--- a/newlib/libc/machine/aarch64/memcmp.S
+++ b/newlib/libc/machine/aarch64/memcmp.S
@@ -1,3 +1,31 @@
+/* memcmp - compare memory
+
+ Copyright (c) 2018 Linaro Limited
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the Linaro nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
+
/*
* Copyright (c) 2017 ARM Ltd
* All rights reserved.
@@ -35,6 +63,8 @@
* ARMv8-a, AArch64, unaligned accesses.
*/
+#define L(l) .L ## l
+
/* Parameters and result. */
#define src1 x0
#define src2 x1
@@ -44,9 +74,12 @@
/* Internal variables. */
#define data1 x3
#define data1w w3
-#define data2 x4
-#define data2w w4
-#define tmp1 x5
+#define data1h x4
+#define data2 x5
+#define data2w w5
+#define data2h x6
+#define tmp1 x7
+#define tmp2 x8
.macro def_fn f p2align=0
.text
@@ -56,83 +89,106 @@
\f:
.endm
-/* Small inputs of less than 8 bytes are handled separately. This allows the
- main code to be sped up using unaligned loads since there are now at least
- 8 bytes to be compared. If the first 8 bytes are equal, align src1.
- This ensures each iteration does at most one unaligned access even if both
- src1 and src2 are unaligned, and mutually aligned inputs behave as if
- aligned. After the main loop, process the last 8 bytes using unaligned
- accesses. */
-
def_fn memcmp p2align=6
subs limit, limit, 8
- b.lo .Lless8
+ b.lo L(less8)
- /* Limit >= 8, so check first 8 bytes using unaligned loads. */
ldr data1, [src1], 8
ldr data2, [src2], 8
- and tmp1, src1, 7
- add limit, limit, tmp1
cmp data1, data2
- bne .Lreturn
+ b.ne L(return)
+
+ subs limit, limit, 8
+ b.gt L(more16)
+
+ ldr data1, [src1, limit]
+ ldr data2, [src2, limit]
+ b L(return)
+
+L(more16):
+ ldr data1, [src1], 8
+ ldr data2, [src2], 8
+ cmp data1, data2
+ bne L(return)
+
+ /* Jump directly to comparing the last 16 bytes for 32 byte (or less)
+ strings. */
+ subs limit, limit, 16
+ b.ls L(last_bytes)
+
+ /* We overlap loads between 0-32 bytes at either side of SRC1 when we
+ try to align, so limit it only to strings larger than 128 bytes. */
+ cmp limit, 96
+ b.ls L(loop16)
/* Align src1 and adjust src2 with bytes not yet done. */
+ and tmp1, src1, 15
+ add limit, limit, tmp1
sub src1, src1, tmp1
sub src2, src2, tmp1
- subs limit, limit, 8
- b.ls .Llast_bytes
-
- /* Loop performing 8 bytes per iteration using aligned src1.
- Limit is pre-decremented by 8 and must be larger than zero.
- Exit if <= 8 bytes left to do or if the data is not equal. */
+ /* Loop performing 16 bytes per iteration using aligned src1.
+ Limit is pre-decremented by 16 and must be larger than zero.
+ Exit if <= 16 bytes left to do or if the data is not equal. */
.p2align 4
-.Lloop8:
- ldr data1, [src1], 8
- ldr data2, [src2], 8
- subs limit, limit, 8
- ccmp data1, data2, 0, hi /* NZCV = 0b0000. */
- b.eq .Lloop8
+L(loop16):
+ ldp data1, data1h, [src1], 16
+ ldp data2, data2h, [src2], 16
+ subs limit, limit, 16
+ ccmp data1, data2, 0, hi
+ ccmp data1h, data2h, 0, eq
+ b.eq L(loop16)
cmp data1, data2
- bne .Lreturn
+ bne L(return)
+ mov data1, data1h
+ mov data2, data2h
+ cmp data1, data2
+ bne L(return)
- /* Compare last 1-8 bytes using unaligned access. */
-.Llast_bytes:
- ldr data1, [src1, limit]
- ldr data2, [src2, limit]
+ /* Compare last 1-16 bytes using unaligned access. */
+L(last_bytes):
+ add src1, src1, limit
+ add src2, src2, limit
+ ldp data1, data1h, [src1]
+ ldp data2, data2h, [src2]
+ cmp data1, data2
+ bne L(return)
+ mov data1, data1h
+ mov data2, data2h
+ cmp data1, data2
/* Compare data bytes and set return value to 0, -1 or 1. */
-.Lreturn:
+L(return):
#ifndef __AARCH64EB__
rev data1, data1
rev data2, data2
#endif
cmp data1, data2
-.Lret_eq:
+L(ret_eq):
cset result, ne
cneg result, result, lo
- ret
+ ret
.p2align 4
/* Compare up to 8 bytes. Limit is [-8..-1]. */
-.Lless8:
+L(less8):
adds limit, limit, 4
- b.lo .Lless4
+ b.lo L(less4)
ldr data1w, [src1], 4
ldr data2w, [src2], 4
cmp data1w, data2w
- b.ne .Lreturn
+ b.ne L(return)
sub limit, limit, 4
-.Lless4:
+L(less4):
adds limit, limit, 4
- beq .Lret_eq
-.Lbyte_loop:
+ beq L(ret_eq)
+L(byte_loop):
ldrb data1w, [src1], 1
ldrb data2w, [src2], 1
subs limit, limit, 1
ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */
- b.eq .Lbyte_loop
+ b.eq L(byte_loop)
sub result, data1w, data2w
ret
--
2.17.1
^ permalink raw reply [flat|nested] 7+ messages in thread
* [PATCH 3/3] [aarch64] strcmp.S: Improve performance for misaligned strings
2018-06-29 12:38 [PATCH 0/3] Improved string comparison routines for aarch64 Siddhesh Poyarekar
2018-06-29 12:38 ` [PATCH 2/3] [aarch64] memcmp.S: optimize for medium to large sizes Siddhesh Poyarekar
2018-06-29 12:38 ` [PATCH 1/3] [aarch64] Improve strncmp for mutually misaligned inputs Siddhesh Poyarekar
@ 2018-06-30 3:19 ` Siddhesh Poyarekar
2018-07-02 11:36 ` [PATCH 0/3] Improved string comparison routines for aarch64 Corinna Vinschen
2018-07-13 11:35 ` Corinna Vinschen
4 siblings, 0 replies; 7+ messages in thread
From: Siddhesh Poyarekar @ 2018-06-30 3:19 UTC (permalink / raw)
To: newlib
Replace the simple byte-wise compare in the misaligned case with a
dword compare with page boundary checks in place. For simplicity I've
chosen a 4K page boundary so that we don't have to query the actual
page size on the system.
This results in up to 3x improvement in performance in the unaligned
case on falkor and about 2.5x improvement on mustang as measured using
bench-strcmp in glibc.
---
newlib/libc/machine/aarch64/strcmp.S | 51 ++++++++++++++++++++++------
1 file changed, 40 insertions(+), 11 deletions(-)
diff --git a/newlib/libc/machine/aarch64/strcmp.S b/newlib/libc/machine/aarch64/strcmp.S
index 85baca920..e2bef2d49 100644
--- a/newlib/libc/machine/aarch64/strcmp.S
+++ b/newlib/libc/machine/aarch64/strcmp.S
@@ -1,4 +1,4 @@
-/* Copyright (c) 2012-2013, Linaro Limited
+/* Copyright (c) 2012-2018, Linaro Limited
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -41,6 +41,8 @@
\f:
.endm
+#define L(label) .L ## label
+
#define REP8_01 0x0101010101010101
#define REP8_7f 0x7f7f7f7f7f7f7f7f
#define REP8_80 0x8080808080808080
@@ -69,24 +71,25 @@ def_fn strcmp p2align=6
eor tmp1, src1, src2
mov zeroones, #REP8_01
tst tmp1, #7
- b.ne .Lmisaligned8
+ b.ne L(misaligned8)
ands tmp1, src1, #7
- b.ne .Lmutual_align
+ b.ne L(mutual_align)
/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
(=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
can be done in parallel across the entire word. */
-.Lloop_aligned:
+L(loop_aligned):
ldr data1, [src1], #8
ldr data2, [src2], #8
-.Lstart_realigned:
+L(start_realigned):
sub tmp1, data1, zeroones
orr tmp2, data1, #REP8_7f
eor diff, data1, data2 /* Non-zero if differences found. */
bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
orr syndrome, diff, has_nul
- cbz syndrome, .Lloop_aligned
+ cbz syndrome, L(loop_aligned)
/* End of performance-critical section -- one 64B cache line. */
+L(end):
#ifndef __AARCH64EB__
rev syndrome, syndrome
rev data1, data1
@@ -137,7 +140,7 @@ def_fn strcmp p2align=6
ret
#endif
-.Lmutual_align:
+L(mutual_align):
/* Sources are mutually aligned, but are not currently at an
alignment boundary. Round down the addresses and then mask off
the bytes that preceed the start point. */
@@ -157,15 +160,41 @@ def_fn strcmp p2align=6
#endif
orr data1, data1, tmp2
orr data2, data2, tmp2
- b .Lstart_realigned
+ b L(start_realigned)
-.Lmisaligned8:
- /* We can do better than this. */
+L(misaligned8):
+ /* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always
+ checking to make sure that we don't access beyond page boundary in
+ SRC2. */
+ tst src1, #7
+ b.eq L(loop_misaligned)
+L(do_misaligned):
ldrb data1w, [src1], #1
ldrb data2w, [src2], #1
cmp data1w, #1
ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
- b.eq .Lmisaligned8
+ b.ne L(done)
+ tst src1, #7
+ b.ne L(do_misaligned)
+
+L(loop_misaligned):
+ /* Test if we are within the last dword of the end of a 4K page. If
+ yes then jump back to the misaligned loop to copy a byte at a time. */
+ and tmp1, src2, #0xff8
+ eor tmp1, tmp1, #0xff8
+ cbz tmp1, L(do_misaligned)
+ ldr data1, [src1], #8
+ ldr data2, [src2], #8
+
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, #REP8_7f
+ eor diff, data1, data2 /* Non-zero if differences found. */
+ bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
+ orr syndrome, diff, has_nul
+ cbz syndrome, L(loop_misaligned)
+ b L(end)
+
+L(done):
sub result, data1, data2
ret
.size strcmp, .-strcmp
--
2.17.1
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH 0/3] Improved string comparison routines for aarch64
2018-06-29 12:38 [PATCH 0/3] Improved string comparison routines for aarch64 Siddhesh Poyarekar
` (2 preceding siblings ...)
2018-06-30 3:19 ` [PATCH 3/3] [aarch64] strcmp.S: Improve performance for misaligned strings Siddhesh Poyarekar
@ 2018-07-02 11:36 ` Corinna Vinschen
2018-07-13 11:35 ` Corinna Vinschen
4 siblings, 0 replies; 7+ messages in thread
From: Corinna Vinschen @ 2018-07-02 11:36 UTC (permalink / raw)
To: newlib
[-- Attachment #1: Type: text/plain, Size: 826 bytes --]
On Jun 29 18:08, Siddhesh Poyarekar wrote:
> Hi,
>
> Following patches improve performance of comparing mutually misaligned
> strings by up to 3.5x on aarch64 and up to 2x for memcmp.
>
> Siddhesh
>
> Siddhesh Poyarekar (3):
> [aarch64] Improve strncmp for mutually misaligned inputs
> [aarch64] memcmp.S: optimize for medium to large sizes
> [aarch64] strcmp.S: Improve performance for misaligned strings
>
> newlib/libc/machine/aarch64/memcmp.S | 142 ++++++++++++++++++--------
> newlib/libc/machine/aarch64/strcmp.S | 51 +++++++--
> newlib/libc/machine/aarch64/strncmp.S | 96 ++++++++++++++---
> 3 files changed, 218 insertions(+), 71 deletions(-)
>
> --
> 2.17.1
A review from the ARM guys here, please?
Thanks,
Corinna
--
Corinna Vinschen
Cygwin Maintainer
Red Hat
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH 0/3] Improved string comparison routines for aarch64
2018-06-29 12:38 [PATCH 0/3] Improved string comparison routines for aarch64 Siddhesh Poyarekar
` (3 preceding siblings ...)
2018-07-02 11:36 ` [PATCH 0/3] Improved string comparison routines for aarch64 Corinna Vinschen
@ 2018-07-13 11:35 ` Corinna Vinschen
2018-07-13 14:50 ` Siddhesh Poyarekar
4 siblings, 1 reply; 7+ messages in thread
From: Corinna Vinschen @ 2018-07-13 11:35 UTC (permalink / raw)
To: Siddhesh Poyarekar; +Cc: newlib
[-- Attachment #1: Type: text/plain, Size: 793 bytes --]
On Jun 29 18:08, Siddhesh Poyarekar wrote:
> Hi,
>
> Following patches improve performance of comparing mutually misaligned
> strings by up to 3.5x on aarch64 and up to 2x for memcmp.
>
> Siddhesh
>
> Siddhesh Poyarekar (3):
> [aarch64] Improve strncmp for mutually misaligned inputs
> [aarch64] memcmp.S: optimize for medium to large sizes
> [aarch64] strcmp.S: Improve performance for misaligned strings
>
> newlib/libc/machine/aarch64/memcmp.S | 142 ++++++++++++++++++--------
> newlib/libc/machine/aarch64/strcmp.S | 51 +++++++--
> newlib/libc/machine/aarch64/strncmp.S | 96 ++++++++++++++---
> 3 files changed, 218 insertions(+), 71 deletions(-)
>
> --
> 2.17.1
Pushed.
Thanks,
Corinna
--
Corinna Vinschen
Cygwin Maintainer
Red Hat
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH 0/3] Improved string comparison routines for aarch64
2018-07-13 11:35 ` Corinna Vinschen
@ 2018-07-13 14:50 ` Siddhesh Poyarekar
0 siblings, 0 replies; 7+ messages in thread
From: Siddhesh Poyarekar @ 2018-07-13 14:50 UTC (permalink / raw)
To: newlib
On 07/13/2018 04:58 PM, Corinna Vinschen wrote:
> On Jun 29 18:08, Siddhesh Poyarekar wrote:
>> Hi,
>>
>> Following patches improve performance of comparing mutually misaligned
>> strings by up to 3.5x on aarch64 and up to 2x for memcmp.
>>
>> Siddhesh
>>
>> Siddhesh Poyarekar (3):
>> [aarch64] Improve strncmp for mutually misaligned inputs
>> [aarch64] memcmp.S: optimize for medium to large sizes
>> [aarch64] strcmp.S: Improve performance for misaligned strings
>>
>> newlib/libc/machine/aarch64/memcmp.S | 142 ++++++++++++++++++--------
>> newlib/libc/machine/aarch64/strcmp.S | 51 +++++++--
>> newlib/libc/machine/aarch64/strncmp.S | 96 ++++++++++++++---
>> 3 files changed, 218 insertions(+), 71 deletions(-)
>>
>> --
>> 2.17.1
>
> Pushed.
Thank you!
Siddhesh
^ permalink raw reply [flat|nested] 7+ messages in thread
end of thread, other threads:[~2018-07-13 12:14 UTC | newest]
Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-06-29 12:38 [PATCH 0/3] Improved string comparison routines for aarch64 Siddhesh Poyarekar
2018-06-29 12:38 ` [PATCH 2/3] [aarch64] memcmp.S: optimize for medium to large sizes Siddhesh Poyarekar
2018-06-29 12:38 ` [PATCH 1/3] [aarch64] Improve strncmp for mutually misaligned inputs Siddhesh Poyarekar
2018-06-30 3:19 ` [PATCH 3/3] [aarch64] strcmp.S: Improve performance for misaligned strings Siddhesh Poyarekar
2018-07-02 11:36 ` [PATCH 0/3] Improved string comparison routines for aarch64 Corinna Vinschen
2018-07-13 11:35 ` Corinna Vinschen
2018-07-13 14:50 ` Siddhesh Poyarekar
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).