public inbox for newlib@sourceware.org
 help / color / mirror / Atom feed
* [PATCH 0/3] Improved string comparison routines for aarch64
@ 2018-06-29 12:38 Siddhesh Poyarekar
  2018-06-29 12:38 ` [PATCH 2/3] [aarch64] memcmp.S: optimize for medium to large sizes Siddhesh Poyarekar
                   ` (4 more replies)
  0 siblings, 5 replies; 7+ messages in thread
From: Siddhesh Poyarekar @ 2018-06-29 12:38 UTC (permalink / raw)
  To: newlib

Hi,

Following patches improve performance of comparing mutually misaligned
strings by up to 3.5x on aarch64 and up to 2x for memcmp.

Siddhesh

Siddhesh Poyarekar (3):
  [aarch64] Improve strncmp for mutually misaligned inputs
  [aarch64] memcmp.S: optimize for medium to large sizes
  [aarch64] strcmp.S: Improve performance for misaligned strings

 newlib/libc/machine/aarch64/memcmp.S  | 142 ++++++++++++++++++--------
 newlib/libc/machine/aarch64/strcmp.S  |  51 +++++++--
 newlib/libc/machine/aarch64/strncmp.S |  96 ++++++++++++++---
 3 files changed, 218 insertions(+), 71 deletions(-)

-- 
2.17.1

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH 1/3] [aarch64] Improve strncmp for mutually misaligned inputs
  2018-06-29 12:38 [PATCH 0/3] Improved string comparison routines for aarch64 Siddhesh Poyarekar
  2018-06-29 12:38 ` [PATCH 2/3] [aarch64] memcmp.S: optimize for medium to large sizes Siddhesh Poyarekar
@ 2018-06-29 12:38 ` Siddhesh Poyarekar
  2018-06-30  3:19 ` [PATCH 3/3] [aarch64] strcmp.S: Improve performance for misaligned strings Siddhesh Poyarekar
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 7+ messages in thread
From: Siddhesh Poyarekar @ 2018-06-29 12:38 UTC (permalink / raw)
  To: newlib

The mutually misaligned inputs on aarch64 are compared with a simple
byte copy, which is not very efficient.  Enhance the comparison
similar to strcmp by loading a double-word at a time.  The peak
performance improvement (i.e. 4k maxlen comparisons) due to this on
the strncmp microbenchmark in glibc is as follows:

falkor: 3.5x (up to 72% time reduction)
cortex-a73: 3.5x (up to 71% time reduction)
cortex-a53: 3.5x (up to 71% time reduction)

All mutually misaligned inputs from 16 bytes maxlen onwards show
upwards of 15% improvement and there is no measurable effect on the
performance of aligned/mutually aligned inputs.
---
 newlib/libc/machine/aarch64/strncmp.S | 96 ++++++++++++++++++++++-----
 1 file changed, 79 insertions(+), 17 deletions(-)

diff --git a/newlib/libc/machine/aarch64/strncmp.S b/newlib/libc/machine/aarch64/strncmp.S
index 0b90dd8a6..ffdabc260 100644
--- a/newlib/libc/machine/aarch64/strncmp.S
+++ b/newlib/libc/machine/aarch64/strncmp.S
@@ -1,4 +1,4 @@
-/* Copyright (c) 2013, Linaro Limited
+/* Copyright (c) 2013, 2018, Linaro Limited
    All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
@@ -67,6 +67,7 @@
 #define limit_wd	x13
 #define mask		x14
 #define endloop		x15
+#define count		mask
 
 	.text
 	.p2align 6
@@ -78,9 +79,9 @@ def_fn strncmp
 	eor	tmp1, src1, src2
 	mov	zeroones, #REP8_01
 	tst	tmp1, #7
+	and	count, src1, #7
 	b.ne	.Lmisaligned8
-	ands	tmp1, src1, #7
-	b.ne	.Lmutual_align
+	cbnz	count, .Lmutual_align
 	/* Calculate the number of full and partial words -1.  */
 	sub	limit_wd, limit, #1	/* limit != 0, so no underflow.  */
 	lsr	limit_wd, limit_wd, #3	/* Convert to Dwords.  */
@@ -185,44 +186,105 @@ def_fn strncmp
 	bic	src1, src1, #7
 	bic	src2, src2, #7
 	ldr	data1, [src1], #8
-	neg	tmp3, tmp1, lsl #3	/* 64 - bits(bytes beyond align). */
+	neg	tmp3, count, lsl #3	/* 64 - bits(bytes beyond align). */
 	ldr	data2, [src2], #8
 	mov	tmp2, #~0
 	sub	limit_wd, limit, #1	/* limit != 0, so no underflow.  */
 #ifdef __AARCH64EB__
 	/* Big-endian.  Early bytes are at MSB.  */
-	lsl	tmp2, tmp2, tmp3	/* Shift (tmp1 & 63).  */
+	lsl	tmp2, tmp2, tmp3	/* Shift (count & 63).  */
 #else
 	/* Little-endian.  Early bytes are at LSB.  */
-	lsr	tmp2, tmp2, tmp3	/* Shift (tmp1 & 63).  */
+	lsr	tmp2, tmp2, tmp3	/* Shift (count & 63).  */
 #endif
 	and	tmp3, limit_wd, #7
 	lsr	limit_wd, limit_wd, #3
 	/* Adjust the limit. Only low 3 bits used, so overflow irrelevant.  */
-	add	limit, limit, tmp1
-	add	tmp3, tmp3, tmp1
+	add	limit, limit, count
+	add	tmp3, tmp3, count
 	orr	data1, data1, tmp2
 	orr	data2, data2, tmp2
 	add	limit_wd, limit_wd, tmp3, lsr #3
 	b	.Lstart_realigned
 
-.Lret0:
-	mov	result, #0
-	ret
-
 	.p2align 6
+	/* Don't bother with dwords for up to 16 bytes.  */
 .Lmisaligned8:
-	sub	limit, limit, #1
-1:
+	cmp	limit, #16
+	b.hs	.Ltry_misaligned_words
+
+.Lbyte_loop:
 	/* Perhaps we can do better than this.  */
 	ldrb	data1w, [src1], #1
 	ldrb	data2w, [src2], #1
 	subs	limit, limit, #1
-	ccmp	data1w, #1, #0, cs	/* NZCV = 0b0000.  */
+	ccmp	data1w, #1, #0, hi	/* NZCV = 0b0000.  */
 	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
-	b.eq	1b
+	b.eq	.Lbyte_loop
+.Ldone:
 	sub	result, data1, data2
 	ret
+	/* Align the SRC1 to a dword by doing a bytewise compare and then do
+	   the dword loop.  */
+.Ltry_misaligned_words:
+	lsr	limit_wd, limit, #3
+	cbz	count, .Ldo_misaligned
+
+	neg	count, count
+	and	count, count, #7
+	sub	limit, limit, count
+	lsr	limit_wd, limit, #3
+
+.Lpage_end_loop:
+	ldrb	data1w, [src1], #1
+	ldrb	data2w, [src2], #1
+	cmp	data1w, #1
+	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
+	b.ne	.Ldone
+	subs	count, count, #1
+	b.hi	.Lpage_end_loop
+
+.Ldo_misaligned:
+	/* Prepare ourselves for the next page crossing.  Unlike the aligned
+	   loop, we fetch 1 less dword because we risk crossing bounds on
+	   SRC2.  */
+	mov	count, #8
+	subs	limit_wd, limit_wd, #1
+	b.lo	.Ldone_loop
+.Lloop_misaligned:
+	and	tmp2, src2, #0xff8
+	eor	tmp2, tmp2, #0xff8
+	cbz	tmp2, .Lpage_end_loop
+
+	ldr	data1, [src1], #8
+	ldr	data2, [src2], #8
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	eor	diff, data1, data2	/* Non-zero if differences found.  */
+	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
+	ccmp	diff, #0, #0, eq
+	b.ne	.Lnot_limit
+	subs	limit_wd, limit_wd, #1
+	b.pl	.Lloop_misaligned
+
+.Ldone_loop:
+	/* We found a difference or a NULL before the limit was reached.  */
+	and	limit, limit, #7
+	cbz	limit, .Lnot_limit
+	/* Read the last word.  */
+	sub	src1, src1, 8
+	sub	src2, src2, 8
+	ldr	data1, [src1, limit]
+	ldr	data2, [src2, limit]
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	eor	diff, data1, data2	/* Non-zero if differences found.  */
+	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
+	ccmp	diff, #0, #0, eq
+	b.ne	.Lnot_limit
+
+.Lret0:
+	mov	result, #0
+	ret
 	.size strncmp, . - strncmp
-
 #endif
-- 
2.17.1

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH 2/3] [aarch64] memcmp.S: optimize for medium to large sizes
  2018-06-29 12:38 [PATCH 0/3] Improved string comparison routines for aarch64 Siddhesh Poyarekar
@ 2018-06-29 12:38 ` Siddhesh Poyarekar
  2018-06-29 12:38 ` [PATCH 1/3] [aarch64] Improve strncmp for mutually misaligned inputs Siddhesh Poyarekar
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 7+ messages in thread
From: Siddhesh Poyarekar @ 2018-06-29 12:38 UTC (permalink / raw)
  To: newlib

This improved memcmp provides a fast path for compares up to 16 bytes
and then compares 16 bytes at a time, thus optimizing loads from both
sources.  The glibc memcmp microbenchmark retains performance (with an
error of ~1ns) for smaller compare sizes and reduces up to 31% of
execution time for compares up to 4K on the APM Mustang.  On Qualcomm
Falkor this improves to almost 48%, i.e. it is almost 2x improvement
for sizes of 2K and above.
---
 newlib/libc/machine/aarch64/memcmp.S | 142 +++++++++++++++++++--------
 1 file changed, 99 insertions(+), 43 deletions(-)

diff --git a/newlib/libc/machine/aarch64/memcmp.S b/newlib/libc/machine/aarch64/memcmp.S
index 1ffb79eb3..605d99365 100644
--- a/newlib/libc/machine/aarch64/memcmp.S
+++ b/newlib/libc/machine/aarch64/memcmp.S
@@ -1,3 +1,31 @@
+/* memcmp - compare memory
+
+   Copyright (c) 2018 Linaro Limited
+   All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are met:
+       * Redistributions of source code must retain the above copyright
+         notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above copyright
+         notice, this list of conditions and the following disclaimer in the
+         documentation and/or other materials provided with the distribution.
+       * Neither the name of the Linaro nor the
+         names of its contributors may be used to endorse or promote products
+         derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
+
 /*
  * Copyright (c) 2017 ARM Ltd
  * All rights reserved.
@@ -35,6 +63,8 @@
  * ARMv8-a, AArch64, unaligned accesses.
  */
 
+#define L(l) .L ## l
+
 /* Parameters and result.  */
 #define src1		x0
 #define src2		x1
@@ -44,9 +74,12 @@
 /* Internal variables.  */
 #define data1		x3
 #define data1w		w3
-#define data2		x4
-#define data2w		w4
-#define tmp1		x5
+#define data1h		x4
+#define data2		x5
+#define data2w		w5
+#define data2h		x6
+#define tmp1		x7
+#define tmp2		x8
 
         .macro def_fn f p2align=0
         .text
@@ -56,83 +89,106 @@
 \f:
         .endm
 
-/* Small inputs of less than 8 bytes are handled separately.  This allows the
-   main code to be sped up using unaligned loads since there are now at least
-   8 bytes to be compared.  If the first 8 bytes are equal, align src1.
-   This ensures each iteration does at most one unaligned access even if both
-   src1 and src2 are unaligned, and mutually aligned inputs behave as if
-   aligned.  After the main loop, process the last 8 bytes using unaligned
-   accesses.  */
-
 def_fn memcmp p2align=6
 	subs	limit, limit, 8
-	b.lo	.Lless8
+	b.lo	L(less8)
 
-	/* Limit >= 8, so check first 8 bytes using unaligned loads.  */
 	ldr	data1, [src1], 8
 	ldr	data2, [src2], 8
-	and	tmp1, src1, 7
-	add	limit, limit, tmp1
 	cmp	data1, data2
-	bne	.Lreturn
+	b.ne	L(return)
+
+	subs	limit, limit, 8
+	b.gt	L(more16)
+
+	ldr	data1, [src1, limit]
+	ldr	data2, [src2, limit]
+	b	L(return)
+
+L(more16):
+	ldr	data1, [src1], 8
+	ldr	data2, [src2], 8
+	cmp	data1, data2
+	bne	L(return)
+
+	/* Jump directly to comparing the last 16 bytes for 32 byte (or less)
+	   strings.  */
+	subs	limit, limit, 16
+	b.ls	L(last_bytes)
+
+	/* We overlap loads between 0-32 bytes at either side of SRC1 when we
+	   try to align, so limit it only to strings larger than 128 bytes.  */
+	cmp	limit, 96
+	b.ls	L(loop16)
 
 	/* Align src1 and adjust src2 with bytes not yet done.  */
+	and	tmp1, src1, 15
+	add	limit, limit, tmp1
 	sub	src1, src1, tmp1
 	sub	src2, src2, tmp1
 
-	subs	limit, limit, 8
-	b.ls	.Llast_bytes
-
-	/* Loop performing 8 bytes per iteration using aligned src1.
-	   Limit is pre-decremented by 8 and must be larger than zero.
-	   Exit if <= 8 bytes left to do or if the data is not equal.  */
+	/* Loop performing 16 bytes per iteration using aligned src1.
+	   Limit is pre-decremented by 16 and must be larger than zero.
+	   Exit if <= 16 bytes left to do or if the data is not equal.  */
 	.p2align 4
-.Lloop8:
-	ldr	data1, [src1], 8
-	ldr	data2, [src2], 8
-	subs	limit, limit, 8
-	ccmp	data1, data2, 0, hi  /* NZCV = 0b0000.  */
-	b.eq	.Lloop8
+L(loop16):
+	ldp	data1, data1h, [src1], 16
+	ldp	data2, data2h, [src2], 16
+	subs	limit, limit, 16
+	ccmp	data1, data2, 0, hi
+	ccmp	data1h, data2h, 0, eq
+	b.eq	L(loop16)
 
 	cmp	data1, data2
-	bne	.Lreturn
+	bne	L(return)
+	mov	data1, data1h
+	mov	data2, data2h
+	cmp	data1, data2
+	bne	L(return)
 
-	/* Compare last 1-8 bytes using unaligned access.  */
-.Llast_bytes:
-	ldr	data1, [src1, limit]
-	ldr	data2, [src2, limit]
+	/* Compare last 1-16 bytes using unaligned access.  */
+L(last_bytes):
+	add	src1, src1, limit
+	add	src2, src2, limit
+	ldp	data1, data1h, [src1]
+	ldp	data2, data2h, [src2]
+	cmp     data1, data2
+	bne	L(return)
+	mov	data1, data1h
+	mov	data2, data2h
+	cmp	data1, data2
 
 	/* Compare data bytes and set return value to 0, -1 or 1.  */
-.Lreturn:
+L(return):
 #ifndef __AARCH64EB__
 	rev	data1, data1
 	rev	data2, data2
 #endif
 	cmp     data1, data2
-.Lret_eq:
+L(ret_eq):
 	cset	result, ne
 	cneg	result, result, lo
-        ret
+	ret
 
 	.p2align 4
 	/* Compare up to 8 bytes.  Limit is [-8..-1].  */
-.Lless8:
+L(less8):
 	adds	limit, limit, 4
-	b.lo	.Lless4
+	b.lo	L(less4)
 	ldr	data1w, [src1], 4
 	ldr	data2w, [src2], 4
 	cmp	data1w, data2w
-	b.ne	.Lreturn
+	b.ne	L(return)
 	sub	limit, limit, 4
-.Lless4:
+L(less4):
 	adds	limit, limit, 4
-	beq	.Lret_eq
-.Lbyte_loop:
+	beq	L(ret_eq)
+L(byte_loop):
 	ldrb	data1w, [src1], 1
 	ldrb	data2w, [src2], 1
 	subs	limit, limit, 1
 	ccmp	data1w, data2w, 0, ne	/* NZCV = 0b0000.  */
-	b.eq	.Lbyte_loop
+	b.eq	L(byte_loop)
 	sub	result, data1w, data2w
 	ret
 
-- 
2.17.1

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH 3/3] [aarch64] strcmp.S: Improve performance for misaligned strings
  2018-06-29 12:38 [PATCH 0/3] Improved string comparison routines for aarch64 Siddhesh Poyarekar
  2018-06-29 12:38 ` [PATCH 2/3] [aarch64] memcmp.S: optimize for medium to large sizes Siddhesh Poyarekar
  2018-06-29 12:38 ` [PATCH 1/3] [aarch64] Improve strncmp for mutually misaligned inputs Siddhesh Poyarekar
@ 2018-06-30  3:19 ` Siddhesh Poyarekar
  2018-07-02 11:36 ` [PATCH 0/3] Improved string comparison routines for aarch64 Corinna Vinschen
  2018-07-13 11:35 ` Corinna Vinschen
  4 siblings, 0 replies; 7+ messages in thread
From: Siddhesh Poyarekar @ 2018-06-30  3:19 UTC (permalink / raw)
  To: newlib

Replace the simple byte-wise compare in the misaligned case with a
dword compare with page boundary checks in place.  For simplicity I've
chosen a 4K page boundary so that we don't have to query the actual
page size on the system.

This results in up to 3x improvement in performance in the unaligned
case on falkor and about 2.5x improvement on mustang as measured using
bench-strcmp in glibc.
---
 newlib/libc/machine/aarch64/strcmp.S | 51 ++++++++++++++++++++++------
 1 file changed, 40 insertions(+), 11 deletions(-)

diff --git a/newlib/libc/machine/aarch64/strcmp.S b/newlib/libc/machine/aarch64/strcmp.S
index 85baca920..e2bef2d49 100644
--- a/newlib/libc/machine/aarch64/strcmp.S
+++ b/newlib/libc/machine/aarch64/strcmp.S
@@ -1,4 +1,4 @@
-/* Copyright (c) 2012-2013, Linaro Limited
+/* Copyright (c) 2012-2018, Linaro Limited
    All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
@@ -41,6 +41,8 @@
 \f:
 	.endm
 
+#define L(label) .L ## label
+
 #define REP8_01 0x0101010101010101
 #define REP8_7f 0x7f7f7f7f7f7f7f7f
 #define REP8_80 0x8080808080808080
@@ -69,24 +71,25 @@ def_fn strcmp p2align=6
 	eor	tmp1, src1, src2
 	mov	zeroones, #REP8_01
 	tst	tmp1, #7
-	b.ne	.Lmisaligned8
+	b.ne	L(misaligned8)
 	ands	tmp1, src1, #7
-	b.ne	.Lmutual_align
+	b.ne	L(mutual_align)
 	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
 	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
 	   can be done in parallel across the entire word.  */
-.Lloop_aligned:
+L(loop_aligned):
 	ldr	data1, [src1], #8
 	ldr	data2, [src2], #8
-.Lstart_realigned:
+L(start_realigned):
 	sub	tmp1, data1, zeroones
 	orr	tmp2, data1, #REP8_7f
 	eor	diff, data1, data2	/* Non-zero if differences found.  */
 	bic	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
 	orr	syndrome, diff, has_nul
-	cbz	syndrome, .Lloop_aligned
+	cbz	syndrome, L(loop_aligned)
 	/* End of performance-critical section  -- one 64B cache line.  */
 
+L(end):
 #ifndef	__AARCH64EB__
 	rev	syndrome, syndrome
 	rev	data1, data1
@@ -137,7 +140,7 @@ def_fn strcmp p2align=6
 	ret
 #endif
 
-.Lmutual_align:
+L(mutual_align):
 	/* Sources are mutually aligned, but are not currently at an
 	   alignment boundary.  Round down the addresses and then mask off
 	   the bytes that preceed the start point.  */
@@ -157,15 +160,41 @@ def_fn strcmp p2align=6
 #endif
 	orr	data1, data1, tmp2
 	orr	data2, data2, tmp2
-	b	.Lstart_realigned
+	b	L(start_realigned)
 
-.Lmisaligned8:
-	/* We can do better than this.  */
+L(misaligned8):
+	/* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always
+	   checking to make sure that we don't access beyond page boundary in
+	   SRC2.  */
+	tst	src1, #7
+	b.eq	L(loop_misaligned)
+L(do_misaligned):
 	ldrb	data1w, [src1], #1
 	ldrb	data2w, [src2], #1
 	cmp	data1w, #1
 	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
-	b.eq	.Lmisaligned8
+	b.ne	L(done)
+	tst	src1, #7
+	b.ne	L(do_misaligned)
+
+L(loop_misaligned):
+	/* Test if we are within the last dword of the end of a 4K page.  If
+	   yes then jump back to the misaligned loop to copy a byte at a time.  */
+	and	tmp1, src2, #0xff8
+	eor	tmp1, tmp1, #0xff8
+	cbz	tmp1, L(do_misaligned)
+	ldr	data1, [src1], #8
+	ldr	data2, [src2], #8
+
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	eor	diff, data1, data2	/* Non-zero if differences found.  */
+	bic	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
+	orr	syndrome, diff, has_nul
+	cbz	syndrome, L(loop_misaligned)
+	b	L(end)
+
+L(done):
 	sub	result, data1, data2
 	ret
 	.size	strcmp, .-strcmp
-- 
2.17.1

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH 0/3] Improved string comparison routines for aarch64
  2018-06-29 12:38 [PATCH 0/3] Improved string comparison routines for aarch64 Siddhesh Poyarekar
                   ` (2 preceding siblings ...)
  2018-06-30  3:19 ` [PATCH 3/3] [aarch64] strcmp.S: Improve performance for misaligned strings Siddhesh Poyarekar
@ 2018-07-02 11:36 ` Corinna Vinschen
  2018-07-13 11:35 ` Corinna Vinschen
  4 siblings, 0 replies; 7+ messages in thread
From: Corinna Vinschen @ 2018-07-02 11:36 UTC (permalink / raw)
  To: newlib

[-- Attachment #1: Type: text/plain, Size: 826 bytes --]

On Jun 29 18:08, Siddhesh Poyarekar wrote:
> Hi,
> 
> Following patches improve performance of comparing mutually misaligned
> strings by up to 3.5x on aarch64 and up to 2x for memcmp.
> 
> Siddhesh
> 
> Siddhesh Poyarekar (3):
>   [aarch64] Improve strncmp for mutually misaligned inputs
>   [aarch64] memcmp.S: optimize for medium to large sizes
>   [aarch64] strcmp.S: Improve performance for misaligned strings
> 
>  newlib/libc/machine/aarch64/memcmp.S  | 142 ++++++++++++++++++--------
>  newlib/libc/machine/aarch64/strcmp.S  |  51 +++++++--
>  newlib/libc/machine/aarch64/strncmp.S |  96 ++++++++++++++---
>  3 files changed, 218 insertions(+), 71 deletions(-)
> 
> -- 
> 2.17.1

A review from the ARM guys here, please?


Thanks,
Corinna

-- 
Corinna Vinschen
Cygwin Maintainer
Red Hat

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH 0/3] Improved string comparison routines for aarch64
  2018-06-29 12:38 [PATCH 0/3] Improved string comparison routines for aarch64 Siddhesh Poyarekar
                   ` (3 preceding siblings ...)
  2018-07-02 11:36 ` [PATCH 0/3] Improved string comparison routines for aarch64 Corinna Vinschen
@ 2018-07-13 11:35 ` Corinna Vinschen
  2018-07-13 14:50   ` Siddhesh Poyarekar
  4 siblings, 1 reply; 7+ messages in thread
From: Corinna Vinschen @ 2018-07-13 11:35 UTC (permalink / raw)
  To: Siddhesh Poyarekar; +Cc: newlib

[-- Attachment #1: Type: text/plain, Size: 793 bytes --]

On Jun 29 18:08, Siddhesh Poyarekar wrote:
> Hi,
> 
> Following patches improve performance of comparing mutually misaligned
> strings by up to 3.5x on aarch64 and up to 2x for memcmp.
> 
> Siddhesh
> 
> Siddhesh Poyarekar (3):
>   [aarch64] Improve strncmp for mutually misaligned inputs
>   [aarch64] memcmp.S: optimize for medium to large sizes
>   [aarch64] strcmp.S: Improve performance for misaligned strings
> 
>  newlib/libc/machine/aarch64/memcmp.S  | 142 ++++++++++++++++++--------
>  newlib/libc/machine/aarch64/strcmp.S  |  51 +++++++--
>  newlib/libc/machine/aarch64/strncmp.S |  96 ++++++++++++++---
>  3 files changed, 218 insertions(+), 71 deletions(-)
> 
> -- 
> 2.17.1

Pushed.


Thanks,
Corinna

-- 
Corinna Vinschen
Cygwin Maintainer
Red Hat

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH 0/3] Improved string comparison routines for aarch64
  2018-07-13 11:35 ` Corinna Vinschen
@ 2018-07-13 14:50   ` Siddhesh Poyarekar
  0 siblings, 0 replies; 7+ messages in thread
From: Siddhesh Poyarekar @ 2018-07-13 14:50 UTC (permalink / raw)
  To: newlib

On 07/13/2018 04:58 PM, Corinna Vinschen wrote:
> On Jun 29 18:08, Siddhesh Poyarekar wrote:
>> Hi,
>>
>> Following patches improve performance of comparing mutually misaligned
>> strings by up to 3.5x on aarch64 and up to 2x for memcmp.
>>
>> Siddhesh
>>
>> Siddhesh Poyarekar (3):
>>    [aarch64] Improve strncmp for mutually misaligned inputs
>>    [aarch64] memcmp.S: optimize for medium to large sizes
>>    [aarch64] strcmp.S: Improve performance for misaligned strings
>>
>>   newlib/libc/machine/aarch64/memcmp.S  | 142 ++++++++++++++++++--------
>>   newlib/libc/machine/aarch64/strcmp.S  |  51 +++++++--
>>   newlib/libc/machine/aarch64/strncmp.S |  96 ++++++++++++++---
>>   3 files changed, 218 insertions(+), 71 deletions(-)
>>
>> -- 
>> 2.17.1
> 
> Pushed.

Thank you!

Siddhesh

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2018-07-13 12:14 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-06-29 12:38 [PATCH 0/3] Improved string comparison routines for aarch64 Siddhesh Poyarekar
2018-06-29 12:38 ` [PATCH 2/3] [aarch64] memcmp.S: optimize for medium to large sizes Siddhesh Poyarekar
2018-06-29 12:38 ` [PATCH 1/3] [aarch64] Improve strncmp for mutually misaligned inputs Siddhesh Poyarekar
2018-06-30  3:19 ` [PATCH 3/3] [aarch64] strcmp.S: Improve performance for misaligned strings Siddhesh Poyarekar
2018-07-02 11:36 ` [PATCH 0/3] Improved string comparison routines for aarch64 Corinna Vinschen
2018-07-13 11:35 ` Corinna Vinschen
2018-07-13 14:50   ` Siddhesh Poyarekar

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).