[PATCH] aarch64: Improve strncmp for mutually misaligned inputs

public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed

* [PATCH] aarch64: Improve strncmp for mutually misaligned inputs
@ 2018-03-06 13:47 Siddhesh Poyarekar
  2018-03-13  9:03 ` Siddhesh Poyarekar
  0 siblings, 1 reply; 11+ messages in thread
From: Siddhesh Poyarekar @ 2018-03-06 13:47 UTC (permalink / raw)
  To: libc-alpha; +Cc: szabolcs.nagy

The mutually misaligned inputs on aarch64 are compared with a simple
byte copy, which is not very efficient.  Enhance the comparison
similar to strcmp by loading a double-word at a time.  The peak
performance improvement (i.e. 4k maxlen comparisons) due to this on
the strncmp microbenchmark is as follows:

falkor: 3.5x (up to 72% time reduction)
cortex-a73: 3.5x (up to 71% time reduction)
cortex-a53: 3.5x (up to 71% time reduction)

All mutually misaligned inputs from 16 bytes maxlen onwards show
upwards of 15% improvement and there is no measurable effect on the
performance of aligned/mutually aligned inputs.

	* sysdeps/aarch64/strncmp.S (count): New macro.
	(strncmp): Store misaligned length in SRC1 in COUNT.
	(mutual_align): Adjust.
	(misaligned8): Load dword at a time when it is safe.
---
 sysdeps/aarch64/strncmp.S | 95 +++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 80 insertions(+), 15 deletions(-)

diff --git a/sysdeps/aarch64/strncmp.S b/sysdeps/aarch64/strncmp.S
index a08d2c0aa5..20c7ec8dad 100644
--- a/sysdeps/aarch64/strncmp.S
+++ b/sysdeps/aarch64/strncmp.S
@@ -49,6 +49,7 @@
 #define limit_wd	x13
 #define mask		x14
 #define endloop		x15
+#define count		mask
 
 ENTRY_ALIGN_AND_PAD (strncmp, 6, 7)
 	DELOUSE (0)
@@ -58,9 +59,9 @@ ENTRY_ALIGN_AND_PAD (strncmp, 6, 7)
 	eor	tmp1, src1, src2
 	mov	zeroones, #REP8_01
 	tst	tmp1, #7
+	and	count, src1, #7
 	b.ne	L(misaligned8)
-	ands	tmp1, src1, #7
-	b.ne	L(mutual_align)
+	cbnz	count, L(mutual_align)
 	/* Calculate the number of full and partial words -1.  */
 	sub	limit_wd, limit, #1	/* limit != 0, so no underflow.  */
 	lsr	limit_wd, limit_wd, #3	/* Convert to Dwords.  */
@@ -165,43 +166,107 @@ L(mutual_align):
 	bic	src1, src1, #7
 	bic	src2, src2, #7
 	ldr	data1, [src1], #8
-	neg	tmp3, tmp1, lsl #3	/* 64 - bits(bytes beyond align). */
+	neg	tmp3, count, lsl #3	/* 64 - bits(bytes beyond align). */
 	ldr	data2, [src2], #8
 	mov	tmp2, #~0
 	sub	limit_wd, limit, #1	/* limit != 0, so no underflow.  */
 #ifdef __AARCH64EB__
 	/* Big-endian.  Early bytes are at MSB.  */
-	lsl	tmp2, tmp2, tmp3	/* Shift (tmp1 & 63).  */
+	lsl	tmp2, tmp2, tmp3	/* Shift (count & 63).  */
 #else
 	/* Little-endian.  Early bytes are at LSB.  */
-	lsr	tmp2, tmp2, tmp3	/* Shift (tmp1 & 63).  */
+	lsr	tmp2, tmp2, tmp3	/* Shift (count & 63).  */
 #endif
 	and	tmp3, limit_wd, #7
 	lsr	limit_wd, limit_wd, #3
 	/* Adjust the limit. Only low 3 bits used, so overflow irrelevant.  */
-	add	limit, limit, tmp1
-	add	tmp3, tmp3, tmp1
+	add	limit, limit, count
+	add	tmp3, tmp3, count
 	orr	data1, data1, tmp2
 	orr	data2, data2, tmp2
 	add	limit_wd, limit_wd, tmp3, lsr #3
 	b	L(start_realigned)
 
-L(ret0):
-	mov	result, #0
-	RET
-
 	.p2align 6
+	/* Don't bother with dwords for up to 16 bytes.  */
 L(misaligned8):
-	sub	limit, limit, #1
-1:
+	cmp	limit, #16
+	b.hs	L(try_misaligned_words)
+
+L(byte_loop):
 	/* Perhaps we can do better than this.  */
 	ldrb	data1w, [src1], #1
 	ldrb	data2w, [src2], #1
 	subs	limit, limit, #1
-	ccmp	data1w, #1, #0, cs	/* NZCV = 0b0000.  */
+	ccmp	data1w, #1, #0, hi	/* NZCV = 0b0000.  */
 	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
-	b.eq	1b
+	b.eq	L(byte_loop)
+L(done):
 	sub	result, data1, data2
 	RET
+
+	/* Align the SRC1 to a dword by doing a bytewise compare and then do
+	   the dword loop.  */
+L(try_misaligned_words):
+	mov	limit_wd, limit, lsr #3
+	cbz	count, L(do_misaligned)
+
+	neg	count, count
+	and	count, count, #7
+	sub	limit, limit, count
+	mov	limit_wd, limit, lsr #3
+
+L(page_end_loop):
+	ldrb	data1w, [src1], #1
+	ldrb	data2w, [src2], #1
+	cmp	data1w, #1
+	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
+	b.ne	L(done)
+	subs	count, count, #1
+	b.hi	L(page_end_loop)
+
+L(do_misaligned):
+	/* Prepare ourselves for the next page crossing.  Unlike the aligned
+	   loop, we fetch 1 less dword because we risk crossing bounds on
+	   SRC2.  */
+	mov	count, #8
+	subs	limit_wd, limit_wd, #1
+	b.lo	L(done_loop)
+L(loop_misaligned):
+	and	tmp2, src2, #0xff8
+	eor	tmp2, tmp2, #0xff8
+	cbz	tmp2, L(page_end_loop)
+
+	ldr	data1, [src1], #8
+	ldr	data2, [src2], #8
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	eor	diff, data1, data2	/* Non-zero if differences found.  */
+	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
+	ccmp	diff, #0, #0, eq
+	b.ne	L(not_limit)
+	subs	limit_wd, limit_wd, #1
+	b.pl	L(loop_misaligned)
+
+L(done_loop):
+	/* We found a difference or a NULL before the limit was reached.  */
+	and	limit, limit, #7
+	cbz	limit, L(not_limit)
+	/* Read the last word.  */
+	sub	src1, src1, 8
+	sub	src2, src2, 8
+	ldr	data1, [src1, limit]
+	ldr	data2, [src2, limit]
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	eor	diff, data1, data2	/* Non-zero if differences found.  */
+	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
+	ccmp	diff, #0, #0, eq
+	b.ne	L(not_limit)
+
+L(ret0):
+	mov	result, #0
+	RET
+
 END (strncmp)
 libc_hidden_builtin_def (strncmp)
-- 
2.14.3

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] aarch64: Improve strncmp for mutually misaligned inputs
  2018-03-06 13:47 [PATCH] aarch64: Improve strncmp for mutually misaligned inputs Siddhesh Poyarekar
@ 2018-03-13  9:03 ` Siddhesh Poyarekar
  2018-03-13 13:12   ` Szabolcs Nagy
  0 siblings, 1 reply; 11+ messages in thread
From: Siddhesh Poyarekar @ 2018-03-13  9:03 UTC (permalink / raw)
  To: libc-alpha; +Cc: szabolcs.nagy

Ping!

On Tuesday 06 March 2018 07:17 PM, Siddhesh Poyarekar wrote:
> The mutually misaligned inputs on aarch64 are compared with a simple
> byte copy, which is not very efficient.  Enhance the comparison
> similar to strcmp by loading a double-word at a time.  The peak
> performance improvement (i.e. 4k maxlen comparisons) due to this on
> the strncmp microbenchmark is as follows:
> 
> falkor: 3.5x (up to 72% time reduction)
> cortex-a73: 3.5x (up to 71% time reduction)
> cortex-a53: 3.5x (up to 71% time reduction)
> 
> All mutually misaligned inputs from 16 bytes maxlen onwards show
> upwards of 15% improvement and there is no measurable effect on the
> performance of aligned/mutually aligned inputs.
> 
> 	* sysdeps/aarch64/strncmp.S (count): New macro.
> 	(strncmp): Store misaligned length in SRC1 in COUNT.
> 	(mutual_align): Adjust.
> 	(misaligned8): Load dword at a time when it is safe.
> ---
>  sysdeps/aarch64/strncmp.S | 95 +++++++++++++++++++++++++++++++++++++++--------
>  1 file changed, 80 insertions(+), 15 deletions(-)
> 
> diff --git a/sysdeps/aarch64/strncmp.S b/sysdeps/aarch64/strncmp.S
> index a08d2c0aa5..20c7ec8dad 100644
> --- a/sysdeps/aarch64/strncmp.S
> +++ b/sysdeps/aarch64/strncmp.S
> @@ -49,6 +49,7 @@
>  #define limit_wd	x13
>  #define mask		x14
>  #define endloop		x15
> +#define count		mask
>  
>  ENTRY_ALIGN_AND_PAD (strncmp, 6, 7)
>  	DELOUSE (0)
> @@ -58,9 +59,9 @@ ENTRY_ALIGN_AND_PAD (strncmp, 6, 7)
>  	eor	tmp1, src1, src2
>  	mov	zeroones, #REP8_01
>  	tst	tmp1, #7
> +	and	count, src1, #7
>  	b.ne	L(misaligned8)
> -	ands	tmp1, src1, #7
> -	b.ne	L(mutual_align)
> +	cbnz	count, L(mutual_align)
>  	/* Calculate the number of full and partial words -1.  */
>  	sub	limit_wd, limit, #1	/* limit != 0, so no underflow.  */
>  	lsr	limit_wd, limit_wd, #3	/* Convert to Dwords.  */
> @@ -165,43 +166,107 @@ L(mutual_align):
>  	bic	src1, src1, #7
>  	bic	src2, src2, #7
>  	ldr	data1, [src1], #8
> -	neg	tmp3, tmp1, lsl #3	/* 64 - bits(bytes beyond align). */
> +	neg	tmp3, count, lsl #3	/* 64 - bits(bytes beyond align). */
>  	ldr	data2, [src2], #8
>  	mov	tmp2, #~0
>  	sub	limit_wd, limit, #1	/* limit != 0, so no underflow.  */
>  #ifdef __AARCH64EB__
>  	/* Big-endian.  Early bytes are at MSB.  */
> -	lsl	tmp2, tmp2, tmp3	/* Shift (tmp1 & 63).  */
> +	lsl	tmp2, tmp2, tmp3	/* Shift (count & 63).  */
>  #else
>  	/* Little-endian.  Early bytes are at LSB.  */
> -	lsr	tmp2, tmp2, tmp3	/* Shift (tmp1 & 63).  */
> +	lsr	tmp2, tmp2, tmp3	/* Shift (count & 63).  */
>  #endif
>  	and	tmp3, limit_wd, #7
>  	lsr	limit_wd, limit_wd, #3
>  	/* Adjust the limit. Only low 3 bits used, so overflow irrelevant.  */
> -	add	limit, limit, tmp1
> -	add	tmp3, tmp3, tmp1
> +	add	limit, limit, count
> +	add	tmp3, tmp3, count
>  	orr	data1, data1, tmp2
>  	orr	data2, data2, tmp2
>  	add	limit_wd, limit_wd, tmp3, lsr #3
>  	b	L(start_realigned)
>  
> -L(ret0):
> -	mov	result, #0
> -	RET
> -
>  	.p2align 6
> +	/* Don't bother with dwords for up to 16 bytes.  */
>  L(misaligned8):
> -	sub	limit, limit, #1
> -1:
> +	cmp	limit, #16
> +	b.hs	L(try_misaligned_words)
> +
> +L(byte_loop):
>  	/* Perhaps we can do better than this.  */
>  	ldrb	data1w, [src1], #1
>  	ldrb	data2w, [src2], #1
>  	subs	limit, limit, #1
> -	ccmp	data1w, #1, #0, cs	/* NZCV = 0b0000.  */
> +	ccmp	data1w, #1, #0, hi	/* NZCV = 0b0000.  */
>  	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
> -	b.eq	1b
> +	b.eq	L(byte_loop)
> +L(done):
>  	sub	result, data1, data2
>  	RET
> +
> +	/* Align the SRC1 to a dword by doing a bytewise compare and then do
> +	   the dword loop.  */
> +L(try_misaligned_words):
> +	mov	limit_wd, limit, lsr #3
> +	cbz	count, L(do_misaligned)
> +
> +	neg	count, count
> +	and	count, count, #7
> +	sub	limit, limit, count
> +	mov	limit_wd, limit, lsr #3
> +
> +L(page_end_loop):
> +	ldrb	data1w, [src1], #1
> +	ldrb	data2w, [src2], #1
> +	cmp	data1w, #1
> +	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
> +	b.ne	L(done)
> +	subs	count, count, #1
> +	b.hi	L(page_end_loop)
> +
> +L(do_misaligned):
> +	/* Prepare ourselves for the next page crossing.  Unlike the aligned
> +	   loop, we fetch 1 less dword because we risk crossing bounds on
> +	   SRC2.  */
> +	mov	count, #8
> +	subs	limit_wd, limit_wd, #1
> +	b.lo	L(done_loop)
> +L(loop_misaligned):
> +	and	tmp2, src2, #0xff8
> +	eor	tmp2, tmp2, #0xff8
> +	cbz	tmp2, L(page_end_loop)
> +
> +	ldr	data1, [src1], #8
> +	ldr	data2, [src2], #8
> +	sub	tmp1, data1, zeroones
> +	orr	tmp2, data1, #REP8_7f
> +	eor	diff, data1, data2	/* Non-zero if differences found.  */
> +	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
> +	ccmp	diff, #0, #0, eq
> +	b.ne	L(not_limit)
> +	subs	limit_wd, limit_wd, #1
> +	b.pl	L(loop_misaligned)
> +
> +L(done_loop):
> +	/* We found a difference or a NULL before the limit was reached.  */
> +	and	limit, limit, #7
> +	cbz	limit, L(not_limit)
> +	/* Read the last word.  */
> +	sub	src1, src1, 8
> +	sub	src2, src2, 8
> +	ldr	data1, [src1, limit]
> +	ldr	data2, [src2, limit]
> +	sub	tmp1, data1, zeroones
> +	orr	tmp2, data1, #REP8_7f
> +	eor	diff, data1, data2	/* Non-zero if differences found.  */
> +	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
> +	ccmp	diff, #0, #0, eq
> +	b.ne	L(not_limit)
> +
> +L(ret0):
> +	mov	result, #0
> +	RET
> +
>  END (strncmp)
>  libc_hidden_builtin_def (strncmp)
> 

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] aarch64: Improve strncmp for mutually misaligned inputs
  2018-03-13  9:03 ` Siddhesh Poyarekar
@ 2018-03-13 13:12   ` Szabolcs Nagy
  2018-03-13 13:18     ` Siddhesh Poyarekar
  2018-03-14 12:36     ` Szabolcs Nagy
  0 siblings, 2 replies; 11+ messages in thread
From: Szabolcs Nagy @ 2018-03-13 13:12 UTC (permalink / raw)
  To: Siddhesh Poyarekar, libc-alpha; +Cc: nd

On 13/03/18 09:03, Siddhesh Poyarekar wrote:
> Ping!
> 
> On Tuesday 06 March 2018 07:17 PM, Siddhesh Poyarekar wrote:
>> The mutually misaligned inputs on aarch64 are compared with a simple
>> byte copy, which is not very efficient.  Enhance the comparison
>> similar to strcmp by loading a double-word at a time.  The peak
>> performance improvement (i.e. 4k maxlen comparisons) due to this on
>> the strncmp microbenchmark is as follows:
>>
>> falkor: 3.5x (up to 72% time reduction)
>> cortex-a73: 3.5x (up to 71% time reduction)
>> cortex-a53: 3.5x (up to 71% time reduction)
>>
>> All mutually misaligned inputs from 16 bytes maxlen onwards show
>> upwards of 15% improvement and there is no measurable effect on the
>> performance of aligned/mutually aligned inputs.
>>
>> 	* sysdeps/aarch64/strncmp.S (count): New macro.
>> 	(strncmp): Store misaligned length in SRC1 in COUNT.
>> 	(mutual_align): Adjust.
>> 	(misaligned8): Load dword at a time when it is safe.

OK to commit.

(it would be nice to have the equivalent change in newlib too..)

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] aarch64: Improve strncmp for mutually misaligned inputs
  2018-03-13 13:12   ` Szabolcs Nagy
@ 2018-03-13 13:18     ` Siddhesh Poyarekar
  2018-03-14 12:36     ` Szabolcs Nagy
  1 sibling, 0 replies; 11+ messages in thread
From: Siddhesh Poyarekar @ 2018-03-13 13:18 UTC (permalink / raw)
  To: Szabolcs Nagy, libc-alpha; +Cc: nd

On Tuesday 13 March 2018 06:42 PM, Szabolcs Nagy wrote:
> OK to commit.
> 
> (it would be nice to have the equivalent change in newlib too..)

Yes, it's in my TODO list for later this week.

Siddhesh

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] aarch64: Improve strncmp for mutually misaligned inputs
  2018-03-13 13:12   ` Szabolcs Nagy
  2018-03-13 13:18     ` Siddhesh Poyarekar
@ 2018-03-14 12:36     ` Szabolcs Nagy
  2018-03-14 13:25       ` Siddhesh Poyarekar
  1 sibling, 1 reply; 11+ messages in thread
From: Szabolcs Nagy @ 2018-03-14 12:36 UTC (permalink / raw)
  To: Siddhesh Poyarekar, libc-alpha; +Cc: nd

On 13/03/18 13:12, Szabolcs Nagy wrote:
> On 13/03/18 09:03, Siddhesh Poyarekar wrote:
>> Ping!
>>
>> On Tuesday 06 March 2018 07:17 PM, Siddhesh Poyarekar wrote:
>>> The mutually misaligned inputs on aarch64 are compared with a simple
>>> byte copy, which is not very efficient.Â  Enhance the comparison
>>> similar to strcmp by loading a double-word at a time.Â  The peak
>>> performance improvement (i.e. 4k maxlen comparisons) due to this on
>>> the strncmp microbenchmark is as follows:
>>>
>>> falkor: 3.5x (up to 72% time reduction)
>>> cortex-a73: 3.5x (up to 71% time reduction)
>>> cortex-a53: 3.5x (up to 71% time reduction)
>>>
>>> All mutually misaligned inputs from 16 bytes maxlen onwards show
>>> upwards of 15% improvement and there is no measurable effect on the
>>> performance of aligned/mutually aligned inputs.
>>>
>>> Â Â Â Â * sysdeps/aarch64/strncmp.S (count): New macro.
>>> Â Â Â Â (strncmp): Store misaligned length in SRC1 in COUNT.
>>> Â Â Â Â (mutual_align): Adjust.
>>> Â Â Â Â (misaligned8): Load dword at a time when it is safe.
> 
> OK to commit.
> 
> (it would be nice to have the equivalent change in newlib too..)

this broke the build for me

../sysdeps/aarch64/strncmp.S: Assembler messages:
../sysdeps/aarch64/strncmp.S:211: Error: unexpected characters following instruction at operand 2 -- `mov x13,x2,lsr#3'
../sysdeps/aarch64/strncmp.S:217: Error: unexpected characters following instruction at operand 2 -- `mov x13,x2,lsr#3'

old binutils 2.26 and before did not support mov with shifted
register (only orr reg,xzr,reg,shift).

but i think a shift instruction (lsr) should be better anyway
(on most implementations).

can you please fix this?

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] aarch64: Improve strncmp for mutually misaligned inputs
  2018-03-14 12:36     ` Szabolcs Nagy
@ 2018-03-14 13:25       ` Siddhesh Poyarekar
  0 siblings, 0 replies; 11+ messages in thread
From: Siddhesh Poyarekar @ 2018-03-14 13:25 UTC (permalink / raw)
  To: Szabolcs Nagy, libc-alpha; +Cc: nd

On Wednesday 14 March 2018 06:05 PM, Szabolcs Nagy wrote:
> this broke the build for me
> 
> ../sysdeps/aarch64/strncmp.S: Assembler messages:
> ../sysdeps/aarch64/strncmp.S:211: Error: unexpected characters following
> instruction at operand 2 -- `mov x13,x2,lsr#3'
> ../sysdeps/aarch64/strncmp.S:217: Error: unexpected characters following
> instruction at operand 2 -- `mov x13,x2,lsr#3'
> 
> old binutils 2.26 and before did not support mov with shifted
> register (only orr reg,xzr,reg,shift).
> 
> but i think a shift instruction (lsr) should be better anyway
> (on most implementations).
> 
> can you please fix this?

Thanks for pointing out, I've pushed this patch[1] after testing it on a
xenial box, which showed this error.  This was my old builder and I
recently moved to a new one based on bionic because of which I didn't
see this problem earlier.

Siddhesh

[1]
https://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=d46f84de745db8f3f06a37048261f4e5ceacf0a3

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] aarch64: Improve strncmp for mutually misaligned inputs
@ 2018-03-14 14:04 Wilco Dijkstra
  2018-03-14 14:20 ` Siddhesh Poyarekar
  0 siblings, 1 reply; 11+ messages in thread
From: Wilco Dijkstra @ 2018-03-14 14:04 UTC (permalink / raw)
  To: siddhesh; +Cc: Szabolcs Nagy, libc-alpha, nd

Hi,

Why not use lsr limit_wd, limit, 3? We have 3-operand shifts on AArch64!

--- a/sysdeps/aarch64/strncmp.S
+++ b/sysdeps/aarch64/strncmp.S
@@ -208,13 +208,15 @@ L(done):
        /* Align the SRC1 to a dword by doing a bytewise compare and then do
           the dword loop.  */
 L(try_misaligned_words):
-       mov     limit_wd, limit, lsr #3
+       mov     limit_wd, limit
+       lsr     limit_wd, limit_wd, #3
        cbz     count, L(do_misaligned)
 
        neg     count, count
        and     count, count, #7
        sub     limit, limit, count
-       mov     limit_wd, limit, lsr #3
+       mov     limit_wd, limit
+       lsr     limit_wd, limit_wd, #3

Also it seems to me it would be far easier to subtract 8 from limit in the main loop.
This means we don't ever need limit_wd, and avoids having to do this later:

        /* We found a difference or a NULL before the limit was reached.  */
        and     limit, limit, #7
        cbz     limit, L(not_limit)

Wilco

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] aarch64: Improve strncmp for mutually misaligned inputs
  2018-03-14 14:04 Wilco Dijkstra
@ 2018-03-14 14:20 ` Siddhesh Poyarekar
  2018-03-15  2:37   ` Siddhesh Poyarekar
  0 siblings, 1 reply; 11+ messages in thread
From: Siddhesh Poyarekar @ 2018-03-14 14:20 UTC (permalink / raw)
  To: Wilco Dijkstra; +Cc: Szabolcs Nagy, libc-alpha, nd

On Wednesday 14 March 2018 07:34 PM, Wilco Dijkstra wrote:
> Why not use lsr limit_wd, limit, 3? We have 3-operand shifts on AArch64!

Because I was half asleep and just followed what Szabolcs said ;)

I'll fix that up later (I can barely sit today, my back is killing me)
or please feel free to fix up if you'd like to.

Thanks,
Siddhesh

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] aarch64: Improve strncmp for mutually misaligned inputs
  2018-03-14 14:20 ` Siddhesh Poyarekar
@ 2018-03-15  2:37   ` Siddhesh Poyarekar
  2018-03-15 13:44     ` Wilco Dijkstra
  0 siblings, 1 reply; 11+ messages in thread
From: Siddhesh Poyarekar @ 2018-03-15  2:37 UTC (permalink / raw)
  To: Wilco Dijkstra; +Cc: Szabolcs Nagy, libc-alpha, nd

On Wednesday 14 March 2018 07:50 PM, Siddhesh Poyarekar wrote:
> On Wednesday 14 March 2018 07:34 PM, Wilco Dijkstra wrote:
>> Why not use lsr limit_wd, limit, 3? We have 3-operand shifts on AArch64!
> 
> Because I was half asleep and just followed what Szabolcs said ;)
> 
> I'll fix that up later (I can barely sit today, my back is killing me)
> or please feel free to fix up if you'd like to.

I have fixed this now:

https://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=b47c3e7637efb77818cbef55dcd0ed1f0ea0ddf1

Thanks,
Siddhesh

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] aarch64: Improve strncmp for mutually misaligned inputs
  2018-03-15  2:37   ` Siddhesh Poyarekar
@ 2018-03-15 13:44     ` Wilco Dijkstra
  2018-03-15 13:57       ` Siddhesh Poyarekar
  0 siblings, 1 reply; 11+ messages in thread
From: Wilco Dijkstra @ 2018-03-15 13:44 UTC (permalink / raw)
  To: Siddhesh Poyarekar; +Cc: Szabolcs Nagy, libc-alpha, nd

Siddhesh Poyarekar wrote:

> I have fixed this now:
>
> https://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=b47c3e7637efb77818cbef55dcd0ed1f0ea0ddf1

Thanks, that's fine for now. We should look into tuning this further in the future,
I think both strcmp and strncmp should be able to be almost as fast as memcmp.

Wilco

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] aarch64: Improve strncmp for mutually misaligned inputs
  2018-03-15 13:44     ` Wilco Dijkstra
@ 2018-03-15 13:57       ` Siddhesh Poyarekar
  0 siblings, 0 replies; 11+ messages in thread
From: Siddhesh Poyarekar @ 2018-03-15 13:57 UTC (permalink / raw)
  To: Wilco Dijkstra; +Cc: Szabolcs Nagy, libc-alpha, nd

On Thursday 15 March 2018 07:14 PM, Wilco Dijkstra wrote:
> Thanks, that's fine for now. We should look into tuning this further in the future,
> I think both strcmp and strncmp should be able to be almost as fast as memcmp.

Agreed, I haven't taken it off my plate.  This was a pretty big gain to
keep holding on to though, which is why I pushed it out early.

Siddhesh

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2018-03-15 13:57 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-03-06 13:47 [PATCH] aarch64: Improve strncmp for mutually misaligned inputs Siddhesh Poyarekar
2018-03-13  9:03 ` Siddhesh Poyarekar
2018-03-13 13:12   ` Szabolcs Nagy
2018-03-13 13:18     ` Siddhesh Poyarekar
2018-03-14 12:36     ` Szabolcs Nagy
2018-03-14 13:25       ` Siddhesh Poyarekar
2018-03-14 14:04 Wilco Dijkstra
2018-03-14 14:20 ` Siddhesh Poyarekar
2018-03-15  2:37   ` Siddhesh Poyarekar
2018-03-15 13:44     ` Wilco Dijkstra
2018-03-15 13:57       ` Siddhesh Poyarekar

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).