* [PATCH] aarch64: Improve strncmp for mutually misaligned inputs
@ 2018-03-06 13:47 Siddhesh Poyarekar
2018-03-13 9:03 ` Siddhesh Poyarekar
0 siblings, 1 reply; 11+ messages in thread
From: Siddhesh Poyarekar @ 2018-03-06 13:47 UTC (permalink / raw)
To: libc-alpha; +Cc: szabolcs.nagy
The mutually misaligned inputs on aarch64 are compared with a simple
byte copy, which is not very efficient. Enhance the comparison
similar to strcmp by loading a double-word at a time. The peak
performance improvement (i.e. 4k maxlen comparisons) due to this on
the strncmp microbenchmark is as follows:
falkor: 3.5x (up to 72% time reduction)
cortex-a73: 3.5x (up to 71% time reduction)
cortex-a53: 3.5x (up to 71% time reduction)
All mutually misaligned inputs from 16 bytes maxlen onwards show
upwards of 15% improvement and there is no measurable effect on the
performance of aligned/mutually aligned inputs.
* sysdeps/aarch64/strncmp.S (count): New macro.
(strncmp): Store misaligned length in SRC1 in COUNT.
(mutual_align): Adjust.
(misaligned8): Load dword at a time when it is safe.
---
sysdeps/aarch64/strncmp.S | 95 +++++++++++++++++++++++++++++++++++++++--------
1 file changed, 80 insertions(+), 15 deletions(-)
diff --git a/sysdeps/aarch64/strncmp.S b/sysdeps/aarch64/strncmp.S
index a08d2c0aa5..20c7ec8dad 100644
--- a/sysdeps/aarch64/strncmp.S
+++ b/sysdeps/aarch64/strncmp.S
@@ -49,6 +49,7 @@
#define limit_wd x13
#define mask x14
#define endloop x15
+#define count mask
ENTRY_ALIGN_AND_PAD (strncmp, 6, 7)
DELOUSE (0)
@@ -58,9 +59,9 @@ ENTRY_ALIGN_AND_PAD (strncmp, 6, 7)
eor tmp1, src1, src2
mov zeroones, #REP8_01
tst tmp1, #7
+ and count, src1, #7
b.ne L(misaligned8)
- ands tmp1, src1, #7
- b.ne L(mutual_align)
+ cbnz count, L(mutual_align)
/* Calculate the number of full and partial words -1. */
sub limit_wd, limit, #1 /* limit != 0, so no underflow. */
lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */
@@ -165,43 +166,107 @@ L(mutual_align):
bic src1, src1, #7
bic src2, src2, #7
ldr data1, [src1], #8
- neg tmp3, tmp1, lsl #3 /* 64 - bits(bytes beyond align). */
+ neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */
ldr data2, [src2], #8
mov tmp2, #~0
sub limit_wd, limit, #1 /* limit != 0, so no underflow. */
#ifdef __AARCH64EB__
/* Big-endian. Early bytes are at MSB. */
- lsl tmp2, tmp2, tmp3 /* Shift (tmp1 & 63). */
+ lsl tmp2, tmp2, tmp3 /* Shift (count & 63). */
#else
/* Little-endian. Early bytes are at LSB. */
- lsr tmp2, tmp2, tmp3 /* Shift (tmp1 & 63). */
+ lsr tmp2, tmp2, tmp3 /* Shift (count & 63). */
#endif
and tmp3, limit_wd, #7
lsr limit_wd, limit_wd, #3
/* Adjust the limit. Only low 3 bits used, so overflow irrelevant. */
- add limit, limit, tmp1
- add tmp3, tmp3, tmp1
+ add limit, limit, count
+ add tmp3, tmp3, count
orr data1, data1, tmp2
orr data2, data2, tmp2
add limit_wd, limit_wd, tmp3, lsr #3
b L(start_realigned)
-L(ret0):
- mov result, #0
- RET
-
.p2align 6
+ /* Don't bother with dwords for up to 16 bytes. */
L(misaligned8):
- sub limit, limit, #1
-1:
+ cmp limit, #16
+ b.hs L(try_misaligned_words)
+
+L(byte_loop):
/* Perhaps we can do better than this. */
ldrb data1w, [src1], #1
ldrb data2w, [src2], #1
subs limit, limit, #1
- ccmp data1w, #1, #0, cs /* NZCV = 0b0000. */
+ ccmp data1w, #1, #0, hi /* NZCV = 0b0000. */
ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
- b.eq 1b
+ b.eq L(byte_loop)
+L(done):
sub result, data1, data2
RET
+
+ /* Align the SRC1 to a dword by doing a bytewise compare and then do
+ the dword loop. */
+L(try_misaligned_words):
+ mov limit_wd, limit, lsr #3
+ cbz count, L(do_misaligned)
+
+ neg count, count
+ and count, count, #7
+ sub limit, limit, count
+ mov limit_wd, limit, lsr #3
+
+L(page_end_loop):
+ ldrb data1w, [src1], #1
+ ldrb data2w, [src2], #1
+ cmp data1w, #1
+ ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
+ b.ne L(done)
+ subs count, count, #1
+ b.hi L(page_end_loop)
+
+L(do_misaligned):
+ /* Prepare ourselves for the next page crossing. Unlike the aligned
+ loop, we fetch 1 less dword because we risk crossing bounds on
+ SRC2. */
+ mov count, #8
+ subs limit_wd, limit_wd, #1
+ b.lo L(done_loop)
+L(loop_misaligned):
+ and tmp2, src2, #0xff8
+ eor tmp2, tmp2, #0xff8
+ cbz tmp2, L(page_end_loop)
+
+ ldr data1, [src1], #8
+ ldr data2, [src2], #8
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, #REP8_7f
+ eor diff, data1, data2 /* Non-zero if differences found. */
+ bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
+ ccmp diff, #0, #0, eq
+ b.ne L(not_limit)
+ subs limit_wd, limit_wd, #1
+ b.pl L(loop_misaligned)
+
+L(done_loop):
+ /* We found a difference or a NULL before the limit was reached. */
+ and limit, limit, #7
+ cbz limit, L(not_limit)
+ /* Read the last word. */
+ sub src1, src1, 8
+ sub src2, src2, 8
+ ldr data1, [src1, limit]
+ ldr data2, [src2, limit]
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, #REP8_7f
+ eor diff, data1, data2 /* Non-zero if differences found. */
+ bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
+ ccmp diff, #0, #0, eq
+ b.ne L(not_limit)
+
+L(ret0):
+ mov result, #0
+ RET
+
END (strncmp)
libc_hidden_builtin_def (strncmp)
--
2.14.3
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH] aarch64: Improve strncmp for mutually misaligned inputs
2018-03-06 13:47 [PATCH] aarch64: Improve strncmp for mutually misaligned inputs Siddhesh Poyarekar
@ 2018-03-13 9:03 ` Siddhesh Poyarekar
2018-03-13 13:12 ` Szabolcs Nagy
0 siblings, 1 reply; 11+ messages in thread
From: Siddhesh Poyarekar @ 2018-03-13 9:03 UTC (permalink / raw)
To: libc-alpha; +Cc: szabolcs.nagy
Ping!
On Tuesday 06 March 2018 07:17 PM, Siddhesh Poyarekar wrote:
> The mutually misaligned inputs on aarch64 are compared with a simple
> byte copy, which is not very efficient. Enhance the comparison
> similar to strcmp by loading a double-word at a time. The peak
> performance improvement (i.e. 4k maxlen comparisons) due to this on
> the strncmp microbenchmark is as follows:
>
> falkor: 3.5x (up to 72% time reduction)
> cortex-a73: 3.5x (up to 71% time reduction)
> cortex-a53: 3.5x (up to 71% time reduction)
>
> All mutually misaligned inputs from 16 bytes maxlen onwards show
> upwards of 15% improvement and there is no measurable effect on the
> performance of aligned/mutually aligned inputs.
>
> * sysdeps/aarch64/strncmp.S (count): New macro.
> (strncmp): Store misaligned length in SRC1 in COUNT.
> (mutual_align): Adjust.
> (misaligned8): Load dword at a time when it is safe.
> ---
> sysdeps/aarch64/strncmp.S | 95 +++++++++++++++++++++++++++++++++++++++--------
> 1 file changed, 80 insertions(+), 15 deletions(-)
>
> diff --git a/sysdeps/aarch64/strncmp.S b/sysdeps/aarch64/strncmp.S
> index a08d2c0aa5..20c7ec8dad 100644
> --- a/sysdeps/aarch64/strncmp.S
> +++ b/sysdeps/aarch64/strncmp.S
> @@ -49,6 +49,7 @@
> #define limit_wd x13
> #define mask x14
> #define endloop x15
> +#define count mask
>
> ENTRY_ALIGN_AND_PAD (strncmp, 6, 7)
> DELOUSE (0)
> @@ -58,9 +59,9 @@ ENTRY_ALIGN_AND_PAD (strncmp, 6, 7)
> eor tmp1, src1, src2
> mov zeroones, #REP8_01
> tst tmp1, #7
> + and count, src1, #7
> b.ne L(misaligned8)
> - ands tmp1, src1, #7
> - b.ne L(mutual_align)
> + cbnz count, L(mutual_align)
> /* Calculate the number of full and partial words -1. */
> sub limit_wd, limit, #1 /* limit != 0, so no underflow. */
> lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */
> @@ -165,43 +166,107 @@ L(mutual_align):
> bic src1, src1, #7
> bic src2, src2, #7
> ldr data1, [src1], #8
> - neg tmp3, tmp1, lsl #3 /* 64 - bits(bytes beyond align). */
> + neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */
> ldr data2, [src2], #8
> mov tmp2, #~0
> sub limit_wd, limit, #1 /* limit != 0, so no underflow. */
> #ifdef __AARCH64EB__
> /* Big-endian. Early bytes are at MSB. */
> - lsl tmp2, tmp2, tmp3 /* Shift (tmp1 & 63). */
> + lsl tmp2, tmp2, tmp3 /* Shift (count & 63). */
> #else
> /* Little-endian. Early bytes are at LSB. */
> - lsr tmp2, tmp2, tmp3 /* Shift (tmp1 & 63). */
> + lsr tmp2, tmp2, tmp3 /* Shift (count & 63). */
> #endif
> and tmp3, limit_wd, #7
> lsr limit_wd, limit_wd, #3
> /* Adjust the limit. Only low 3 bits used, so overflow irrelevant. */
> - add limit, limit, tmp1
> - add tmp3, tmp3, tmp1
> + add limit, limit, count
> + add tmp3, tmp3, count
> orr data1, data1, tmp2
> orr data2, data2, tmp2
> add limit_wd, limit_wd, tmp3, lsr #3
> b L(start_realigned)
>
> -L(ret0):
> - mov result, #0
> - RET
> -
> .p2align 6
> + /* Don't bother with dwords for up to 16 bytes. */
> L(misaligned8):
> - sub limit, limit, #1
> -1:
> + cmp limit, #16
> + b.hs L(try_misaligned_words)
> +
> +L(byte_loop):
> /* Perhaps we can do better than this. */
> ldrb data1w, [src1], #1
> ldrb data2w, [src2], #1
> subs limit, limit, #1
> - ccmp data1w, #1, #0, cs /* NZCV = 0b0000. */
> + ccmp data1w, #1, #0, hi /* NZCV = 0b0000. */
> ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
> - b.eq 1b
> + b.eq L(byte_loop)
> +L(done):
> sub result, data1, data2
> RET
> +
> + /* Align the SRC1 to a dword by doing a bytewise compare and then do
> + the dword loop. */
> +L(try_misaligned_words):
> + mov limit_wd, limit, lsr #3
> + cbz count, L(do_misaligned)
> +
> + neg count, count
> + and count, count, #7
> + sub limit, limit, count
> + mov limit_wd, limit, lsr #3
> +
> +L(page_end_loop):
> + ldrb data1w, [src1], #1
> + ldrb data2w, [src2], #1
> + cmp data1w, #1
> + ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
> + b.ne L(done)
> + subs count, count, #1
> + b.hi L(page_end_loop)
> +
> +L(do_misaligned):
> + /* Prepare ourselves for the next page crossing. Unlike the aligned
> + loop, we fetch 1 less dword because we risk crossing bounds on
> + SRC2. */
> + mov count, #8
> + subs limit_wd, limit_wd, #1
> + b.lo L(done_loop)
> +L(loop_misaligned):
> + and tmp2, src2, #0xff8
> + eor tmp2, tmp2, #0xff8
> + cbz tmp2, L(page_end_loop)
> +
> + ldr data1, [src1], #8
> + ldr data2, [src2], #8
> + sub tmp1, data1, zeroones
> + orr tmp2, data1, #REP8_7f
> + eor diff, data1, data2 /* Non-zero if differences found. */
> + bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
> + ccmp diff, #0, #0, eq
> + b.ne L(not_limit)
> + subs limit_wd, limit_wd, #1
> + b.pl L(loop_misaligned)
> +
> +L(done_loop):
> + /* We found a difference or a NULL before the limit was reached. */
> + and limit, limit, #7
> + cbz limit, L(not_limit)
> + /* Read the last word. */
> + sub src1, src1, 8
> + sub src2, src2, 8
> + ldr data1, [src1, limit]
> + ldr data2, [src2, limit]
> + sub tmp1, data1, zeroones
> + orr tmp2, data1, #REP8_7f
> + eor diff, data1, data2 /* Non-zero if differences found. */
> + bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
> + ccmp diff, #0, #0, eq
> + b.ne L(not_limit)
> +
> +L(ret0):
> + mov result, #0
> + RET
> +
> END (strncmp)
> libc_hidden_builtin_def (strncmp)
>
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH] aarch64: Improve strncmp for mutually misaligned inputs
2018-03-13 9:03 ` Siddhesh Poyarekar
@ 2018-03-13 13:12 ` Szabolcs Nagy
2018-03-13 13:18 ` Siddhesh Poyarekar
2018-03-14 12:36 ` Szabolcs Nagy
0 siblings, 2 replies; 11+ messages in thread
From: Szabolcs Nagy @ 2018-03-13 13:12 UTC (permalink / raw)
To: Siddhesh Poyarekar, libc-alpha; +Cc: nd
On 13/03/18 09:03, Siddhesh Poyarekar wrote:
> Ping!
>
> On Tuesday 06 March 2018 07:17 PM, Siddhesh Poyarekar wrote:
>> The mutually misaligned inputs on aarch64 are compared with a simple
>> byte copy, which is not very efficient. Enhance the comparison
>> similar to strcmp by loading a double-word at a time. The peak
>> performance improvement (i.e. 4k maxlen comparisons) due to this on
>> the strncmp microbenchmark is as follows:
>>
>> falkor: 3.5x (up to 72% time reduction)
>> cortex-a73: 3.5x (up to 71% time reduction)
>> cortex-a53: 3.5x (up to 71% time reduction)
>>
>> All mutually misaligned inputs from 16 bytes maxlen onwards show
>> upwards of 15% improvement and there is no measurable effect on the
>> performance of aligned/mutually aligned inputs.
>>
>> * sysdeps/aarch64/strncmp.S (count): New macro.
>> (strncmp): Store misaligned length in SRC1 in COUNT.
>> (mutual_align): Adjust.
>> (misaligned8): Load dword at a time when it is safe.
OK to commit.
(it would be nice to have the equivalent change in newlib too..)
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH] aarch64: Improve strncmp for mutually misaligned inputs
2018-03-13 13:12 ` Szabolcs Nagy
@ 2018-03-13 13:18 ` Siddhesh Poyarekar
2018-03-14 12:36 ` Szabolcs Nagy
1 sibling, 0 replies; 11+ messages in thread
From: Siddhesh Poyarekar @ 2018-03-13 13:18 UTC (permalink / raw)
To: Szabolcs Nagy, libc-alpha; +Cc: nd
On Tuesday 13 March 2018 06:42 PM, Szabolcs Nagy wrote:
> OK to commit.
>
> (it would be nice to have the equivalent change in newlib too..)
Yes, it's in my TODO list for later this week.
Siddhesh
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH] aarch64: Improve strncmp for mutually misaligned inputs
2018-03-13 13:12 ` Szabolcs Nagy
2018-03-13 13:18 ` Siddhesh Poyarekar
@ 2018-03-14 12:36 ` Szabolcs Nagy
2018-03-14 13:25 ` Siddhesh Poyarekar
1 sibling, 1 reply; 11+ messages in thread
From: Szabolcs Nagy @ 2018-03-14 12:36 UTC (permalink / raw)
To: Siddhesh Poyarekar, libc-alpha; +Cc: nd
On 13/03/18 13:12, Szabolcs Nagy wrote:
> On 13/03/18 09:03, Siddhesh Poyarekar wrote:
>> Ping!
>>
>> On Tuesday 06 March 2018 07:17 PM, Siddhesh Poyarekar wrote:
>>> The mutually misaligned inputs on aarch64 are compared with a simple
>>> byte copy, which is not very efficient. Enhance the comparison
>>> similar to strcmp by loading a double-word at a time. The peak
>>> performance improvement (i.e. 4k maxlen comparisons) due to this on
>>> the strncmp microbenchmark is as follows:
>>>
>>> falkor: 3.5x (up to 72% time reduction)
>>> cortex-a73: 3.5x (up to 71% time reduction)
>>> cortex-a53: 3.5x (up to 71% time reduction)
>>>
>>> All mutually misaligned inputs from 16 bytes maxlen onwards show
>>> upwards of 15% improvement and there is no measurable effect on the
>>> performance of aligned/mutually aligned inputs.
>>>
>>> Â Â Â Â * sysdeps/aarch64/strncmp.S (count): New macro.
>>> Â Â Â Â (strncmp): Store misaligned length in SRC1 in COUNT.
>>> Â Â Â Â (mutual_align): Adjust.
>>> Â Â Â Â (misaligned8): Load dword at a time when it is safe.
>
> OK to commit.
>
> (it would be nice to have the equivalent change in newlib too..)
this broke the build for me
../sysdeps/aarch64/strncmp.S: Assembler messages:
../sysdeps/aarch64/strncmp.S:211: Error: unexpected characters following instruction at operand 2 -- `mov x13,x2,lsr#3'
../sysdeps/aarch64/strncmp.S:217: Error: unexpected characters following instruction at operand 2 -- `mov x13,x2,lsr#3'
old binutils 2.26 and before did not support mov with shifted
register (only orr reg,xzr,reg,shift).
but i think a shift instruction (lsr) should be better anyway
(on most implementations).
can you please fix this?
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH] aarch64: Improve strncmp for mutually misaligned inputs
2018-03-14 12:36 ` Szabolcs Nagy
@ 2018-03-14 13:25 ` Siddhesh Poyarekar
0 siblings, 0 replies; 11+ messages in thread
From: Siddhesh Poyarekar @ 2018-03-14 13:25 UTC (permalink / raw)
To: Szabolcs Nagy, libc-alpha; +Cc: nd
On Wednesday 14 March 2018 06:05 PM, Szabolcs Nagy wrote:
> this broke the build for me
>
> ../sysdeps/aarch64/strncmp.S: Assembler messages:
> ../sysdeps/aarch64/strncmp.S:211: Error: unexpected characters following
> instruction at operand 2 -- `mov x13,x2,lsr#3'
> ../sysdeps/aarch64/strncmp.S:217: Error: unexpected characters following
> instruction at operand 2 -- `mov x13,x2,lsr#3'
>
> old binutils 2.26 and before did not support mov with shifted
> register (only orr reg,xzr,reg,shift).
>
> but i think a shift instruction (lsr) should be better anyway
> (on most implementations).
>
> can you please fix this?
Thanks for pointing out, I've pushed this patch[1] after testing it on a
xenial box, which showed this error. This was my old builder and I
recently moved to a new one based on bionic because of which I didn't
see this problem earlier.
Siddhesh
[1]
https://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=d46f84de745db8f3f06a37048261f4e5ceacf0a3
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH] aarch64: Improve strncmp for mutually misaligned inputs
@ 2018-03-14 14:04 Wilco Dijkstra
2018-03-14 14:20 ` Siddhesh Poyarekar
0 siblings, 1 reply; 11+ messages in thread
From: Wilco Dijkstra @ 2018-03-14 14:04 UTC (permalink / raw)
To: siddhesh; +Cc: Szabolcs Nagy, libc-alpha, nd
Hi,
Why not use lsr limit_wd, limit, 3? We have 3-operand shifts on AArch64!
--- a/sysdeps/aarch64/strncmp.S
+++ b/sysdeps/aarch64/strncmp.S
@@ -208,13 +208,15 @@ L(done):
/* Align the SRC1 to a dword by doing a bytewise compare and then do
the dword loop. */
L(try_misaligned_words):
- mov limit_wd, limit, lsr #3
+ mov limit_wd, limit
+ lsr limit_wd, limit_wd, #3
cbz count, L(do_misaligned)
neg count, count
and count, count, #7
sub limit, limit, count
- mov limit_wd, limit, lsr #3
+ mov limit_wd, limit
+ lsr limit_wd, limit_wd, #3
Also it seems to me it would be far easier to subtract 8 from limit in the main loop.
This means we don't ever need limit_wd, and avoids having to do this later:
/* We found a difference or a NULL before the limit was reached. */
and limit, limit, #7
cbz limit, L(not_limit)
Wilco
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH] aarch64: Improve strncmp for mutually misaligned inputs
2018-03-14 14:04 Wilco Dijkstra
@ 2018-03-14 14:20 ` Siddhesh Poyarekar
2018-03-15 2:37 ` Siddhesh Poyarekar
0 siblings, 1 reply; 11+ messages in thread
From: Siddhesh Poyarekar @ 2018-03-14 14:20 UTC (permalink / raw)
To: Wilco Dijkstra; +Cc: Szabolcs Nagy, libc-alpha, nd
On Wednesday 14 March 2018 07:34 PM, Wilco Dijkstra wrote:
> Why not use lsr limit_wd, limit, 3? We have 3-operand shifts on AArch64!
Because I was half asleep and just followed what Szabolcs said ;)
I'll fix that up later (I can barely sit today, my back is killing me)
or please feel free to fix up if you'd like to.
Thanks,
Siddhesh
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH] aarch64: Improve strncmp for mutually misaligned inputs
2018-03-14 14:20 ` Siddhesh Poyarekar
@ 2018-03-15 2:37 ` Siddhesh Poyarekar
2018-03-15 13:44 ` Wilco Dijkstra
0 siblings, 1 reply; 11+ messages in thread
From: Siddhesh Poyarekar @ 2018-03-15 2:37 UTC (permalink / raw)
To: Wilco Dijkstra; +Cc: Szabolcs Nagy, libc-alpha, nd
On Wednesday 14 March 2018 07:50 PM, Siddhesh Poyarekar wrote:
> On Wednesday 14 March 2018 07:34 PM, Wilco Dijkstra wrote:
>> Why not use lsr limit_wd, limit, 3? We have 3-operand shifts on AArch64!
>
> Because I was half asleep and just followed what Szabolcs said ;)
>
> I'll fix that up later (I can barely sit today, my back is killing me)
> or please feel free to fix up if you'd like to.
I have fixed this now:
https://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=b47c3e7637efb77818cbef55dcd0ed1f0ea0ddf1
Thanks,
Siddhesh
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH] aarch64: Improve strncmp for mutually misaligned inputs
2018-03-15 2:37 ` Siddhesh Poyarekar
@ 2018-03-15 13:44 ` Wilco Dijkstra
2018-03-15 13:57 ` Siddhesh Poyarekar
0 siblings, 1 reply; 11+ messages in thread
From: Wilco Dijkstra @ 2018-03-15 13:44 UTC (permalink / raw)
To: Siddhesh Poyarekar; +Cc: Szabolcs Nagy, libc-alpha, nd
Siddhesh Poyarekar wrote:
> I have fixed this now:
>
> https://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=b47c3e7637efb77818cbef55dcd0ed1f0ea0ddf1
Thanks, that's fine for now. We should look into tuning this further in the future,
I think both strcmp and strncmp should be able to be almost as fast as memcmp.
Wilco
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH] aarch64: Improve strncmp for mutually misaligned inputs
2018-03-15 13:44 ` Wilco Dijkstra
@ 2018-03-15 13:57 ` Siddhesh Poyarekar
0 siblings, 0 replies; 11+ messages in thread
From: Siddhesh Poyarekar @ 2018-03-15 13:57 UTC (permalink / raw)
To: Wilco Dijkstra; +Cc: Szabolcs Nagy, libc-alpha, nd
On Thursday 15 March 2018 07:14 PM, Wilco Dijkstra wrote:
> Thanks, that's fine for now. We should look into tuning this further in the future,
> I think both strcmp and strncmp should be able to be almost as fast as memcmp.
Agreed, I haven't taken it off my plate. This was a pretty big gain to
keep holding on to though, which is why I pushed it out early.
Siddhesh
^ permalink raw reply [flat|nested] 11+ messages in thread
end of thread, other threads:[~2018-03-15 13:57 UTC | newest]
Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-03-06 13:47 [PATCH] aarch64: Improve strncmp for mutually misaligned inputs Siddhesh Poyarekar
2018-03-13 9:03 ` Siddhesh Poyarekar
2018-03-13 13:12 ` Szabolcs Nagy
2018-03-13 13:18 ` Siddhesh Poyarekar
2018-03-14 12:36 ` Szabolcs Nagy
2018-03-14 13:25 ` Siddhesh Poyarekar
2018-03-14 14:04 Wilco Dijkstra
2018-03-14 14:20 ` Siddhesh Poyarekar
2018-03-15 2:37 ` Siddhesh Poyarekar
2018-03-15 13:44 ` Wilco Dijkstra
2018-03-15 13:57 ` Siddhesh Poyarekar
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).