* [ARM] Optimised strchr and strlen
@ 2011-12-19 17:21 Dr. David Alan Gilbert
[not found] ` <CAODfWeEuNWtsWebJd_GRDnYzTkYW9GWHoSt5qcYRAADTaCsVgQ@mail.gmail.com>
` (2 more replies)
0 siblings, 3 replies; 10+ messages in thread
From: Dr. David Alan Gilbert @ 2011-12-19 17:21 UTC (permalink / raw)
To: libc-ports; +Cc: joseph, patches
This is a strchr and strlen optimised for ARM v6t2 or v7. It's against svn rev r15869
with my previous memchr patch. Tested both little & big endian.
(I've checked it still applies on svn trunk, but not done a retest on that; nothing
seems to have changed around there).
Dave
2012-12-19 Dr. David Alan Gilbert <david.gilbert@linaro.org>
* sysdeps/arm/eabi/armv6t2/strchr.S: New file
* sysdeps/arm/eabi/armv6t2/strlen.S: New file
diff -urN ports/sysdeps/arm/eabi/armv6t2/strchr.S src/ports/sysdeps/arm/eabi/armv6t2/strchr.S
--- ports/sysdeps/arm/eabi/armv6t2/strchr.S 1970-01-01 01:00:00.000000000 +0100
+++ ports/sysdeps/arm/eabi/armv6t2/strchr.S 2011-12-16 13:43:56.704694919 +0000
@@ -0,0 +1,71 @@
+/* Copyright (C) 2011 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Code contributed by Dave Gilbert <david.gilbert@linaro.org>
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <sysdep.h>
+
+@ A very simple strchr routine, from benchmarks on A9 it's a bit faster than
+@ the current version in eglibc.
+@ While I have a version that does 8 bytes/loop and is a lot faster on long
+@ strings, it is slower on short strings, and short strings seem more common
+@ in strchr usage.
+@ Note: The use of cbz/cbnz means it's Thumb only
+
+@ 2011-02-07 david.gilbert@linaro.org
+@ Extracted from local git a5b438d861
+@ 2011-12-16 david.gilbert@linaro.org
+@ Copy from Cortex strings rev 65 and change license
+
+ .syntax unified
+
+ .text
+ .thumb
+
+@ ---------------------------------------------------------------------------
+
+ .thumb_func
+ .global strchr
+ .type strchr,%function
+ENTRY(strchr)
+ @ r0 = start of string
+ @ r1 = character to match
+ @ returns NULL for no match, or a pointer to the match
+ and r1,r1, #255
+
+1:
+ ldrb r2,[r0],#1
+ cmp r2,r1
+ cbz r2,10f
+ bne 1b
+
+ @ We're here if it matched
+5:
+ subs r0,r0,#1
+ DO_RET(lr)
+
+10:
+ @ We're here if we ran off the end
+ cmp r1, #0 @ Corner case - you can search for the nil and get a pointer to it
+ beq 5b @ messy, if common we should branch at the start to a special loop
+ mov r0,#0
+ DO_RET(lr)
+
+END(strchr)
+
+weak_alias (strchr, index)
+libc_hidden_builtin_def(strchr)
diff -urN ports/sysdeps/arm/eabi/armv6t2/strlen.S src/ports/sysdeps/arm/eabi/armv6t2/strlen.S
--- ports/sysdeps/arm/eabi/armv6t2/strlen.S 1970-01-01 01:00:00.000000000 +0100
+++ ports/sysdeps/arm/eabi/armv6t2/strlen.S 2011-12-16 13:43:01.991130183 +0000
@@ -0,0 +1,118 @@
+/* Copyright (C) 2011 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Code contributed by Dave Gilbert <david.gilbert@linaro.org>
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <sysdep.h>
+
+@ This strlen routine is optimised on a Cortex-A9 and should work on
+@ all ARMv7 processors. This routine is reasonably fast for short
+@ strings, but is probably slower than a simple implementation if all
+@ your strings are very short
+@ Note: The use of cbz/cbnz means it's Thumb only
+
+@ 2011-02-08 david.gilbert@linaro.org
+@ Extracted from local git 6848613a
+@ 2011-12-16 david.gilbert@linaro.org
+@ Copy from Cortex strings rev 65 and change license
+@ Add cfi magic, switch to ldrd
+
+
+@ this lets us check a flag in a 00/ff byte easily in either endianness
+#ifdef __ARMEB__
+#define CHARTSTMASK(c) 1<<(31-(c*8))
+#else
+#define CHARTSTMASK(c) 1<<(c*8)
+#endif
+
+@-----------------------------------------------------------------------------
+ .syntax unified
+
+ .text
+ .thumb
+
+ .thumb_func
+ .global strlen
+ .type strlen,%function
+ENTRY(strlen)
+ @ r0 = string
+ @ returns count of bytes in string not including terminator
+ mov r1, r0
+ push { r4,r6 }
+ cfi_adjust_cfa_offset (8)
+ cfi_rel_offset (r4, 0)
+ cfi_rel_offset (r6, 4)
+
+ cfi_remember_state
+
+ mvns r6, #0 @ all F
+ movs r4, #0
+ tst r0, #7
+ beq 2f
+
+1:
+ ldrb r2, [r1], #1
+ tst r1, #7 @ Hit alignment yet?
+ cbz r2, 10f @ Exit if we found the 0
+ bne 1b
+
+ @ So we're now aligned
+2:
+ ldrd r2,r3,[r1],#8
+ uadd8 r2, r2, r6 @ Parallel add 0xff - sets the GE bits for anything that wasn't 0
+ sel r2, r4, r6 @ bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION
+ uadd8 r3, r3, r6 @ Parallel add 0xff - sets the GE bits for anything that wasn't 0
+ sel r3, r2, r6 @ bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION
+ cmp r3, #0
+ beq 2b
+
+strlenendtmp:
+ @ One (or more) of the bytes we loaded was 0 - but which one?
+ @ r2 has the mask corresponding to the first loaded word
+ @ r3 has a combined mask of the two words - but if r2 was all-non 0
+ @ then it's just the 2nd words
+ cmp r2, #0
+ itte eq
+ moveq r2, r3 @ the end is in the 2nd word
+ subeq r1,r1,#3
+ subne r1,r1,#7
+
+ @ r1 currently points to the 2nd byte of the word containing the 0
+ tst r2, # CHARTSTMASK(0) @ 1st character
+ bne 10f
+ adds r1,r1,#1
+ tst r2, # CHARTSTMASK(1) @ 2nd character
+ ittt eq
+ addeq r1,r1,#1
+ tsteq r2, # (3<<15) @ 2nd & 3rd character
+ @ If not the 3rd must be the last one
+ addeq r1,r1,#1
+
+10:
+ @ r0 is still at the beginning, r1 is pointing 1 byte after terminator
+ sub r0, r1, r0
+ subs r0, r0, #1
+ pop { r4, r6 }
+
+ cfi_adjust_cfa_offset (-8)
+ cfi_restore (r4)
+ cfi_restore (r6)
+
+ DO_RET(lr)
+
+END(strlen)
+libc_hidden_builtin_def (strlen)
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [ARM] Optimised strchr and strlen
[not found] ` <CAODfWeEuNWtsWebJd_GRDnYzTkYW9GWHoSt5qcYRAADTaCsVgQ@mail.gmail.com>
@ 2011-12-19 17:39 ` David Gilbert
0 siblings, 0 replies; 10+ messages in thread
From: David Gilbert @ 2011-12-19 17:39 UTC (permalink / raw)
To: Hector Oron; +Cc: libc-ports
On 19 December 2011 17:30, Hector Oron <hector.oron@gmail.com> wrote:
> Hello,
>
> 2011/12/19 Dr. David Alan Gilbert <david.gilbert@linaro.org>:
>
>> 2012-12-19 Dr. David Alan Gilbert <david.gilbert@linaro.org>
>> * sysdeps/arm/eabi/armv6t2/strchr.S: New file
>> * sysdeps/arm/eabi/armv6t2/strlen.S: New file
>
> As a minor issue, I think the date on your ChangeLog entry is bogus by
> a year. :-)
Oops yes - well spotted!
Dave
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [ARM] Optimised strchr and strlen
2011-12-19 17:21 [ARM] Optimised strchr and strlen Dr. David Alan Gilbert
[not found] ` <CAODfWeEuNWtsWebJd_GRDnYzTkYW9GWHoSt5qcYRAADTaCsVgQ@mail.gmail.com>
@ 2011-12-20 20:29 ` Richard Henderson
2011-12-21 10:56 ` David Gilbert
2011-12-20 23:07 ` Joseph S. Myers
2 siblings, 1 reply; 10+ messages in thread
From: Richard Henderson @ 2011-12-20 20:29 UTC (permalink / raw)
To: Dr. David Alan Gilbert; +Cc: libc-ports, joseph, patches
On 12/19/2011 09:21 AM, Dr. David Alan Gilbert wrote:
> + @ r1 currently points to the 2nd byte of the word containing the 0
> + tst r2, # CHARTSTMASK(0) @ 1st character
> + bne 10f
> + adds r1,r1,#1
> + tst r2, # CHARTSTMASK(1) @ 2nd character
> + ittt eq
> + addeq r1,r1,#1
> + tsteq r2, # (3<<15) @ 2nd & 3rd character
> + @ If not the 3rd must be the last one
> + addeq r1,r1,#1
No need to search -- clz can do that for you.
#ifdef __ARMEL__
rbit r2, r2
#endif
clz r2, r2
lsrs r2, r2, #3
adds r1, r1, r2
r~
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [ARM] Optimised strchr and strlen
2011-12-19 17:21 [ARM] Optimised strchr and strlen Dr. David Alan Gilbert
[not found] ` <CAODfWeEuNWtsWebJd_GRDnYzTkYW9GWHoSt5qcYRAADTaCsVgQ@mail.gmail.com>
2011-12-20 20:29 ` Richard Henderson
@ 2011-12-20 23:07 ` Joseph S. Myers
2011-12-21 10:55 ` David Gilbert
2 siblings, 1 reply; 10+ messages in thread
From: Joseph S. Myers @ 2011-12-20 23:07 UTC (permalink / raw)
To: Dr. David Alan Gilbert; +Cc: libc-ports, patches
On Mon, 19 Dec 2011, Dr. David Alan Gilbert wrote:
> +@ A very simple strchr routine, from benchmarks on A9 it's a bit faster than
> +@ the current version in eglibc.
> +@ While I have a version that does 8 bytes/loop and is a lot faster on long
> +@ strings, it is slower on short strings, and short strings seem more common
> +@ in strchr usage.
That sounds like a possible case for a hybrid function, with an unrolled
initial part testing some number of characters to cover short strings (it
might be possible to get things aligned at the same time) and the more
complicated version for longer strings. What's the actual size
distribution you see in strchr use?
--
Joseph S. Myers
joseph@codesourcery.com
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [ARM] Optimised strchr and strlen
2011-12-20 23:07 ` Joseph S. Myers
@ 2011-12-21 10:55 ` David Gilbert
2011-12-21 21:21 ` Richard Henderson
0 siblings, 1 reply; 10+ messages in thread
From: David Gilbert @ 2011-12-21 10:55 UTC (permalink / raw)
To: Joseph S. Myers; +Cc: libc-ports, patches
On 20 December 2011 23:07, Joseph S. Myers <joseph@codesourcery.com> wrote:
> On Mon, 19 Dec 2011, Dr. David Alan Gilbert wrote:
>
>> +@ A very simple strchr routine, from benchmarks on A9 it's a bit faster than
>> +@ the current version in eglibc.
>> +@ While I have a version that does 8 bytes/loop and is a lot faster on long
>> +@ strings, it is slower on short strings, and short strings seem more common
>> +@ in strchr usage.
>
> That sounds like a possible case for a hybrid function, with an unrolled
> initial part testing some number of characters to cover short strings (it
> might be possible to get things aligned at the same time) and the more
> complicated version for longer strings.
Of course the difficulty with strchr (compared with memchr) is that you have
no length parameter to hint at how much is left, and thus whether it's
worth making that switch; so it has to be a heuristic and it's going to cost
you something in the small case.
> What's the actual size distribution you see in strchr use?
It varies heavily by program. I only found one of the SPEC benchmarks using
it to a measurable amount (i.e. showed up in profile), and that was
mostly in 24byte strings
with the match at random positions within the string.
From some embedded benchmarks I found that almost all the uses of strchr
were calls with less than 8 byte strings (mostly unexpected fallout
from within libc rather
than the meat of the benchmark).
One of the things people commonly seem to do with strchr is see whether a
character is in a set, and run that strchr along a string - e.g. something like
char *mystr=....
char tmp;
for(tmp=*mystr;tmp;mystr++) {
tmp=*(mystr++);
if (strchr(" \t\n\r", tmp)!=NULL) { do something }
}
With the string being searched depressingly small. There are some places where
you might hope the compiler could be smart and do something with that,
although often the string being searched is actually a variable (think
$IFS in shell).
(I fancy something like a strchr_short with the compiler calling that
when it knows
the string is less than some length - but how do you define that
length between the
compiler and library?)
The other thing I found in some examples was splitting env strings; so
searching for
the '=' in NAME=VALUE, the NAME parts tend not to be particularly long.
The worst case tends to be where you're using strchr on a long string and you
don't actually find the match.
A ltrace of gcc's cc1 shows a few of the cases - e.g.
Mostly short identifiers:
strchr("nothrow", ' ') = NULL
strchr("final", ' ') = NULL
strchr("dfinish", ' ') = NULL
This is a case that can get long - finding '.' in filenames; this can
be one where
you get longer strings without a match.
strchr("/usr/local/include", '.') = NULL
strchr("/usr/local/include", '.') = NULL
strchr("/usr/lib/arm-linux-gnueabi/gcc/arm-linux-gnueabi/4.5.2/include",
'.') = ".5.2/include"
I suspect something like parsing a format string:
strchr("-+ #0", 's') = NULL
Splitting assignments:
strchr("__UINT16_C(c)=c", '=') = "=c"
strchr("__UINT_LEAST32_MAX__=4294967295U", '=') = "=4294967295U"
Here's a profile graph of different strlen's on an ARM:
https://wiki.linaro.org/WorkingGroups/ToolChain/Benchmarks/InitialStrchr?action=AttachFile&do=get&target=panda-01-strchr-git44154ec-strchr-abs.png
That 'simple' one is showing the benefit at the short lengths,
the 'smarter' one I have is doing 8 bytes/loop and is nice on the long
strings - but as you can see worse at the short ones.
Dave
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [ARM] Optimised strchr and strlen
2011-12-20 20:29 ` Richard Henderson
@ 2011-12-21 10:56 ` David Gilbert
0 siblings, 0 replies; 10+ messages in thread
From: David Gilbert @ 2011-12-21 10:56 UTC (permalink / raw)
To: Richard Henderson; +Cc: libc-ports, joseph, patches
On 20 December 2011 20:29, Richard Henderson <rth@twiddle.net> wrote:
> On 12/19/2011 09:21 AM, Dr. David Alan Gilbert wrote:
>> + @ r1 currently points to the 2nd byte of the word containing the 0
>> + tst r2, # CHARTSTMASK(0) @ 1st character
>> + bne 10f
>> + adds r1,r1,#1
>> + tst r2, # CHARTSTMASK(1) @ 2nd character
>> + ittt eq
>> + addeq r1,r1,#1
>> + tsteq r2, # (3<<15) @ 2nd & 3rd character
>> + @ If not the 3rd must be the last one
>> + addeq r1,r1,#1
>
> No need to search -- clz can do that for you.
>
> #ifdef __ARMEL__
> rbit r2, r2
> #endif
> clz r2, r2
> lsrs r2, r2, #3
> adds r1, r1, r2
Hi Richard,
Thanks for the suggestion - I'd seen that form in some code from another
library and decided because of that not to roll it into my code; however
since there have now been 3 separate people who've suggested it to me
independently I don't see why not!
Thanks,
Dave
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [ARM] Optimised strchr and strlen
2011-12-21 10:55 ` David Gilbert
@ 2011-12-21 21:21 ` Richard Henderson
2011-12-23 20:32 ` David Gilbert
0 siblings, 1 reply; 10+ messages in thread
From: Richard Henderson @ 2011-12-21 21:21 UTC (permalink / raw)
To: David Gilbert; +Cc: Joseph S. Myers, libc-ports, patches
On 12/21/2011 02:55 AM, David Gilbert wrote:
> That 'simple' one is showing the benefit at the short lengths,
> the 'smarter' one I have is doing 8 bytes/loop and is nice on the long
> strings - but as you can see worse at the short ones.
Having not seen your "smarter" strchr, it's hard to suggest anything
concrete. I'd have thought that there's enough slack in load delay
that one or two arithmetic operations could be done without penalty...
Something like performing a simple compare loop looking for "alignment plus":
...
bic r3, r0, #7
and r1, r1, #255
adds r3, r3, #32
1:
ldrb r2, [r0],#1
cmp r2, r1
cbz r2, .Lfound_zero
it ne
cmpne r0, r3
bne 1b
cmp r2, r1
beq .Lfound
@ Here, r0 is aligned. Do something word-based.
...
or even just
and r3, r0, #7
and r1, r1, #255
rsb r3, r3, #32
1:
ldrb r2, [r0],#1
cmp r2, r1
beq .Lfound
subs r3, r3, #1
cbz r2, .Lfound_zero
bne 1b
@ Here, r0 is aligned. Do something word-based.
r~
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [ARM] Optimised strchr and strlen
2011-12-21 21:21 ` Richard Henderson
@ 2011-12-23 20:32 ` David Gilbert
2011-12-24 21:01 ` Richard Henderson
0 siblings, 1 reply; 10+ messages in thread
From: David Gilbert @ 2011-12-23 20:32 UTC (permalink / raw)
To: Richard Henderson; +Cc: Joseph S. Myers, libc-ports, michael.hope
On 21 December 2011 21:20, Richard Henderson <rth@twiddle.net> wrote:
> On 12/21/2011 02:55 AM, David Gilbert wrote:
>> That 'simple' one is showing the benefit at the short lengths,
>> the 'smarter' one I have is doing 8 bytes/loop and is nice on the long
>> strings - but as you can see worse at the short ones.
>
> Having not seen your "smarter" strchr, it's hard to suggest anything
> concrete. I'd have thought that there's enough slack in load delay
> that one or two arithmetic operations could be done without penalty...
Sure; it's pretty much the same trick as my strlen routine.
> Something like performing a simple compare loop looking for "alignment plus":
<snip>
> or even just
>
> and r3, r0, #7
> and r1, r1, #255
> rsb r3, r3, #32
> 1:
> ldrb r2, [r0],#1
> cmp r2, r1
> beq .Lfound
> subs r3, r3, #1
> cbz r2, .Lfound_zero
> bne 1b
> @ Here, r0 is aligned. Do something word-based.
OK, so I gave that a go - and the results are:
https://wiki.linaro.org/WorkingGroups/ToolChain/Benchmarks/InitialStrchr?action=AttachFile&do=get&target=strchr-fiddle20111223-rth-strchr-abs.png
The line on that graph above labelled 'fiddle' is the one where I
tried that code; I can't explain the
really grim dip (at 16 bytes) - I checked with a debugger that it is
going around the loop
upto 32 times. However, even if we ignored the dip at 16, it's still
not a nice result - it's worse
than the simple routine I posted up to 64 bytes. One guess is that
loop with 3 branches in might
be too much for the branch predictor (I remember there are
restrictions on how close together they can go.
I might be able to make that a bit better for the larger cases by
using the clz trick you suggested for the strlen
for the end-of-fastloop case; and I'll give that a go; but it's not
going to help at the bottom.
(OT: Does anyone know how to get R to use a log2 X graph rather than log 10?)
Here are the raw numbers that maybe easier than the graphs (Note the
strings tested are aligned):
smarter_strchr_armv7fiddle: ,6553600, loops of ,64, bytes=400.000000
MB, transferred in ,1395904541.000000 ns, giving, 286.552546 MB/s
smarter_strchr_armv7fiddle: ,6553600, loops of ,62, bytes=387.500000
MB, transferred in ,1258880615.000000 ns, giving, 307.813144 MB/s
smarter_strchr_armv7fiddle: ,13107200, loops of ,32, bytes=400.000000
MB, transferred in ,1813507080.000000 ns, giving, 220.567101 MB/s
smarter_strchr_armv7fiddle: ,26214400, loops of ,16, bytes=400.000000
MB, transferred in ,2922332764.000000 ns, giving, 136.876951 MB/s
smarter_strchr_armv7fiddle: ,52428800, loops of ,8, bytes=400.000000
MB, transferred in ,1904724122.000000 ns, giving, 210.004166 MB/s
smarter_strchr_armv7fiddle: ,104857600, loops of ,4, bytes=400.000000
MB, transferred in ,2191650391.000000 ns, giving, 182.510861 MB/s
smarter_strchr_armv7fiddle: ,209715200, loops of ,2, bytes=400.000000
MB, transferred in ,3548339844.000000 ns, giving, 112.728774 MB/s
smarter_strchr_armv7: ,6553600, loops of ,64, bytes=400.000000 MB,
transferred in ,726409912.000000 ns, giving, 550.653279 MB/s
smarter_strchr_armv7: ,6553600, loops of ,62, bytes=387.500000 MB,
transferred in ,743652344.000000 ns, giving, 521.076822 MB/s
smarter_strchr_armv7: ,13107200, loops of ,32, bytes=400.000000 MB,
transferred in ,874023437.000000 ns, giving, 457.653632 MB/s
smarter_strchr_armv7: ,26214400, loops of ,16, bytes=400.000000 MB,
transferred in ,1160278321.000000 ns, giving, 344.744871 MB/s
smarter_strchr_armv7: ,52428800, loops of ,8, bytes=400.000000 MB,
transferred in ,1617584229.000000 ns, giving, 247.282332 MB/s
smarter_strchr_armv7: ,104857600, loops of ,4, bytes=400.000000 MB,
transferred in ,3235260010.000000 ns, giving, 123.637667 MB/s
smarter_strchr_armv7: ,209715200, loops of ,2, bytes=400.000000 MB,
transferred in ,7044738769.000000 ns, giving, 56.779962 MB/s
simple_strchr: ,6553600, loops of ,64, bytes=400.000000 MB,
transferred in ,1298034668.000000 ns, giving, 308.158179 MB/s
simple_strchr: ,6553600, loops of ,62, bytes=387.500000 MB,
transferred in ,1265472413.000000 ns, giving, 306.209757 MB/s
simple_strchr: ,13107200, loops of ,32, bytes=400.000000 MB,
transferred in ,1447998047.000000 ns, giving, 276.243467 MB/s
simple_strchr: ,26214400, loops of ,16, bytes=400.000000 MB,
transferred in ,1245819092.000000 ns, giving, 321.073904 MB/s
simple_strchr: ,52428800, loops of ,8, bytes=400.000000 MB,
transferred in ,1435119629.000000 ns, giving, 278.722409 MB/s
simple_strchr: ,104857600, loops of ,4, bytes=400.000000 MB,
transferred in ,1878479004.000000 ns, giving, 212.938233 MB/s
simple_strchr: ,209715200, loops of ,2, bytes=400.000000 MB,
transferred in ,3339569092.000000 ns, giving, 119.775932 MB/s
Dave
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [ARM] Optimised strchr and strlen
2011-12-23 20:32 ` David Gilbert
@ 2011-12-24 21:01 ` Richard Henderson
2011-12-30 21:36 ` David Gilbert
0 siblings, 1 reply; 10+ messages in thread
From: Richard Henderson @ 2011-12-24 21:01 UTC (permalink / raw)
To: David Gilbert; +Cc: Joseph S. Myers, libc-ports, michael.hope
[-- Attachment #1: Type: text/plain, Size: 553 bytes --]
On 12/23/2011 12:31 PM, David Gilbert wrote:
> Sure; it's pretty much the same trick as my strlen routine.
...
> OK, so I gave that a go - and the results are:
I can't help but wonder if just the one branch in the first loop is best.
Also, it appears one can use uqadd8 and do the aligned two words in parallel
rather than having everything serialize on the GT flags and SEL.
I've run this through glibc's test-strchr, but havn't gotten around to
benchmarking it at all. Since you've already got that set up, perhaps
you could give it a whirl.
r~
[-- Attachment #2: strchr.S --]
[-- Type: text/plain, Size: 3303 bytes --]
@ We occasionally want to use the S form simply to achieve a smaller
@ instruction form in Thumb mode. Never set the flags in ARM mode.
#ifdef __thumb2__
# define s(insn) insn##s
#else
# define s(insn) insn
#endif
.syntax unified
.text
#ifdef __thumb2__
.thumb
.thumb_func
.arch armv6t2
#else
.arch armv7-a
#endif
.global strchr
.type strchr, %function
strchr:
@ r0 = start of string
@ r1 = character to match
@ returns NULL for no match, or a pointer to the match
@ To cater to long strings, we want to search through a few
@ characters until we reach an aligned pointer. To cater to
@ small strings, we don't want to start doing word operations
@ immediately. The compromise is a maximum of 32 bytes less
@ whatever is required to end with an aligned pointer.
@ r3 = number of characters to search in alignment loop
and r3, r0, #7
uxtb r1, r1
rsb r3, r3, #32
@ Loop until we find ...
1: ldrb r2, [r0], #1
subs r3, r3, #1 @ ... the aligment point
it ne
cmpne r2, r1 @ ... or the character
it ne
cmpne r2, #0 @ ... or EOS
bne 1b
@ Disambiguate the exit possibilites above
cmp r2, r1 @ Found the character
itt eq
subeq r0, r0, #1
bxeq lr
cmp r2, #0 @ Found EOS
itt eq
moveq r0, #0
bxeq lr
@ So now we're aligned. Now we actually need a stack frame.
push { r4, r5, r6, r7 }
orr r1, r1, r1, lsl #8 @ Replicate C to all bytes
movw ip, #0xfefe
orr r1, r1, r1, lsl #16
movt ip, #0xfefe
@ Loop searching for EOS or C, 8 bytes at a time.
2: ldrd r2, r3, [r0], #8
@ Adding (unsigned saturating) 0xfe means result of 0xfe for any byte
@ that was originally zero and 0xff otherwise. Therefore we consider
@ the lsb of each byte the "found" bit, with 0 for a match.
uqadd8 r4, r2, ip @ Find EOS
uqadd8 r5, r3, ip
eor r6, r2, r1 @ Convert C bytes to 0
eor r7, r3, r1
uqadd8 r6, r6, ip @ Find C
uqadd8 r7, r7, ip
s(and) r4, r4, r6 @ Combine found for EOS and C
s(and) r5, r5, r7
and r6, r4, r5 @ Combine the two words
mvns r6, r6 @ Test for any found bit true
beq 2b
@ Invert the sense of the found bits. After this we have 1 in
@ any byte that contains a match, and 0 otherwise.
s(mvn) r5, r5
mvns r4, r4
@ Found something. Disambiguate between first and second words.
@ Adjust r0 to point to the word containing the match.
@ Adjust r2 to the contents of the word containing the match.
@ Adjust r4 to the found bits for the word containing the match.
iteee ne
subne r0, r0, #8
subeq r0, r0, #4
moveq r4, r5
moveq r2, r3
@ Find the bit-offset of the match within the word.
#ifdef __ARMEL__
@ For little-endian, we only need to reverse the bits so that
@ count-leading-zeros becomes in effect count-trailing-zeros.
rbit r4, r4
clz r3, r4
#else
@ For big-endian, we're matching 0x01 (not 0x80), and so the
@ bit offset is 7 too high. Also, we byte-swap the word so
@ that we can shift down to extract the found byte.
clz r3, r4
rev r2, r2
s(sub) r3, r3, #7
#endif
s(lsr) r2, r2, r3 @ Shift down found byte
add r0, r0, r3, lsr #3 @ Adjust the pointer to the found byte
uxtb r2, r2 @ Extract found byte
uxtb r1, r1 @ Undo replication of C
pop { r4, r5, r6, r7 }
@ Disambiguate between EOS and C.
cmp r2, r1
it ne
movne r0, #0 @ Found EOS, return NULL
bx lr
.size strchr, . - strchr
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [ARM] Optimised strchr and strlen
2011-12-24 21:01 ` Richard Henderson
@ 2011-12-30 21:36 ` David Gilbert
0 siblings, 0 replies; 10+ messages in thread
From: David Gilbert @ 2011-12-30 21:36 UTC (permalink / raw)
To: Richard Henderson; +Cc: Joseph S. Myers, libc-ports, michael.hope
On 24 December 2011 21:01, Richard Henderson <rth@twiddle.net> wrote:
> On 12/23/2011 12:31 PM, David Gilbert wrote:
>> Sure; it's pretty much the same trick as my strlen routine.
> ...
>> OK, so I gave that a go - and the results are:
>
> I can't help but wonder if just the one branch in the first loop is best.
Yes.
> Also, it appears one can use uqadd8 and do the aligned two words in parallel
> rather than having everything serialize on the GT flags and SEL.
>
> I've run this through glibc's test-strchr, but havn't gotten around to
> benchmarking it at all. Since you've already got that set up, perhaps
> you could give it a whirl.
Here we go - you're code is the green line; rth_strchr - your uqadd8
trick is very nice;
the peak speed is a nice bit higher than my version using a set of uadd8's and
sel (you get 1 instruction less in the main loop).
https://wiki.linaro.org/WorkingGroups/ToolChain/Benchmarks/InitialStrchr?action=AttachFile&do=view&target=strchr-withrth-strchr-abs.png
The simple routine is still easily winning below 32 bytes though, and
there is still that odd notch at 16.
(I think your uqadd8 trick would be a nice improvement on my strlen
and memchr routines).
Dave
^ permalink raw reply [flat|nested] 10+ messages in thread
end of thread, other threads:[~2011-12-30 21:36 UTC | newest]
Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-12-19 17:21 [ARM] Optimised strchr and strlen Dr. David Alan Gilbert
[not found] ` <CAODfWeEuNWtsWebJd_GRDnYzTkYW9GWHoSt5qcYRAADTaCsVgQ@mail.gmail.com>
2011-12-19 17:39 ` David Gilbert
2011-12-20 20:29 ` Richard Henderson
2011-12-21 10:56 ` David Gilbert
2011-12-20 23:07 ` Joseph S. Myers
2011-12-21 10:55 ` David Gilbert
2011-12-21 21:21 ` Richard Henderson
2011-12-23 20:32 ` David Gilbert
2011-12-24 21:01 ` Richard Henderson
2011-12-30 21:36 ` David Gilbert
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).