* [PATCH 3/5] arm: Implement armv6 optimized strchr
2013-03-06 23:54 [PATCH 0/5] ARM v6 string routines Richard Henderson
2013-03-06 23:54 ` [PATCH 4/5] arm: Implement armv6 optimized strrchr Richard Henderson
@ 2013-03-06 23:54 ` Richard Henderson
2013-03-07 1:07 ` Joseph S. Myers
2013-03-06 23:54 ` [PATCH 5/5] arm: Implement armv6 optimized rawmemchr Richard Henderson
` (2 subsequent siblings)
4 siblings, 1 reply; 11+ messages in thread
From: Richard Henderson @ 2013-03-06 23:54 UTC (permalink / raw)
To: libc-ports; +Cc: joseph
---
ports/sysdeps/arm/armv6/strchr.S | 143 +++++++++++++++++++++++++++++++++++++++
1 file changed, 143 insertions(+)
create mode 100644 ports/sysdeps/arm/armv6/strchr.S
diff --git a/ports/sysdeps/arm/armv6/strchr.S b/ports/sysdeps/arm/armv6/strchr.S
new file mode 100644
index 0000000..c856283
--- /dev/null
+++ b/ports/sysdeps/arm/armv6/strchr.S
@@ -0,0 +1,143 @@
+/* strchr -- find the first instance of C in a nul-terminated string.
+ Copyright (C) 2013 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+ .syntax unified
+ .text
+
+ENTRY (strchr)
+ @ r0 = start of string
+ @ r1 = character to match
+ @ returns NULL for no match, or a pointer to the match
+ ldrb r2, [r0] @ load the first byte asap
+ uxtb r1, r1
+
+ @ To cater to long strings, we want to search through a few
+ @ characters until we reach an aligned pointer. To cater to
+ @ small strings, we don't want to start doing word operations
+ @ immediately. The compromise is a maximum of 16 bytes less
+ @ whatever is required to end with an aligned pointer.
+ @ r3 = number of characters to search in alignment loop
+ and r3, r0, #7
+ rsb r3, r3, #15 @ 16 - 1 peeled loop iteration
+ cmp r2, r1 @ Found C?
+ it ne
+ cmpne r2, #0 @ Found EOS?
+ beq 99f
+
+ @ Loop until we find ...
+1: ldrb r2, [r0, #1]!
+ subs r3, r3, #1 @ ... the aligment point
+ it ne
+ cmpne r2, r1 @ ... or the character
+ it ne
+ cmpne r2, #0 @ ... or EOS
+ bne 1b
+
+ @ Disambiguate the exit possibilites above
+ cmp r2, r1 @ Found the character
+ it ne
+ cmpne r2, #0 @ Found EOS
+ beq 99f
+ add r0, r0, #1
+
+ @ So now we're aligned. Now we actually need a stack frame.
+ push { r4, r5, r6, r7 }
+ cfi_adjust_cfa_offset (16)
+ cfi_rel_offset (r4, 0)
+ cfi_rel_offset (r5, 4)
+ cfi_rel_offset (r6, 8)
+ cfi_rel_offset (r7, 12)
+
+ ldrd r2, r3, [r0], #8
+ orr r1, r1, r1, lsl #8 @ Replicate C to all bytes
+#ifdef ARCH_HAS_T2
+ movw ip, #0x0101
+ pld [r0, #64]
+ movt ip, #0x0101
+#else
+ ldr ip, =0x01010101
+ pld [r0, #64]
+#endif
+ orr r1, r1, r1, lsl #16
+
+ @ Loop searching for EOS or C, 8 bytes at a time.
+2:
+ @ Subtracting (unsigned saturating) from 1 means result of 1 for
+ @ any byte that was originally zero and 0 otherwise. Therefore
+ @ we consider the lsb of each byte the "found" bit.
+ uqsub8 r4, ip, r2 @ Find EOS
+ eor r6, r2, r1 @ Convert C bytes to 0
+ uqsub8 r5, ip, r3
+ eor r7, r3, r1
+ uqsub8 r6, ip, r6 @ Find C
+ pld [r0, #128] @ Prefetch 2 lines ahead
+ uqsub8 r7, ip, r7
+ orr r4, r4, r6 @ Combine found for EOS and C
+ orr r5, r5, r7
+ orrs r6, r4, r5 @ Combine the two words
+ it eq
+ ldrdeq r2, r3, [r0], #8
+ beq 2b
+
+ @ Found something. Disambiguate between first and second words.
+ @ Adjust r0 to point to the word containing the match.
+ @ Adjust r2 to the contents of the word containing the match.
+ @ Adjust r4 to the found bits for the word containing the match.
+ cmp r4, #0
+ sub r0, r0, #4
+ itte eq
+ moveq r4, r5
+ moveq r2, r3
+ subne r0, r0, #4
+
+ @ Find the bit-offset of the match within the word.
+#if defined(__ARMEL__)
+ @ For LE, swap the found word so clz searches from the little end.
+ rev r4, r4
+#else
+ @ For BE, byte swap the word to make it easier to extract the byte.
+ rev r2, r2
+#endif
+ @ We're counting 0x01 (not 0x80), so the bit offset is 7 too high.
+ clz r3, r4
+ sub r3, r3, #7
+ lsr r2, r2, r3 @ Shift down found byte
+ uxtb r1, r1 @ Undo replication of C
+ uxtb r2, r2 @ Extract found byte
+ add r0, r0, r3, lsr #3 @ Adjust the pointer to the found byte
+
+ pop { r4, r5, r6, r7 }
+ cfi_adjust_cfa_offset (-16)
+ cfi_restore (r4)
+ cfi_restore (r5)
+ cfi_restore (r6)
+ cfi_restore (r7)
+
+ @ Disambiguate between EOS and C.
+99:
+ cmp r2, r1
+ it ne
+ movne r0, #0 @ Found EOS, return NULL
+ bx lr
+
+END (strchr)
+
+weak_alias (strchr, index)
+libc_hidden_builtin_def (strchr)
--
1.8.1.2
^ permalink raw reply [flat|nested] 11+ messages in thread
* [PATCH 0/5] ARM v6 string routines
@ 2013-03-06 23:54 Richard Henderson
2013-03-06 23:54 ` [PATCH 4/5] arm: Implement armv6 optimized strrchr Richard Henderson
` (4 more replies)
0 siblings, 5 replies; 11+ messages in thread
From: Richard Henderson @ 2013-03-06 23:54 UTC (permalink / raw)
To: libc-ports; +Cc: joseph
Not all of them yet, but probably the most imporant ones.
Changed since v1 is that these are now appropriate for armv6
as opposed to only armv6t2.
Tested on A15, and BE via qemu.
r~
Richard Henderson (5):
arm: Implement armv6 optimized strlen
arm: Implement armv6 optimized strcpy
arm: Implement armv6 optimized strchr
arm: Implement armv6 optimized strrchr
arm: Implement armv6 optimized rawmemchr
ports/sysdeps/arm/armv6/rawmemchr.S | 105 +++++++++++++++++
ports/sysdeps/arm/armv6/stpcpy.S | 1 +
ports/sysdeps/arm/armv6/strchr.S | 143 +++++++++++++++++++++++
ports/sysdeps/arm/armv6/strcpy.S | 218 ++++++++++++++++++++++++++++++++++++
ports/sysdeps/arm/armv6/strlen.S | 99 ++++++++++++++++
ports/sysdeps/arm/armv6/strrchr.S | 129 +++++++++++++++++++++
ports/sysdeps/arm/armv6t2/Implies | 2 +
7 files changed, 697 insertions(+)
create mode 100644 ports/sysdeps/arm/armv6/rawmemchr.S
create mode 100644 ports/sysdeps/arm/armv6/stpcpy.S
create mode 100644 ports/sysdeps/arm/armv6/strchr.S
create mode 100644 ports/sysdeps/arm/armv6/strcpy.S
create mode 100644 ports/sysdeps/arm/armv6/strlen.S
create mode 100644 ports/sysdeps/arm/armv6/strrchr.S
create mode 100644 ports/sysdeps/arm/armv6t2/Implies
--
1.8.1.2
^ permalink raw reply [flat|nested] 11+ messages in thread
* [PATCH 1/5] arm: Implement armv6 optimized strlen
2013-03-06 23:54 [PATCH 0/5] ARM v6 string routines Richard Henderson
` (3 preceding siblings ...)
2013-03-06 23:54 ` [PATCH 2/5] arm: Implement armv6 optimized strcpy Richard Henderson
@ 2013-03-06 23:54 ` Richard Henderson
2013-03-07 1:02 ` Joseph S. Myers
4 siblings, 1 reply; 11+ messages in thread
From: Richard Henderson @ 2013-03-06 23:54 UTC (permalink / raw)
To: libc-ports; +Cc: joseph
Twice as fast for long strings and 50% faster for short strings
over the armv4 version on A15.
---
ports/sysdeps/arm/armv6/strlen.S | 99 +++++++++++++++++++++++++++++++++++++++
ports/sysdeps/arm/armv6t2/Implies | 2 +
2 files changed, 101 insertions(+)
create mode 100644 ports/sysdeps/arm/armv6/strlen.S
create mode 100644 ports/sysdeps/arm/armv6t2/Implies
diff --git a/ports/sysdeps/arm/armv6/strlen.S b/ports/sysdeps/arm/armv6/strlen.S
new file mode 100644
index 0000000..a53d414
--- /dev/null
+++ b/ports/sysdeps/arm/armv6/strlen.S
@@ -0,0 +1,99 @@
+/* strlen -- find the length of a nul-terminated string.
+ Copyright (C) 2013 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+ .syntax unified
+ .text
+
+ENTRY (strlen)
+ @ r0 = start of string
+ ldrb r2, [r0] @ load the first byte asap
+
+ @ To cater to long strings, we want to search through a few
+ @ characters until we reach an aligned pointer. To cater to
+ @ small strings, we don't want to start doing word operations
+ @ immediately. The compromise is a maximum of 16 bytes less
+ @ whatever is required to end with an aligned pointer.
+ @ r3 = number of characters to search in alignment loop
+ and r3, r0, #7
+ mov r1, r0 @ Save the input pointer
+ rsb r3, r3, #15 @ 16 - 1 peeled loop iteration
+ cmp r2, #0
+ beq 99f
+
+ @ Loop until we find ...
+1: ldrb r2, [r0, #1]!
+ subs r3, r3, #1 @ ... the aligment point
+ it ne
+ cmpne r2, #0 @ ... or EOS
+ bne 1b
+
+ @ Disambiguate the exit possibilites above
+ cmp r2, #0 @ Found EOS
+ beq 99f
+ add r0, r0, #1
+
+ @ So now we're aligned.
+ ldrd r2, r3, [r0], #8
+#ifdef ARCH_HAS_T2
+ movw ip, #0x0101
+ pld [r0, #64]
+ movt ip, #0x0101
+#else
+ ldr ip, =0x01010101
+ pld [r0, #64]
+#endif
+
+ @ Loop searching for EOS, 8 bytes at a time.
+ @ Subtracting (unsigned saturating) from 1 for any byte means that
+ @ we get 1 for any byte that was originally zero and 0 otherwise.
+ @ Therefore we consider the lsb of each byte the "found" bit.
+ .balign 16
+2: uqsub8 r2, ip, r2 @ Find EOS
+ uqsub8 r3, ip, r3
+ pld [r0, #128] @ Prefetch 2 lines ahead
+ orrs r3, r3, r2 @ Combine the two words
+ it eq
+ ldrdeq r2, r3, [r0], #8
+ beq 2b
+
+ @ Found something. Disambiguate between first and second words.
+ @ Adjust r0 to point to the word containing the match.
+ @ Adjust r2 to the found bits for the word containing the match.
+ cmp r2, #0
+ sub r0, r0, #4
+ ite eq
+ moveq r2, r3
+ subne r0, r0, #4
+
+ @ Find the bit-offset of the match within the word. Note that the
+ @ bit result from clz will be 7 higher than "true", but we'll
+ @ immediately discard those bits converting to a byte offset.
+#ifdef __ARMEL__
+ rev r2, r2 @ For LE, count from the little end
+#endif
+ clz r2, r2
+ add r0, r0, r2, lsr #3 @ Adjust the pointer to the found byte
+99:
+ sub r0, r0, r1 @ Subtract input to compute length
+ bx lr
+
+END (strlen)
+
+libc_hidden_builtin_def (strlen)
diff --git a/ports/sysdeps/arm/armv6t2/Implies b/ports/sysdeps/arm/armv6t2/Implies
new file mode 100644
index 0000000..20a87fc
--- /dev/null
+++ b/ports/sysdeps/arm/armv6t2/Implies
@@ -0,0 +1,2 @@
+# We can do everything that 6 can
+arm/armv6
--
1.8.1.2
^ permalink raw reply [flat|nested] 11+ messages in thread
* [PATCH 2/5] arm: Implement armv6 optimized strcpy
2013-03-06 23:54 [PATCH 0/5] ARM v6 string routines Richard Henderson
` (2 preceding siblings ...)
2013-03-06 23:54 ` [PATCH 5/5] arm: Implement armv6 optimized rawmemchr Richard Henderson
@ 2013-03-06 23:54 ` Richard Henderson
2013-03-07 1:12 ` Joseph S. Myers
2013-03-06 23:54 ` [PATCH 1/5] arm: Implement armv6 optimized strlen Richard Henderson
4 siblings, 1 reply; 11+ messages in thread
From: Richard Henderson @ 2013-03-06 23:54 UTC (permalink / raw)
To: libc-ports; +Cc: joseph
Four times faster than the byte-by-byte default version.
---
ports/sysdeps/arm/armv6/stpcpy.S | 1 +
ports/sysdeps/arm/armv6/strcpy.S | 218 +++++++++++++++++++++++++++++++++++++++
2 files changed, 219 insertions(+)
create mode 100644 ports/sysdeps/arm/armv6/stpcpy.S
create mode 100644 ports/sysdeps/arm/armv6/strcpy.S
diff --git a/ports/sysdeps/arm/armv6/stpcpy.S b/ports/sysdeps/arm/armv6/stpcpy.S
new file mode 100644
index 0000000..21a4f38
--- /dev/null
+++ b/ports/sysdeps/arm/armv6/stpcpy.S
@@ -0,0 +1 @@
+/* Defined in strcpy.S. */
diff --git a/ports/sysdeps/arm/armv6/strcpy.S b/ports/sysdeps/arm/armv6/strcpy.S
new file mode 100644
index 0000000..41f6443
--- /dev/null
+++ b/ports/sysdeps/arm/armv6/strcpy.S
@@ -0,0 +1,218 @@
+/* strcpy -- copy a nul-terminated string.
+ Copyright (C) 2013 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+/* Endian independent macros for shifting bytes within registers. */
+#ifdef __ARMEB__
+#define lsh_gt lsr
+#define lsh_ls lsl
+#else
+#define lsh_gt lsl
+#define lsh_ls lsr
+#endif
+
+ .syntax unified
+ .text
+
+ENTRY (__stpcpy)
+ @ Signal stpcpy with NULL in IP.
+ mov ip, #0
+ b 0f
+END (__stpcpy)
+
+weak_alias (__stpcpy, stpcpy)
+libc_hidden_def (__stpcpy)
+libc_hidden_builtin_def (stpcpy)
+
+ENTRY (strcpy)
+ @ Signal strcpy with DEST in IP.
+ mov ip, r0
+0:
+ pld [r0]
+ pld [r1]
+
+ @ To cater to long strings, we want 8 byte alignment in the source.
+ @ To cater to small strings, we don't want to start that right away.
+ @ Loop up to 16 times, less whatever it takes to reach alignment.
+ and r3, r1, #7
+ rsb r3, r3, #16
+
+ @ Loop until we find ...
+1: ldrb r2, [r1], #1
+ subs r3, r3, #1 @ ... the alignment point
+ strb r2, [r0], #1
+ it ne
+ cmpne r2, #0 @ ... or EOS
+ bne 1b
+
+ @ Disambiguate the exit possibilites above
+ cmp r2, #0 @ Found EOS
+ beq .Lreturn
+
+ @ Load the next two words asap
+ ldrd r2, r3, [r1], #8
+ pld [r0, #64]
+ pld [r1, #64]
+
+ @ For longer strings, we actaully need a stack frame.
+ push { r4, r5, r6, r7 }
+ cfi_adjust_cfa_offset (16)
+ cfi_rel_offset (r4, 0)
+ cfi_rel_offset (r5, 4)
+ cfi_rel_offset (r6, 8)
+ cfi_rel_offset (r7, 12)
+
+ @ Subtracting (unsigned saturating) from 1 for any byte means result
+ @ of 1 for any byte that was originally zero and 0 otherwise.
+ @ Therefore we consider the lsb of each byte the "found" bit.
+#ifdef ARCH_HAS_T2
+ movw r7, #0x0101
+ tst r0, #3 @ Test alignment of DEST
+ movt r7, #0x0101
+#else
+ ldr ip, =0x01010101
+ tst r0, #3
+#endif
+ bne .Lunaligned
+
+ @ So now source (r1) is aligned to 8, and dest (r0) is aligned to 4.
+ @ Loop, reading 8 bytes at a time, searching for EOS.
+ .balign 16
+2: uqsub8 r4, r7, r2 @ Find EOS
+ uqsub8 r5, r7, r3
+ pld [r1, #128]
+ cmp r4, #0 @ EOS in first word?
+ pld [r0, #128]
+ bne 3f
+ str r2, [r0], #4
+ cmp r5, #0 @ EOS in second word?
+ bne 4f
+ str r3, [r0], #4
+ ldrd r2, r3, [r1], #8
+ b 2b
+
+3: sub r1, r1, #4 @ backup to first word
+4: sub r1, r1, #4 @ backup to second word
+
+ @ ... then finish up any tail a byte at a time.
+ @ Note that we generally back up and re-read source bytes,
+ @ but we'll not re-write dest bytes.
+.Lbyte_loop:
+ ldrb r2, [r1], #1
+ cmp r2, #0
+ strb r2, [r0], #1
+ bne .Lbyte_loop
+
+ pop { r4, r5, r6, r7 }
+ cfi_remember_state
+ cfi_adjust_cfa_offset (-16)
+ cfi_restore (r4)
+ cfi_restore (r5)
+ cfi_restore (r6)
+ cfi_restore (r7)
+
+.Lreturn:
+ cmp ip, #0 @ Was this strcpy or stpcpy?
+ ite eq
+ subeq r0, r0, #1 @ stpcpy: undo post-inc from store
+ movne r0, ip @ strcpy: return original dest
+ bx lr
+
+.Lunaligned:
+ cfi_restore_state
+ @ Here, source is aligned to 8, but the destination is not word
+ @ aligned. Therefore we have to shift the data in order to be
+ @ able to perform aligned word stores.
+
+ @ Find out which misalignment we're dealing with.
+ tst r0, #1
+ beq .Lunaligned2
+ tst r0, #2
+ bne .Lunaligned3
+ @ Fallthru to .Lunaligned1.
+
+.macro unaligned_copy unalign
+ @ Prologue to unaligned loop. Seed shifted non-zero bytes.
+ uqsub8 r4, r7, r2 @ Find EOS
+ uqsub8 r5, r7, r3
+ mvns r4, r4 @ EOS in first word?
+ it ne
+ subne r1, r1, #8
+ bne .Lbyte_loop
+#ifdef __ARMEB__
+ rev r2, r2 @ Byte stores below need LE data
+#endif
+ @ Store a few bytes from the first word.
+ @ At the same time we align r0 and shift out bytes from r2.
+.rept 4-\unalign
+ strb r2, [r0], #1
+ lsr r2, r2, #8
+.endr
+#ifdef __ARMEB__
+ rev r2, r2 @ Undo previous rev
+#endif
+ @ Rotated unaligned copy loop. The tail of the prologue is
+ @ shared with the loop itself.
+ .balign 8
+1: mvns r5, r5 @ EOS in second word?
+ bne 4f
+ @ Combine first and second words
+ orr r2, r2, r3, lsh_gt #(\unalign*8)
+ @ Save leftover bytes from the two words
+ lsh_ls r6, r3, #((4-\unalign)*8)
+ str r2, [r0], #4
+ @ The "real" start of the unaligned copy loop.
+ ldrd r2, r3, [r1], #8 @ Load 8 more bytes
+ uqsub8 r4, r7, r2 @ Find EOS
+ pld [r1, #128]
+ uqsub8 r5, r7, r3
+ pld [r0, #128]
+ mvns r4, r4 @ EOS in first word?
+ bne 3f
+ @ Combine the leftover and the first word
+ orr r6, r6, r2, lsh_gt #(\unalign*8)
+ @ Discard used bytes from the first word.
+ lsh_ls r2, r2, #((4-\unalign)*8)
+ str r6, [r0], #4
+ b 1b
+ @ Found EOS in one of the words; adjust backward
+3: sub r1, r1, #4
+ mov r2, r6
+4: sub r1, r1, #4
+ @ And store the remaining bytes from the leftover
+#ifdef __ARMEB__
+ rev r2, r2
+#endif
+.rept \unalign
+ strb r2, [r0], #1
+ lsr r2, r2, #8
+.endr
+ b .Lbyte_loop
+.endm
+
+.Lunaligned1:
+ unaligned_copy 1
+.Lunaligned2:
+ unaligned_copy 2
+.Lunaligned3:
+ unaligned_copy 3
+
+END (strcpy)
+
+libc_hidden_builtin_def (strcpy)
--
1.8.1.2
^ permalink raw reply [flat|nested] 11+ messages in thread
* [PATCH 5/5] arm: Implement armv6 optimized rawmemchr
2013-03-06 23:54 [PATCH 0/5] ARM v6 string routines Richard Henderson
2013-03-06 23:54 ` [PATCH 4/5] arm: Implement armv6 optimized strrchr Richard Henderson
2013-03-06 23:54 ` [PATCH 3/5] arm: Implement armv6 optimized strchr Richard Henderson
@ 2013-03-06 23:54 ` Richard Henderson
2013-03-07 1:19 ` Joseph S. Myers
2013-03-06 23:54 ` [PATCH 2/5] arm: Implement armv6 optimized strcpy Richard Henderson
2013-03-06 23:54 ` [PATCH 1/5] arm: Implement armv6 optimized strlen Richard Henderson
4 siblings, 1 reply; 11+ messages in thread
From: Richard Henderson @ 2013-03-06 23:54 UTC (permalink / raw)
To: libc-ports; +Cc: joseph
---
ports/sysdeps/arm/armv6/rawmemchr.S | 105 ++++++++++++++++++++++++++++++++++++
1 file changed, 105 insertions(+)
create mode 100644 ports/sysdeps/arm/armv6/rawmemchr.S
diff --git a/ports/sysdeps/arm/armv6/rawmemchr.S b/ports/sysdeps/arm/armv6/rawmemchr.S
new file mode 100644
index 0000000..7877bcf
--- /dev/null
+++ b/ports/sysdeps/arm/armv6/rawmemchr.S
@@ -0,0 +1,105 @@
+/* rawmemchr -- find a byte within an unsized memory block.
+ Copyright (C) 2013 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+ .syntax unified
+ .text
+
+ENTRY (__rawmemchr)
+ @ r0 = start of string
+ @ r1 = character to match
+ @ returns a pointer to the match, which must be present.
+ ldrb r2, [r0] @ load first byte asap
+
+ @ To cater to long strings, we want to search through a few
+ @ characters until we reach an aligned pointer. To cater to
+ @ small strings, we don't want to start doing word operations
+ @ immediately. The compromise is a maximum of 16 bytes less
+ @ whatever is required to end with an aligned pointer.
+ @ r3 = number of characters to search in alignment loop
+ and r3, r0, #7
+ uxtb r1, r1
+ rsb r3, r3, #15 @ 16 - 1 peeled loop iteration
+ cmp r2, r1
+ it eq
+ bxeq lr
+
+ @ Loop until we find ...
+1: ldrb r2, [r0, #1]!
+ subs r3, r3, #1 @ ... the alignment point
+ it ne
+ cmpne r2, r1 @ ... or C
+ bne 1b
+
+ @ Disambiguate the exit possibilites above
+ cmp r2, r1 @ Found C
+ it eq
+ bxeq lr
+ add r0, r0, #1
+
+ @ So now we're aligned.
+ ldrd r2, r3, [r0], #8
+ orr r1, r1, r1, lsl #8 @ Replicate C to all bytes
+#ifdef ARCH_HAS_T2
+ movw ip, #0x0101
+ pld [r0, #64]
+ movt ip, #0x0101
+#else
+ ldr ip, =0x01010101
+ pld [r0, #64]
+#endif
+ orr r1, r1, r1, lsl #16
+
+ @ Loop searching for C, 8 bytes at a time.
+ @ Subtracting (unsigned saturating) from 1 means result of 1 for
+ @ any byte that was originally zero and 0 otherwise. Therefore
+ @ we consider the lsb of each byte the "found" bit.
+2: eor r2, r2, r1 @ Convert C bytes to 0
+ eor r3, r3, r1
+ uqsub8 r2, ip, r2 @ Find C
+ uqsub8 r3, ip, r3
+ pld [r0, #128]
+ orrs r3, r3, r2 @ Test both words for found
+ it eq
+ ldrdeq r2, r3, [r0], #8
+ beq 2b
+
+ @ Found something. Disambiguate between first and second words.
+ @ Adjust r0 to point to the word containing the match.
+ @ Adjust r2 to the found bits for the word containing the match.
+ cmp r2, #0
+ sub r0, r0, #4
+ ite eq
+ moveq r2, r3
+ subne r0, r0, #4
+
+ @ Find the bit-offset of the match within the word. Note that the
+ @ bit result from clz will be 7 higher than "true", but we'll
+ @ immediately discard those bits converting to a byte offset.
+#ifdef __ARMEL__
+ rev r2, r2 @ For LE, count from the little end
+#endif
+ clz r2, r2
+ add r0, r0, r2, lsr #3 @ Adjust the pointer to the found byte
+ bx lr
+
+END (__rawmemchr)
+
+weak_alias (__rawmemchr, rawmemchr)
+libc_hidden_def (__rawmemchr)
--
1.8.1.2
^ permalink raw reply [flat|nested] 11+ messages in thread
* [PATCH 4/5] arm: Implement armv6 optimized strrchr
2013-03-06 23:54 [PATCH 0/5] ARM v6 string routines Richard Henderson
@ 2013-03-06 23:54 ` Richard Henderson
2013-03-07 1:16 ` Joseph S. Myers
2013-03-06 23:54 ` [PATCH 3/5] arm: Implement armv6 optimized strchr Richard Henderson
` (3 subsequent siblings)
4 siblings, 1 reply; 11+ messages in thread
From: Richard Henderson @ 2013-03-06 23:54 UTC (permalink / raw)
To: libc-ports; +Cc: joseph
---
ports/sysdeps/arm/armv6/strrchr.S | 129 ++++++++++++++++++++++++++++++++++++++
1 file changed, 129 insertions(+)
create mode 100644 ports/sysdeps/arm/armv6/strrchr.S
diff --git a/ports/sysdeps/arm/armv6/strrchr.S b/ports/sysdeps/arm/armv6/strrchr.S
new file mode 100644
index 0000000..ddd4f7f
--- /dev/null
+++ b/ports/sysdeps/arm/armv6/strrchr.S
@@ -0,0 +1,129 @@
+/* strrchr -- find the last occurence of C in a nul-terminated string
+ Copyright (C) 2013 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+ .syntax unified
+ .text
+
+ENTRY (strrchr)
+ @ r0 = start of string
+ @ r1 = character to match
+ @ returns NULL for no match, or a pointer to the match
+
+ mov r3, r0
+ mov r0, #0
+ uxtb r1, r1
+
+ @ Loop a few times until we're aligned.
+ tst r3, #7
+ beq 2f
+1: ldrb r2, [r3], #1
+ cmp r2, r1 @ Find the character
+ it eq
+ subeq r0, r3, #1
+ cmp r2, #0 @ Find EOS
+ it eq
+ bxeq lr
+ tst r3, #7 @ Find the aligment point
+ bne 1b
+
+ @ So now we're aligned. Now we actually need a stack frame.
+2: push { r4, r5, r6, r7 }
+ cfi_adjust_cfa_offset (16)
+ cfi_rel_offset (r4, 0)
+ cfi_rel_offset (r5, 4)
+ cfi_rel_offset (r6, 8)
+ cfi_rel_offset (r7, 12)
+
+ orr r1, r1, r1, lsl #8 @ Replicate C to all bytes
+#ifdef ARCH_HAS_T2
+ movw ip, #0x0101
+ movt ip, #0x0101
+#else
+ ldr ip, =0x01010101
+#endif
+ orr r1, r1, r1, lsl #16
+ mov r2, #0 @ No found bits yet
+
+ @ Loop searching for EOS and C, 8 bytes at a time.
+ @ Any time we find a match in a word, we copy the address of
+ @ the word to r0, and the found bits to r2.
+3: ldrd r4, r5, [r3], #8
+ @ Subtracting (unsigned saturating) from 1 means result of 1 for
+ @ any byte that was originally zero and 0 otherwise. Therefore
+ @ we consider the lsb of each byte the "found" bit.
+ uqsub8 r6, ip, r4 @ Find EOS
+ uqsub8 r7, ip, r5
+ eor r4, r4, r1 @ Convert C bytes to 0
+ eor r5, r5, r1
+ uqsub8 r4, ip, r4 @ Find C
+ uqsub8 r5, ip, r5
+ cmp r6, #0 @ Found EOS, first word
+ bne 4f
+ cmp r4, #0 @ Handle C, first word
+ itt ne
+ subne r0, r3, #8
+ movne r2, r4
+ cmp r7, #0 @ Found EOS, second word
+ bne 5f
+ cmp r5, #0 @ Handle C, second word
+ itt ne
+ subne r0, r3, #4
+ movne r2, r5
+ b 3b
+
+ @ Found EOS in second word; fold to first word.
+5: add r3, r3, #4 @ Dec pointer to 2nd word, with below
+ mov r4, r5 @ Overwrite first word C found
+ mov r6, r7 @ Overwrite first word EOS found
+
+ @ Found EOS. Zap found C after EOS.
+4: sub r3, r3, #8 @ Decrement pointer to first word
+#ifdef __ARMEB__
+ @ Byte swap to be congruent with LE, which is easier from here on.
+ rev r6, r6 @ Byte swap found EOS,
+ rev r4, r4 @ ... this found C
+ rev r2, r2 @ ... prev found C
+#endif
+ sub r7, r6, #1 @ Toggle EOS lsb and below
+ eor r6, r6, r7 @ All bits below and including lsb
+ ands r4, r4, r6 @ Zap C above EOS
+ itt ne
+ movne r2, r4 @ Copy to result, if still non-zero
+ movne r0, r3
+
+ pop { r4, r5, r6, r7 }
+ cfi_adjust_cfa_offset (-16)
+ cfi_restore (r4)
+ cfi_restore (r5)
+ cfi_restore (r6)
+ cfi_restore (r7)
+
+ @ Adjust the result pointer if we found a word containing C.
+ cmp r2, #0
+ clz r2, r2 @ Find the bit offset of the last C
+ itt ne
+ rsbne r2, r2, #32 @ Convert to a count from the right
+ addne r0, r0, r2, lsr #3 @ Convert to byte offset and add.
+ bx lr
+
+END (strrchr)
+
+weak_alias (strrchr, rindex)
+libc_hidden_builtin_def (strrchr)
--
1.8.1.2
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH 1/5] arm: Implement armv6 optimized strlen
2013-03-06 23:54 ` [PATCH 1/5] arm: Implement armv6 optimized strlen Richard Henderson
@ 2013-03-07 1:02 ` Joseph S. Myers
0 siblings, 0 replies; 11+ messages in thread
From: Joseph S. Myers @ 2013-03-07 1:02 UTC (permalink / raw)
To: Richard Henderson; +Cc: libc-ports
On Wed, 6 Mar 2013, Richard Henderson wrote:
> Twice as fast for long strings and 50% faster for short strings
> over the armv4 version on A15.
> ---
> ports/sysdeps/arm/armv6/strlen.S | 99 +++++++++++++++++++++++++++++++++++++++
> ports/sysdeps/arm/armv6t2/Implies | 2 +
> 2 files changed, 101 insertions(+)
> create mode 100644 ports/sysdeps/arm/armv6/strlen.S
> create mode 100644 ports/sysdeps/arm/armv6t2/Implies
OK.
--
Joseph S. Myers
joseph@codesourcery.com
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH 3/5] arm: Implement armv6 optimized strchr
2013-03-06 23:54 ` [PATCH 3/5] arm: Implement armv6 optimized strchr Richard Henderson
@ 2013-03-07 1:07 ` Joseph S. Myers
0 siblings, 0 replies; 11+ messages in thread
From: Joseph S. Myers @ 2013-03-07 1:07 UTC (permalink / raw)
To: Richard Henderson; +Cc: libc-ports
On Wed, 6 Mar 2013, Richard Henderson wrote:
> ---
> ports/sysdeps/arm/armv6/strchr.S | 143 +++++++++++++++++++++++++++++++++++++++
> 1 file changed, 143 insertions(+)
> create mode 100644 ports/sysdeps/arm/armv6/strchr.S
OK.
--
Joseph S. Myers
joseph@codesourcery.com
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH 2/5] arm: Implement armv6 optimized strcpy
2013-03-06 23:54 ` [PATCH 2/5] arm: Implement armv6 optimized strcpy Richard Henderson
@ 2013-03-07 1:12 ` Joseph S. Myers
0 siblings, 0 replies; 11+ messages in thread
From: Joseph S. Myers @ 2013-03-07 1:12 UTC (permalink / raw)
To: Richard Henderson; +Cc: libc-ports
On Wed, 6 Mar 2013, Richard Henderson wrote:
> Four times faster than the byte-by-byte default version.
> ---
> ports/sysdeps/arm/armv6/stpcpy.S | 1 +
> ports/sysdeps/arm/armv6/strcpy.S | 218 +++++++++++++++++++++++++++++++++++++++
> 2 files changed, 219 insertions(+)
> create mode 100644 ports/sysdeps/arm/armv6/stpcpy.S
> create mode 100644 ports/sysdeps/arm/armv6/strcpy.S
OK.
--
Joseph S. Myers
joseph@codesourcery.com
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH 4/5] arm: Implement armv6 optimized strrchr
2013-03-06 23:54 ` [PATCH 4/5] arm: Implement armv6 optimized strrchr Richard Henderson
@ 2013-03-07 1:16 ` Joseph S. Myers
0 siblings, 0 replies; 11+ messages in thread
From: Joseph S. Myers @ 2013-03-07 1:16 UTC (permalink / raw)
To: Richard Henderson; +Cc: libc-ports
On Wed, 6 Mar 2013, Richard Henderson wrote:
> ---
> ports/sysdeps/arm/armv6/strrchr.S | 129 ++++++++++++++++++++++++++++++++++++++
> 1 file changed, 129 insertions(+)
> create mode 100644 ports/sysdeps/arm/armv6/strrchr.S
OK.
--
Joseph S. Myers
joseph@codesourcery.com
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH 5/5] arm: Implement armv6 optimized rawmemchr
2013-03-06 23:54 ` [PATCH 5/5] arm: Implement armv6 optimized rawmemchr Richard Henderson
@ 2013-03-07 1:19 ` Joseph S. Myers
0 siblings, 0 replies; 11+ messages in thread
From: Joseph S. Myers @ 2013-03-07 1:19 UTC (permalink / raw)
To: Richard Henderson; +Cc: libc-ports
On Wed, 6 Mar 2013, Richard Henderson wrote:
> ---
> ports/sysdeps/arm/armv6/rawmemchr.S | 105 ++++++++++++++++++++++++++++++++++++
> 1 file changed, 105 insertions(+)
> create mode 100644 ports/sysdeps/arm/armv6/rawmemchr.S
OK.
--
Joseph S. Myers
joseph@codesourcery.com
^ permalink raw reply [flat|nested] 11+ messages in thread
end of thread, other threads:[~2013-03-07 1:19 UTC | newest]
Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-03-06 23:54 [PATCH 0/5] ARM v6 string routines Richard Henderson
2013-03-06 23:54 ` [PATCH 4/5] arm: Implement armv6 optimized strrchr Richard Henderson
2013-03-07 1:16 ` Joseph S. Myers
2013-03-06 23:54 ` [PATCH 3/5] arm: Implement armv6 optimized strchr Richard Henderson
2013-03-07 1:07 ` Joseph S. Myers
2013-03-06 23:54 ` [PATCH 5/5] arm: Implement armv6 optimized rawmemchr Richard Henderson
2013-03-07 1:19 ` Joseph S. Myers
2013-03-06 23:54 ` [PATCH 2/5] arm: Implement armv6 optimized strcpy Richard Henderson
2013-03-07 1:12 ` Joseph S. Myers
2013-03-06 23:54 ` [PATCH 1/5] arm: Implement armv6 optimized strlen Richard Henderson
2013-03-07 1:02 ` Joseph S. Myers
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).