* Re: [PATCH][AArch64] Add optimized memchr
@ 2015-12-15 16:41 Wilco Dijkstra
2016-04-15 12:42 ` Wilco Dijkstra
` (2 more replies)
0 siblings, 3 replies; 12+ messages in thread
From: Wilco Dijkstra @ 2015-12-15 16:41 UTC (permalink / raw)
To: 'GNU C Library'; +Cc: nd
[-- Attachment #1: Type: text/plain, Size: 516 bytes --]
ping
-----Original Message-----
From: Wilco Dijkstra [mailto:wdijkstr@arm.com]
Sent: 25 September 2015 14:21
To: 'GNU C Library'
Subject: [PATCH][AArch64] Add optimized memchr
An optimized memchr was missing for AArch64. This version is similar to strchr and is significantly faster than the C version. Passes GLIBC tests.
OK for commit?
ChangeLog:
2015-09-25 Wilco Dijkstra <wdijkstr@arm.com>
2015-09-25 Kevin Petit <kevin.petit@arm.com>
* sysdeps/aarch64/memchr.S (__memchr): New file.
[-- Attachment #2: 0001-Add-memchr.txt --]
[-- Type: text/plain, Size: 5008 bytes --]
---
sysdeps/aarch64/memchr.S | 157 +++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 157 insertions(+)
create mode 100644 sysdeps/aarch64/memchr.S
diff --git a/sysdeps/aarch64/memchr.S b/sysdeps/aarch64/memchr.S
new file mode 100644
index 0000000..2f643dd
--- /dev/null
+++ b/sysdeps/aarch64/memchr.S
@@ -0,0 +1,157 @@
+/* memchr - find a character in a memory zone
+
+ Copyright (C) 2015 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * Neon Available.
+ */
+
+/* Arguments and results. */
+#define srcin x0
+#define chrin w1
+#define cntin x2
+
+#define result x0
+
+#define src x3
+#define tmp x4
+#define wtmp2 w5
+#define synd x6
+#define soff x9
+#define cntrem x10
+
+#define vrepchr v0
+#define vdata1 v1
+#define vdata2 v2
+#define vhas_chr1 v3
+#define vhas_chr2 v4
+#define vrepmask v5
+#define vend v6
+
+/*
+ * Core algorithm:
+ *
+ * For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits
+ * per byte. For each tuple, bit 0 is set if the relevant byte matched the
+ * requested character and bit 1 is not used (faster than using a 32bit
+ * syndrome). Since the bits in the syndrome reflect exactly the order in which
+ * things occur in the original string, counting trailing zeros allows to
+ * identify exactly which byte has matched.
+ */
+
+ENTRY (__memchr)
+ /* Do not dereference srcin if no bytes to compare. */
+ cbz cntin, L(zero_length)
+ /*
+ * Magic constant 0x40100401 allows us to identify which lane matches
+ * the requested byte.
+ */
+ mov wtmp2, #0x0401
+ movk wtmp2, #0x4010, lsl #16
+ dup vrepchr.16b, chrin
+ /* Work with aligned 32-byte chunks */
+ bic src, srcin, #31
+ dup vrepmask.4s, wtmp2
+ ands soff, srcin, #31
+ and cntrem, cntin, #31
+ b.eq L(loop)
+
+ /*
+ * Input string is not 32-byte aligned. We calculate the syndrome
+ * value for the aligned 32 bytes block containing the first bytes
+ * and mask the irrelevant part.
+ */
+
+ ld1 {vdata1.16b, vdata2.16b}, [src], #32
+ sub tmp, soff, #32
+ adds cntin, cntin, tmp
+ cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
+ cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
+ and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
+ and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+ addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */
+ addp vend.16b, vend.16b, vend.16b /* 128->64 */
+ mov synd, vend.2d[0]
+ /* Clear the soff*2 lower bits */
+ lsl tmp, soff, #1
+ lsr synd, synd, tmp
+ lsl synd, synd, tmp
+ /* The first block can also be the last */
+ b.ls L(masklast)
+ /* Have we found something already? */
+ cbnz synd, L(tail)
+
+L(loop):
+ ld1 {vdata1.16b, vdata2.16b}, [src], #32
+ subs cntin, cntin, #32
+ cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
+ cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
+ /* If we're out of data we finish regardless of the result */
+ b.ls L(end)
+ /* Use a fast check for the termination condition */
+ orr vend.16b, vhas_chr1.16b, vhas_chr2.16b
+ addp vend.2d, vend.2d, vend.2d
+ mov synd, vend.2d[0]
+ /* We're not out of data, loop if we haven't found the character */
+ cbz synd, L(loop)
+
+L(end):
+ /* Termination condition found, let's calculate the syndrome value */
+ and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
+ and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+ addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */
+ addp vend.16b, vend.16b, vend.16b /* 128->64 */
+ mov synd, vend.2d[0]
+ /* Only do the clear for the last possible block */
+ b.hi L(tail)
+
+L(masklast):
+ /* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */
+ add tmp, cntrem, soff
+ and tmp, tmp, #31
+ sub tmp, tmp, #32
+ neg tmp, tmp, lsl #1
+ lsl synd, synd, tmp
+ lsr synd, synd, tmp
+
+L(tail):
+ /* Count the trailing zeros using bit reversing */
+ rbit synd, synd
+ /* Compensate the last post-increment */
+ sub src, src, #32
+ /* Check that we have found a character */
+ cmp synd, #0
+ /* And count the leading zeros */
+ clz synd, synd
+ /* Compute the potential result */
+ add result, src, synd, lsr #1
+ /* Select result or NULL */
+ csel result, xzr, result, eq
+ ret
+
+L(zero_length):
+ mov result, #0
+ ret
+END (__memchr)
+weak_alias (__memchr, memchr)
+libc_hidden_builtin_def (memchr)
--
1.9.1
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH][AArch64] Add optimized memchr
2015-12-15 16:41 [PATCH][AArch64] Add optimized memchr Wilco Dijkstra
@ 2016-04-15 12:42 ` Wilco Dijkstra
2016-05-12 14:02 ` Fw: " Wilco Dijkstra
2016-10-19 14:58 ` Wilco Dijkstra
2 siblings, 0 replies; 12+ messages in thread
From: Wilco Dijkstra @ 2016-04-15 12:42 UTC (permalink / raw)
To: 'GNU C Library'; +Cc: nd, Richard Earnshaw, Marcus Shawcroft
[-- Attachment #1: Type: text/plain, Size: 707 bytes --]
ping
________________________________________
From: Wilco Dijkstra
Sent: 15 December 2015 16:41
To: 'GNU C Library'
Cc: nd
Subject: Re: [PATCH][AArch64] Add optimized memchr
ping
-----Original Message-----
From: Wilco Dijkstra [mailto:wdijkstr@arm.com]
Sent: 25 September 2015 14:21
To: 'GNU C Library'
Subject: [PATCH][AArch64] Add optimized memchr
An optimized memchr was missing for AArch64. This version is similar to strchr and is significantly faster than the C version. Passes GLIBC tests.
OK for commit?
ChangeLog:
2015-09-25 Wilco Dijkstra <wdijkstr@arm.com>
2015-09-25 Kevin Petit <kevin.petit@arm.com>
* sysdeps/aarch64/memchr.S (__memchr): New file.
[-- Attachment #2: 0001-Add-memchr.txt --]
[-- Type: text/plain, Size: 5008 bytes --]
---
sysdeps/aarch64/memchr.S | 157 +++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 157 insertions(+)
create mode 100644 sysdeps/aarch64/memchr.S
diff --git a/sysdeps/aarch64/memchr.S b/sysdeps/aarch64/memchr.S
new file mode 100644
index 0000000..2f643dd
--- /dev/null
+++ b/sysdeps/aarch64/memchr.S
@@ -0,0 +1,157 @@
+/* memchr - find a character in a memory zone
+
+ Copyright (C) 2015 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * Neon Available.
+ */
+
+/* Arguments and results. */
+#define srcin x0
+#define chrin w1
+#define cntin x2
+
+#define result x0
+
+#define src x3
+#define tmp x4
+#define wtmp2 w5
+#define synd x6
+#define soff x9
+#define cntrem x10
+
+#define vrepchr v0
+#define vdata1 v1
+#define vdata2 v2
+#define vhas_chr1 v3
+#define vhas_chr2 v4
+#define vrepmask v5
+#define vend v6
+
+/*
+ * Core algorithm:
+ *
+ * For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits
+ * per byte. For each tuple, bit 0 is set if the relevant byte matched the
+ * requested character and bit 1 is not used (faster than using a 32bit
+ * syndrome). Since the bits in the syndrome reflect exactly the order in which
+ * things occur in the original string, counting trailing zeros allows to
+ * identify exactly which byte has matched.
+ */
+
+ENTRY (__memchr)
+ /* Do not dereference srcin if no bytes to compare. */
+ cbz cntin, L(zero_length)
+ /*
+ * Magic constant 0x40100401 allows us to identify which lane matches
+ * the requested byte.
+ */
+ mov wtmp2, #0x0401
+ movk wtmp2, #0x4010, lsl #16
+ dup vrepchr.16b, chrin
+ /* Work with aligned 32-byte chunks */
+ bic src, srcin, #31
+ dup vrepmask.4s, wtmp2
+ ands soff, srcin, #31
+ and cntrem, cntin, #31
+ b.eq L(loop)
+
+ /*
+ * Input string is not 32-byte aligned. We calculate the syndrome
+ * value for the aligned 32 bytes block containing the first bytes
+ * and mask the irrelevant part.
+ */
+
+ ld1 {vdata1.16b, vdata2.16b}, [src], #32
+ sub tmp, soff, #32
+ adds cntin, cntin, tmp
+ cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
+ cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
+ and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
+ and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+ addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */
+ addp vend.16b, vend.16b, vend.16b /* 128->64 */
+ mov synd, vend.2d[0]
+ /* Clear the soff*2 lower bits */
+ lsl tmp, soff, #1
+ lsr synd, synd, tmp
+ lsl synd, synd, tmp
+ /* The first block can also be the last */
+ b.ls L(masklast)
+ /* Have we found something already? */
+ cbnz synd, L(tail)
+
+L(loop):
+ ld1 {vdata1.16b, vdata2.16b}, [src], #32
+ subs cntin, cntin, #32
+ cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
+ cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
+ /* If we're out of data we finish regardless of the result */
+ b.ls L(end)
+ /* Use a fast check for the termination condition */
+ orr vend.16b, vhas_chr1.16b, vhas_chr2.16b
+ addp vend.2d, vend.2d, vend.2d
+ mov synd, vend.2d[0]
+ /* We're not out of data, loop if we haven't found the character */
+ cbz synd, L(loop)
+
+L(end):
+ /* Termination condition found, let's calculate the syndrome value */
+ and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
+ and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+ addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */
+ addp vend.16b, vend.16b, vend.16b /* 128->64 */
+ mov synd, vend.2d[0]
+ /* Only do the clear for the last possible block */
+ b.hi L(tail)
+
+L(masklast):
+ /* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */
+ add tmp, cntrem, soff
+ and tmp, tmp, #31
+ sub tmp, tmp, #32
+ neg tmp, tmp, lsl #1
+ lsl synd, synd, tmp
+ lsr synd, synd, tmp
+
+L(tail):
+ /* Count the trailing zeros using bit reversing */
+ rbit synd, synd
+ /* Compensate the last post-increment */
+ sub src, src, #32
+ /* Check that we have found a character */
+ cmp synd, #0
+ /* And count the leading zeros */
+ clz synd, synd
+ /* Compute the potential result */
+ add result, src, synd, lsr #1
+ /* Select result or NULL */
+ csel result, xzr, result, eq
+ ret
+
+L(zero_length):
+ mov result, #0
+ ret
+END (__memchr)
+weak_alias (__memchr, memchr)
+libc_hidden_builtin_def (memchr)
--
1.9.1
^ permalink raw reply [flat|nested] 12+ messages in thread
* Fw: [PATCH][AArch64] Add optimized memchr
2015-12-15 16:41 [PATCH][AArch64] Add optimized memchr Wilco Dijkstra
2016-04-15 12:42 ` Wilco Dijkstra
@ 2016-05-12 14:02 ` Wilco Dijkstra
2016-06-03 12:41 ` Wilco Dijkstra
2016-10-19 14:58 ` Wilco Dijkstra
2 siblings, 1 reply; 12+ messages in thread
From: Wilco Dijkstra @ 2016-05-12 14:02 UTC (permalink / raw)
To: 'GNU C Library', Marcus Shawcroft; +Cc: nd
[-- Attachment #1: Type: text/plain, Size: 526 bytes --]
ping
-----Original Message-----
From: Wilco Dijkstra [mailto:wdijkstr@arm.com]
Sent: 25 September 2015 14:21
To: 'GNU C Library'
Subject: [PATCH][AArch64] Add optimized memchr
An optimized memchr was missing for AArch64. This version is similar to strchr and is significantly faster than the C version. Passes GLIBC tests.
OK for commit?
ChangeLog:
2015-09-25 Wilco Dijkstra <wdijkstr@arm.com>
2015-09-25 Kevin Petit <kevin.petit@arm.com>
* sysdeps/aarch64/memchr.S (__memchr): New file.
[-- Attachment #2: 0001-Add-memchr.txt --]
[-- Type: text/plain, Size: 5008 bytes --]
---
sysdeps/aarch64/memchr.S | 157 +++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 157 insertions(+)
create mode 100644 sysdeps/aarch64/memchr.S
diff --git a/sysdeps/aarch64/memchr.S b/sysdeps/aarch64/memchr.S
new file mode 100644
index 0000000..2f643dd
--- /dev/null
+++ b/sysdeps/aarch64/memchr.S
@@ -0,0 +1,157 @@
+/* memchr - find a character in a memory zone
+
+ Copyright (C) 2015 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * Neon Available.
+ */
+
+/* Arguments and results. */
+#define srcin x0
+#define chrin w1
+#define cntin x2
+
+#define result x0
+
+#define src x3
+#define tmp x4
+#define wtmp2 w5
+#define synd x6
+#define soff x9
+#define cntrem x10
+
+#define vrepchr v0
+#define vdata1 v1
+#define vdata2 v2
+#define vhas_chr1 v3
+#define vhas_chr2 v4
+#define vrepmask v5
+#define vend v6
+
+/*
+ * Core algorithm:
+ *
+ * For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits
+ * per byte. For each tuple, bit 0 is set if the relevant byte matched the
+ * requested character and bit 1 is not used (faster than using a 32bit
+ * syndrome). Since the bits in the syndrome reflect exactly the order in which
+ * things occur in the original string, counting trailing zeros allows to
+ * identify exactly which byte has matched.
+ */
+
+ENTRY (__memchr)
+ /* Do not dereference srcin if no bytes to compare. */
+ cbz cntin, L(zero_length)
+ /*
+ * Magic constant 0x40100401 allows us to identify which lane matches
+ * the requested byte.
+ */
+ mov wtmp2, #0x0401
+ movk wtmp2, #0x4010, lsl #16
+ dup vrepchr.16b, chrin
+ /* Work with aligned 32-byte chunks */
+ bic src, srcin, #31
+ dup vrepmask.4s, wtmp2
+ ands soff, srcin, #31
+ and cntrem, cntin, #31
+ b.eq L(loop)
+
+ /*
+ * Input string is not 32-byte aligned. We calculate the syndrome
+ * value for the aligned 32 bytes block containing the first bytes
+ * and mask the irrelevant part.
+ */
+
+ ld1 {vdata1.16b, vdata2.16b}, [src], #32
+ sub tmp, soff, #32
+ adds cntin, cntin, tmp
+ cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
+ cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
+ and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
+ and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+ addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */
+ addp vend.16b, vend.16b, vend.16b /* 128->64 */
+ mov synd, vend.2d[0]
+ /* Clear the soff*2 lower bits */
+ lsl tmp, soff, #1
+ lsr synd, synd, tmp
+ lsl synd, synd, tmp
+ /* The first block can also be the last */
+ b.ls L(masklast)
+ /* Have we found something already? */
+ cbnz synd, L(tail)
+
+L(loop):
+ ld1 {vdata1.16b, vdata2.16b}, [src], #32
+ subs cntin, cntin, #32
+ cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
+ cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
+ /* If we're out of data we finish regardless of the result */
+ b.ls L(end)
+ /* Use a fast check for the termination condition */
+ orr vend.16b, vhas_chr1.16b, vhas_chr2.16b
+ addp vend.2d, vend.2d, vend.2d
+ mov synd, vend.2d[0]
+ /* We're not out of data, loop if we haven't found the character */
+ cbz synd, L(loop)
+
+L(end):
+ /* Termination condition found, let's calculate the syndrome value */
+ and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
+ and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+ addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */
+ addp vend.16b, vend.16b, vend.16b /* 128->64 */
+ mov synd, vend.2d[0]
+ /* Only do the clear for the last possible block */
+ b.hi L(tail)
+
+L(masklast):
+ /* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */
+ add tmp, cntrem, soff
+ and tmp, tmp, #31
+ sub tmp, tmp, #32
+ neg tmp, tmp, lsl #1
+ lsl synd, synd, tmp
+ lsr synd, synd, tmp
+
+L(tail):
+ /* Count the trailing zeros using bit reversing */
+ rbit synd, synd
+ /* Compensate the last post-increment */
+ sub src, src, #32
+ /* Check that we have found a character */
+ cmp synd, #0
+ /* And count the leading zeros */
+ clz synd, synd
+ /* Compute the potential result */
+ add result, src, synd, lsr #1
+ /* Select result or NULL */
+ csel result, xzr, result, eq
+ ret
+
+L(zero_length):
+ mov result, #0
+ ret
+END (__memchr)
+weak_alias (__memchr, memchr)
+libc_hidden_builtin_def (memchr)
--
1.9.1
^ permalink raw reply [flat|nested] 12+ messages in thread
* Fw: [PATCH][AArch64] Add optimized memchr
2016-05-12 14:02 ` Fw: " Wilco Dijkstra
@ 2016-06-03 12:41 ` Wilco Dijkstra
2016-06-21 13:35 ` Wilco Dijkstra
0 siblings, 1 reply; 12+ messages in thread
From: Wilco Dijkstra @ 2016-06-03 12:41 UTC (permalink / raw)
To: Marcus Shawcroft; +Cc: nd, 'GNU C Library'
[-- Attachment #1: Type: text/plain, Size: 528 bytes --]
ping
-----Original Message-----
From: Wilco Dijkstra [mailto:wdijkstr@arm.com]
Sent: 25 September 2015 14:21
To: 'GNU C Library'
Subject: [PATCH][AArch64] Add optimized memchr
An optimized memchr was missing for AArch64. This version is similar to strchr and is significantly faster than the C version. Passes GLIBC tests.
OK for commit?
ChangeLog:
2015-09-25 Wilco Dijkstra <wdijkstr@arm.com>
2015-09-25 Kevin Petit <kevin.petit@arm.com>
* sysdeps/aarch64/memchr.S (__memchr): New file.
[-- Attachment #2: 0001-Add-memchr.txt --]
[-- Type: text/plain, Size: 5008 bytes --]
---
sysdeps/aarch64/memchr.S | 157 +++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 157 insertions(+)
create mode 100644 sysdeps/aarch64/memchr.S
diff --git a/sysdeps/aarch64/memchr.S b/sysdeps/aarch64/memchr.S
new file mode 100644
index 0000000..2f643dd
--- /dev/null
+++ b/sysdeps/aarch64/memchr.S
@@ -0,0 +1,157 @@
+/* memchr - find a character in a memory zone
+
+ Copyright (C) 2015 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * Neon Available.
+ */
+
+/* Arguments and results. */
+#define srcin x0
+#define chrin w1
+#define cntin x2
+
+#define result x0
+
+#define src x3
+#define tmp x4
+#define wtmp2 w5
+#define synd x6
+#define soff x9
+#define cntrem x10
+
+#define vrepchr v0
+#define vdata1 v1
+#define vdata2 v2
+#define vhas_chr1 v3
+#define vhas_chr2 v4
+#define vrepmask v5
+#define vend v6
+
+/*
+ * Core algorithm:
+ *
+ * For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits
+ * per byte. For each tuple, bit 0 is set if the relevant byte matched the
+ * requested character and bit 1 is not used (faster than using a 32bit
+ * syndrome). Since the bits in the syndrome reflect exactly the order in which
+ * things occur in the original string, counting trailing zeros allows to
+ * identify exactly which byte has matched.
+ */
+
+ENTRY (__memchr)
+ /* Do not dereference srcin if no bytes to compare. */
+ cbz cntin, L(zero_length)
+ /*
+ * Magic constant 0x40100401 allows us to identify which lane matches
+ * the requested byte.
+ */
+ mov wtmp2, #0x0401
+ movk wtmp2, #0x4010, lsl #16
+ dup vrepchr.16b, chrin
+ /* Work with aligned 32-byte chunks */
+ bic src, srcin, #31
+ dup vrepmask.4s, wtmp2
+ ands soff, srcin, #31
+ and cntrem, cntin, #31
+ b.eq L(loop)
+
+ /*
+ * Input string is not 32-byte aligned. We calculate the syndrome
+ * value for the aligned 32 bytes block containing the first bytes
+ * and mask the irrelevant part.
+ */
+
+ ld1 {vdata1.16b, vdata2.16b}, [src], #32
+ sub tmp, soff, #32
+ adds cntin, cntin, tmp
+ cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
+ cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
+ and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
+ and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+ addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */
+ addp vend.16b, vend.16b, vend.16b /* 128->64 */
+ mov synd, vend.2d[0]
+ /* Clear the soff*2 lower bits */
+ lsl tmp, soff, #1
+ lsr synd, synd, tmp
+ lsl synd, synd, tmp
+ /* The first block can also be the last */
+ b.ls L(masklast)
+ /* Have we found something already? */
+ cbnz synd, L(tail)
+
+L(loop):
+ ld1 {vdata1.16b, vdata2.16b}, [src], #32
+ subs cntin, cntin, #32
+ cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
+ cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
+ /* If we're out of data we finish regardless of the result */
+ b.ls L(end)
+ /* Use a fast check for the termination condition */
+ orr vend.16b, vhas_chr1.16b, vhas_chr2.16b
+ addp vend.2d, vend.2d, vend.2d
+ mov synd, vend.2d[0]
+ /* We're not out of data, loop if we haven't found the character */
+ cbz synd, L(loop)
+
+L(end):
+ /* Termination condition found, let's calculate the syndrome value */
+ and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
+ and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+ addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */
+ addp vend.16b, vend.16b, vend.16b /* 128->64 */
+ mov synd, vend.2d[0]
+ /* Only do the clear for the last possible block */
+ b.hi L(tail)
+
+L(masklast):
+ /* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */
+ add tmp, cntrem, soff
+ and tmp, tmp, #31
+ sub tmp, tmp, #32
+ neg tmp, tmp, lsl #1
+ lsl synd, synd, tmp
+ lsr synd, synd, tmp
+
+L(tail):
+ /* Count the trailing zeros using bit reversing */
+ rbit synd, synd
+ /* Compensate the last post-increment */
+ sub src, src, #32
+ /* Check that we have found a character */
+ cmp synd, #0
+ /* And count the leading zeros */
+ clz synd, synd
+ /* Compute the potential result */
+ add result, src, synd, lsr #1
+ /* Select result or NULL */
+ csel result, xzr, result, eq
+ ret
+
+L(zero_length):
+ mov result, #0
+ ret
+END (__memchr)
+weak_alias (__memchr, memchr)
+libc_hidden_builtin_def (memchr)
--
1.9.1
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH][AArch64] Add optimized memchr
2016-06-03 12:41 ` Wilco Dijkstra
@ 2016-06-21 13:35 ` Wilco Dijkstra
0 siblings, 0 replies; 12+ messages in thread
From: Wilco Dijkstra @ 2016-06-21 13:35 UTC (permalink / raw)
To: Marcus Shawcroft; +Cc: nd, 'GNU C Library'
[-- Attachment #1: Type: text/plain, Size: 532 bytes --]
ping
-----Original Message-----
From: Wilco Dijkstra [mailto:wdijkstr@arm.com]
Sent: 25 September 2015 14:21
To: 'GNU C Library'
Subject: [PATCH][AArch64] Add optimized memchr
An optimized memchr was missing for AArch64. This version is similar to strchr and is significantly faster than the C version. Passes GLIBC tests.
OK for commit?
ChangeLog:
2015-09-25 Wilco Dijkstra <wdijkstr@arm.com>
2015-09-25 Kevin Petit <kevin.petit@arm.com>
* sysdeps/aarch64/memchr.S (__memchr): New file.
[-- Attachment #2: 0001-Add-memchr.txt --]
[-- Type: text/plain, Size: 5008 bytes --]
---
sysdeps/aarch64/memchr.S | 157 +++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 157 insertions(+)
create mode 100644 sysdeps/aarch64/memchr.S
diff --git a/sysdeps/aarch64/memchr.S b/sysdeps/aarch64/memchr.S
new file mode 100644
index 0000000..2f643dd
--- /dev/null
+++ b/sysdeps/aarch64/memchr.S
@@ -0,0 +1,157 @@
+/* memchr - find a character in a memory zone
+
+ Copyright (C) 2015 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * Neon Available.
+ */
+
+/* Arguments and results. */
+#define srcin x0
+#define chrin w1
+#define cntin x2
+
+#define result x0
+
+#define src x3
+#define tmp x4
+#define wtmp2 w5
+#define synd x6
+#define soff x9
+#define cntrem x10
+
+#define vrepchr v0
+#define vdata1 v1
+#define vdata2 v2
+#define vhas_chr1 v3
+#define vhas_chr2 v4
+#define vrepmask v5
+#define vend v6
+
+/*
+ * Core algorithm:
+ *
+ * For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits
+ * per byte. For each tuple, bit 0 is set if the relevant byte matched the
+ * requested character and bit 1 is not used (faster than using a 32bit
+ * syndrome). Since the bits in the syndrome reflect exactly the order in which
+ * things occur in the original string, counting trailing zeros allows to
+ * identify exactly which byte has matched.
+ */
+
+ENTRY (__memchr)
+ /* Do not dereference srcin if no bytes to compare. */
+ cbz cntin, L(zero_length)
+ /*
+ * Magic constant 0x40100401 allows us to identify which lane matches
+ * the requested byte.
+ */
+ mov wtmp2, #0x0401
+ movk wtmp2, #0x4010, lsl #16
+ dup vrepchr.16b, chrin
+ /* Work with aligned 32-byte chunks */
+ bic src, srcin, #31
+ dup vrepmask.4s, wtmp2
+ ands soff, srcin, #31
+ and cntrem, cntin, #31
+ b.eq L(loop)
+
+ /*
+ * Input string is not 32-byte aligned. We calculate the syndrome
+ * value for the aligned 32 bytes block containing the first bytes
+ * and mask the irrelevant part.
+ */
+
+ ld1 {vdata1.16b, vdata2.16b}, [src], #32
+ sub tmp, soff, #32
+ adds cntin, cntin, tmp
+ cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
+ cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
+ and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
+ and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+ addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */
+ addp vend.16b, vend.16b, vend.16b /* 128->64 */
+ mov synd, vend.2d[0]
+ /* Clear the soff*2 lower bits */
+ lsl tmp, soff, #1
+ lsr synd, synd, tmp
+ lsl synd, synd, tmp
+ /* The first block can also be the last */
+ b.ls L(masklast)
+ /* Have we found something already? */
+ cbnz synd, L(tail)
+
+L(loop):
+ ld1 {vdata1.16b, vdata2.16b}, [src], #32
+ subs cntin, cntin, #32
+ cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
+ cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
+ /* If we're out of data we finish regardless of the result */
+ b.ls L(end)
+ /* Use a fast check for the termination condition */
+ orr vend.16b, vhas_chr1.16b, vhas_chr2.16b
+ addp vend.2d, vend.2d, vend.2d
+ mov synd, vend.2d[0]
+ /* We're not out of data, loop if we haven't found the character */
+ cbz synd, L(loop)
+
+L(end):
+ /* Termination condition found, let's calculate the syndrome value */
+ and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
+ and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+ addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */
+ addp vend.16b, vend.16b, vend.16b /* 128->64 */
+ mov synd, vend.2d[0]
+ /* Only do the clear for the last possible block */
+ b.hi L(tail)
+
+L(masklast):
+ /* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */
+ add tmp, cntrem, soff
+ and tmp, tmp, #31
+ sub tmp, tmp, #32
+ neg tmp, tmp, lsl #1
+ lsl synd, synd, tmp
+ lsr synd, synd, tmp
+
+L(tail):
+ /* Count the trailing zeros using bit reversing */
+ rbit synd, synd
+ /* Compensate the last post-increment */
+ sub src, src, #32
+ /* Check that we have found a character */
+ cmp synd, #0
+ /* And count the leading zeros */
+ clz synd, synd
+ /* Compute the potential result */
+ add result, src, synd, lsr #1
+ /* Select result or NULL */
+ csel result, xzr, result, eq
+ ret
+
+L(zero_length):
+ mov result, #0
+ ret
+END (__memchr)
+weak_alias (__memchr, memchr)
+libc_hidden_builtin_def (memchr)
--
1.9.1
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH][AArch64] Add optimized memchr
2015-12-15 16:41 [PATCH][AArch64] Add optimized memchr Wilco Dijkstra
2016-04-15 12:42 ` Wilco Dijkstra
2016-05-12 14:02 ` Fw: " Wilco Dijkstra
@ 2016-10-19 14:58 ` Wilco Dijkstra
2016-10-25 17:08 ` Adhemerval Zanella
2016-11-02 16:45 ` Wilco Dijkstra
2 siblings, 2 replies; 12+ messages in thread
From: Wilco Dijkstra @ 2016-10-19 14:58 UTC (permalink / raw)
To: libc-alpha, Richard Earnshaw, Szabolcs Nagy, Marcus Shawcroft; +Cc: nd
[-- Attachment #1: Type: text/plain, Size: 534 bytes --]
ping
-----Original Message-----
From: Wilco Dijkstra [mailto:wdijkstr@arm.com]
Sent: 25 September 2015 14:21
To: 'GNU C Library'
Subject: [PATCH][AArch64] Add optimized memchr
An optimized memchr was missing for AArch64. This version is similar to strchr and is significantly faster than the C version. Passes GLIBC tests.
OK for commit?
ChangeLog:
2015-09-25 Wilco Dijkstra <wdijkstr@arm.com>
2015-09-25 Kevin Petit <kevin.petit@arm.com>
* sysdeps/aarch64/memchr.S (__memchr): New file.
[-- Attachment #2: 0001-Add-memchr.txt --]
[-- Type: text/plain, Size: 5008 bytes --]
---
sysdeps/aarch64/memchr.S | 157 +++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 157 insertions(+)
create mode 100644 sysdeps/aarch64/memchr.S
diff --git a/sysdeps/aarch64/memchr.S b/sysdeps/aarch64/memchr.S
new file mode 100644
index 0000000..2f643dd
--- /dev/null
+++ b/sysdeps/aarch64/memchr.S
@@ -0,0 +1,157 @@
+/* memchr - find a character in a memory zone
+
+ Copyright (C) 2015 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * Neon Available.
+ */
+
+/* Arguments and results. */
+#define srcin x0
+#define chrin w1
+#define cntin x2
+
+#define result x0
+
+#define src x3
+#define tmp x4
+#define wtmp2 w5
+#define synd x6
+#define soff x9
+#define cntrem x10
+
+#define vrepchr v0
+#define vdata1 v1
+#define vdata2 v2
+#define vhas_chr1 v3
+#define vhas_chr2 v4
+#define vrepmask v5
+#define vend v6
+
+/*
+ * Core algorithm:
+ *
+ * For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits
+ * per byte. For each tuple, bit 0 is set if the relevant byte matched the
+ * requested character and bit 1 is not used (faster than using a 32bit
+ * syndrome). Since the bits in the syndrome reflect exactly the order in which
+ * things occur in the original string, counting trailing zeros allows to
+ * identify exactly which byte has matched.
+ */
+
+ENTRY (__memchr)
+ /* Do not dereference srcin if no bytes to compare. */
+ cbz cntin, L(zero_length)
+ /*
+ * Magic constant 0x40100401 allows us to identify which lane matches
+ * the requested byte.
+ */
+ mov wtmp2, #0x0401
+ movk wtmp2, #0x4010, lsl #16
+ dup vrepchr.16b, chrin
+ /* Work with aligned 32-byte chunks */
+ bic src, srcin, #31
+ dup vrepmask.4s, wtmp2
+ ands soff, srcin, #31
+ and cntrem, cntin, #31
+ b.eq L(loop)
+
+ /*
+ * Input string is not 32-byte aligned. We calculate the syndrome
+ * value for the aligned 32 bytes block containing the first bytes
+ * and mask the irrelevant part.
+ */
+
+ ld1 {vdata1.16b, vdata2.16b}, [src], #32
+ sub tmp, soff, #32
+ adds cntin, cntin, tmp
+ cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
+ cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
+ and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
+ and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+ addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */
+ addp vend.16b, vend.16b, vend.16b /* 128->64 */
+ mov synd, vend.2d[0]
+ /* Clear the soff*2 lower bits */
+ lsl tmp, soff, #1
+ lsr synd, synd, tmp
+ lsl synd, synd, tmp
+ /* The first block can also be the last */
+ b.ls L(masklast)
+ /* Have we found something already? */
+ cbnz synd, L(tail)
+
+L(loop):
+ ld1 {vdata1.16b, vdata2.16b}, [src], #32
+ subs cntin, cntin, #32
+ cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
+ cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
+ /* If we're out of data we finish regardless of the result */
+ b.ls L(end)
+ /* Use a fast check for the termination condition */
+ orr vend.16b, vhas_chr1.16b, vhas_chr2.16b
+ addp vend.2d, vend.2d, vend.2d
+ mov synd, vend.2d[0]
+ /* We're not out of data, loop if we haven't found the character */
+ cbz synd, L(loop)
+
+L(end):
+ /* Termination condition found, let's calculate the syndrome value */
+ and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
+ and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+ addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */
+ addp vend.16b, vend.16b, vend.16b /* 128->64 */
+ mov synd, vend.2d[0]
+ /* Only do the clear for the last possible block */
+ b.hi L(tail)
+
+L(masklast):
+ /* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */
+ add tmp, cntrem, soff
+ and tmp, tmp, #31
+ sub tmp, tmp, #32
+ neg tmp, tmp, lsl #1
+ lsl synd, synd, tmp
+ lsr synd, synd, tmp
+
+L(tail):
+ /* Count the trailing zeros using bit reversing */
+ rbit synd, synd
+ /* Compensate the last post-increment */
+ sub src, src, #32
+ /* Check that we have found a character */
+ cmp synd, #0
+ /* And count the leading zeros */
+ clz synd, synd
+ /* Compute the potential result */
+ add result, src, synd, lsr #1
+ /* Select result or NULL */
+ csel result, xzr, result, eq
+ ret
+
+L(zero_length):
+ mov result, #0
+ ret
+END (__memchr)
+weak_alias (__memchr, memchr)
+libc_hidden_builtin_def (memchr)
--
1.9.1
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH][AArch64] Add optimized memchr
2016-10-19 14:58 ` Wilco Dijkstra
@ 2016-10-25 17:08 ` Adhemerval Zanella
2016-11-02 16:45 ` Wilco Dijkstra
1 sibling, 0 replies; 12+ messages in thread
From: Adhemerval Zanella @ 2016-10-25 17:08 UTC (permalink / raw)
To: libc-alpha
LGTM.
On 19/10/2016 12:58, Wilco Dijkstra wrote:
>
>
> ping
>
> -----Original Message-----
> From: Wilco Dijkstra [mailto:wdijkstr@arm.com]
> Sent: 25 September 2015 14:21
> To: 'GNU C Library'
> Subject: [PATCH][AArch64] Add optimized memchr
>
> An optimized memchr was missing for AArch64. This version is similar to strchr and is significantly faster than the C version. Passes GLIBC tests.
>
> OK for commit?
>
> ChangeLog:
> 2015-09-25 Wilco Dijkstra <wdijkstr@arm.com>
> 2015-09-25 Kevin Petit <kevin.petit@arm.com>
>
> * sysdeps/aarch64/memchr.S (__memchr): New file.
>
>
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH][AArch64] Add optimized memchr
2016-10-19 14:58 ` Wilco Dijkstra
2016-10-25 17:08 ` Adhemerval Zanella
@ 2016-11-02 16:45 ` Wilco Dijkstra
2016-11-04 9:01 ` Marcus Shawcroft
1 sibling, 1 reply; 12+ messages in thread
From: Wilco Dijkstra @ 2016-11-02 16:45 UTC (permalink / raw)
To: libc-alpha, Marcus Shawcroft; +Cc: nd, Szabolcs Nagy, Richard Earnshaw
[-- Attachment #1: Type: text/plain, Size: 532 bytes --]
ping
-----Original Message-----
From: Wilco Dijkstra [mailto:wdijkstr@arm.com]
Sent: 25 September 2015 14:21
To: 'GNU C Library'
Subject: [PATCH][AArch64] Add optimized memchr
An optimized memchr was missing for AArch64. This version is similar to strchr and is significantly faster than the C version. Passes GLIBC tests.
OK for commit?
ChangeLog:
2015-09-25 Wilco Dijkstra <wdijkstr@arm.com>
2015-09-25 Kevin Petit <kevin.petit@arm.com>
* sysdeps/aarch64/memchr.S (__memchr): New file.
[-- Attachment #2: 0001-Add-memchr.txt --]
[-- Type: text/plain, Size: 5008 bytes --]
---
sysdeps/aarch64/memchr.S | 157 +++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 157 insertions(+)
create mode 100644 sysdeps/aarch64/memchr.S
diff --git a/sysdeps/aarch64/memchr.S b/sysdeps/aarch64/memchr.S
new file mode 100644
index 0000000..2f643dd
--- /dev/null
+++ b/sysdeps/aarch64/memchr.S
@@ -0,0 +1,157 @@
+/* memchr - find a character in a memory zone
+
+ Copyright (C) 2015 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * Neon Available.
+ */
+
+/* Arguments and results. */
+#define srcin x0
+#define chrin w1
+#define cntin x2
+
+#define result x0
+
+#define src x3
+#define tmp x4
+#define wtmp2 w5
+#define synd x6
+#define soff x9
+#define cntrem x10
+
+#define vrepchr v0
+#define vdata1 v1
+#define vdata2 v2
+#define vhas_chr1 v3
+#define vhas_chr2 v4
+#define vrepmask v5
+#define vend v6
+
+/*
+ * Core algorithm:
+ *
+ * For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits
+ * per byte. For each tuple, bit 0 is set if the relevant byte matched the
+ * requested character and bit 1 is not used (faster than using a 32bit
+ * syndrome). Since the bits in the syndrome reflect exactly the order in which
+ * things occur in the original string, counting trailing zeros allows to
+ * identify exactly which byte has matched.
+ */
+
+ENTRY (__memchr)
+ /* Do not dereference srcin if no bytes to compare. */
+ cbz cntin, L(zero_length)
+ /*
+ * Magic constant 0x40100401 allows us to identify which lane matches
+ * the requested byte.
+ */
+ mov wtmp2, #0x0401
+ movk wtmp2, #0x4010, lsl #16
+ dup vrepchr.16b, chrin
+ /* Work with aligned 32-byte chunks */
+ bic src, srcin, #31
+ dup vrepmask.4s, wtmp2
+ ands soff, srcin, #31
+ and cntrem, cntin, #31
+ b.eq L(loop)
+
+ /*
+ * Input string is not 32-byte aligned. We calculate the syndrome
+ * value for the aligned 32 bytes block containing the first bytes
+ * and mask the irrelevant part.
+ */
+
+ ld1 {vdata1.16b, vdata2.16b}, [src], #32
+ sub tmp, soff, #32
+ adds cntin, cntin, tmp
+ cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
+ cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
+ and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
+ and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+ addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */
+ addp vend.16b, vend.16b, vend.16b /* 128->64 */
+ mov synd, vend.2d[0]
+ /* Clear the soff*2 lower bits */
+ lsl tmp, soff, #1
+ lsr synd, synd, tmp
+ lsl synd, synd, tmp
+ /* The first block can also be the last */
+ b.ls L(masklast)
+ /* Have we found something already? */
+ cbnz synd, L(tail)
+
+L(loop):
+ ld1 {vdata1.16b, vdata2.16b}, [src], #32
+ subs cntin, cntin, #32
+ cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
+ cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
+ /* If we're out of data we finish regardless of the result */
+ b.ls L(end)
+ /* Use a fast check for the termination condition */
+ orr vend.16b, vhas_chr1.16b, vhas_chr2.16b
+ addp vend.2d, vend.2d, vend.2d
+ mov synd, vend.2d[0]
+ /* We're not out of data, loop if we haven't found the character */
+ cbz synd, L(loop)
+
+L(end):
+ /* Termination condition found, let's calculate the syndrome value */
+ and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
+ and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+ addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */
+ addp vend.16b, vend.16b, vend.16b /* 128->64 */
+ mov synd, vend.2d[0]
+ /* Only do the clear for the last possible block */
+ b.hi L(tail)
+
+L(masklast):
+ /* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */
+ add tmp, cntrem, soff
+ and tmp, tmp, #31
+ sub tmp, tmp, #32
+ neg tmp, tmp, lsl #1
+ lsl synd, synd, tmp
+ lsr synd, synd, tmp
+
+L(tail):
+ /* Count the trailing zeros using bit reversing */
+ rbit synd, synd
+ /* Compensate the last post-increment */
+ sub src, src, #32
+ /* Check that we have found a character */
+ cmp synd, #0
+ /* And count the leading zeros */
+ clz synd, synd
+ /* Compute the potential result */
+ add result, src, synd, lsr #1
+ /* Select result or NULL */
+ csel result, xzr, result, eq
+ ret
+
+L(zero_length):
+ mov result, #0
+ ret
+END (__memchr)
+weak_alias (__memchr, memchr)
+libc_hidden_builtin_def (memchr)
--
1.9.1
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH][AArch64] Add optimized memchr
2016-11-02 16:45 ` Wilco Dijkstra
@ 2016-11-04 9:01 ` Marcus Shawcroft
0 siblings, 0 replies; 12+ messages in thread
From: Marcus Shawcroft @ 2016-11-04 9:01 UTC (permalink / raw)
To: Wilco Dijkstra; +Cc: libc-alpha
On 2 November 2016 at 16:45, Wilco Dijkstra <Wilco.Dijkstra@arm.com> wrote:
>
> ping
>
> -----Original Message-----
> From: Wilco Dijkstra [mailto:wdijkstr@arm.com]
> Sent: 25 September 2015 14:21
> To: 'GNU C Library'
> Subject: [PATCH][AArch64] Add optimized memchr
>
> An optimized memchr was missing for AArch64. This version is similar to strchr and is significantly faster than the C version. Passes GLIBC tests.
>
> OK for commit?
>
> ChangeLog:
> 2015-09-25 Wilco Dijkstra <wdijkstr@arm.com>
> 2015-09-25 Kevin Petit <kevin.petit@arm.com>
>
> * sysdeps/aarch64/memchr.S (__memchr): New file.
>
OK /Marcus
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH][AArch64] Add optimized memchr
@ 2015-09-25 13:21 Wilco Dijkstra
2015-09-26 8:46 ` Ondřej Bílka
0 siblings, 1 reply; 12+ messages in thread
From: Wilco Dijkstra @ 2015-09-25 13:21 UTC (permalink / raw)
To: 'GNU C Library'
[-- Attachment #1: Type: text/plain, Size: 330 bytes --]
An optimized memchr was missing for AArch64. This version is similar to strchr and is significantly
faster than the C version. Passes GLIBC tests.
OK for commit?
ChangeLog:
2015-09-25 Wilco Dijkstra <wdijkstr@arm.com>
2015-09-25 Kevin Petit <kevin.petit@arm.com>
* sysdeps/aarch64/memchr.S (__memchr): New file.
[-- Attachment #2: 0001-Add-memchr.txt --]
[-- Type: text/plain, Size: 5179 bytes --]
---
sysdeps/aarch64/memchr.S | 157 +++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 157 insertions(+)
create mode 100644 sysdeps/aarch64/memchr.S
diff --git a/sysdeps/aarch64/memchr.S b/sysdeps/aarch64/memchr.S
new file mode 100644
index 0000000..2f643dd
--- /dev/null
+++ b/sysdeps/aarch64/memchr.S
@@ -0,0 +1,157 @@
+/* memchr - find a character in a memory zone
+
+ Copyright (C) 2015 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * Neon Available.
+ */
+
+/* Arguments and results. */
+#define srcin x0
+#define chrin w1
+#define cntin x2
+
+#define result x0
+
+#define src x3
+#define tmp x4
+#define wtmp2 w5
+#define synd x6
+#define soff x9
+#define cntrem x10
+
+#define vrepchr v0
+#define vdata1 v1
+#define vdata2 v2
+#define vhas_chr1 v3
+#define vhas_chr2 v4
+#define vrepmask v5
+#define vend v6
+
+/*
+ * Core algorithm:
+ *
+ * For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits
+ * per byte. For each tuple, bit 0 is set if the relevant byte matched the
+ * requested character and bit 1 is not used (faster than using a 32bit
+ * syndrome). Since the bits in the syndrome reflect exactly the order in which
+ * things occur in the original string, counting trailing zeros allows to
+ * identify exactly which byte has matched.
+ */
+
+ENTRY (__memchr)
+ /* Do not dereference srcin if no bytes to compare. */
+ cbz cntin, L(zero_length)
+ /*
+ * Magic constant 0x40100401 allows us to identify which lane matches
+ * the requested byte.
+ */
+ mov wtmp2, #0x0401
+ movk wtmp2, #0x4010, lsl #16
+ dup vrepchr.16b, chrin
+ /* Work with aligned 32-byte chunks */
+ bic src, srcin, #31
+ dup vrepmask.4s, wtmp2
+ ands soff, srcin, #31
+ and cntrem, cntin, #31
+ b.eq L(loop)
+
+ /*
+ * Input string is not 32-byte aligned. We calculate the syndrome
+ * value for the aligned 32 bytes block containing the first bytes
+ * and mask the irrelevant part.
+ */
+
+ ld1 {vdata1.16b, vdata2.16b}, [src], #32
+ sub tmp, soff, #32
+ adds cntin, cntin, tmp
+ cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
+ cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
+ and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
+ and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+ addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */
+ addp vend.16b, vend.16b, vend.16b /* 128->64 */
+ mov synd, vend.2d[0]
+ /* Clear the soff*2 lower bits */
+ lsl tmp, soff, #1
+ lsr synd, synd, tmp
+ lsl synd, synd, tmp
+ /* The first block can also be the last */
+ b.ls L(masklast)
+ /* Have we found something already? */
+ cbnz synd, L(tail)
+
+L(loop):
+ ld1 {vdata1.16b, vdata2.16b}, [src], #32
+ subs cntin, cntin, #32
+ cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
+ cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
+ /* If we're out of data we finish regardless of the result */
+ b.ls L(end)
+ /* Use a fast check for the termination condition */
+ orr vend.16b, vhas_chr1.16b, vhas_chr2.16b
+ addp vend.2d, vend.2d, vend.2d
+ mov synd, vend.2d[0]
+ /* We're not out of data, loop if we haven't found the character */
+ cbz synd, L(loop)
+
+L(end):
+ /* Termination condition found, let's calculate the syndrome value */
+ and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
+ and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+ addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */
+ addp vend.16b, vend.16b, vend.16b /* 128->64 */
+ mov synd, vend.2d[0]
+ /* Only do the clear for the last possible block */
+ b.hi L(tail)
+
+L(masklast):
+ /* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */
+ add tmp, cntrem, soff
+ and tmp, tmp, #31
+ sub tmp, tmp, #32
+ neg tmp, tmp, lsl #1
+ lsl synd, synd, tmp
+ lsr synd, synd, tmp
+
+L(tail):
+ /* Count the trailing zeros using bit reversing */
+ rbit synd, synd
+ /* Compensate the last post-increment */
+ sub src, src, #32
+ /* Check that we have found a character */
+ cmp synd, #0
+ /* And count the leading zeros */
+ clz synd, synd
+ /* Compute the potential result */
+ add result, src, synd, lsr #1
+ /* Select result or NULL */
+ csel result, xzr, result, eq
+ ret
+
+L(zero_length):
+ mov result, #0
+ ret
+END (__memchr)
+weak_alias (__memchr, memchr)
+libc_hidden_builtin_def (memchr)
--
1.9.1
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH][AArch64] Add optimized memchr
2015-09-25 13:21 Wilco Dijkstra
@ 2015-09-26 8:46 ` Ondřej Bílka
2015-09-28 9:23 ` Wilco Dijkstra
0 siblings, 1 reply; 12+ messages in thread
From: Ondřej Bílka @ 2015-09-26 8:46 UTC (permalink / raw)
To: Wilco Dijkstra; +Cc: 'GNU C Library'
On Fri, Sep 25, 2015 at 02:21:13PM +0100, Wilco Dijkstra wrote:
> An optimized memchr was missing for AArch64. This version is similar to strchr and is significantly
> faster than the C version. Passes GLIBC tests.
>
> OK for commit?
>
> ChangeLog:
> 2015-09-25 Wilco Dijkstra <wdijkstr@arm.com>
> 2015-09-25 Kevin Petit <kevin.petit@arm.com>
>
> * sysdeps/aarch64/memchr.S (__memchr): New file.
How you tested performance. I think that also here loading first 32
bytes unaligned should be better. Could you use dryrun to verify?
Also same optimization could be used for memrchr.
^ permalink raw reply [flat|nested] 12+ messages in thread
* RE: [PATCH][AArch64] Add optimized memchr
2015-09-26 8:46 ` Ondřej Bílka
@ 2015-09-28 9:23 ` Wilco Dijkstra
0 siblings, 0 replies; 12+ messages in thread
From: Wilco Dijkstra @ 2015-09-28 9:23 UTC (permalink / raw)
To: 'Ondřej Bílka'; +Cc: 'GNU C Library'
> Ondřej Bílka wrote:
> On Fri, Sep 25, 2015 at 02:21:13PM +0100, Wilco Dijkstra wrote:
> > An optimized memchr was missing for AArch64. This version is similar to strchr and is
> significantly
> > faster than the C version. Passes GLIBC tests.
> >
> > OK for commit?
> >
> > ChangeLog:
> > 2015-09-25 Wilco Dijkstra <wdijkstr@arm.com>
> > 2015-09-25 Kevin Petit <kevin.petit@arm.com>
> >
> > * sysdeps/aarch64/memchr.S (__memchr): New file.
>
> How you tested performance. I think that also here loading first 32
> bytes unaligned should be better. Could you use dryrun to verify?
>
> Also same optimization could be used for memrchr.
I haven't tuned this at all, this is an existing implementation that
was added to Newlib last year but not yet ported to GLIBC.
For maximum performance doing the first 16/32 bytes unaligned will likely
be fastest just like it was for strlen.
Wilco
^ permalink raw reply [flat|nested] 12+ messages in thread
end of thread, other threads:[~2016-11-04 9:01 UTC | newest]
Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-12-15 16:41 [PATCH][AArch64] Add optimized memchr Wilco Dijkstra
2016-04-15 12:42 ` Wilco Dijkstra
2016-05-12 14:02 ` Fw: " Wilco Dijkstra
2016-06-03 12:41 ` Wilco Dijkstra
2016-06-21 13:35 ` Wilco Dijkstra
2016-10-19 14:58 ` Wilco Dijkstra
2016-10-25 17:08 ` Adhemerval Zanella
2016-11-02 16:45 ` Wilco Dijkstra
2016-11-04 9:01 ` Marcus Shawcroft
-- strict thread matches above, loose matches on Subject: below --
2015-09-25 13:21 Wilco Dijkstra
2015-09-26 8:46 ` Ondřej Bílka
2015-09-28 9:23 ` Wilco Dijkstra
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).