Re: [PATCH][AArch64] Add optimized memchr

public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed

* Re: [PATCH][AArch64] Add optimized memchr
@ 2015-12-15 16:41 Wilco Dijkstra
  2016-04-15 12:42 ` Wilco Dijkstra
                   ` (2 more replies)
  0 siblings, 3 replies; 12+ messages in thread
From: Wilco Dijkstra @ 2015-12-15 16:41 UTC (permalink / raw)
  To: 'GNU C Library'; +Cc: nd

[-- Attachment #1: Type: text/plain, Size: 516 bytes --]

ping

-----Original Message-----
From: Wilco Dijkstra [mailto:wdijkstr@arm.com] 
Sent: 25 September 2015 14:21
To: 'GNU C Library'
Subject: [PATCH][AArch64] Add optimized memchr

An optimized memchr was missing for AArch64. This version is similar to strchr and is significantly faster than the C version. Passes GLIBC tests.

OK for commit?

ChangeLog:
2015-09-25  Wilco Dijkstra  <wdijkstr@arm.com>
2015-09-25  Kevin Petit  <kevin.petit@arm.com>

	* sysdeps/aarch64/memchr.S (__memchr): New file.

[-- Attachment #2: 0001-Add-memchr.txt --]
[-- Type: text/plain, Size: 5008 bytes --]

---
 sysdeps/aarch64/memchr.S | 157 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 157 insertions(+)
 create mode 100644 sysdeps/aarch64/memchr.S

diff --git a/sysdeps/aarch64/memchr.S b/sysdeps/aarch64/memchr.S
new file mode 100644
index 0000000..2f643dd
--- /dev/null
+++ b/sysdeps/aarch64/memchr.S
@@ -0,0 +1,157 @@
+/* memchr - find a character in a memory zone
+
+   Copyright (C) 2015 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * Neon Available.
+ */
+
+/* Arguments and results.  */
+#define srcin		x0
+#define chrin		w1
+#define cntin		x2
+
+#define result		x0
+
+#define src		x3
+#define	tmp		x4
+#define wtmp2		w5
+#define synd		x6
+#define soff		x9
+#define cntrem		x10
+
+#define vrepchr		v0
+#define vdata1		v1
+#define vdata2		v2
+#define vhas_chr1	v3
+#define vhas_chr2	v4
+#define vrepmask	v5
+#define vend		v6
+
+/*
+ * Core algorithm:
+ *
+ * For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits
+ * per byte. For each tuple, bit 0 is set if the relevant byte matched the
+ * requested character and bit 1 is not used (faster than using a 32bit
+ * syndrome). Since the bits in the syndrome reflect exactly the order in which
+ * things occur in the original string, counting trailing zeros allows to
+ * identify exactly which byte has matched.
+ */
+
+ENTRY (__memchr)
+	/* Do not dereference srcin if no bytes to compare.  */
+	cbz	cntin, L(zero_length)
+	/*
+	 * Magic constant 0x40100401 allows us to identify which lane matches
+	 * the requested byte.
+	 */
+	mov	wtmp2, #0x0401
+	movk	wtmp2, #0x4010, lsl #16
+	dup	vrepchr.16b, chrin
+	/* Work with aligned 32-byte chunks */
+	bic	src, srcin, #31
+	dup	vrepmask.4s, wtmp2
+	ands	soff, srcin, #31
+	and	cntrem, cntin, #31
+	b.eq	L(loop)
+
+	/*
+	 * Input string is not 32-byte aligned. We calculate the syndrome
+	 * value for the aligned 32 bytes block containing the first bytes
+	 * and mask the irrelevant part.
+	 */
+
+	ld1	{vdata1.16b, vdata2.16b}, [src], #32
+	sub	tmp, soff, #32
+	adds	cntin, cntin, tmp
+	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
+	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
+	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
+	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+	addp	vend.16b, vhas_chr1.16b, vhas_chr2.16b		/* 256->128 */
+	addp	vend.16b, vend.16b, vend.16b			/* 128->64 */
+	mov	synd, vend.2d[0]
+	/* Clear the soff*2 lower bits */
+	lsl	tmp, soff, #1
+	lsr	synd, synd, tmp
+	lsl	synd, synd, tmp
+	/* The first block can also be the last */
+	b.ls	L(masklast)
+	/* Have we found something already? */
+	cbnz	synd, L(tail)
+
+L(loop):
+	ld1	{vdata1.16b, vdata2.16b}, [src], #32
+	subs	cntin, cntin, #32
+	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
+	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
+	/* If we're out of data we finish regardless of the result */
+	b.ls	L(end)
+	/* Use a fast check for the termination condition */
+	orr	vend.16b, vhas_chr1.16b, vhas_chr2.16b
+	addp	vend.2d, vend.2d, vend.2d
+	mov	synd, vend.2d[0]
+	/* We're not out of data, loop if we haven't found the character */
+	cbz	synd, L(loop)
+
+L(end):
+	/* Termination condition found, let's calculate the syndrome value */
+	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
+	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+	addp	vend.16b, vhas_chr1.16b, vhas_chr2.16b		/* 256->128 */
+	addp	vend.16b, vend.16b, vend.16b			/* 128->64 */
+	mov	synd, vend.2d[0]
+	/* Only do the clear for the last possible block */
+	b.hi	L(tail)
+
+L(masklast):
+	/* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */
+	add	tmp, cntrem, soff
+	and	tmp, tmp, #31
+	sub	tmp, tmp, #32
+	neg	tmp, tmp, lsl #1
+	lsl	synd, synd, tmp
+	lsr	synd, synd, tmp
+
+L(tail):
+	/* Count the trailing zeros using bit reversing */
+	rbit	synd, synd
+	/* Compensate the last post-increment */
+	sub	src, src, #32
+	/* Check that we have found a character */
+	cmp	synd, #0
+	/* And count the leading zeros */
+	clz	synd, synd
+	/* Compute the potential result */
+	add	result, src, synd, lsr #1
+	/* Select result or NULL */
+	csel	result, xzr, result, eq
+	ret
+
+L(zero_length):
+	mov	result, #0
+	ret
+END (__memchr)
+weak_alias (__memchr, memchr)
+libc_hidden_builtin_def (memchr)
-- 
1.9.1


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH][AArch64] Add optimized memchr
  2015-12-15 16:41 [PATCH][AArch64] Add optimized memchr Wilco Dijkstra
@ 2016-04-15 12:42 ` Wilco Dijkstra
  2016-05-12 14:02 ` Fw: " Wilco Dijkstra
  2016-10-19 14:58 ` Wilco Dijkstra
  2 siblings, 0 replies; 12+ messages in thread
From: Wilco Dijkstra @ 2016-04-15 12:42 UTC (permalink / raw)
  To: 'GNU C Library'; +Cc: nd, Richard Earnshaw, Marcus Shawcroft

[-- Attachment #1: Type: text/plain, Size: 707 bytes --]

ping

________________________________________
From: Wilco Dijkstra
Sent: 15 December 2015 16:41
To: 'GNU C Library'
Cc: nd
Subject: Re: [PATCH][AArch64] Add optimized memchr

ping

-----Original Message-----
From: Wilco Dijkstra [mailto:wdijkstr@arm.com]
Sent: 25 September 2015 14:21
To: 'GNU C Library'
Subject: [PATCH][AArch64] Add optimized memchr

An optimized memchr was missing for AArch64. This version is similar to strchr and is significantly faster than the C version. Passes GLIBC tests.

OK for commit?

ChangeLog:
2015-09-25  Wilco Dijkstra  <wdijkstr@arm.com>
2015-09-25  Kevin Petit  <kevin.petit@arm.com>

        * sysdeps/aarch64/memchr.S (__memchr): New file.

[-- Attachment #2: 0001-Add-memchr.txt --]
[-- Type: text/plain, Size: 5008 bytes --]

---
 sysdeps/aarch64/memchr.S | 157 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 157 insertions(+)
 create mode 100644 sysdeps/aarch64/memchr.S

diff --git a/sysdeps/aarch64/memchr.S b/sysdeps/aarch64/memchr.S
new file mode 100644
index 0000000..2f643dd
--- /dev/null
+++ b/sysdeps/aarch64/memchr.S
@@ -0,0 +1,157 @@
+/* memchr - find a character in a memory zone
+
+   Copyright (C) 2015 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * Neon Available.
+ */
+
+/* Arguments and results.  */
+#define srcin		x0
+#define chrin		w1
+#define cntin		x2
+
+#define result		x0
+
+#define src		x3
+#define	tmp		x4
+#define wtmp2		w5
+#define synd		x6
+#define soff		x9
+#define cntrem		x10
+
+#define vrepchr		v0
+#define vdata1		v1
+#define vdata2		v2
+#define vhas_chr1	v3
+#define vhas_chr2	v4
+#define vrepmask	v5
+#define vend		v6
+
+/*
+ * Core algorithm:
+ *
+ * For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits
+ * per byte. For each tuple, bit 0 is set if the relevant byte matched the
+ * requested character and bit 1 is not used (faster than using a 32bit
+ * syndrome). Since the bits in the syndrome reflect exactly the order in which
+ * things occur in the original string, counting trailing zeros allows to
+ * identify exactly which byte has matched.
+ */
+
+ENTRY (__memchr)
+	/* Do not dereference srcin if no bytes to compare.  */
+	cbz	cntin, L(zero_length)
+	/*
+	 * Magic constant 0x40100401 allows us to identify which lane matches
+	 * the requested byte.
+	 */
+	mov	wtmp2, #0x0401
+	movk	wtmp2, #0x4010, lsl #16
+	dup	vrepchr.16b, chrin
+	/* Work with aligned 32-byte chunks */
+	bic	src, srcin, #31
+	dup	vrepmask.4s, wtmp2
+	ands	soff, srcin, #31
+	and	cntrem, cntin, #31
+	b.eq	L(loop)
+
+	/*
+	 * Input string is not 32-byte aligned. We calculate the syndrome
+	 * value for the aligned 32 bytes block containing the first bytes
+	 * and mask the irrelevant part.
+	 */
+
+	ld1	{vdata1.16b, vdata2.16b}, [src], #32
+	sub	tmp, soff, #32
+	adds	cntin, cntin, tmp
+	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
+	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
+	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
+	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+	addp	vend.16b, vhas_chr1.16b, vhas_chr2.16b		/* 256->128 */
+	addp	vend.16b, vend.16b, vend.16b			/* 128->64 */
+	mov	synd, vend.2d[0]
+	/* Clear the soff*2 lower bits */
+	lsl	tmp, soff, #1
+	lsr	synd, synd, tmp
+	lsl	synd, synd, tmp
+	/* The first block can also be the last */
+	b.ls	L(masklast)
+	/* Have we found something already? */
+	cbnz	synd, L(tail)
+
+L(loop):
+	ld1	{vdata1.16b, vdata2.16b}, [src], #32
+	subs	cntin, cntin, #32
+	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
+	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
+	/* If we're out of data we finish regardless of the result */
+	b.ls	L(end)
+	/* Use a fast check for the termination condition */
+	orr	vend.16b, vhas_chr1.16b, vhas_chr2.16b
+	addp	vend.2d, vend.2d, vend.2d
+	mov	synd, vend.2d[0]
+	/* We're not out of data, loop if we haven't found the character */
+	cbz	synd, L(loop)
+
+L(end):
+	/* Termination condition found, let's calculate the syndrome value */
+	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
+	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+	addp	vend.16b, vhas_chr1.16b, vhas_chr2.16b		/* 256->128 */
+	addp	vend.16b, vend.16b, vend.16b			/* 128->64 */
+	mov	synd, vend.2d[0]
+	/* Only do the clear for the last possible block */
+	b.hi	L(tail)
+
+L(masklast):
+	/* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */
+	add	tmp, cntrem, soff
+	and	tmp, tmp, #31
+	sub	tmp, tmp, #32
+	neg	tmp, tmp, lsl #1
+	lsl	synd, synd, tmp
+	lsr	synd, synd, tmp
+
+L(tail):
+	/* Count the trailing zeros using bit reversing */
+	rbit	synd, synd
+	/* Compensate the last post-increment */
+	sub	src, src, #32
+	/* Check that we have found a character */
+	cmp	synd, #0
+	/* And count the leading zeros */
+	clz	synd, synd
+	/* Compute the potential result */
+	add	result, src, synd, lsr #1
+	/* Select result or NULL */
+	csel	result, xzr, result, eq
+	ret
+
+L(zero_length):
+	mov	result, #0
+	ret
+END (__memchr)
+weak_alias (__memchr, memchr)
+libc_hidden_builtin_def (memchr)
-- 
1.9.1


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Fw: [PATCH][AArch64] Add optimized memchr
  2015-12-15 16:41 [PATCH][AArch64] Add optimized memchr Wilco Dijkstra
  2016-04-15 12:42 ` Wilco Dijkstra
@ 2016-05-12 14:02 ` Wilco Dijkstra
  2016-06-03 12:41   ` Wilco Dijkstra
  2016-10-19 14:58 ` Wilco Dijkstra
  2 siblings, 1 reply; 12+ messages in thread
From: Wilco Dijkstra @ 2016-05-12 14:02 UTC (permalink / raw)
  To: 'GNU C Library', Marcus Shawcroft; +Cc: nd

[-- Attachment #1: Type: text/plain, Size: 526 bytes --]


ping


-----Original Message-----
From: Wilco Dijkstra [mailto:wdijkstr@arm.com]
Sent: 25 September 2015 14:21
To: 'GNU C Library'
Subject: [PATCH][AArch64] Add optimized memchr

An optimized memchr was missing for AArch64. This version is similar to strchr and is significantly faster than the C version. Passes GLIBC tests.

OK for commit?

ChangeLog:
2015-09-25  Wilco Dijkstra  <wdijkstr@arm.com>
2015-09-25  Kevin Petit  <kevin.petit@arm.com>

        * sysdeps/aarch64/memchr.S (__memchr): New file.

[-- Attachment #2: 0001-Add-memchr.txt --]
[-- Type: text/plain, Size: 5008 bytes --]

---
 sysdeps/aarch64/memchr.S | 157 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 157 insertions(+)
 create mode 100644 sysdeps/aarch64/memchr.S

diff --git a/sysdeps/aarch64/memchr.S b/sysdeps/aarch64/memchr.S
new file mode 100644
index 0000000..2f643dd
--- /dev/null
+++ b/sysdeps/aarch64/memchr.S
@@ -0,0 +1,157 @@
+/* memchr - find a character in a memory zone
+
+   Copyright (C) 2015 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * Neon Available.
+ */
+
+/* Arguments and results.  */
+#define srcin		x0
+#define chrin		w1
+#define cntin		x2
+
+#define result		x0
+
+#define src		x3
+#define	tmp		x4
+#define wtmp2		w5
+#define synd		x6
+#define soff		x9
+#define cntrem		x10
+
+#define vrepchr		v0
+#define vdata1		v1
+#define vdata2		v2
+#define vhas_chr1	v3
+#define vhas_chr2	v4
+#define vrepmask	v5
+#define vend		v6
+
+/*
+ * Core algorithm:
+ *
+ * For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits
+ * per byte. For each tuple, bit 0 is set if the relevant byte matched the
+ * requested character and bit 1 is not used (faster than using a 32bit
+ * syndrome). Since the bits in the syndrome reflect exactly the order in which
+ * things occur in the original string, counting trailing zeros allows to
+ * identify exactly which byte has matched.
+ */
+
+ENTRY (__memchr)
+	/* Do not dereference srcin if no bytes to compare.  */
+	cbz	cntin, L(zero_length)
+	/*
+	 * Magic constant 0x40100401 allows us to identify which lane matches
+	 * the requested byte.
+	 */
+	mov	wtmp2, #0x0401
+	movk	wtmp2, #0x4010, lsl #16
+	dup	vrepchr.16b, chrin
+	/* Work with aligned 32-byte chunks */
+	bic	src, srcin, #31
+	dup	vrepmask.4s, wtmp2
+	ands	soff, srcin, #31
+	and	cntrem, cntin, #31
+	b.eq	L(loop)
+
+	/*
+	 * Input string is not 32-byte aligned. We calculate the syndrome
+	 * value for the aligned 32 bytes block containing the first bytes
+	 * and mask the irrelevant part.
+	 */
+
+	ld1	{vdata1.16b, vdata2.16b}, [src], #32
+	sub	tmp, soff, #32
+	adds	cntin, cntin, tmp
+	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
+	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
+	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
+	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+	addp	vend.16b, vhas_chr1.16b, vhas_chr2.16b		/* 256->128 */
+	addp	vend.16b, vend.16b, vend.16b			/* 128->64 */
+	mov	synd, vend.2d[0]
+	/* Clear the soff*2 lower bits */
+	lsl	tmp, soff, #1
+	lsr	synd, synd, tmp
+	lsl	synd, synd, tmp
+	/* The first block can also be the last */
+	b.ls	L(masklast)
+	/* Have we found something already? */
+	cbnz	synd, L(tail)
+
+L(loop):
+	ld1	{vdata1.16b, vdata2.16b}, [src], #32
+	subs	cntin, cntin, #32
+	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
+	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
+	/* If we're out of data we finish regardless of the result */
+	b.ls	L(end)
+	/* Use a fast check for the termination condition */
+	orr	vend.16b, vhas_chr1.16b, vhas_chr2.16b
+	addp	vend.2d, vend.2d, vend.2d
+	mov	synd, vend.2d[0]
+	/* We're not out of data, loop if we haven't found the character */
+	cbz	synd, L(loop)
+
+L(end):
+	/* Termination condition found, let's calculate the syndrome value */
+	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
+	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+	addp	vend.16b, vhas_chr1.16b, vhas_chr2.16b		/* 256->128 */
+	addp	vend.16b, vend.16b, vend.16b			/* 128->64 */
+	mov	synd, vend.2d[0]
+	/* Only do the clear for the last possible block */
+	b.hi	L(tail)
+
+L(masklast):
+	/* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */
+	add	tmp, cntrem, soff
+	and	tmp, tmp, #31
+	sub	tmp, tmp, #32
+	neg	tmp, tmp, lsl #1
+	lsl	synd, synd, tmp
+	lsr	synd, synd, tmp
+
+L(tail):
+	/* Count the trailing zeros using bit reversing */
+	rbit	synd, synd
+	/* Compensate the last post-increment */
+	sub	src, src, #32
+	/* Check that we have found a character */
+	cmp	synd, #0
+	/* And count the leading zeros */
+	clz	synd, synd
+	/* Compute the potential result */
+	add	result, src, synd, lsr #1
+	/* Select result or NULL */
+	csel	result, xzr, result, eq
+	ret
+
+L(zero_length):
+	mov	result, #0
+	ret
+END (__memchr)
+weak_alias (__memchr, memchr)
+libc_hidden_builtin_def (memchr)
-- 
1.9.1


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Fw: [PATCH][AArch64] Add optimized memchr
  2016-05-12 14:02 ` Fw: " Wilco Dijkstra
@ 2016-06-03 12:41   ` Wilco Dijkstra
  2016-06-21 13:35     ` Wilco Dijkstra
  0 siblings, 1 reply; 12+ messages in thread
From: Wilco Dijkstra @ 2016-06-03 12:41 UTC (permalink / raw)
  To: Marcus Shawcroft; +Cc: nd, 'GNU C Library'

[-- Attachment #1: Type: text/plain, Size: 528 bytes --]



ping


-----Original Message-----
From: Wilco Dijkstra [mailto:wdijkstr@arm.com]
Sent: 25 September 2015 14:21
To: 'GNU C Library'
Subject: [PATCH][AArch64] Add optimized memchr

An optimized memchr was missing for AArch64. This version is similar to strchr and is significantly faster than the C version. Passes GLIBC tests.

OK for commit?

ChangeLog:
2015-09-25  Wilco Dijkstra  <wdijkstr@arm.com>
2015-09-25  Kevin Petit  <kevin.petit@arm.com>

        * sysdeps/aarch64/memchr.S (__memchr): New file.

[-- Attachment #2: 0001-Add-memchr.txt --]
[-- Type: text/plain, Size: 5008 bytes --]

---
 sysdeps/aarch64/memchr.S | 157 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 157 insertions(+)
 create mode 100644 sysdeps/aarch64/memchr.S

diff --git a/sysdeps/aarch64/memchr.S b/sysdeps/aarch64/memchr.S
new file mode 100644
index 0000000..2f643dd
--- /dev/null
+++ b/sysdeps/aarch64/memchr.S
@@ -0,0 +1,157 @@
+/* memchr - find a character in a memory zone
+
+   Copyright (C) 2015 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * Neon Available.
+ */
+
+/* Arguments and results.  */
+#define srcin		x0
+#define chrin		w1
+#define cntin		x2
+
+#define result		x0
+
+#define src		x3
+#define	tmp		x4
+#define wtmp2		w5
+#define synd		x6
+#define soff		x9
+#define cntrem		x10
+
+#define vrepchr		v0
+#define vdata1		v1
+#define vdata2		v2
+#define vhas_chr1	v3
+#define vhas_chr2	v4
+#define vrepmask	v5
+#define vend		v6
+
+/*
+ * Core algorithm:
+ *
+ * For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits
+ * per byte. For each tuple, bit 0 is set if the relevant byte matched the
+ * requested character and bit 1 is not used (faster than using a 32bit
+ * syndrome). Since the bits in the syndrome reflect exactly the order in which
+ * things occur in the original string, counting trailing zeros allows to
+ * identify exactly which byte has matched.
+ */
+
+ENTRY (__memchr)
+	/* Do not dereference srcin if no bytes to compare.  */
+	cbz	cntin, L(zero_length)
+	/*
+	 * Magic constant 0x40100401 allows us to identify which lane matches
+	 * the requested byte.
+	 */
+	mov	wtmp2, #0x0401
+	movk	wtmp2, #0x4010, lsl #16
+	dup	vrepchr.16b, chrin
+	/* Work with aligned 32-byte chunks */
+	bic	src, srcin, #31
+	dup	vrepmask.4s, wtmp2
+	ands	soff, srcin, #31
+	and	cntrem, cntin, #31
+	b.eq	L(loop)
+
+	/*
+	 * Input string is not 32-byte aligned. We calculate the syndrome
+	 * value for the aligned 32 bytes block containing the first bytes
+	 * and mask the irrelevant part.
+	 */
+
+	ld1	{vdata1.16b, vdata2.16b}, [src], #32
+	sub	tmp, soff, #32
+	adds	cntin, cntin, tmp
+	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
+	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
+	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
+	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+	addp	vend.16b, vhas_chr1.16b, vhas_chr2.16b		/* 256->128 */
+	addp	vend.16b, vend.16b, vend.16b			/* 128->64 */
+	mov	synd, vend.2d[0]
+	/* Clear the soff*2 lower bits */
+	lsl	tmp, soff, #1
+	lsr	synd, synd, tmp
+	lsl	synd, synd, tmp
+	/* The first block can also be the last */
+	b.ls	L(masklast)
+	/* Have we found something already? */
+	cbnz	synd, L(tail)
+
+L(loop):
+	ld1	{vdata1.16b, vdata2.16b}, [src], #32
+	subs	cntin, cntin, #32
+	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
+	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
+	/* If we're out of data we finish regardless of the result */
+	b.ls	L(end)
+	/* Use a fast check for the termination condition */
+	orr	vend.16b, vhas_chr1.16b, vhas_chr2.16b
+	addp	vend.2d, vend.2d, vend.2d
+	mov	synd, vend.2d[0]
+	/* We're not out of data, loop if we haven't found the character */
+	cbz	synd, L(loop)
+
+L(end):
+	/* Termination condition found, let's calculate the syndrome value */
+	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
+	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+	addp	vend.16b, vhas_chr1.16b, vhas_chr2.16b		/* 256->128 */
+	addp	vend.16b, vend.16b, vend.16b			/* 128->64 */
+	mov	synd, vend.2d[0]
+	/* Only do the clear for the last possible block */
+	b.hi	L(tail)
+
+L(masklast):
+	/* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */
+	add	tmp, cntrem, soff
+	and	tmp, tmp, #31
+	sub	tmp, tmp, #32
+	neg	tmp, tmp, lsl #1
+	lsl	synd, synd, tmp
+	lsr	synd, synd, tmp
+
+L(tail):
+	/* Count the trailing zeros using bit reversing */
+	rbit	synd, synd
+	/* Compensate the last post-increment */
+	sub	src, src, #32
+	/* Check that we have found a character */
+	cmp	synd, #0
+	/* And count the leading zeros */
+	clz	synd, synd
+	/* Compute the potential result */
+	add	result, src, synd, lsr #1
+	/* Select result or NULL */
+	csel	result, xzr, result, eq
+	ret
+
+L(zero_length):
+	mov	result, #0
+	ret
+END (__memchr)
+weak_alias (__memchr, memchr)
+libc_hidden_builtin_def (memchr)
-- 
1.9.1


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH][AArch64] Add optimized memchr
  2016-06-03 12:41   ` Wilco Dijkstra
@ 2016-06-21 13:35     ` Wilco Dijkstra
  0 siblings, 0 replies; 12+ messages in thread
From: Wilco Dijkstra @ 2016-06-21 13:35 UTC (permalink / raw)
  To: Marcus Shawcroft; +Cc: nd, 'GNU C Library'

[-- Attachment #1: Type: text/plain, Size: 532 bytes --]



ping


-----Original Message-----
From: Wilco Dijkstra [mailto:wdijkstr@arm.com]
Sent: 25 September 2015 14:21
To: 'GNU C Library'
Subject: [PATCH][AArch64] Add optimized memchr

An optimized memchr was missing for AArch64. This version is similar to strchr and is significantly faster than the C version. Passes GLIBC tests.

OK for commit?

ChangeLog:
2015-09-25  Wilco Dijkstra  <wdijkstr@arm.com>
2015-09-25  Kevin Petit  <kevin.petit@arm.com>

        * sysdeps/aarch64/memchr.S (__memchr): New file.
    

[-- Attachment #2: 0001-Add-memchr.txt --]
[-- Type: text/plain, Size: 5008 bytes --]

---
 sysdeps/aarch64/memchr.S | 157 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 157 insertions(+)
 create mode 100644 sysdeps/aarch64/memchr.S

diff --git a/sysdeps/aarch64/memchr.S b/sysdeps/aarch64/memchr.S
new file mode 100644
index 0000000..2f643dd
--- /dev/null
+++ b/sysdeps/aarch64/memchr.S
@@ -0,0 +1,157 @@
+/* memchr - find a character in a memory zone
+
+   Copyright (C) 2015 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * Neon Available.
+ */
+
+/* Arguments and results.  */
+#define srcin		x0
+#define chrin		w1
+#define cntin		x2
+
+#define result		x0
+
+#define src		x3
+#define	tmp		x4
+#define wtmp2		w5
+#define synd		x6
+#define soff		x9
+#define cntrem		x10
+
+#define vrepchr		v0
+#define vdata1		v1
+#define vdata2		v2
+#define vhas_chr1	v3
+#define vhas_chr2	v4
+#define vrepmask	v5
+#define vend		v6
+
+/*
+ * Core algorithm:
+ *
+ * For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits
+ * per byte. For each tuple, bit 0 is set if the relevant byte matched the
+ * requested character and bit 1 is not used (faster than using a 32bit
+ * syndrome). Since the bits in the syndrome reflect exactly the order in which
+ * things occur in the original string, counting trailing zeros allows to
+ * identify exactly which byte has matched.
+ */
+
+ENTRY (__memchr)
+	/* Do not dereference srcin if no bytes to compare.  */
+	cbz	cntin, L(zero_length)
+	/*
+	 * Magic constant 0x40100401 allows us to identify which lane matches
+	 * the requested byte.
+	 */
+	mov	wtmp2, #0x0401
+	movk	wtmp2, #0x4010, lsl #16
+	dup	vrepchr.16b, chrin
+	/* Work with aligned 32-byte chunks */
+	bic	src, srcin, #31
+	dup	vrepmask.4s, wtmp2
+	ands	soff, srcin, #31
+	and	cntrem, cntin, #31
+	b.eq	L(loop)
+
+	/*
+	 * Input string is not 32-byte aligned. We calculate the syndrome
+	 * value for the aligned 32 bytes block containing the first bytes
+	 * and mask the irrelevant part.
+	 */
+
+	ld1	{vdata1.16b, vdata2.16b}, [src], #32
+	sub	tmp, soff, #32
+	adds	cntin, cntin, tmp
+	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
+	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
+	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
+	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+	addp	vend.16b, vhas_chr1.16b, vhas_chr2.16b		/* 256->128 */
+	addp	vend.16b, vend.16b, vend.16b			/* 128->64 */
+	mov	synd, vend.2d[0]
+	/* Clear the soff*2 lower bits */
+	lsl	tmp, soff, #1
+	lsr	synd, synd, tmp
+	lsl	synd, synd, tmp
+	/* The first block can also be the last */
+	b.ls	L(masklast)
+	/* Have we found something already? */
+	cbnz	synd, L(tail)
+
+L(loop):
+	ld1	{vdata1.16b, vdata2.16b}, [src], #32
+	subs	cntin, cntin, #32
+	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
+	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
+	/* If we're out of data we finish regardless of the result */
+	b.ls	L(end)
+	/* Use a fast check for the termination condition */
+	orr	vend.16b, vhas_chr1.16b, vhas_chr2.16b
+	addp	vend.2d, vend.2d, vend.2d
+	mov	synd, vend.2d[0]
+	/* We're not out of data, loop if we haven't found the character */
+	cbz	synd, L(loop)
+
+L(end):
+	/* Termination condition found, let's calculate the syndrome value */
+	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
+	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+	addp	vend.16b, vhas_chr1.16b, vhas_chr2.16b		/* 256->128 */
+	addp	vend.16b, vend.16b, vend.16b			/* 128->64 */
+	mov	synd, vend.2d[0]
+	/* Only do the clear for the last possible block */
+	b.hi	L(tail)
+
+L(masklast):
+	/* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */
+	add	tmp, cntrem, soff
+	and	tmp, tmp, #31
+	sub	tmp, tmp, #32
+	neg	tmp, tmp, lsl #1
+	lsl	synd, synd, tmp
+	lsr	synd, synd, tmp
+
+L(tail):
+	/* Count the trailing zeros using bit reversing */
+	rbit	synd, synd
+	/* Compensate the last post-increment */
+	sub	src, src, #32
+	/* Check that we have found a character */
+	cmp	synd, #0
+	/* And count the leading zeros */
+	clz	synd, synd
+	/* Compute the potential result */
+	add	result, src, synd, lsr #1
+	/* Select result or NULL */
+	csel	result, xzr, result, eq
+	ret
+
+L(zero_length):
+	mov	result, #0
+	ret
+END (__memchr)
+weak_alias (__memchr, memchr)
+libc_hidden_builtin_def (memchr)
-- 
1.9.1


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH][AArch64] Add optimized memchr
  2015-12-15 16:41 [PATCH][AArch64] Add optimized memchr Wilco Dijkstra
  2016-04-15 12:42 ` Wilco Dijkstra
  2016-05-12 14:02 ` Fw: " Wilco Dijkstra
@ 2016-10-19 14:58 ` Wilco Dijkstra
  2016-10-25 17:08   ` Adhemerval Zanella
  2016-11-02 16:45   ` Wilco Dijkstra
  2 siblings, 2 replies; 12+ messages in thread
From: Wilco Dijkstra @ 2016-10-19 14:58 UTC (permalink / raw)
  To: libc-alpha, Richard Earnshaw, Szabolcs Nagy, Marcus Shawcroft; +Cc: nd

[-- Attachment #1: Type: text/plain, Size: 534 bytes --]


    
ping

-----Original Message-----
From: Wilco Dijkstra [mailto:wdijkstr@arm.com]
Sent: 25 September 2015 14:21
To: 'GNU C Library'
Subject: [PATCH][AArch64] Add optimized memchr

An optimized memchr was missing for AArch64. This version is similar to strchr and is significantly faster than the C version. Passes GLIBC tests.

OK for commit?

ChangeLog:
2015-09-25  Wilco Dijkstra  <wdijkstr@arm.com>
2015-09-25  Kevin Petit  <kevin.petit@arm.com>

        * sysdeps/aarch64/memchr.S (__memchr): New file.
    

[-- Attachment #2: 0001-Add-memchr.txt --]
[-- Type: text/plain, Size: 5008 bytes --]

---
 sysdeps/aarch64/memchr.S | 157 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 157 insertions(+)
 create mode 100644 sysdeps/aarch64/memchr.S

diff --git a/sysdeps/aarch64/memchr.S b/sysdeps/aarch64/memchr.S
new file mode 100644
index 0000000..2f643dd
--- /dev/null
+++ b/sysdeps/aarch64/memchr.S
@@ -0,0 +1,157 @@
+/* memchr - find a character in a memory zone
+
+   Copyright (C) 2015 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * Neon Available.
+ */
+
+/* Arguments and results.  */
+#define srcin		x0
+#define chrin		w1
+#define cntin		x2
+
+#define result		x0
+
+#define src		x3
+#define	tmp		x4
+#define wtmp2		w5
+#define synd		x6
+#define soff		x9
+#define cntrem		x10
+
+#define vrepchr		v0
+#define vdata1		v1
+#define vdata2		v2
+#define vhas_chr1	v3
+#define vhas_chr2	v4
+#define vrepmask	v5
+#define vend		v6
+
+/*
+ * Core algorithm:
+ *
+ * For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits
+ * per byte. For each tuple, bit 0 is set if the relevant byte matched the
+ * requested character and bit 1 is not used (faster than using a 32bit
+ * syndrome). Since the bits in the syndrome reflect exactly the order in which
+ * things occur in the original string, counting trailing zeros allows to
+ * identify exactly which byte has matched.
+ */
+
+ENTRY (__memchr)
+	/* Do not dereference srcin if no bytes to compare.  */
+	cbz	cntin, L(zero_length)
+	/*
+	 * Magic constant 0x40100401 allows us to identify which lane matches
+	 * the requested byte.
+	 */
+	mov	wtmp2, #0x0401
+	movk	wtmp2, #0x4010, lsl #16
+	dup	vrepchr.16b, chrin
+	/* Work with aligned 32-byte chunks */
+	bic	src, srcin, #31
+	dup	vrepmask.4s, wtmp2
+	ands	soff, srcin, #31
+	and	cntrem, cntin, #31
+	b.eq	L(loop)
+
+	/*
+	 * Input string is not 32-byte aligned. We calculate the syndrome
+	 * value for the aligned 32 bytes block containing the first bytes
+	 * and mask the irrelevant part.
+	 */
+
+	ld1	{vdata1.16b, vdata2.16b}, [src], #32
+	sub	tmp, soff, #32
+	adds	cntin, cntin, tmp
+	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
+	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
+	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
+	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+	addp	vend.16b, vhas_chr1.16b, vhas_chr2.16b		/* 256->128 */
+	addp	vend.16b, vend.16b, vend.16b			/* 128->64 */
+	mov	synd, vend.2d[0]
+	/* Clear the soff*2 lower bits */
+	lsl	tmp, soff, #1
+	lsr	synd, synd, tmp
+	lsl	synd, synd, tmp
+	/* The first block can also be the last */
+	b.ls	L(masklast)
+	/* Have we found something already? */
+	cbnz	synd, L(tail)
+
+L(loop):
+	ld1	{vdata1.16b, vdata2.16b}, [src], #32
+	subs	cntin, cntin, #32
+	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
+	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
+	/* If we're out of data we finish regardless of the result */
+	b.ls	L(end)
+	/* Use a fast check for the termination condition */
+	orr	vend.16b, vhas_chr1.16b, vhas_chr2.16b
+	addp	vend.2d, vend.2d, vend.2d
+	mov	synd, vend.2d[0]
+	/* We're not out of data, loop if we haven't found the character */
+	cbz	synd, L(loop)
+
+L(end):
+	/* Termination condition found, let's calculate the syndrome value */
+	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
+	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+	addp	vend.16b, vhas_chr1.16b, vhas_chr2.16b		/* 256->128 */
+	addp	vend.16b, vend.16b, vend.16b			/* 128->64 */
+	mov	synd, vend.2d[0]
+	/* Only do the clear for the last possible block */
+	b.hi	L(tail)
+
+L(masklast):
+	/* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */
+	add	tmp, cntrem, soff
+	and	tmp, tmp, #31
+	sub	tmp, tmp, #32
+	neg	tmp, tmp, lsl #1
+	lsl	synd, synd, tmp
+	lsr	synd, synd, tmp
+
+L(tail):
+	/* Count the trailing zeros using bit reversing */
+	rbit	synd, synd
+	/* Compensate the last post-increment */
+	sub	src, src, #32
+	/* Check that we have found a character */
+	cmp	synd, #0
+	/* And count the leading zeros */
+	clz	synd, synd
+	/* Compute the potential result */
+	add	result, src, synd, lsr #1
+	/* Select result or NULL */
+	csel	result, xzr, result, eq
+	ret
+
+L(zero_length):
+	mov	result, #0
+	ret
+END (__memchr)
+weak_alias (__memchr, memchr)
+libc_hidden_builtin_def (memchr)
-- 
1.9.1


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH][AArch64] Add optimized memchr
  2016-10-19 14:58 ` Wilco Dijkstra
@ 2016-10-25 17:08   ` Adhemerval Zanella
  2016-11-02 16:45   ` Wilco Dijkstra
  1 sibling, 0 replies; 12+ messages in thread
From: Adhemerval Zanella @ 2016-10-25 17:08 UTC (permalink / raw)
  To: libc-alpha

LGTM.

On 19/10/2016 12:58, Wilco Dijkstra wrote:
> 
>     
> ping
> 
> -----Original Message-----
> From: Wilco Dijkstra [mailto:wdijkstr@arm.com]
> Sent: 25 September 2015 14:21
> To: 'GNU C Library'
> Subject: [PATCH][AArch64] Add optimized memchr
> 
> An optimized memchr was missing for AArch64. This version is similar to strchr and is significantly faster than the C version. Passes GLIBC tests.
> 
> OK for commit?
> 
> ChangeLog:
> 2015-09-25  Wilco Dijkstra  <wdijkstr@arm.com>
> 2015-09-25  Kevin Petit  <kevin.petit@arm.com>
> 
>         * sysdeps/aarch64/memchr.S (__memchr): New file.
>     
> 

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH][AArch64] Add optimized memchr
  2016-10-19 14:58 ` Wilco Dijkstra
  2016-10-25 17:08   ` Adhemerval Zanella
@ 2016-11-02 16:45   ` Wilco Dijkstra
  2016-11-04  9:01     ` Marcus Shawcroft
  1 sibling, 1 reply; 12+ messages in thread
From: Wilco Dijkstra @ 2016-11-02 16:45 UTC (permalink / raw)
  To: libc-alpha, Marcus Shawcroft; +Cc: nd, Szabolcs Nagy, Richard Earnshaw

[-- Attachment #1: Type: text/plain, Size: 532 bytes --]


ping

-----Original Message-----
From: Wilco Dijkstra [mailto:wdijkstr@arm.com]
Sent: 25 September 2015 14:21
To: 'GNU C Library'
Subject: [PATCH][AArch64] Add optimized memchr

An optimized memchr was missing for AArch64. This version is similar to strchr and is significantly faster than the C version. Passes GLIBC tests.

OK for commit?

ChangeLog:
2015-09-25  Wilco Dijkstra  <wdijkstr@arm.com>
2015-09-25  Kevin Petit  <kevin.petit@arm.com>

        * sysdeps/aarch64/memchr.S (__memchr): New file.
        

[-- Attachment #2: 0001-Add-memchr.txt --]
[-- Type: text/plain, Size: 5008 bytes --]

---
 sysdeps/aarch64/memchr.S | 157 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 157 insertions(+)
 create mode 100644 sysdeps/aarch64/memchr.S

diff --git a/sysdeps/aarch64/memchr.S b/sysdeps/aarch64/memchr.S
new file mode 100644
index 0000000..2f643dd
--- /dev/null
+++ b/sysdeps/aarch64/memchr.S
@@ -0,0 +1,157 @@
+/* memchr - find a character in a memory zone
+
+   Copyright (C) 2015 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * Neon Available.
+ */
+
+/* Arguments and results.  */
+#define srcin		x0
+#define chrin		w1
+#define cntin		x2
+
+#define result		x0
+
+#define src		x3
+#define	tmp		x4
+#define wtmp2		w5
+#define synd		x6
+#define soff		x9
+#define cntrem		x10
+
+#define vrepchr		v0
+#define vdata1		v1
+#define vdata2		v2
+#define vhas_chr1	v3
+#define vhas_chr2	v4
+#define vrepmask	v5
+#define vend		v6
+
+/*
+ * Core algorithm:
+ *
+ * For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits
+ * per byte. For each tuple, bit 0 is set if the relevant byte matched the
+ * requested character and bit 1 is not used (faster than using a 32bit
+ * syndrome). Since the bits in the syndrome reflect exactly the order in which
+ * things occur in the original string, counting trailing zeros allows to
+ * identify exactly which byte has matched.
+ */
+
+ENTRY (__memchr)
+	/* Do not dereference srcin if no bytes to compare.  */
+	cbz	cntin, L(zero_length)
+	/*
+	 * Magic constant 0x40100401 allows us to identify which lane matches
+	 * the requested byte.
+	 */
+	mov	wtmp2, #0x0401
+	movk	wtmp2, #0x4010, lsl #16
+	dup	vrepchr.16b, chrin
+	/* Work with aligned 32-byte chunks */
+	bic	src, srcin, #31
+	dup	vrepmask.4s, wtmp2
+	ands	soff, srcin, #31
+	and	cntrem, cntin, #31
+	b.eq	L(loop)
+
+	/*
+	 * Input string is not 32-byte aligned. We calculate the syndrome
+	 * value for the aligned 32 bytes block containing the first bytes
+	 * and mask the irrelevant part.
+	 */
+
+	ld1	{vdata1.16b, vdata2.16b}, [src], #32
+	sub	tmp, soff, #32
+	adds	cntin, cntin, tmp
+	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
+	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
+	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
+	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+	addp	vend.16b, vhas_chr1.16b, vhas_chr2.16b		/* 256->128 */
+	addp	vend.16b, vend.16b, vend.16b			/* 128->64 */
+	mov	synd, vend.2d[0]
+	/* Clear the soff*2 lower bits */
+	lsl	tmp, soff, #1
+	lsr	synd, synd, tmp
+	lsl	synd, synd, tmp
+	/* The first block can also be the last */
+	b.ls	L(masklast)
+	/* Have we found something already? */
+	cbnz	synd, L(tail)
+
+L(loop):
+	ld1	{vdata1.16b, vdata2.16b}, [src], #32
+	subs	cntin, cntin, #32
+	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
+	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
+	/* If we're out of data we finish regardless of the result */
+	b.ls	L(end)
+	/* Use a fast check for the termination condition */
+	orr	vend.16b, vhas_chr1.16b, vhas_chr2.16b
+	addp	vend.2d, vend.2d, vend.2d
+	mov	synd, vend.2d[0]
+	/* We're not out of data, loop if we haven't found the character */
+	cbz	synd, L(loop)
+
+L(end):
+	/* Termination condition found, let's calculate the syndrome value */
+	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
+	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+	addp	vend.16b, vhas_chr1.16b, vhas_chr2.16b		/* 256->128 */
+	addp	vend.16b, vend.16b, vend.16b			/* 128->64 */
+	mov	synd, vend.2d[0]
+	/* Only do the clear for the last possible block */
+	b.hi	L(tail)
+
+L(masklast):
+	/* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */
+	add	tmp, cntrem, soff
+	and	tmp, tmp, #31
+	sub	tmp, tmp, #32
+	neg	tmp, tmp, lsl #1
+	lsl	synd, synd, tmp
+	lsr	synd, synd, tmp
+
+L(tail):
+	/* Count the trailing zeros using bit reversing */
+	rbit	synd, synd
+	/* Compensate the last post-increment */
+	sub	src, src, #32
+	/* Check that we have found a character */
+	cmp	synd, #0
+	/* And count the leading zeros */
+	clz	synd, synd
+	/* Compute the potential result */
+	add	result, src, synd, lsr #1
+	/* Select result or NULL */
+	csel	result, xzr, result, eq
+	ret
+
+L(zero_length):
+	mov	result, #0
+	ret
+END (__memchr)
+weak_alias (__memchr, memchr)
+libc_hidden_builtin_def (memchr)
-- 
1.9.1


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH][AArch64] Add optimized memchr
  2016-11-02 16:45   ` Wilco Dijkstra
@ 2016-11-04  9:01     ` Marcus Shawcroft
  0 siblings, 0 replies; 12+ messages in thread
From: Marcus Shawcroft @ 2016-11-04  9:01 UTC (permalink / raw)
  To: Wilco Dijkstra; +Cc: libc-alpha

On 2 November 2016 at 16:45, Wilco Dijkstra <Wilco.Dijkstra@arm.com> wrote:
>
> ping
>
> -----Original Message-----
> From: Wilco Dijkstra [mailto:wdijkstr@arm.com]
> Sent: 25 September 2015 14:21
> To: 'GNU C Library'
> Subject: [PATCH][AArch64] Add optimized memchr
>
> An optimized memchr was missing for AArch64. This version is similar to strchr and is significantly faster than the C version. Passes GLIBC tests.
>
> OK for commit?
>
> ChangeLog:
> 2015-09-25  Wilco Dijkstra  <wdijkstr@arm.com>
> 2015-09-25  Kevin Petit  <kevin.petit@arm.com>
>
>         * sysdeps/aarch64/memchr.S (__memchr): New file.
>

OK /Marcus

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH][AArch64] Add optimized memchr
@ 2015-09-25 13:21 Wilco Dijkstra
  2015-09-26  8:46 ` Ondřej Bílka
  0 siblings, 1 reply; 12+ messages in thread
From: Wilco Dijkstra @ 2015-09-25 13:21 UTC (permalink / raw)
  To: 'GNU C Library'

[-- Attachment #1: Type: text/plain, Size: 330 bytes --]

An optimized memchr was missing for AArch64. This version is similar to strchr and is significantly
faster than the C version. Passes GLIBC tests.

OK for commit?

ChangeLog:
2015-09-25  Wilco Dijkstra  <wdijkstr@arm.com>
2015-09-25  Kevin Petit  <kevin.petit@arm.com>

	* sysdeps/aarch64/memchr.S (__memchr): New file.

[-- Attachment #2: 0001-Add-memchr.txt --]
[-- Type: text/plain, Size: 5179 bytes --]

---
 sysdeps/aarch64/memchr.S | 157 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 157 insertions(+)
 create mode 100644 sysdeps/aarch64/memchr.S

diff --git a/sysdeps/aarch64/memchr.S b/sysdeps/aarch64/memchr.S
new file mode 100644
index 0000000..2f643dd
--- /dev/null
+++ b/sysdeps/aarch64/memchr.S
@@ -0,0 +1,157 @@
+/* memchr - find a character in a memory zone
+
+   Copyright (C) 2015 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * Neon Available.
+ */
+
+/* Arguments and results.  */
+#define srcin		x0
+#define chrin		w1
+#define cntin		x2
+
+#define result		x0
+
+#define src		x3
+#define	tmp		x4
+#define wtmp2		w5
+#define synd		x6
+#define soff		x9
+#define cntrem		x10
+
+#define vrepchr		v0
+#define vdata1		v1
+#define vdata2		v2
+#define vhas_chr1	v3
+#define vhas_chr2	v4
+#define vrepmask	v5
+#define vend		v6
+
+/*
+ * Core algorithm:
+ *
+ * For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits
+ * per byte. For each tuple, bit 0 is set if the relevant byte matched the
+ * requested character and bit 1 is not used (faster than using a 32bit
+ * syndrome). Since the bits in the syndrome reflect exactly the order in which
+ * things occur in the original string, counting trailing zeros allows to
+ * identify exactly which byte has matched.
+ */
+
+ENTRY (__memchr)
+	/* Do not dereference srcin if no bytes to compare.  */
+	cbz	cntin, L(zero_length)
+	/*
+	 * Magic constant 0x40100401 allows us to identify which lane matches
+	 * the requested byte.
+	 */
+	mov	wtmp2, #0x0401
+	movk	wtmp2, #0x4010, lsl #16
+	dup	vrepchr.16b, chrin
+	/* Work with aligned 32-byte chunks */
+	bic	src, srcin, #31
+	dup	vrepmask.4s, wtmp2
+	ands	soff, srcin, #31
+	and	cntrem, cntin, #31
+	b.eq	L(loop)
+
+	/*
+	 * Input string is not 32-byte aligned. We calculate the syndrome
+	 * value for the aligned 32 bytes block containing the first bytes
+	 * and mask the irrelevant part.
+	 */
+
+	ld1	{vdata1.16b, vdata2.16b}, [src], #32
+	sub	tmp, soff, #32
+	adds	cntin, cntin, tmp
+	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
+	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
+	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
+	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+	addp	vend.16b, vhas_chr1.16b, vhas_chr2.16b		/* 256->128 */
+	addp	vend.16b, vend.16b, vend.16b			/* 128->64 */
+	mov	synd, vend.2d[0]
+	/* Clear the soff*2 lower bits */
+	lsl	tmp, soff, #1
+	lsr	synd, synd, tmp
+	lsl	synd, synd, tmp
+	/* The first block can also be the last */
+	b.ls	L(masklast)
+	/* Have we found something already? */
+	cbnz	synd, L(tail)
+
+L(loop):
+	ld1	{vdata1.16b, vdata2.16b}, [src], #32
+	subs	cntin, cntin, #32
+	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
+	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
+	/* If we're out of data we finish regardless of the result */
+	b.ls	L(end)
+	/* Use a fast check for the termination condition */
+	orr	vend.16b, vhas_chr1.16b, vhas_chr2.16b
+	addp	vend.2d, vend.2d, vend.2d
+	mov	synd, vend.2d[0]
+	/* We're not out of data, loop if we haven't found the character */
+	cbz	synd, L(loop)
+
+L(end):
+	/* Termination condition found, let's calculate the syndrome value */
+	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
+	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+	addp	vend.16b, vhas_chr1.16b, vhas_chr2.16b		/* 256->128 */
+	addp	vend.16b, vend.16b, vend.16b			/* 128->64 */
+	mov	synd, vend.2d[0]
+	/* Only do the clear for the last possible block */
+	b.hi	L(tail)
+
+L(masklast):
+	/* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */
+	add	tmp, cntrem, soff
+	and	tmp, tmp, #31
+	sub	tmp, tmp, #32
+	neg	tmp, tmp, lsl #1
+	lsl	synd, synd, tmp
+	lsr	synd, synd, tmp
+
+L(tail):
+	/* Count the trailing zeros using bit reversing */
+	rbit	synd, synd
+	/* Compensate the last post-increment */
+	sub	src, src, #32
+	/* Check that we have found a character */
+	cmp	synd, #0
+	/* And count the leading zeros */
+	clz	synd, synd
+	/* Compute the potential result */
+	add	result, src, synd, lsr #1
+	/* Select result or NULL */
+	csel	result, xzr, result, eq
+	ret
+
+L(zero_length):
+	mov	result, #0
+	ret
+END (__memchr)
+weak_alias (__memchr, memchr)
+libc_hidden_builtin_def (memchr)
-- 
1.9.1


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH][AArch64] Add optimized memchr
  2015-09-25 13:21 Wilco Dijkstra
@ 2015-09-26  8:46 ` Ondřej Bílka
  2015-09-28  9:23   ` Wilco Dijkstra
  0 siblings, 1 reply; 12+ messages in thread
From: Ondřej Bílka @ 2015-09-26  8:46 UTC (permalink / raw)
  To: Wilco Dijkstra; +Cc: 'GNU C Library'

On Fri, Sep 25, 2015 at 02:21:13PM +0100, Wilco Dijkstra wrote:
> An optimized memchr was missing for AArch64. This version is similar to strchr and is significantly
> faster than the C version. Passes GLIBC tests.
> 
> OK for commit?
> 
> ChangeLog:
> 2015-09-25  Wilco Dijkstra  <wdijkstr@arm.com>
> 2015-09-25  Kevin Petit  <kevin.petit@arm.com>
> 
> 	* sysdeps/aarch64/memchr.S (__memchr): New file.

How you tested performance. I think that also here loading first 32
bytes unaligned should be better. Could you use dryrun to verify?

Also same optimization could be used for memrchr.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* RE: [PATCH][AArch64] Add optimized memchr
  2015-09-26  8:46 ` Ondřej Bílka
@ 2015-09-28  9:23   ` Wilco Dijkstra
  0 siblings, 0 replies; 12+ messages in thread
From: Wilco Dijkstra @ 2015-09-28  9:23 UTC (permalink / raw)
  To: 'Ondřej Bílka'; +Cc: 'GNU C Library'

> Ondřej Bílka wrote:
> On Fri, Sep 25, 2015 at 02:21:13PM +0100, Wilco Dijkstra wrote:
> > An optimized memchr was missing for AArch64. This version is similar to strchr and is
> significantly
> > faster than the C version. Passes GLIBC tests.
> >
> > OK for commit?
> >
> > ChangeLog:
> > 2015-09-25  Wilco Dijkstra  <wdijkstr@arm.com>
> > 2015-09-25  Kevin Petit  <kevin.petit@arm.com>
> >
> > 	* sysdeps/aarch64/memchr.S (__memchr): New file.
> 
> How you tested performance. I think that also here loading first 32
> bytes unaligned should be better. Could you use dryrun to verify?
> 
> Also same optimization could be used for memrchr.

I haven't tuned this at all, this is an existing implementation that
was added to Newlib last year but not yet ported to GLIBC.

For maximum performance doing the first 16/32 bytes unaligned will likely 
be fastest just like it was for strlen.

Wilco


^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2016-11-04  9:01 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-12-15 16:41 [PATCH][AArch64] Add optimized memchr Wilco Dijkstra
2016-04-15 12:42 ` Wilco Dijkstra
2016-05-12 14:02 ` Fw: " Wilco Dijkstra
2016-06-03 12:41   ` Wilco Dijkstra
2016-06-21 13:35     ` Wilco Dijkstra
2016-10-19 14:58 ` Wilco Dijkstra
2016-10-25 17:08   ` Adhemerval Zanella
2016-11-02 16:45   ` Wilco Dijkstra
2016-11-04  9:01     ` Marcus Shawcroft
  -- strict thread matches above, loose matches on Subject: below --
2015-09-25 13:21 Wilco Dijkstra
2015-09-26  8:46 ` Ondřej Bílka
2015-09-28  9:23   ` Wilco Dijkstra

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).