public inbox for newlib@sourceware.org
 help / color / mirror / Atom feed
* [PATCH, NEWLIB/ARM] Optimise memchr for NEON-enabled processors
@ 2017-04-05 12:39 Prakhar Bahuguna
  2017-04-06 16:20 ` Corinna Vinschen
  0 siblings, 1 reply; 2+ messages in thread
From: Prakhar Bahuguna @ 2017-04-05 12:39 UTC (permalink / raw)
  To: newlib; +Cc: nd

[-- Attachment #1: Type: text/plain, Size: 908 bytes --]

This patch provides an optimised implementation of memchr using NEON
instructions to improve its performance, especially with longer search regions.
This gave at least double the performance against the Thumb2+DSP optimised code,
with as much as quadruple performance for larger inputs. The NEON code also wins
in cases where the input is small (less than 8 bytes) by defaulting to a simple
byte-by-byte search. This avoids the overhead imposed by filling two quadword
registers from memory.

newlib/ChangeLog:

2017-01-26  Prakhar Bahuguna  <prakhar.bahuguna@arm.com>

	* newlib/libc/machine/arm/memchr-stub.c: Add __ARM_NEON__ ifdef.
	Change if to elif.
	* newlib/libc/machine/arm/memchr.S: Add __ARM_NEON__ ifdef.
	Add NEON-optimised memchr implementation.
	Change if to elif.

Testing done: Ran regression tests for arm-eabi-none, and benchmark tests on
hardware.

Okay for master?

--

Prakhar Bahuguna

[-- Attachment #2: 0001-Optimise-memchr-for-NEON-enabled-processors.patch --]
[-- Type: text/plain, Size: 6204 bytes --]

From 52e23fde584c161989ffaf8ea5b6b02edbe23592 Mon Sep 17 00:00:00 2001
From: Prakhar Bahuguna <prakhar.bahuguna@arm.com>
Date: Thu, 26 Jan 2017 10:06:10 +0000
Subject: [PATCH] Optimise memchr for NEON-enabled processors

---
 newlib/libc/machine/arm/memchr-stub.c |   4 +-
 newlib/libc/machine/arm/memchr.S      | 183 +++++++++++++++++++++++++++++++++-
 2 files changed, 185 insertions(+), 2 deletions(-)

diff --git a/newlib/libc/machine/arm/memchr-stub.c b/newlib/libc/machine/arm/memchr-stub.c
index 21ffbbd96..5c7881b9c 100644
--- a/newlib/libc/machine/arm/memchr-stub.c
+++ b/newlib/libc/machine/arm/memchr-stub.c
@@ -29,7 +29,9 @@
 
 #include "acle-compat.h"
 
-#if __ARM_ARCH_ISA_THUMB >= 2 && defined (__ARM_FEATURE_DSP)
+#if defined (__ARM_NEON__) || defined (__ARM_NEON)
+/* Defined in memchr.S.  */
+#elif __ARM_ARCH_ISA_THUMB >= 2 && defined (__ARM_FEATURE_DSP)
 /* Defined in memchr.S.  */
 #else
 # include "../../string/memchr.c"
diff --git a/newlib/libc/machine/arm/memchr.S b/newlib/libc/machine/arm/memchr.S
index ef2d6d08b..b5dcf83c0 100644
--- a/newlib/libc/machine/arm/memchr.S
+++ b/newlib/libc/machine/arm/memchr.S
@@ -78,7 +78,188 @@
 #include "acle-compat.h"
 
 @ NOTE: This ifdef MUST match the one in memchr-stub.c
-#if __ARM_ARCH_ISA_THUMB >= 2 && defined (__ARM_FEATURE_DSP)
+#if defined (__ARM_NEON__) || defined (__ARM_NEON)
+	.arch	armv7-a
+	.fpu	neon
+
+
+/* Arguments */
+#define srcin		r0
+#define chrin		r1
+#define cntin		r2
+
+/* Retval */
+#define result		r0	/* Live range does not overlap with srcin */
+
+/* Working registers */
+#define src		r1	/* Live range does not overlap with chrin */
+#define tmp		r3
+#define synd		r0	/* No overlap with srcin or result */
+#define soff		r12
+
+/* Working NEON registers */
+#define vrepchr		q0
+#define vdata0		q1
+#define vdata0_0	d2	/* Lower half of vdata0 */
+#define vdata0_1	d3	/* Upper half of vdata0 */
+#define vdata1		q2
+#define vdata1_0	d4	/* Lower half of vhas_chr0 */
+#define vdata1_1	d5	/* Upper half of vhas_chr0 */
+#define vrepmask	q3
+#define vrepmask0	d6
+#define vrepmask1	d7
+#define vend		q4
+#define vend0		d8
+#define vend1		d9
+
+/*
+ * Core algorithm:
+ *
+ * For each 32-byte chunk we calculate a 32-bit syndrome value, with one bit per
+ * byte. Each bit is set if the relevant byte matched the requested character
+ * and cleared otherwise. Since the bits in the syndrome reflect exactly the
+ * order in which things occur in the original string, counting trailing zeros
+ * allows to identify exactly which byte has matched.
+ */
+
+	.text
+	.thumb_func
+	.align 4
+	.p2align 4,,15
+	.global memchr
+	.type memchr,%function
+
+memchr:
+	.cfi_sections .debug_frame
+	.cfi_startproc
+	/* Use a simple loop if there are less than 8 bytes to search.  */
+	cmp	cntin, #7
+	bhi	.Llargestr
+
+.Lsmallstr:
+	subs	cntin, cntin, #1
+	blt	.Lnotfound	/* Return not found if reached end.  */
+	ldrb	tmp, [srcin], #1
+	cmp	tmp, chrin
+	bne	.Lsmallstr	/* Loop again if not found.  */
+	/* Otherwise fixup address and return.  */
+	sub	result, result, #1
+	bx	lr
+
+
+.Llargestr:
+	vdup.8	vrepchr, chrin	/* Duplicate char across all lanes. */
+	/*
+	 * Magic constant 0x8040201008040201 allows us to identify which lane
+	 * matches the requested byte.
+	 */
+	movw	tmp, #0x0201
+	movt	tmp, #0x0804
+	lsl	soff, tmp, #4
+	vmov	vrepmask0, tmp, soff
+	vmov	vrepmask1, tmp, soff
+	/* Work with aligned 32-byte chunks */
+	bic	src, srcin, #31
+	ands	soff, srcin, #31
+	beq	.Lloopintro	/* Go straight to main loop if it's aligned. */
+
+	/*
+	 * Input string is not 32-byte aligned. We calculate the syndrome
+	 * value for the aligned 32 bytes block containing the first bytes
+	 * and mask the irrelevant part.
+	 */
+	vld1.8		{vdata0, vdata1}, [src:256]!
+	sub		tmp, soff, #32
+	adds		cntin, cntin, tmp
+	vceq.i8		vdata0, vdata0, vrepchr
+	vceq.i8		vdata1, vdata1, vrepchr
+	vand		vdata0, vdata0, vrepmask
+	vand		vdata1, vdata1, vrepmask
+	vpadd.i8	vdata0_0, vdata0_0, vdata0_1
+	vpadd.i8	vdata1_0, vdata1_0, vdata1_1
+	vpadd.i8	vdata0_0, vdata0_0, vdata1_0
+	vpadd.i8	vdata0_0, vdata0_0, vdata0_0
+	vmov		synd, vdata0_0[0]
+
+	/* Clear the soff lower bits */
+	lsr		synd, synd, soff
+	lsl		synd, synd, soff
+	/* The first block can also be the last */
+	bls		.Lmasklast
+	/* Have we found something already? */
+	cbnz		synd, .Ltail
+
+
+.Lloopintro:
+	vpush	{vend}
+	/* 264/265 correspond to d8/d9 for q4 */
+	.cfi_adjust_cfa_offset	16
+	.cfi_rel_offset	264, 0
+	.cfi_rel_offset	265, 8
+	.p2align 3,,7
+.Lloop:
+	vld1.8		{vdata0, vdata1}, [src:256]!
+	subs		cntin, cntin, #32
+	vceq.i8		vdata0, vdata0, vrepchr
+	vceq.i8		vdata1, vdata1, vrepchr
+	/* If we're out of data we finish regardless of the result. */
+	bls		.Lend
+	/* Use a fast check for the termination condition. */
+	vorr		vend, vdata0, vdata1
+	vorr		vend0, vend0, vend1
+	vmov		synd, tmp, vend0
+	orrs		synd, synd, tmp
+	/* We're not out of data, loop if we haven't found the character. */
+	beq		.Lloop
+
+.Lend:
+	vpop		{vend}
+	.cfi_adjust_cfa_offset	-16
+	.cfi_restore	264
+	.cfi_restore	265
+
+	/* Termination condition found, let's calculate the syndrome value. */
+	vand		vdata0, vdata0, vrepmask
+	vand		vdata1, vdata1, vrepmask
+	vpadd.i8	vdata0_0, vdata0_0, vdata0_1
+	vpadd.i8	vdata1_0, vdata1_0, vdata1_1
+	vpadd.i8	vdata0_0, vdata0_0, vdata1_0
+	vpadd.i8	vdata0_0, vdata0_0, vdata0_0
+	vmov		synd, vdata0_0[0]
+	cbz		synd, .Lnotfound
+	bhi		.Ltail
+
+
+.Lmasklast:
+	/* Clear the (-cntin) upper bits to avoid out-of-bounds matches. */
+	neg	cntin, cntin
+	lsl	synd, synd, cntin
+	lsrs	synd, synd, cntin
+	it	eq
+	moveq	src, #0	/* If no match, set src to 0 so the retval is 0. */
+
+
+.Ltail:
+	/* Count the trailing zeros using bit reversing */
+	rbit	synd, synd
+	/* Compensate the last post-increment */
+	sub	src, src, #32
+	/* Count the leading zeros */
+	clz	synd, synd
+	/* Compute the potential result and return */
+	add	result, src, synd
+	bx	lr
+
+
+.Lnotfound:
+	/* Set result to NULL if not found and return */
+	mov	result, #0
+	bx	lr
+
+	.cfi_endproc
+	.size	memchr, . - memchr
+
+#elif __ARM_ARCH_ISA_THUMB >= 2 && defined (__ARM_FEATURE_DSP)
 
 #if __ARM_ARCH_PROFILE == 'M'
        .arch armv7e-m
-- 
2.11.0


^ permalink raw reply	[flat|nested] 2+ messages in thread

* Re: [PATCH, NEWLIB/ARM] Optimise memchr for NEON-enabled processors
  2017-04-05 12:39 [PATCH, NEWLIB/ARM] Optimise memchr for NEON-enabled processors Prakhar Bahuguna
@ 2017-04-06 16:20 ` Corinna Vinschen
  0 siblings, 0 replies; 2+ messages in thread
From: Corinna Vinschen @ 2017-04-06 16:20 UTC (permalink / raw)
  To: newlib

[-- Attachment #1: Type: text/plain, Size: 1081 bytes --]

On Apr  5 13:39, Prakhar Bahuguna wrote:
> This patch provides an optimised implementation of memchr using NEON
> instructions to improve its performance, especially with longer search regions.
> This gave at least double the performance against the Thumb2+DSP optimised code,
> with as much as quadruple performance for larger inputs. The NEON code also wins
> in cases where the input is small (less than 8 bytes) by defaulting to a simple
> byte-by-byte search. This avoids the overhead imposed by filling two quadword
> registers from memory.
> 
> newlib/ChangeLog:
> 
> 2017-01-26  Prakhar Bahuguna  <prakhar.bahuguna@arm.com>
> 
> 	* newlib/libc/machine/arm/memchr-stub.c: Add __ARM_NEON__ ifdef.
> 	Change if to elif.
> 	* newlib/libc/machine/arm/memchr.S: Add __ARM_NEON__ ifdef.
> 	Add NEON-optimised memchr implementation.
> 	Change if to elif.
> 
> Testing done: Ran regression tests for arm-eabi-none, and benchmark tests on
> hardware.
> 
> Okay for master?

Applied.


Thanks,
Corinna

-- 
Corinna Vinschen
Cygwin Maintainer
Red Hat

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2017-04-06 16:20 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-04-05 12:39 [PATCH, NEWLIB/ARM] Optimise memchr for NEON-enabled processors Prakhar Bahuguna
2017-04-06 16:20 ` Corinna Vinschen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).