From 52e23fde584c161989ffaf8ea5b6b02edbe23592 Mon Sep 17 00:00:00 2001 From: Prakhar Bahuguna Date: Thu, 26 Jan 2017 10:06:10 +0000 Subject: [PATCH] Optimise memchr for NEON-enabled processors --- newlib/libc/machine/arm/memchr-stub.c | 4 +- newlib/libc/machine/arm/memchr.S | 183 +++++++++++++++++++++++++++++++++- 2 files changed, 185 insertions(+), 2 deletions(-) diff --git a/newlib/libc/machine/arm/memchr-stub.c b/newlib/libc/machine/arm/memchr-stub.c index 21ffbbd96..5c7881b9c 100644 --- a/newlib/libc/machine/arm/memchr-stub.c +++ b/newlib/libc/machine/arm/memchr-stub.c @@ -29,7 +29,9 @@ #include "acle-compat.h" -#if __ARM_ARCH_ISA_THUMB >= 2 && defined (__ARM_FEATURE_DSP) +#if defined (__ARM_NEON__) || defined (__ARM_NEON) +/* Defined in memchr.S. */ +#elif __ARM_ARCH_ISA_THUMB >= 2 && defined (__ARM_FEATURE_DSP) /* Defined in memchr.S. */ #else # include "../../string/memchr.c" diff --git a/newlib/libc/machine/arm/memchr.S b/newlib/libc/machine/arm/memchr.S index ef2d6d08b..b5dcf83c0 100644 --- a/newlib/libc/machine/arm/memchr.S +++ b/newlib/libc/machine/arm/memchr.S @@ -78,7 +78,188 @@ #include "acle-compat.h" @ NOTE: This ifdef MUST match the one in memchr-stub.c -#if __ARM_ARCH_ISA_THUMB >= 2 && defined (__ARM_FEATURE_DSP) +#if defined (__ARM_NEON__) || defined (__ARM_NEON) + .arch armv7-a + .fpu neon + + +/* Arguments */ +#define srcin r0 +#define chrin r1 +#define cntin r2 + +/* Retval */ +#define result r0 /* Live range does not overlap with srcin */ + +/* Working registers */ +#define src r1 /* Live range does not overlap with chrin */ +#define tmp r3 +#define synd r0 /* No overlap with srcin or result */ +#define soff r12 + +/* Working NEON registers */ +#define vrepchr q0 +#define vdata0 q1 +#define vdata0_0 d2 /* Lower half of vdata0 */ +#define vdata0_1 d3 /* Upper half of vdata0 */ +#define vdata1 q2 +#define vdata1_0 d4 /* Lower half of vhas_chr0 */ +#define vdata1_1 d5 /* Upper half of vhas_chr0 */ +#define vrepmask q3 +#define vrepmask0 d6 +#define vrepmask1 d7 +#define vend q4 +#define vend0 d8 +#define vend1 d9 + +/* + * Core algorithm: + * + * For each 32-byte chunk we calculate a 32-bit syndrome value, with one bit per + * byte. Each bit is set if the relevant byte matched the requested character + * and cleared otherwise. Since the bits in the syndrome reflect exactly the + * order in which things occur in the original string, counting trailing zeros + * allows to identify exactly which byte has matched. + */ + + .text + .thumb_func + .align 4 + .p2align 4,,15 + .global memchr + .type memchr,%function + +memchr: + .cfi_sections .debug_frame + .cfi_startproc + /* Use a simple loop if there are less than 8 bytes to search. */ + cmp cntin, #7 + bhi .Llargestr + +.Lsmallstr: + subs cntin, cntin, #1 + blt .Lnotfound /* Return not found if reached end. */ + ldrb tmp, [srcin], #1 + cmp tmp, chrin + bne .Lsmallstr /* Loop again if not found. */ + /* Otherwise fixup address and return. */ + sub result, result, #1 + bx lr + + +.Llargestr: + vdup.8 vrepchr, chrin /* Duplicate char across all lanes. */ + /* + * Magic constant 0x8040201008040201 allows us to identify which lane + * matches the requested byte. + */ + movw tmp, #0x0201 + movt tmp, #0x0804 + lsl soff, tmp, #4 + vmov vrepmask0, tmp, soff + vmov vrepmask1, tmp, soff + /* Work with aligned 32-byte chunks */ + bic src, srcin, #31 + ands soff, srcin, #31 + beq .Lloopintro /* Go straight to main loop if it's aligned. */ + + /* + * Input string is not 32-byte aligned. We calculate the syndrome + * value for the aligned 32 bytes block containing the first bytes + * and mask the irrelevant part. + */ + vld1.8 {vdata0, vdata1}, [src:256]! + sub tmp, soff, #32 + adds cntin, cntin, tmp + vceq.i8 vdata0, vdata0, vrepchr + vceq.i8 vdata1, vdata1, vrepchr + vand vdata0, vdata0, vrepmask + vand vdata1, vdata1, vrepmask + vpadd.i8 vdata0_0, vdata0_0, vdata0_1 + vpadd.i8 vdata1_0, vdata1_0, vdata1_1 + vpadd.i8 vdata0_0, vdata0_0, vdata1_0 + vpadd.i8 vdata0_0, vdata0_0, vdata0_0 + vmov synd, vdata0_0[0] + + /* Clear the soff lower bits */ + lsr synd, synd, soff + lsl synd, synd, soff + /* The first block can also be the last */ + bls .Lmasklast + /* Have we found something already? */ + cbnz synd, .Ltail + + +.Lloopintro: + vpush {vend} + /* 264/265 correspond to d8/d9 for q4 */ + .cfi_adjust_cfa_offset 16 + .cfi_rel_offset 264, 0 + .cfi_rel_offset 265, 8 + .p2align 3,,7 +.Lloop: + vld1.8 {vdata0, vdata1}, [src:256]! + subs cntin, cntin, #32 + vceq.i8 vdata0, vdata0, vrepchr + vceq.i8 vdata1, vdata1, vrepchr + /* If we're out of data we finish regardless of the result. */ + bls .Lend + /* Use a fast check for the termination condition. */ + vorr vend, vdata0, vdata1 + vorr vend0, vend0, vend1 + vmov synd, tmp, vend0 + orrs synd, synd, tmp + /* We're not out of data, loop if we haven't found the character. */ + beq .Lloop + +.Lend: + vpop {vend} + .cfi_adjust_cfa_offset -16 + .cfi_restore 264 + .cfi_restore 265 + + /* Termination condition found, let's calculate the syndrome value. */ + vand vdata0, vdata0, vrepmask + vand vdata1, vdata1, vrepmask + vpadd.i8 vdata0_0, vdata0_0, vdata0_1 + vpadd.i8 vdata1_0, vdata1_0, vdata1_1 + vpadd.i8 vdata0_0, vdata0_0, vdata1_0 + vpadd.i8 vdata0_0, vdata0_0, vdata0_0 + vmov synd, vdata0_0[0] + cbz synd, .Lnotfound + bhi .Ltail + + +.Lmasklast: + /* Clear the (-cntin) upper bits to avoid out-of-bounds matches. */ + neg cntin, cntin + lsl synd, synd, cntin + lsrs synd, synd, cntin + it eq + moveq src, #0 /* If no match, set src to 0 so the retval is 0. */ + + +.Ltail: + /* Count the trailing zeros using bit reversing */ + rbit synd, synd + /* Compensate the last post-increment */ + sub src, src, #32 + /* Count the leading zeros */ + clz synd, synd + /* Compute the potential result and return */ + add result, src, synd + bx lr + + +.Lnotfound: + /* Set result to NULL if not found and return */ + mov result, #0 + bx lr + + .cfi_endproc + .size memchr, . - memchr + +#elif __ARM_ARCH_ISA_THUMB >= 2 && defined (__ARM_FEATURE_DSP) #if __ARM_ARCH_PROFILE == 'M' .arch armv7e-m -- 2.11.0