From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (qmail 28482 invoked by alias); 27 Feb 2013 03:17:40 -0000 Received: (qmail 28368 invoked by uid 22791); 27 Feb 2013 03:17:35 -0000 X-SWARE-Spam-Status: No, hits=-5.0 required=5.0 tests=AWL,BAYES_00,DKIM_SIGNED,DKIM_VALID,FREEMAIL_ENVFROM_END_DIGIT,FREEMAIL_FROM,KHOP_RCVD_TRUST,KHOP_SPAMHAUS_DROP,KHOP_THREADED,RCVD_IN_DNSWL_LOW,RCVD_IN_HOSTKARMA_YE,TW_CB,TW_CL,TW_DR,TW_MV,TW_OV,TW_XF X-Spam-Check-By: sourceware.org Received: from mail-da0-f44.google.com (HELO mail-da0-f44.google.com) (209.85.210.44) by sourceware.org (qpsmtpd/0.43rc1) with ESMTP; Wed, 27 Feb 2013 03:17:13 +0000 Received: by mail-da0-f44.google.com with SMTP id z20so60984dae.3 for ; Tue, 26 Feb 2013 19:17:12 -0800 (PST) X-Received: by 10.66.5.138 with SMTP id s10mr5233017pas.52.1361935032363; Tue, 26 Feb 2013 19:17:12 -0800 (PST) Received: from pebble.twiddle.net (50-194-63-110-static.hfc.comcastbusiness.net. [50.194.63.110]) by mx.google.com with ESMTPS id pp1sm265271pac.7.2013.02.26.19.17.10 (version=TLSv1.2 cipher=RC4-SHA bits=128/128); Tue, 26 Feb 2013 19:17:11 -0800 (PST) From: Richard Henderson To: libc-ports@sourceware.org Cc: Joseph Myers Subject: [PATCH 23/26] arm: Rewrite armv6t2 memchr with uqadd8 Date: Wed, 27 Feb 2013 03:17:00 -0000 Message-Id: <1361934986-17018-24-git-send-email-rth@twiddle.net> In-Reply-To: <1361934986-17018-1-git-send-email-rth@twiddle.net> References: <1361934986-17018-1-git-send-email-rth@twiddle.net> X-IsSubscribed: yes Mailing-List: contact libc-ports-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Subscribe: List-Post: List-Help: , Sender: libc-ports-owner@sourceware.org X-SW-Source: 2013-02/txt/msg00093.txt.bz2 Not recently speed tested vs the Linaro version, but having a common set of algorithms for all the *chr routines is surely worth something for maintanence. But I do recall it was once a few percent faster on A8. --- * sysdeps/arm/armv6t2/memchr.S: Rewrite. --- ports/sysdeps/arm/armv6t2/memchr.S | 216 +++++++++++++++++-------------------- 1 file changed, 101 insertions(+), 115 deletions(-) diff --git a/ports/sysdeps/arm/armv6t2/memchr.S b/ports/sysdeps/arm/armv6t2/memchr.S index 6d35f47..1739a4c 100644 --- a/ports/sysdeps/arm/armv6t2/memchr.S +++ b/ports/sysdeps/arm/armv6t2/memchr.S @@ -22,142 +22,128 @@ @ and ARMv6T2 processors. It has a fast path for short sizes, and has an @ optimised path for large data sets; the worst case is finding the match early @ in a large data set. -@ Note: The use of cbz/cbnz means it's Thumb only - -@ 2011-07-15 david.gilbert@linaro.org -@ Copy from Cortex strings release 21 and change license -@ http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/view/head:/src/linaro-a9/memchr.S -@ Change function declarations/entry/exit -@ 2011-12-01 david.gilbert@linaro.org -@ Add some fixes from comments received (including use of ldrd instead ldm) -@ 2011-12-07 david.gilbert@linaro.org -@ Removed cbz from align loop - can't be taken - -@ this lets us check a flag in a 00/ff byte easily in either endianness -#ifdef __ARMEB__ -#define CHARTSTMASK(c) 1<<(31-(c*8)) -#else -#define CHARTSTMASK(c) 1<<(c*8) -#endif - .syntax unified + .syntax unified .text - .thumb -@ --------------------------------------------------------------------------- - .thumb_func - .global memchr - .type memchr,%function ENTRY(memchr) @ r0 = start of memory to scan @ r1 = character to look for @ r2 = length @ returns r0 = pointer to character or NULL if not found - and r1,r1,#0xff @ Don't think we can trust the caller to actually pass a char - - cmp r2,#16 @ If it's short don't bother with anything clever - blt 20f - tst r0, #7 @ If it's already aligned skip the next bit - beq 10f + uxtb r1, r1 + cmp r2, #16 @ Is the buffer too short? + blo .Lbuf_small + tst r0, #7 @ Is the buffer already aligned? + beq .Lbuf_aligned @ Work up to an aligned point -5: - ldrb r3, [r0],#1 - subs r2, r2, #1 - cmp r3, r1 - beq 50f @ If it matches exit found - tst r0, #7 - bne 5b @ If not aligned yet then do next byte - -10: - @ At this point, we are aligned, we know we have at least 8 bytes to work with - push {r4,r5,r6,r7} - cfi_adjust_cfa_offset (16) +0: ldrb r3, [r0], #1 + s(sub) r2, r2, #1 + cmp r3, r1 @ If found, adjust and return. + beq .Lfound_minus1 + tst r0, #7 @ If not yet aligned, loop + bne 0b + +.Lbuf_aligned: + @ Here, we are aligned and we have at least 8 bytes to work with. + push { r4, r5 } + cfi_adjust_cfa_offset (8) cfi_rel_offset (r4, 0) cfi_rel_offset (r5, 4) - cfi_rel_offset (r6, 8) - cfi_rel_offset (r7, 12) - cfi_remember_state - - orr r1, r1, r1, lsl #8 @ expand the match word across to all bytes + orr r1, r1, r1, lsl #8 @ Replicate C to all bytes + movw ip, #0xfefe orr r1, r1, r1, lsl #16 - bic r4, r2, #7 @ Number of double words to work with * 8 - mvns r7, #0 @ all F's - movs r3, #0 - -15: - ldrd r5,r6, [r0],#8 - subs r4, r4, #8 - eor r5,r5, r1 @ Get it so that r5,r6 have 00's where the bytes match the target - eor r6,r6, r1 - uadd8 r5, r5, r7 @ Parallel add 0xff - sets the GE bits for anything that wasn't 0 - sel r5, r3, r7 @ bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION - uadd8 r6, r6, r7 @ Parallel add 0xff - sets the GE bits for anything that wasn't 0 - sel r6, r5, r7 @ chained....bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION - cbnz r6, 60f - bne 15b @ (Flags from the subs above) If not run out of bytes then go around again - - pop {r4,r5,r6,r7} - cfi_adjust_cfa_offset (-16) + movt ip, #0xfefe + +1: ldrd r4, r5, [r0], #8 + s(eor) r4, r4, r1 @ Convert C's to zeros + s(eor) r5, r5, r1 + @ Adding (unsigned saturating) 0xfe means result of 0xfe for any byte + @ that was originally zero and 0xff otherwise. Therefore we consider + @ the lsb of each byte the "found" bit, with 0 for a match. + uqadd8 r4, r4, ip + uqadd8 r5, r5, ip + cmp r2, #8 @ Have we still a full 8 bytes left? + blo .Lbuf_aligned_finish + s(and) r5, r4, r4 @ Combine found bits between words. + mvns r5, r5 @ Match within either word? + beq 1b + + @ Here, we've found a match. Disambiguate 1st or 2nd word. + mvns r4, r4 + itee ne + subne r0, r0, #8 @ Dec pointer to 1st word + subeq r0, r0, #4 @ Dec pointer to 2nd word + moveq r4, r5 @ Copy found bits from 2nd word. + +.Lfind_word: + @ Here we've found a match. + @ r0 = pointer to word containing the match + @ r4 = found bits for the word. +#ifdef __ARMEL__ + rbit r4, r4 @ For LE we need count-trailing-zeros +#endif + clz r4, r4 @ Find the bit offset of the match. + add r0, r0, r4, lsr #3 @ Adjust the pointer to the found byte + + pop { r4, r5 } + cfi_remember_state + cfi_adjust_cfa_offset (-8) cfi_restore (r4) cfi_restore (r5) - cfi_restore (r6) - cfi_restore (r7) - - and r1,r1,#0xff @ Get r1 back to a single character from the expansion above - and r2,r2,#7 @ Leave the count remaining as the number after the double words have been done - -20: - cbz r2, 40f @ 0 length or hit the end already then not found - -21: @ Post aligned section, or just a short call - ldrb r3,[r0],#1 - subs r2,r2,#1 - eor r3,r3,r1 @ r3 = 0 if match - doesn't break flags from sub - cbz r3, 50f - bne 21b @ on r2 flags - -40: - movs r0,#0 @ not found - DO_RET(lr) + bx lr -50: - subs r0,r0,#1 @ found - DO_RET(lr) - -60: @ We're here because the fast path found a hit - now we have to track down exactly which word it was - @ r0 points to the start of the double word after the one that was tested - @ r5 has the 00/ff pattern for the first word, r6 has the chained value cfi_restore_state - cmp r5, #0 - itte eq - moveq r5, r6 @ the end is in the 2nd word - subeq r0,r0,#3 @ Points to 2nd byte of 2nd word - subne r0,r0,#7 @ or 2nd byte of 1st word - - @ r0 currently points to the 2nd byte of the word containing the hit - tst r5, # CHARTSTMASK(0) @ 1st character - bne 61f - adds r0,r0,#1 - tst r5, # CHARTSTMASK(1) @ 2nd character - ittt eq - addeq r0,r0,#1 - tsteq r5, # (3<<15) @ 2nd & 3rd character - @ If not the 3rd must be the last one - addeq r0,r0,#1 - -61: - pop {r4,r5,r6,r7} - cfi_adjust_cfa_offset (-16) +.Lbuf_aligned_finish: + @ Here we've read and computed found bits for 8 bytes, but not + @ all of those bytes are within the buffer. Determine which + @ found bytes are really valid. + s(sub) r0, r0, #8 @ Dec pointer to the 1st word + cmp r2, #4 @ Do we have at least 4 bytes left? + blo 1f + mvns r4, r4 @ Match within the 1st word? + bne .Lfind_word + s(add) r0, r0, #4 @ Inc pointer to the 2nd word + s(mvn) r4, r5 @ Copy found bits from 2nd word + s(sub) r2, r2, #4 @ Bytes remaining in 2nd word +1: + lsls r2, r2, #3 @ Convert remaining to bits + bne 2f @ No bytes remaining? + mvn r3, #0 +#ifdef __ARMEL__ + s(lsl) r3, r3, r2 @ Mask with 1s covering invalid bytes +#else + s(lsr) r3, r3, r2 +#endif + bics r4, r4, r3 @ Clear found past end of buffer + bne .Lfind_word +2: + s(mov) r0, #0 @ No found + pop { r4, r5 } + cfi_adjust_cfa_offset (-8) cfi_restore (r4) cfi_restore (r5) - cfi_restore (r6) - cfi_restore (r7) - - subs r0,r0,#1 - DO_RET(lr) + bx lr + +.Lbuf_small: + @ Here we've a small buffer to be searched a byte at a time. +0: ldrb r3, [r0], #1 + cmp r3, r1 @ If found, adjust and return. + beq .Lfound_minus1 + subs r2, r2, #1 @ Any bytes left? + bne 0b + + s(mov) r0, #0 @ Not found + bx lr + +.Lfound_minus1: + @ Here we've found a match in a byte loop + @ r0 = pointer, post-incremented past the byte + s(sub) r0, r0, #1 + bx lr END(memchr) libc_hidden_builtin_def (memchr) -- 1.8.1.2