From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (qmail 28836 invoked by alias); 27 Feb 2013 03:17:53 -0000 Received: (qmail 28788 invoked by uid 22791); 27 Feb 2013 03:17:49 -0000 X-SWARE-Spam-Status: No, hits=-4.8 required=5.0 tests=AWL,BAYES_00,DKIM_SIGNED,DKIM_VALID,FREEMAIL_ENVFROM_END_DIGIT,FREEMAIL_FROM,KHOP_RCVD_TRUST,KHOP_SPAMHAUS_DROP,KHOP_THREADED,RCVD_IN_DNSWL_LOW,RCVD_IN_HOSTKARMA_YE,SARE_SUB_6CONS_WORD,TW_CB,TW_CL,TW_DR,TW_MV,TW_OV,TW_XF X-Spam-Check-By: sourceware.org Received: from mail-pb0-f47.google.com (HELO mail-pb0-f47.google.com) (209.85.160.47) by sourceware.org (qpsmtpd/0.43rc1) with ESMTP; Wed, 27 Feb 2013 03:17:11 +0000 Received: by mail-pb0-f47.google.com with SMTP id rp2so74397pbb.20 for ; Tue, 26 Feb 2013 19:17:10 -0800 (PST) X-Received: by 10.66.164.97 with SMTP id yp1mr5153324pab.47.1361935030632; Tue, 26 Feb 2013 19:17:10 -0800 (PST) Received: from pebble.twiddle.net (50-194-63-110-static.hfc.comcastbusiness.net. [50.194.63.110]) by mx.google.com with ESMTPS id pp1sm265271pac.7.2013.02.26.19.17.08 (version=TLSv1.2 cipher=RC4-SHA bits=128/128); Tue, 26 Feb 2013 19:17:09 -0800 (PST) From: Richard Henderson To: libc-ports@sourceware.org Cc: Joseph Myers Subject: [PATCH 22/26] arm: Implement armv6t2 optimized strchr, strrchr, rawmemchr Date: Wed, 27 Feb 2013 03:17:00 -0000 Message-Id: <1361934986-17018-23-git-send-email-rth@twiddle.net> In-Reply-To: <1361934986-17018-1-git-send-email-rth@twiddle.net> References: <1361934986-17018-1-git-send-email-rth@twiddle.net> X-IsSubscribed: yes Mailing-List: contact libc-ports-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Subscribe: List-Post: List-Help: , Sender: libc-ports-owner@sourceware.org X-SW-Source: 2013-02/txt/msg00080.txt.bz2 Not specifically speed tested against the byte-by-byte versions, but expected to be about as fast as the new strlen. --- * sysdeps/arm/armv6t2/strchr.S: New file. * sysdeps/arm/armv6t2/strrchr.S: New file. * sysdeps/arm/armv6t2/rawmemchr.S: New file. --- ports/sysdeps/arm/armv6t2/rawmemchr.S | 81 ++++++++++++++++++++ ports/sysdeps/arm/armv6t2/strchr.S | 138 ++++++++++++++++++++++++++++++++++ ports/sysdeps/arm/armv6t2/strrchr.S | 137 +++++++++++++++++++++++++++++++++ 3 files changed, 356 insertions(+) create mode 100644 ports/sysdeps/arm/armv6t2/rawmemchr.S create mode 100644 ports/sysdeps/arm/armv6t2/strchr.S create mode 100644 ports/sysdeps/arm/armv6t2/strrchr.S diff --git a/ports/sysdeps/arm/armv6t2/rawmemchr.S b/ports/sysdeps/arm/armv6t2/rawmemchr.S new file mode 100644 index 0000000..eea7707 --- /dev/null +++ b/ports/sysdeps/arm/armv6t2/rawmemchr.S @@ -0,0 +1,81 @@ +/* Copyright (C) 2013 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + . */ + +#include + + .syntax unified + .text + +ENTRY(__rawmemchr) + @ r0 = start of string + @ r1 = character to match + @ returns a pointer to the match, which must be present. + uxtb r1, r1 + + @ Loop until we find ... +1: ldrb r2, [r0], #1 + cmp r2, r1 @ ... the character + it ne + tstne r0, #7 @ ... the aligment point + bne 1b + + @ Disambiguate the exit possibilites above + cmp r2, r1 @ Found the character + itt eq + subeq r0, r0, #1 + bxeq lr + + @ So now we're aligned. + orr r1, r1, r1, lsl #8 @ Replicate C to all bytes + movw ip, #0xfefe + orr r1, r1, r1, lsl #16 + movt ip, #0xfefe + + @ Loop searching for EOS or C, 8 bytes at a time. + @ Adding (unsigned saturating) 0xfe means result of 0xfe for any byte + @ that was originally zero and 0xff otherwise. Therefore we consider + @ the lsb of each byte the "found" bit, with 0 for a match. +2: ldrd r2, r3, [r0], #8 + s(eor) r2, r2, r1 @ Convert C bytes to 0 + s(eor) r3, r3, r1 + uqadd8 r2, r2, ip @ Find C + uqadd8 r3, r3, ip + s(and) r3, r3, r2 @ Combine the two words + mvns r3, r3 @ Test for any found bit true + beq 2b + + @ Found something. Disambiguate between first and second words. + @ Adjust r0 to point to the word containing the match. + @ Adjust r2 to the found bits for the word containing the match. + mvns r2, r2 + itee ne + subne r0, r0, #8 + subeq r0, r0, #4 + moveq r2, r3 + + @ Find the bit-offset of the match within the word. +#ifdef __ARMEL__ + rbit r2, r2 @ For LE we need count-trailing-zeros +#endif + clz r2, r2 + add r0, r0, r2, lsr #3 @ Adjust the pointer to the found byte + bx lr + +END(__rawmemchr) + +weak_alias (__rawmemchr, rawmemchr) +libc_hidden_def (__rawmemchr) diff --git a/ports/sysdeps/arm/armv6t2/strchr.S b/ports/sysdeps/arm/armv6t2/strchr.S new file mode 100644 index 0000000..e7f5acf --- /dev/null +++ b/ports/sysdeps/arm/armv6t2/strchr.S @@ -0,0 +1,138 @@ +/* Copyright (C) 2013 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + . */ + +#include + + .syntax unified + .text + +ENTRY(strchr) + @ r0 = start of string + @ r1 = character to match + @ returns NULL for no match, or a pointer to the match + + @ To cater to long strings, we want to search through a few + @ characters until we reach an aligned pointer. To cater to + @ small strings, we don't want to start doing word operations + @ immediately. The compromise is a maximum of 32 bytes less + @ whatever is required to end with an aligned pointer. + @ r3 = number of characters to search in alignment loop + and r3, r0, #7 + uxtb r1, r1 + rsb r3, r3, #32 + + @ Loop until we find ... +1: ldrb r2, [r0], #1 + subs r3, r3, #1 @ ... the aligment point + it ne + cmpne r2, r1 @ ... or the character + it ne + cmpne r2, #0 @ ... or EOS + bne 1b + + @ Disambiguate the exit possibilites above + cmp r2, r1 @ Found the character + itt eq + subeq r0, r0, #1 + bxeq lr + + cmp r2, #0 @ Found EOS + itt eq + moveq r0, #0 + bxeq lr + + @ So now we're aligned. Now we actually need a stack frame. + push { r4, r5, r6, r7 } + cfi_adjust_cfa_offset (16) + cfi_rel_offset (r4, 0) + cfi_rel_offset (r5, 4) + cfi_rel_offset (r6, 8) + cfi_rel_offset (r7, 12) + + orr r1, r1, r1, lsl #8 @ Replicate C to all bytes + movw ip, #0xfefe + orr r1, r1, r1, lsl #16 + movt ip, #0xfefe + + @ Loop searching for EOS or C, 8 bytes at a time. +2: ldrd r2, r3, [r0], #8 + @ Adding (unsigned saturating) 0xfe means result of 0xfe for any byte + @ that was originally zero and 0xff otherwise. Therefore we consider + @ the lsb of each byte the "found" bit, with 0 for a match. + uqadd8 r4, r2, ip @ Find EOS + uqadd8 r5, r3, ip + eor r6, r2, r1 @ Convert C bytes to 0 + eor r7, r3, r1 + uqadd8 r6, r6, ip @ Find C + uqadd8 r7, r7, ip + s(and) r4, r4, r6 @ Combine found for EOS and C + s(and) r5, r5, r7 + and r6, r4, r5 @ Combine the two words + mvns r6, r6 @ Test for any found bit true + beq 2b + + @ Invert the sense of the found bits. After this we have 1 in + @ any byte that contains a match, and 0 otherwise. + s(mvn) r5, r5 + mvns r4, r4 + + @ Found something. Disambiguate between first and second words. + @ Adjust r0 to point to the word containing the match. + @ Adjust r2 to the contents of the word containing the match. + @ Adjust r4 to the found bits for the word containing the match. + iteee ne + subne r0, r0, #8 + subeq r0, r0, #4 + moveq r4, r5 + moveq r2, r3 + + @ Find the bit-offset of the match within the word. +#ifdef __ARMEL__ + @ For little-endian, we only need to reverse the bits so that + @ count-leading-zeros becomes in effect count-trailing-zeros. + rbit r4, r4 + clz r3, r4 +#else + @ For big-endian, we're matching 0x01 (not 0x80), and so the + @ bit offset is 7 too high. Also, we byte-swap the word so + @ that we can shift down to extract the found byte. + clz r3, r4 + rev r2, r2 + s(sub) r3, r3, #7 +#endif + s(lsr) r2, r2, r3 @ Shift down found byte + add r0, r0, r3, lsr #3 @ Adjust the pointer to the found byte + uxtb r2, r2 @ Extract found byte + uxtb r1, r1 @ Undo replication of C + + pop { r4, r5, r6, r7 } + cfi_adjust_cfa_offset (-16) + cfi_restore (r4) + cfi_restore (r5) + cfi_restore (r6) + cfi_restore (r7) + + @ Disambiguate between EOS and C. + cmp r2, r1 + it ne + movne r0, #0 @ Found EOS, return NULL + bx lr + +END(strchr) + +weak_alias (strchr, index) +libc_hidden_builtin_def (strchr) diff --git a/ports/sysdeps/arm/armv6t2/strrchr.S b/ports/sysdeps/arm/armv6t2/strrchr.S new file mode 100644 index 0000000..483e52a --- /dev/null +++ b/ports/sysdeps/arm/armv6t2/strrchr.S @@ -0,0 +1,137 @@ +/* Copyright (C) 2013 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + . */ + +#include + + .syntax unified + .text + +ENTRY(strrchr) + @ r0 = start of string + @ r1 = character to match + @ returns NULL for no match, or a pointer to the match + + mov r3, r0 + s(mov) r0, #0 + uxtb r1, r1 + + @ Loop a few times until we're aligned. + tst r3, #7 + beq 2f +1: ldrb r2, [r3], #1 + cmp r2, r1 @ Find the character + it eq + subeq r0, r3, #1 + cmp r2, #0 @ Find EOS + it eq + bxeq lr + tst r3, #7 @ Find the aligment point + bne 1b + + @ So now we're aligned. Now we actually need a stack frame. +2: push { r4, r5, r6, r7 } + cfi_adjust_cfa_offset (16) + cfi_rel_offset (r4, 0) + cfi_rel_offset (r5, 4) + cfi_rel_offset (r6, 8) + cfi_rel_offset (r7, 12) + + orr r1, r1, r1, lsl #8 @ Replicate C to all bytes + movw ip, #0xfefe + orr r1, r1, r1, lsl #16 + movt ip, #0xfefe + s(mov) r2, #0 @ No found bits yet + + @ Loop searching for EOS and C, 8 bytes at a time. + @ Any time we find a match in a word, we copy the address of + @ the word to r0, and the found bits to r2. +3: ldrd r4, r5, [r3], #8 + @ Adding (unsigned saturating) 0xfe means result of 0xfe for any byte + @ that was originally zero and 0xff otherwise. Therefore we consider + @ the lsb of each byte the "found" bit, with 0 for a match. + uqadd8 r6, r4, ip @ Find EOS + uqadd8 r7, r5, ip + s(eor) r4, r4, r1 @ Convert C bytes to 0 + s(eor) r5, r5, r1 + uqadd8 r4, r4, ip @ Find C + uqadd8 r5, r5, ip + mvns r6, r6 @ Found EOS, first word + bne 4f + mvns r4, r4 @ Handle C, first word + itt ne + subne r0, r3, #8 + movne r2, r4 + mvns r7, r7 @ Found EOS, second word + bne 5f + mvns r5, r5 @ Handle C, second word + itt ne + subne r0, r3, #4 + movne r2, r5 + b 3b + + @ Found EOS in second word; fold to first word. +5: s(add) r3, r3, #4 @ Dec pointer to 2nd word, with below + mov r4, r5 @ Overwrite first word C found + mov r6, r7 @ Overwrite first word EOS found + + @ Found EOS. Zap found C after EOS. +4: s(sub) r3, r3, #8 @ Decrement pointer to first word + s(mvn) r4, r4 @ Positive found bit for C +#ifdef __ARMEL__ + sub r7, r6, #1 @ Toggle EOS lsb and below + s(eor) r6, r6, r7 @ All bits below and including lsb + ands r4, r4, r6 @ Zap C above EOS +#else + clz r6, r6 @ Find highest EOS bit set. + s(mvn) r7, #0 + s(add) r6, r6, #1 + s(lsr) r7, r7, r6 @ All bits below msb + bics r4, r4, r7 @ Zap C below EOS +#endif + itt ne + movne r2, r4 @ Copy to result, if still non-zero + movne r0, r3 + + pop { r4, r5, r6, r7 } + cfi_adjust_cfa_offset (-16) + cfi_restore (r4) + cfi_restore (r5) + cfi_restore (r6) + cfi_restore (r7) + + @ Adjust the result pointer if we found a word containing C. + @ Rather than fight with thumb IT insn about how many insns + @ we'd like to conditionally execute, just jump over them here. +#ifdef __thumb2__ +#define ne(insn) insn + cbz r2, 6f @ Did we find any C? +#else +#define ne(insn) insn##ne + cmp r2, #0 +#endif +#ifdef __ARMEB__ + ne(rbit) r2, r2 @ BE needs count-trailing-zeros +#endif + ne(clz) r2, r2 @ Find the bit offset of the last C + ne(rsb) r2, r2, #32 @ Convert to a count from the right + ne(add) r0, r0, r2, lsr #3 @ Convert to byte offset and add. +6: bx lr + +END(strrchr) + +weak_alias (strrchr, rindex) +libc_hidden_builtin_def (strrchr) -- 1.8.1.2