From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (qmail 9214 invoked by alias); 27 Feb 2015 15:06:14 -0000 Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Subscribe: List-Archive: List-Post: List-Help: , Sender: libc-alpha-owner@sourceware.org Received: (qmail 9201 invoked by uid 89); 27 Feb 2015 15:06:14 -0000 Authentication-Results: sourceware.org; auth=none X-Virus-Found: No X-Spam-SWARE-Status: No, score=-1.9 required=5.0 tests=AWL,BAYES_00,SPF_PASS autolearn=ham version=3.3.2 X-HELO: service87.mimecast.com From: "Wilco Dijkstra" To: References: In-Reply-To: Subject: RE: [PATCH][AArch64] Optimize strlen Date: Fri, 27 Feb 2015 15:06:00 -0000 Message-ID: <000401d0529e$e8dca8b0$ba95fa10$@com> MIME-Version: 1.0 X-MC-Unique: 115022715060908601 Content-Type: text/plain; charset=WINDOWS-1252 Content-Transfer-Encoding: quoted-printable X-SW-Source: 2015-02/txt/msg00761.txt.bz2 ping > -----Original Message----- > From: Wilco Dijkstra [mailto:wdijkstr@arm.com] > Sent: 16 January 2015 17:39 > To: libc-alpha@sourceware.org > Subject: [PATCH][AArch64] Optimize strlen >=20 > Optimize the strlen implementation by using a page cross check and a fast= check > for nul bytes which reverts to separate loop when a non-ASCII char is enc= ountered. > Speedup on test-strlen is ~10%, long ASCII strings are processed ~60% fas= ter, > and on random tests it is ~80% better. >=20 > ChangeLog: >=20 > 2015-01-16 Wilco Dijkstra >=20 > * sysdeps/aarch64/strlen.S (strlen): Optimize strlen. >=20 > --- > sysdeps/aarch64/strlen.S | 230 +++++++++++++++++++++++++++++++++--------= ------ > 1 file changed, 165 insertions(+), 65 deletions(-) >=20 > diff --git a/sysdeps/aarch64/strlen.S b/sysdeps/aarch64/strlen.S > index 54271b9..7ca47d9 100644 > --- a/sysdeps/aarch64/strlen.S > +++ b/sysdeps/aarch64/strlen.S > @@ -20,9 +20,13 @@ >=20 > /* Assumptions: > * > - * ARMv8-a, AArch64 > + * ARMv8-a, AArch64, unaligned accesses, min page size 4k. > */ >=20 > +/* To test the page crossing code path more thoroughly, compile with > + -DTEST_PAGE_CROSS - this will force all calls through the slower > + entry path. This option is not intended for production use. */ > + > /* Arguments and results. */ > #define srcin x0 > #define len x0 > @@ -31,87 +35,183 @@ > #define src x1 > #define data1 x2 > #define data2 x3 > -#define data2a x4 > -#define has_nul1 x5 > -#define has_nul2 x6 > -#define tmp1 x7 > -#define tmp2 x8 > -#define tmp3 x9 > -#define tmp4 x10 > -#define zeroones x11 > -#define pos x12 > +#define has_nul1 x4 > +#define has_nul2 x5 > +#define tmp1 x4 > +#define tmp2 x5 > +#define tmp3 x6 > +#define tmp4 x7 > +#define zeroones x8 > + > + /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 > + (=3D> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and > + can be done in parallel across the entire word. A faster check > + (X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives > + false hits for characters 129..255. */ >=20 > #define REP8_01 0x0101010101010101 > #define REP8_7f 0x7f7f7f7f7f7f7f7f > #define REP8_80 0x8080808080808080 >=20 > - /* Start of critial section -- keep to one 64Byte cache line. */ > +#ifdef TEST_PAGE_CROSS > +# define MIN_PAGE_SIZE 15 > +#else > +# define MIN_PAGE_SIZE 4096 > +#endif > + > + /* Since strings are short on average, we check the first 16 bytes > + of the string for a NUL character. In order to do an unaligned ldp > + safely we have to do a page cross check first. If there is a NUL > + byte we calculate the length from the 2 8-byte words using > + conditional select to reduce branch mispredictions (it is unlikely > + strlen will be repeatedly called on strings with the same length). > + > + If the string is longer than 16 bytes, we align src so don't need > + further page cross checks, and process 32 bytes per iteration > + using the fast NUL check. If we encounter non-ASCII characters, > + fallback to a second loop using the full NUL check. > + > + If the page cross check fails, we read 16 bytes from an aligned > + address, remove any characters before the string, and continue > + in the main loop using aligned loads. Since strings crossing a > + page in the first 16 bytes are rare (probability of > + 16/MIN_PAGE_SIZE ~=3D 0.4%), this case does not need to be optimized. > + > + AArch64 systems have a minimum page size of 4k. We don't bother > + checking for larger page sizes - the cost of setting up the correct > + page size is just not worth the extra gain from a small reduction in > + the cases taking the slow path. Note that we only care about > + whether the first fetch, which may be misaligned, crosses a page > + boundary. */ > + > ENTRY_ALIGN (strlen, 6) > - mov zeroones, #REP8_01 > - bic src, srcin, #15 > - ands tmp1, srcin, #15 > - b.ne L(misaligned) > - /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 > - (=3D> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and > - can be done in parallel across the entire word. */ > - /* The inner loop deals with two Dwords at a time. This has a > - slightly higher start-up cost, but we should win quite quickly, > - especially on cores with a high number of issue slots per > - cycle, as we get much better parallelism out of the operations. */ > -L(loop): > - ldp data1, data2, [src], #16 > -L(realigned): > + and tmp1, srcin, MIN_PAGE_SIZE - 1 > + mov zeroones, REP8_01 > + cmp tmp1, MIN_PAGE_SIZE - 16 > + b.gt L(page_cross) > + ldp data1, data2, [srcin] > +#ifdef __AARCH64EB__ > + /* For big-endian, carry propagation (if the final byte in the > + string is 0x01) means we cannot use has_nul1/2 directly. > + Since we expect strings to be small and early-exit, > + byte-swap the data now so has_null1/2 will be correct. */ > + rev data1, data1 > + rev data2, data2 > +#endif > sub tmp1, data1, zeroones > - orr tmp2, data1, #REP8_7f > + orr tmp2, data1, REP8_7f > sub tmp3, data2, zeroones > - orr tmp4, data2, #REP8_7f > - bic has_nul1, tmp1, tmp2 > - bics has_nul2, tmp3, tmp4 > - ccmp has_nul1, #0, #0, eq /* NZCV =3D 0000 */ > - b.eq L(loop) > - /* End of critical section -- keep to one 64Byte cache line. */ > + orr tmp4, data2, REP8_7f > + bics has_nul1, tmp1, tmp2 > + bic has_nul2, tmp3, tmp4 > + ccmp has_nul2, 0, 0, eq > + beq L(main_loop_entry) >=20 > - sub len, src, srcin > - cbz has_nul1, L(nul_in_data2) > -#ifdef __AARCH64EB__ > - mov data2, data1 > -#endif > - sub len, len, #8 > - mov has_nul2, has_nul1 > -L(nul_in_data2): > + /* Enter with C =3D has_nul1 =3D=3D 0. */ > + csel has_nul1, has_nul1, has_nul2, cc > + mov len, 8 > + rev has_nul1, has_nul1 > + clz tmp1, has_nul1 > + csel len, xzr, len, cc > + add len, len, tmp1, lsr 3 > + ret > + > + /* The inner loop processes 32 bytes per iteration and uses the fast > + NUL check. If we encounter non-ASCII characters, use a second > + loop with the accurate NUL check. */ > + .p2align 4 > +L(main_loop_entry): > + bic src, srcin, 15 > + sub src, src, 16 > +L(main_loop): > + ldp data1, data2, [src, 32]! > +.Lpage_cross_entry: > + sub tmp1, data1, zeroones > + sub tmp3, data2, zeroones > + orr tmp2, tmp1, tmp3 > + tst tmp2, zeroones, lsl 7 > + bne 1f > + ldp data1, data2, [src, 16] > + sub tmp1, data1, zeroones > + sub tmp3, data2, zeroones > + orr tmp2, tmp1, tmp3 > + tst tmp2, zeroones, lsl 7 > + beq L(main_loop) > + add src, src, 16 > +1: > + /* The fast check failed, so do the slower, accurate NUL check. */ > + orr tmp2, data1, REP8_7f > + orr tmp4, data2, REP8_7f > + bics has_nul1, tmp1, tmp2 > + bic has_nul2, tmp3, tmp4 > + ccmp has_nul2, 0, 0, eq > + beq L(nonascii_loop) > + > + /* Enter with C =3D has_nul1 =3D=3D 0. */ > +L(tail): > #ifdef __AARCH64EB__ > /* For big-endian, carry propagation (if the final byte in the > - string is 0x01) means we cannot use has_nul directly. The > + string is 0x01) means we cannot use has_nul1/2 directly. The > easiest way to get the correct byte is to byte-swap the data > and calculate the syndrome a second time. */ > - rev data2, data2 > - sub tmp1, data2, zeroones > - orr tmp2, data2, #REP8_7f > - bic has_nul2, tmp1, tmp2 > + csel data1, data1, data2, cc > + rev data1, data1 > + sub tmp1, data1, zeroones > + orr tmp2, data1, REP8_7f > + bic has_nul1, tmp1, tmp2 > +#else > + csel has_nul1, has_nul1, has_nul2, cc > #endif > - sub len, len, #8 > - rev has_nul2, has_nul2 > - clz pos, has_nul2 > - add len, len, pos, lsr #3 /* Bits to bytes. */ > - RET > - > -L(misaligned): > - cmp tmp1, #8 > - neg tmp1, tmp1 > - ldp data1, data2, [src], #16 > - lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */ > - mov tmp2, #~0 > + sub len, src, srcin > + rev has_nul1, has_nul1 > + add tmp2, len, 8 > + clz tmp1, has_nul1 > + csel len, len, tmp2, cc > + add len, len, tmp1, lsr 3 > + ret > + > +L(nonascii_loop): > + ldp data1, data2, [src, 16]! > + sub tmp1, data1, zeroones > + orr tmp2, data1, REP8_7f > + sub tmp3, data2, zeroones > + orr tmp4, data2, REP8_7f > + bics has_nul1, tmp1, tmp2 > + bic has_nul2, tmp3, tmp4 > + ccmp has_nul2, 0, 0, eq > + bne L(tail) > + ldp data1, data2, [src, 16]! > + sub tmp1, data1, zeroones > + orr tmp2, data1, REP8_7f > + sub tmp3, data2, zeroones > + orr tmp4, data2, REP8_7f > + bics has_nul1, tmp1, tmp2 > + bic has_nul2, tmp3, tmp4 > + ccmp has_nul2, 0, 0, eq > + beq L(nonascii_loop) > + b L(tail) > + > + /* Load 16 bytes from [srcin & ~15] and force the bytes that precede > + srcin to 0x7f, so we ignore any NUL bytes before the string. > + Then continue in the aligned loop. */ > +L(page_cross): > + bic src, srcin, 15 > + ldp data1, data2, [src] > + lsl tmp1, srcin, 3 > + mov tmp4, -1 > #ifdef __AARCH64EB__ > - /* Big-endian. Early bytes are at MSB. */ > - lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ > + /* Big-endian. Early bytes are at MSB. */ > + lsr tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */ > #else > /* Little-endian. Early bytes are at LSB. */ > - lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ > + lsl tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */ > #endif > - orr data1, data1, tmp2 > - orr data2a, data2, tmp2 > - csinv data1, data1, xzr, le > - csel data2, data2, data2a, le > - b L(realigned) > + orr tmp1, tmp1, REP8_80 > + orn data1, data1, tmp1 > + orn tmp2, data2, tmp1 > + tst srcin, 8 > + csel data1, data1, tmp4, eq > + csel data2, data2, tmp2, eq > + b L(page_cross_entry) > END (strlen) > libc_hidden_builtin_def (strlen) > -- > 1.9.1