From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (qmail 89228 invoked by alias); 15 Apr 2015 12:34:20 -0000 Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Subscribe: List-Archive: List-Post: List-Help: , Sender: libc-alpha-owner@sourceware.org Received: (qmail 89174 invoked by uid 89); 15 Apr 2015 12:34:19 -0000 Authentication-Results: sourceware.org; auth=none X-Virus-Found: No X-Spam-SWARE-Status: No, score=-1.9 required=5.0 tests=AWL,BAYES_00,SPF_PASS autolearn=ham version=3.3.2 X-HELO: eu-smtp-delivery-143.mimecast.com From: "Wilco Dijkstra" To: References: In-Reply-To: Subject: RE: [PATCH][AArch64] Optimize strlen Date: Wed, 15 Apr 2015 12:34:00 -0000 Message-ID: <000601d07778$768457f0$638d07d0$@com> MIME-Version: 1.0 X-MC-Unique: aLQayCIuSuG28nYXk2uzkQ-1 Content-Type: text/plain; charset=WINDOWS-1252 Content-Transfer-Encoding: quoted-printable X-SW-Source: 2015-04/txt/msg00197.txt.bz2 ping =20 > > -----Original Message----- > > From: Wilco Dijkstra [mailto:wdijkstr@arm.com] > > Sent: 16 January 2015 17:39 > > To: libc-alpha@sourceware.org > > Subject: [PATCH][AArch64] Optimize strlen > > > > Optimize the strlen implementation by using a page cross check and a fa= st check > > for nul bytes which reverts to separate loop when a non-ASCII char is e= ncountered. > > Speedup on test-strlen is ~10%, long ASCII strings are processed ~60% f= aster, > > and on random tests it is ~80% better. > > > > ChangeLog: > > > > 2015-01-16 Wilco Dijkstra > > > > * sysdeps/aarch64/strlen.S (strlen): Optimize strlen. > > > > --- > > sysdeps/aarch64/strlen.S | 230 +++++++++++++++++++++++++++++++++------= -------- > > 1 file changed, 165 insertions(+), 65 deletions(-) > > > > diff --git a/sysdeps/aarch64/strlen.S b/sysdeps/aarch64/strlen.S > > index 54271b9..7ca47d9 100644 > > --- a/sysdeps/aarch64/strlen.S > > +++ b/sysdeps/aarch64/strlen.S > > @@ -20,9 +20,13 @@ > > > > /* Assumptions: > > * > > - * ARMv8-a, AArch64 > > + * ARMv8-a, AArch64, unaligned accesses, min page size 4k. > > */ > > > > +/* To test the page crossing code path more thoroughly, compile with > > + -DTEST_PAGE_CROSS - this will force all calls through the slower > > + entry path. This option is not intended for production use. */ > > + > > /* Arguments and results. */ > > #define srcin x0 > > #define len x0 > > @@ -31,87 +35,183 @@ > > #define src x1 > > #define data1 x2 > > #define data2 x3 > > -#define data2a x4 > > -#define has_nul1 x5 > > -#define has_nul2 x6 > > -#define tmp1 x7 > > -#define tmp2 x8 > > -#define tmp3 x9 > > -#define tmp4 x10 > > -#define zeroones x11 > > -#define pos x12 > > +#define has_nul1 x4 > > +#define has_nul2 x5 > > +#define tmp1 x4 > > +#define tmp2 x5 > > +#define tmp3 x6 > > +#define tmp4 x7 > > +#define zeroones x8 > > + > > + /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 > > + (=3D> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and > > + can be done in parallel across the entire word. A faster check > > + (X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives > > + false hits for characters 129..255. */ > > > > #define REP8_01 0x0101010101010101 > > #define REP8_7f 0x7f7f7f7f7f7f7f7f > > #define REP8_80 0x8080808080808080 > > > > - /* Start of critial section -- keep to one 64Byte cache line. */ > > +#ifdef TEST_PAGE_CROSS > > +# define MIN_PAGE_SIZE 15 > > +#else > > +# define MIN_PAGE_SIZE 4096 > > +#endif > > + > > + /* Since strings are short on average, we check the first 16 bytes > > + of the string for a NUL character. In order to do an unaligned ldp > > + safely we have to do a page cross check first. If there is a NUL > > + byte we calculate the length from the 2 8-byte words using > > + conditional select to reduce branch mispredictions (it is unlikely > > + strlen will be repeatedly called on strings with the same length). > > + > > + If the string is longer than 16 bytes, we align src so don't need > > + further page cross checks, and process 32 bytes per iteration > > + using the fast NUL check. If we encounter non-ASCII characters, > > + fallback to a second loop using the full NUL check. > > + > > + If the page cross check fails, we read 16 bytes from an aligned > > + address, remove any characters before the string, and continue > > + in the main loop using aligned loads. Since strings crossing a > > + page in the first 16 bytes are rare (probability of > > + 16/MIN_PAGE_SIZE ~=3D 0.4%), this case does not need to be optimiz= ed. > > + > > + AArch64 systems have a minimum page size of 4k. We don't bother > > + checking for larger page sizes - the cost of setting up the correct > > + page size is just not worth the extra gain from a small reduction = in > > + the cases taking the slow path. Note that we only care about > > + whether the first fetch, which may be misaligned, crosses a page > > + boundary. */ > > + > > ENTRY_ALIGN (strlen, 6) > > - mov zeroones, #REP8_01 > > - bic src, srcin, #15 > > - ands tmp1, srcin, #15 > > - b.ne L(misaligned) > > - /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 > > - (=3D> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and > > - can be done in parallel across the entire word. */ > > - /* The inner loop deals with two Dwords at a time. This has a > > - slightly higher start-up cost, but we should win quite quickly, > > - especially on cores with a high number of issue slots per > > - cycle, as we get much better parallelism out of the operations. */ > > -L(loop): > > - ldp data1, data2, [src], #16 > > -L(realigned): > > + and tmp1, srcin, MIN_PAGE_SIZE - 1 > > + mov zeroones, REP8_01 > > + cmp tmp1, MIN_PAGE_SIZE - 16 > > + b.gt L(page_cross) > > + ldp data1, data2, [srcin] > > +#ifdef __AARCH64EB__ > > + /* For big-endian, carry propagation (if the final byte in the > > + string is 0x01) means we cannot use has_nul1/2 directly. > > + Since we expect strings to be small and early-exit, > > + byte-swap the data now so has_null1/2 will be correct. */ > > + rev data1, data1 > > + rev data2, data2 > > +#endif > > sub tmp1, data1, zeroones > > - orr tmp2, data1, #REP8_7f > > + orr tmp2, data1, REP8_7f > > sub tmp3, data2, zeroones > > - orr tmp4, data2, #REP8_7f > > - bic has_nul1, tmp1, tmp2 > > - bics has_nul2, tmp3, tmp4 > > - ccmp has_nul1, #0, #0, eq /* NZCV =3D 0000 */ > > - b.eq L(loop) > > - /* End of critical section -- keep to one 64Byte cache line. */ > > + orr tmp4, data2, REP8_7f > > + bics has_nul1, tmp1, tmp2 > > + bic has_nul2, tmp3, tmp4 > > + ccmp has_nul2, 0, 0, eq > > + beq L(main_loop_entry) > > > > - sub len, src, srcin > > - cbz has_nul1, L(nul_in_data2) > > -#ifdef __AARCH64EB__ > > - mov data2, data1 > > -#endif > > - sub len, len, #8 > > - mov has_nul2, has_nul1 > > -L(nul_in_data2): > > + /* Enter with C =3D has_nul1 =3D=3D 0. */ > > + csel has_nul1, has_nul1, has_nul2, cc > > + mov len, 8 > > + rev has_nul1, has_nul1 > > + clz tmp1, has_nul1 > > + csel len, xzr, len, cc > > + add len, len, tmp1, lsr 3 > > + ret > > + > > + /* The inner loop processes 32 bytes per iteration and uses the fast > > + NUL check. If we encounter non-ASCII characters, use a second > > + loop with the accurate NUL check. */ > > + .p2align 4 > > +L(main_loop_entry): > > + bic src, srcin, 15 > > + sub src, src, 16 > > +L(main_loop): > > + ldp data1, data2, [src, 32]! > > +.Lpage_cross_entry: > > + sub tmp1, data1, zeroones > > + sub tmp3, data2, zeroones > > + orr tmp2, tmp1, tmp3 > > + tst tmp2, zeroones, lsl 7 > > + bne 1f > > + ldp data1, data2, [src, 16] > > + sub tmp1, data1, zeroones > > + sub tmp3, data2, zeroones > > + orr tmp2, tmp1, tmp3 > > + tst tmp2, zeroones, lsl 7 > > + beq L(main_loop) > > + add src, src, 16 > > +1: > > + /* The fast check failed, so do the slower, accurate NUL check. */ > > + orr tmp2, data1, REP8_7f > > + orr tmp4, data2, REP8_7f > > + bics has_nul1, tmp1, tmp2 > > + bic has_nul2, tmp3, tmp4 > > + ccmp has_nul2, 0, 0, eq > > + beq L(nonascii_loop) > > + > > + /* Enter with C =3D has_nul1 =3D=3D 0. */ > > +L(tail): > > #ifdef __AARCH64EB__ > > /* For big-endian, carry propagation (if the final byte in the > > - string is 0x01) means we cannot use has_nul directly. The > > + string is 0x01) means we cannot use has_nul1/2 directly. The > > easiest way to get the correct byte is to byte-swap the data > > and calculate the syndrome a second time. */ > > - rev data2, data2 > > - sub tmp1, data2, zeroones > > - orr tmp2, data2, #REP8_7f > > - bic has_nul2, tmp1, tmp2 > > + csel data1, data1, data2, cc > > + rev data1, data1 > > + sub tmp1, data1, zeroones > > + orr tmp2, data1, REP8_7f > > + bic has_nul1, tmp1, tmp2 > > +#else > > + csel has_nul1, has_nul1, has_nul2, cc > > #endif > > - sub len, len, #8 > > - rev has_nul2, has_nul2 > > - clz pos, has_nul2 > > - add len, len, pos, lsr #3 /* Bits to bytes. */ > > - RET > > - > > -L(misaligned): > > - cmp tmp1, #8 > > - neg tmp1, tmp1 > > - ldp data1, data2, [src], #16 > > - lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */ > > - mov tmp2, #~0 > > + sub len, src, srcin > > + rev has_nul1, has_nul1 > > + add tmp2, len, 8 > > + clz tmp1, has_nul1 > > + csel len, len, tmp2, cc > > + add len, len, tmp1, lsr 3 > > + ret > > + > > +L(nonascii_loop): > > + ldp data1, data2, [src, 16]! > > + sub tmp1, data1, zeroones > > + orr tmp2, data1, REP8_7f > > + sub tmp3, data2, zeroones > > + orr tmp4, data2, REP8_7f > > + bics has_nul1, tmp1, tmp2 > > + bic has_nul2, tmp3, tmp4 > > + ccmp has_nul2, 0, 0, eq > > + bne L(tail) > > + ldp data1, data2, [src, 16]! > > + sub tmp1, data1, zeroones > > + orr tmp2, data1, REP8_7f > > + sub tmp3, data2, zeroones > > + orr tmp4, data2, REP8_7f > > + bics has_nul1, tmp1, tmp2 > > + bic has_nul2, tmp3, tmp4 > > + ccmp has_nul2, 0, 0, eq > > + beq L(nonascii_loop) > > + b L(tail) > > + > > + /* Load 16 bytes from [srcin & ~15] and force the bytes that precede > > + srcin to 0x7f, so we ignore any NUL bytes before the string. > > + Then continue in the aligned loop. */ > > +L(page_cross): > > + bic src, srcin, 15 > > + ldp data1, data2, [src] > > + lsl tmp1, srcin, 3 > > + mov tmp4, -1 > > #ifdef __AARCH64EB__ > > - /* Big-endian. Early bytes are at MSB. */ > > - lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ > > + /* Big-endian. Early bytes are at MSB. */ > > + lsr tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */ > > #else > > /* Little-endian. Early bytes are at LSB. */ > > - lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ > > + lsl tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */ > > #endif > > - orr data1, data1, tmp2 > > - orr data2a, data2, tmp2 > > - csinv data1, data1, xzr, le > > - csel data2, data2, data2a, le > > - b L(realigned) > > + orr tmp1, tmp1, REP8_80 > > + orn data1, data1, tmp1 > > + orn tmp2, data2, tmp1 > > + tst srcin, 8 > > + csel data1, data1, tmp4, eq > > + csel data2, data2, tmp2, eq > > + b L(page_cross_entry) > > END (strlen) > > libc_hidden_builtin_def (strlen) > > -- > > 1.9.1