From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <libc-alpha-return-58451-listarch-libc-alpha=sources.redhat.com@sourceware.org>
Received: (qmail 89228 invoked by alias); 15 Apr 2015 12:34:20 -0000
Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm
Precedence: bulk
List-Id: <libc-alpha.sourceware.org>
List-Subscribe: <mailto:libc-alpha-subscribe@sourceware.org>
List-Archive: <http://sourceware.org/ml/libc-alpha/>
List-Post: <mailto:libc-alpha@sourceware.org>
List-Help: <mailto:libc-alpha-help@sourceware.org>, <http://sourceware.org/ml/#faqs>
Sender: libc-alpha-owner@sourceware.org
Received: (qmail 89174 invoked by uid 89); 15 Apr 2015 12:34:19 -0000
Authentication-Results: sourceware.org; auth=none
X-Virus-Found: No
X-Spam-SWARE-Status: No, score=-1.9 required=5.0 tests=AWL,BAYES_00,SPF_PASS autolearn=ham version=3.3.2
X-HELO: eu-smtp-delivery-143.mimecast.com
From: "Wilco Dijkstra" <wdijkstr@arm.com>
To: <libc-alpha@sourceware.org>
References: 
In-Reply-To: 
Subject: RE: [PATCH][AArch64] Optimize strlen
Date: Wed, 15 Apr 2015 12:34:00 -0000
Message-ID: <000601d07778$768457f0$638d07d0$@com>
MIME-Version: 1.0
X-MC-Unique: aLQayCIuSuG28nYXk2uzkQ-1
Content-Type: text/plain; charset=WINDOWS-1252
Content-Transfer-Encoding: quoted-printable
X-SW-Source: 2015-04/txt/msg00197.txt.bz2

ping
=20
> > -----Original Message-----
> > From: Wilco Dijkstra [mailto:wdijkstr@arm.com]
> > Sent: 16 January 2015 17:39
> > To: libc-alpha@sourceware.org
> > Subject: [PATCH][AArch64] Optimize strlen
> >
> > Optimize the strlen implementation by using a page cross check and a fa=
st check
> > for nul bytes which reverts to separate loop when a non-ASCII char is e=
ncountered.
> > Speedup on test-strlen is ~10%, long ASCII strings are processed ~60% f=
aster,
> > and on random tests it is ~80% better.
> >
> > ChangeLog:
> >
> > 2015-01-16  Wilco Dijkstra  <wdijkstr@arm.com>
> >
> > 	* sysdeps/aarch64/strlen.S (strlen): Optimize strlen.
> >
> > ---
> >  sysdeps/aarch64/strlen.S | 230 +++++++++++++++++++++++++++++++++------=
--------
> >  1 file changed, 165 insertions(+), 65 deletions(-)
> >
> > diff --git a/sysdeps/aarch64/strlen.S b/sysdeps/aarch64/strlen.S
> > index 54271b9..7ca47d9 100644
> > --- a/sysdeps/aarch64/strlen.S
> > +++ b/sysdeps/aarch64/strlen.S
> > @@ -20,9 +20,13 @@
> >
> >  /* Assumptions:
> >   *
> > - * ARMv8-a, AArch64
> > + * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
> >   */
> >
> > +/* To test the page crossing code path more thoroughly, compile with
> > +   -DTEST_PAGE_CROSS - this will force all calls through the slower
> > +   entry path.  This option is not intended for production use.  */
> > +
> >  /* Arguments and results.  */
> >  #define srcin		x0
> >  #define len		x0
> > @@ -31,87 +35,183 @@
> >  #define src		x1
> >  #define data1		x2
> >  #define data2		x3
> > -#define data2a		x4
> > -#define has_nul1	x5
> > -#define has_nul2	x6
> > -#define tmp1		x7
> > -#define tmp2		x8
> > -#define tmp3		x9
> > -#define tmp4		x10
> > -#define zeroones	x11
> > -#define pos		x12
> > +#define has_nul1	x4
> > +#define has_nul2	x5
> > +#define tmp1		x4
> > +#define tmp2		x5
> > +#define tmp3		x6
> > +#define tmp4		x7
> > +#define zeroones	x8
> > +
> > +	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
> > +	   (=3D> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
> > +	   can be done in parallel across the entire word. A faster check
> > +	   (X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives
> > +	   false hits for characters 129..255.	*/
> >
> >  #define REP8_01 0x0101010101010101
> >  #define REP8_7f 0x7f7f7f7f7f7f7f7f
> >  #define REP8_80 0x8080808080808080
> >
> > -	/* Start of critial section -- keep to one 64Byte cache line.  */
> > +#ifdef TEST_PAGE_CROSS
> > +# define MIN_PAGE_SIZE 15
> > +#else
> > +# define MIN_PAGE_SIZE 4096
> > +#endif
> > +
> > +	/* Since strings are short on average, we check the first 16 bytes
> > +	   of the string for a NUL character.  In order to do an unaligned ldp
> > +	   safely we have to do a page cross check first.  If there is a NUL
> > +	   byte we calculate the length from the 2 8-byte words using
> > +	   conditional select to reduce branch mispredictions (it is unlikely
> > +	   strlen will be repeatedly called on strings with the same length).
> > +
> > +	   If the string is longer than 16 bytes, we align src so don't need
> > +	   further page cross checks, and process 32 bytes per iteration
> > +	   using the fast NUL check.  If we encounter non-ASCII characters,
> > +	   fallback to a second loop using the full NUL check.
> > +
> > +	   If the page cross check fails, we read 16 bytes from an aligned
> > +	   address, remove any characters before the string, and continue
> > +	   in the main loop using aligned loads.  Since strings crossing a
> > +	   page in the first 16 bytes are rare (probability of
> > +	   16/MIN_PAGE_SIZE ~=3D 0.4%), this case does not need to be optimiz=
ed.
> > +
> > +	   AArch64 systems have a minimum page size of 4k.  We don't bother
> > +	   checking for larger page sizes - the cost of setting up the correct
> > +	   page size is just not worth the extra gain from a small reduction =
in
> > +	   the cases taking the slow path.  Note that we only care about
> > +	   whether the first fetch, which may be misaligned, crosses a page
> > +	   boundary.  */
> > +
> >  ENTRY_ALIGN (strlen, 6)
> > -	mov	zeroones, #REP8_01
> > -	bic	src, srcin, #15
> > -	ands	tmp1, srcin, #15
> > -	b.ne	L(misaligned)
> > -	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
> > -	   (=3D> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
> > -	   can be done in parallel across the entire word.  */
> > -	/* The inner loop deals with two Dwords at a time.  This has a
> > -	   slightly higher start-up cost, but we should win quite quickly,
> > -	   especially on cores with a high number of issue slots per
> > -	   cycle, as we get much better parallelism out of the operations.  */
> > -L(loop):
> > -	ldp	data1, data2, [src], #16
> > -L(realigned):
> > +	and	tmp1, srcin, MIN_PAGE_SIZE - 1
> > +	mov	zeroones, REP8_01
> > +	cmp	tmp1, MIN_PAGE_SIZE - 16
> > +	b.gt	L(page_cross)
> > +	ldp	data1, data2, [srcin]
> > +#ifdef __AARCH64EB__
> > +	/* For big-endian, carry propagation (if the final byte in the
> > +	   string is 0x01) means we cannot use has_nul1/2 directly.
> > +	   Since we expect strings to be small and early-exit,
> > +	   byte-swap the data now so has_null1/2 will be correct.  */
> > +	rev	data1, data1
> > +	rev	data2, data2
> > +#endif
> >  	sub	tmp1, data1, zeroones
> > -	orr	tmp2, data1, #REP8_7f
> > +	orr	tmp2, data1, REP8_7f
> >  	sub	tmp3, data2, zeroones
> > -	orr	tmp4, data2, #REP8_7f
> > -	bic	has_nul1, tmp1, tmp2
> > -	bics	has_nul2, tmp3, tmp4
> > -	ccmp	has_nul1, #0, #0, eq	/* NZCV =3D 0000  */
> > -	b.eq	L(loop)
> > -	/* End of critical section -- keep to one 64Byte cache line.  */
> > +	orr	tmp4, data2, REP8_7f
> > +	bics	has_nul1, tmp1, tmp2
> > +	bic	has_nul2, tmp3, tmp4
> > +	ccmp	has_nul2, 0, 0, eq
> > +	beq	L(main_loop_entry)
> >
> > -	sub	len, src, srcin
> > -	cbz	has_nul1, L(nul_in_data2)
> > -#ifdef __AARCH64EB__
> > -	mov	data2, data1
> > -#endif
> > -	sub	len, len, #8
> > -	mov	has_nul2, has_nul1
> > -L(nul_in_data2):
> > +	/* Enter with C =3D has_nul1 =3D=3D 0.  */
> > +	csel	has_nul1, has_nul1, has_nul2, cc
> > +	mov	len, 8
> > +	rev	has_nul1, has_nul1
> > +	clz	tmp1, has_nul1
> > +	csel	len, xzr, len, cc
> > +	add	len, len, tmp1, lsr 3
> > +	ret
> > +
> > +	/* The inner loop processes 32 bytes per iteration and uses the fast
> > +	   NUL check.  If we encounter non-ASCII characters, use a second
> > +	   loop with the accurate NUL check.  */
> > +	.p2align 4
> > +L(main_loop_entry):
> > +	bic	src, srcin, 15
> > +	sub	src, src, 16
> > +L(main_loop):
> > +	ldp	data1, data2, [src, 32]!
> > +.Lpage_cross_entry:
> > +	sub	tmp1, data1, zeroones
> > +	sub	tmp3, data2, zeroones
> > +	orr	tmp2, tmp1, tmp3
> > +	tst	tmp2, zeroones, lsl 7
> > +	bne	1f
> > +	ldp	data1, data2, [src, 16]
> > +	sub	tmp1, data1, zeroones
> > +	sub	tmp3, data2, zeroones
> > +	orr	tmp2, tmp1, tmp3
> > +	tst	tmp2, zeroones, lsl 7
> > +	beq	L(main_loop)
> > +	add	src, src, 16
> > +1:
> > +	/* The fast check failed, so do the slower, accurate NUL check.	 */
> > +	orr	tmp2, data1, REP8_7f
> > +	orr	tmp4, data2, REP8_7f
> > +	bics	has_nul1, tmp1, tmp2
> > +	bic	has_nul2, tmp3, tmp4
> > +	ccmp	has_nul2, 0, 0, eq
> > +	beq	L(nonascii_loop)
> > +
> > +	/* Enter with C =3D has_nul1 =3D=3D 0.  */
> > +L(tail):
> >  #ifdef __AARCH64EB__
> >  	/* For big-endian, carry propagation (if the final byte in the
> > -	   string is 0x01) means we cannot use has_nul directly.  The
> > +	   string is 0x01) means we cannot use has_nul1/2 directly.  The
> >  	   easiest way to get the correct byte is to byte-swap the data
> >  	   and calculate the syndrome a second time.  */
> > -	rev	data2, data2
> > -	sub	tmp1, data2, zeroones
> > -	orr	tmp2, data2, #REP8_7f
> > -	bic	has_nul2, tmp1, tmp2
> > +	csel	data1, data1, data2, cc
> > +	rev	data1, data1
> > +	sub	tmp1, data1, zeroones
> > +	orr	tmp2, data1, REP8_7f
> > +	bic	has_nul1, tmp1, tmp2
> > +#else
> > +	csel	has_nul1, has_nul1, has_nul2, cc
> >  #endif
> > -	sub	len, len, #8
> > -	rev	has_nul2, has_nul2
> > -	clz	pos, has_nul2
> > -	add	len, len, pos, lsr #3		/* Bits to bytes.  */
> > -	RET
> > -
> > -L(misaligned):
> > -	cmp	tmp1, #8
> > -	neg	tmp1, tmp1
> > -	ldp	data1, data2, [src], #16
> > -	lsl	tmp1, tmp1, #3		/* Bytes beyond alignment -> bits.  */
> > -	mov	tmp2, #~0
> > +	sub	len, src, srcin
> > +	rev	has_nul1, has_nul1
> > +	add	tmp2, len, 8
> > +	clz	tmp1, has_nul1
> > +	csel	len, len, tmp2, cc
> > +	add	len, len, tmp1, lsr 3
> > +	ret
> > +
> > +L(nonascii_loop):
> > +	ldp	data1, data2, [src, 16]!
> > +	sub	tmp1, data1, zeroones
> > +	orr	tmp2, data1, REP8_7f
> > +	sub	tmp3, data2, zeroones
> > +	orr	tmp4, data2, REP8_7f
> > +	bics	has_nul1, tmp1, tmp2
> > +	bic	has_nul2, tmp3, tmp4
> > +	ccmp	has_nul2, 0, 0, eq
> > +	bne	L(tail)
> > +	ldp	data1, data2, [src, 16]!
> > +	sub	tmp1, data1, zeroones
> > +	orr	tmp2, data1, REP8_7f
> > +	sub	tmp3, data2, zeroones
> > +	orr	tmp4, data2, REP8_7f
> > +	bics	has_nul1, tmp1, tmp2
> > +	bic	has_nul2, tmp3, tmp4
> > +	ccmp	has_nul2, 0, 0, eq
> > +	beq	L(nonascii_loop)
> > +	b	L(tail)
> > +
> > +	/* Load 16 bytes from [srcin & ~15] and force the bytes that precede
> > +	   srcin to 0x7f, so we ignore any NUL bytes before the string.
> > +	   Then continue in the aligned loop.  */
> > +L(page_cross):
> > +	bic	src, srcin, 15
> > +	ldp	data1, data2, [src]
> > +	lsl	tmp1, srcin, 3
> > +	mov	tmp4, -1
> >  #ifdef __AARCH64EB__
> > -	/* Big-endian.  Early bytes are at MSB.  */
> > -	lsl	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
> > +	/* Big-endian.	Early bytes are at MSB.	 */
> > +	lsr	tmp1, tmp4, tmp1	/* Shift (tmp1 & 63).  */
> >  #else
> >  	/* Little-endian.  Early bytes are at LSB.  */
> > -	lsr	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
> > +	lsl	tmp1, tmp4, tmp1	/* Shift (tmp1 & 63).  */
> >  #endif
> > -	orr	data1, data1, tmp2
> > -	orr	data2a, data2, tmp2
> > -	csinv	data1, data1, xzr, le
> > -	csel	data2, data2, data2a, le
> > -	b	L(realigned)
> > +	orr	tmp1, tmp1, REP8_80
> > +	orn	data1, data1, tmp1
> > +	orn	tmp2, data2, tmp1
> > +	tst	srcin, 8
> > +	csel	data1, data1, tmp4, eq
> > +	csel	data2, data2, tmp2, eq
> > +	b	L(page_cross_entry)
> >  END (strlen)
> >  libc_hidden_builtin_def (strlen)
> > --
> > 1.9.1