public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed
* [PATCH] AArch64: Optimize strlen
@ 2023-01-12 15:53 Wilco Dijkstra
  2023-01-13 12:26 ` Szabolcs Nagy
  0 siblings, 1 reply; 9+ messages in thread
From: Wilco Dijkstra @ 2023-01-12 15:53 UTC (permalink / raw)
  To: 'GNU C Library'; +Cc: Szabolcs Nagy

Optimize strlen by unrolling the main loop. Large strings are 64% faster on
modern CPUs.  Passes regress.

---

diff --git a/sysdeps/aarch64/strlen.S b/sysdeps/aarch64/strlen.S
index b3c92d9dc9b3c52e29e05ebbb89b929f177dc2cf..133ef933425fa260e61642a7840d73391168507d 100644
--- a/sysdeps/aarch64/strlen.S
+++ b/sysdeps/aarch64/strlen.S
@@ -43,12 +43,9 @@
 #define dend		d2
 
 /* Core algorithm:
-
-   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
-   per byte. We take 4 bits of every comparison byte with shift right and narrow
-   by 4 instruction. Since the bits in the nibble mask reflect the order in
-   which things occur in the original string, counting trailing zeros identifies
-   exactly which byte matched.  */
+   Process the string in 16-byte aligned chunks. Compute a 64-bit mask with
+   four bits per byte using the shrn instruction. A count trailing zeros then
+   identifies the first zero byte.  */
 
 ENTRY (STRLEN)
 	PTR_ARG (0)
@@ -68,18 +65,25 @@ ENTRY (STRLEN)
 
 	.p2align 5
 L(loop):
-	ldr	data, [src, 16]!
+	ldr	data, [src, 16]
+	cmeq	vhas_nul.16b, vdata.16b, 0
+	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	fmov	synd, dend
+	cbnz	synd, L(loop_end)
+	ldr	data, [src, 32]!
 	cmeq	vhas_nul.16b, vdata.16b, 0
 	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
 	fmov	synd, dend
 	cbz	synd, L(loop)
-
+	sub	src, src, 16
+L(loop_end):
 	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
 	sub	result, src, srcin
 	fmov	synd, dend
 #ifndef __AARCH64EB__
 	rbit	synd, synd
 #endif
+	add	result, result, 16
 	clz	tmp, synd
 	add	result, result, tmp, lsr 2
 	ret


^ permalink raw reply	[flat|nested] 9+ messages in thread
* RE: [PATCH][AArch64] Optimize strlen
@ 2015-04-15 12:34 Wilco Dijkstra
  0 siblings, 0 replies; 9+ messages in thread
From: Wilco Dijkstra @ 2015-04-15 12:34 UTC (permalink / raw)
  To: libc-alpha

ping
 
> > -----Original Message-----
> > From: Wilco Dijkstra [mailto:wdijkstr@arm.com]
> > Sent: 16 January 2015 17:39
> > To: libc-alpha@sourceware.org
> > Subject: [PATCH][AArch64] Optimize strlen
> >
> > Optimize the strlen implementation by using a page cross check and a fast check
> > for nul bytes which reverts to separate loop when a non-ASCII char is encountered.
> > Speedup on test-strlen is ~10%, long ASCII strings are processed ~60% faster,
> > and on random tests it is ~80% better.
> >
> > ChangeLog:
> >
> > 2015-01-16  Wilco Dijkstra  <wdijkstr@arm.com>
> >
> > 	* sysdeps/aarch64/strlen.S (strlen): Optimize strlen.
> >
> > ---
> >  sysdeps/aarch64/strlen.S | 230 +++++++++++++++++++++++++++++++++--------------
> >  1 file changed, 165 insertions(+), 65 deletions(-)
> >
> > diff --git a/sysdeps/aarch64/strlen.S b/sysdeps/aarch64/strlen.S
> > index 54271b9..7ca47d9 100644
> > --- a/sysdeps/aarch64/strlen.S
> > +++ b/sysdeps/aarch64/strlen.S
> > @@ -20,9 +20,13 @@
> >
> >  /* Assumptions:
> >   *
> > - * ARMv8-a, AArch64
> > + * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
> >   */
> >
> > +/* To test the page crossing code path more thoroughly, compile with
> > +   -DTEST_PAGE_CROSS - this will force all calls through the slower
> > +   entry path.  This option is not intended for production use.  */
> > +
> >  /* Arguments and results.  */
> >  #define srcin		x0
> >  #define len		x0
> > @@ -31,87 +35,183 @@
> >  #define src		x1
> >  #define data1		x2
> >  #define data2		x3
> > -#define data2a		x4
> > -#define has_nul1	x5
> > -#define has_nul2	x6
> > -#define tmp1		x7
> > -#define tmp2		x8
> > -#define tmp3		x9
> > -#define tmp4		x10
> > -#define zeroones	x11
> > -#define pos		x12
> > +#define has_nul1	x4
> > +#define has_nul2	x5
> > +#define tmp1		x4
> > +#define tmp2		x5
> > +#define tmp3		x6
> > +#define tmp4		x7
> > +#define zeroones	x8
> > +
> > +	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
> > +	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
> > +	   can be done in parallel across the entire word. A faster check
> > +	   (X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives
> > +	   false hits for characters 129..255.	*/
> >
> >  #define REP8_01 0x0101010101010101
> >  #define REP8_7f 0x7f7f7f7f7f7f7f7f
> >  #define REP8_80 0x8080808080808080
> >
> > -	/* Start of critial section -- keep to one 64Byte cache line.  */
> > +#ifdef TEST_PAGE_CROSS
> > +# define MIN_PAGE_SIZE 15
> > +#else
> > +# define MIN_PAGE_SIZE 4096
> > +#endif
> > +
> > +	/* Since strings are short on average, we check the first 16 bytes
> > +	   of the string for a NUL character.  In order to do an unaligned ldp
> > +	   safely we have to do a page cross check first.  If there is a NUL
> > +	   byte we calculate the length from the 2 8-byte words using
> > +	   conditional select to reduce branch mispredictions (it is unlikely
> > +	   strlen will be repeatedly called on strings with the same length).
> > +
> > +	   If the string is longer than 16 bytes, we align src so don't need
> > +	   further page cross checks, and process 32 bytes per iteration
> > +	   using the fast NUL check.  If we encounter non-ASCII characters,
> > +	   fallback to a second loop using the full NUL check.
> > +
> > +	   If the page cross check fails, we read 16 bytes from an aligned
> > +	   address, remove any characters before the string, and continue
> > +	   in the main loop using aligned loads.  Since strings crossing a
> > +	   page in the first 16 bytes are rare (probability of
> > +	   16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized.
> > +
> > +	   AArch64 systems have a minimum page size of 4k.  We don't bother
> > +	   checking for larger page sizes - the cost of setting up the correct
> > +	   page size is just not worth the extra gain from a small reduction in
> > +	   the cases taking the slow path.  Note that we only care about
> > +	   whether the first fetch, which may be misaligned, crosses a page
> > +	   boundary.  */
> > +
> >  ENTRY_ALIGN (strlen, 6)
> > -	mov	zeroones, #REP8_01
> > -	bic	src, srcin, #15
> > -	ands	tmp1, srcin, #15
> > -	b.ne	L(misaligned)
> > -	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
> > -	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
> > -	   can be done in parallel across the entire word.  */
> > -	/* The inner loop deals with two Dwords at a time.  This has a
> > -	   slightly higher start-up cost, but we should win quite quickly,
> > -	   especially on cores with a high number of issue slots per
> > -	   cycle, as we get much better parallelism out of the operations.  */
> > -L(loop):
> > -	ldp	data1, data2, [src], #16
> > -L(realigned):
> > +	and	tmp1, srcin, MIN_PAGE_SIZE - 1
> > +	mov	zeroones, REP8_01
> > +	cmp	tmp1, MIN_PAGE_SIZE - 16
> > +	b.gt	L(page_cross)
> > +	ldp	data1, data2, [srcin]
> > +#ifdef __AARCH64EB__
> > +	/* For big-endian, carry propagation (if the final byte in the
> > +	   string is 0x01) means we cannot use has_nul1/2 directly.
> > +	   Since we expect strings to be small and early-exit,
> > +	   byte-swap the data now so has_null1/2 will be correct.  */
> > +	rev	data1, data1
> > +	rev	data2, data2
> > +#endif
> >  	sub	tmp1, data1, zeroones
> > -	orr	tmp2, data1, #REP8_7f
> > +	orr	tmp2, data1, REP8_7f
> >  	sub	tmp3, data2, zeroones
> > -	orr	tmp4, data2, #REP8_7f
> > -	bic	has_nul1, tmp1, tmp2
> > -	bics	has_nul2, tmp3, tmp4
> > -	ccmp	has_nul1, #0, #0, eq	/* NZCV = 0000  */
> > -	b.eq	L(loop)
> > -	/* End of critical section -- keep to one 64Byte cache line.  */
> > +	orr	tmp4, data2, REP8_7f
> > +	bics	has_nul1, tmp1, tmp2
> > +	bic	has_nul2, tmp3, tmp4
> > +	ccmp	has_nul2, 0, 0, eq
> > +	beq	L(main_loop_entry)
> >
> > -	sub	len, src, srcin
> > -	cbz	has_nul1, L(nul_in_data2)
> > -#ifdef __AARCH64EB__
> > -	mov	data2, data1
> > -#endif
> > -	sub	len, len, #8
> > -	mov	has_nul2, has_nul1
> > -L(nul_in_data2):
> > +	/* Enter with C = has_nul1 == 0.  */
> > +	csel	has_nul1, has_nul1, has_nul2, cc
> > +	mov	len, 8
> > +	rev	has_nul1, has_nul1
> > +	clz	tmp1, has_nul1
> > +	csel	len, xzr, len, cc
> > +	add	len, len, tmp1, lsr 3
> > +	ret
> > +
> > +	/* The inner loop processes 32 bytes per iteration and uses the fast
> > +	   NUL check.  If we encounter non-ASCII characters, use a second
> > +	   loop with the accurate NUL check.  */
> > +	.p2align 4
> > +L(main_loop_entry):
> > +	bic	src, srcin, 15
> > +	sub	src, src, 16
> > +L(main_loop):
> > +	ldp	data1, data2, [src, 32]!
> > +.Lpage_cross_entry:
> > +	sub	tmp1, data1, zeroones
> > +	sub	tmp3, data2, zeroones
> > +	orr	tmp2, tmp1, tmp3
> > +	tst	tmp2, zeroones, lsl 7
> > +	bne	1f
> > +	ldp	data1, data2, [src, 16]
> > +	sub	tmp1, data1, zeroones
> > +	sub	tmp3, data2, zeroones
> > +	orr	tmp2, tmp1, tmp3
> > +	tst	tmp2, zeroones, lsl 7
> > +	beq	L(main_loop)
> > +	add	src, src, 16
> > +1:
> > +	/* The fast check failed, so do the slower, accurate NUL check.	 */
> > +	orr	tmp2, data1, REP8_7f
> > +	orr	tmp4, data2, REP8_7f
> > +	bics	has_nul1, tmp1, tmp2
> > +	bic	has_nul2, tmp3, tmp4
> > +	ccmp	has_nul2, 0, 0, eq
> > +	beq	L(nonascii_loop)
> > +
> > +	/* Enter with C = has_nul1 == 0.  */
> > +L(tail):
> >  #ifdef __AARCH64EB__
> >  	/* For big-endian, carry propagation (if the final byte in the
> > -	   string is 0x01) means we cannot use has_nul directly.  The
> > +	   string is 0x01) means we cannot use has_nul1/2 directly.  The
> >  	   easiest way to get the correct byte is to byte-swap the data
> >  	   and calculate the syndrome a second time.  */
> > -	rev	data2, data2
> > -	sub	tmp1, data2, zeroones
> > -	orr	tmp2, data2, #REP8_7f
> > -	bic	has_nul2, tmp1, tmp2
> > +	csel	data1, data1, data2, cc
> > +	rev	data1, data1
> > +	sub	tmp1, data1, zeroones
> > +	orr	tmp2, data1, REP8_7f
> > +	bic	has_nul1, tmp1, tmp2
> > +#else
> > +	csel	has_nul1, has_nul1, has_nul2, cc
> >  #endif
> > -	sub	len, len, #8
> > -	rev	has_nul2, has_nul2
> > -	clz	pos, has_nul2
> > -	add	len, len, pos, lsr #3		/* Bits to bytes.  */
> > -	RET
> > -
> > -L(misaligned):
> > -	cmp	tmp1, #8
> > -	neg	tmp1, tmp1
> > -	ldp	data1, data2, [src], #16
> > -	lsl	tmp1, tmp1, #3		/* Bytes beyond alignment -> bits.  */
> > -	mov	tmp2, #~0
> > +	sub	len, src, srcin
> > +	rev	has_nul1, has_nul1
> > +	add	tmp2, len, 8
> > +	clz	tmp1, has_nul1
> > +	csel	len, len, tmp2, cc
> > +	add	len, len, tmp1, lsr 3
> > +	ret
> > +
> > +L(nonascii_loop):
> > +	ldp	data1, data2, [src, 16]!
> > +	sub	tmp1, data1, zeroones
> > +	orr	tmp2, data1, REP8_7f
> > +	sub	tmp3, data2, zeroones
> > +	orr	tmp4, data2, REP8_7f
> > +	bics	has_nul1, tmp1, tmp2
> > +	bic	has_nul2, tmp3, tmp4
> > +	ccmp	has_nul2, 0, 0, eq
> > +	bne	L(tail)
> > +	ldp	data1, data2, [src, 16]!
> > +	sub	tmp1, data1, zeroones
> > +	orr	tmp2, data1, REP8_7f
> > +	sub	tmp3, data2, zeroones
> > +	orr	tmp4, data2, REP8_7f
> > +	bics	has_nul1, tmp1, tmp2
> > +	bic	has_nul2, tmp3, tmp4
> > +	ccmp	has_nul2, 0, 0, eq
> > +	beq	L(nonascii_loop)
> > +	b	L(tail)
> > +
> > +	/* Load 16 bytes from [srcin & ~15] and force the bytes that precede
> > +	   srcin to 0x7f, so we ignore any NUL bytes before the string.
> > +	   Then continue in the aligned loop.  */
> > +L(page_cross):
> > +	bic	src, srcin, 15
> > +	ldp	data1, data2, [src]
> > +	lsl	tmp1, srcin, 3
> > +	mov	tmp4, -1
> >  #ifdef __AARCH64EB__
> > -	/* Big-endian.  Early bytes are at MSB.  */
> > -	lsl	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
> > +	/* Big-endian.	Early bytes are at MSB.	 */
> > +	lsr	tmp1, tmp4, tmp1	/* Shift (tmp1 & 63).  */
> >  #else
> >  	/* Little-endian.  Early bytes are at LSB.  */
> > -	lsr	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
> > +	lsl	tmp1, tmp4, tmp1	/* Shift (tmp1 & 63).  */
> >  #endif
> > -	orr	data1, data1, tmp2
> > -	orr	data2a, data2, tmp2
> > -	csinv	data1, data1, xzr, le
> > -	csel	data2, data2, data2a, le
> > -	b	L(realigned)
> > +	orr	tmp1, tmp1, REP8_80
> > +	orn	data1, data1, tmp1
> > +	orn	tmp2, data2, tmp1
> > +	tst	srcin, 8
> > +	csel	data1, data1, tmp4, eq
> > +	csel	data2, data2, tmp2, eq
> > +	b	L(page_cross_entry)
> >  END (strlen)
> >  libc_hidden_builtin_def (strlen)
> > --
> > 1.9.1



^ permalink raw reply	[flat|nested] 9+ messages in thread
* RE: [PATCH][AArch64] Optimize strlen
@ 2015-02-27 15:06 Wilco Dijkstra
  0 siblings, 0 replies; 9+ messages in thread
From: Wilco Dijkstra @ 2015-02-27 15:06 UTC (permalink / raw)
  To: libc-alpha

ping

> -----Original Message-----
> From: Wilco Dijkstra [mailto:wdijkstr@arm.com]
> Sent: 16 January 2015 17:39
> To: libc-alpha@sourceware.org
> Subject: [PATCH][AArch64] Optimize strlen
> 
> Optimize the strlen implementation by using a page cross check and a fast check
> for nul bytes which reverts to separate loop when a non-ASCII char is encountered.
> Speedup on test-strlen is ~10%, long ASCII strings are processed ~60% faster,
> and on random tests it is ~80% better.
> 
> ChangeLog:
> 
> 2015-01-16  Wilco Dijkstra  <wdijkstr@arm.com>
> 
> 	* sysdeps/aarch64/strlen.S (strlen): Optimize strlen.
> 
> ---
>  sysdeps/aarch64/strlen.S | 230 +++++++++++++++++++++++++++++++++--------------
>  1 file changed, 165 insertions(+), 65 deletions(-)
> 
> diff --git a/sysdeps/aarch64/strlen.S b/sysdeps/aarch64/strlen.S
> index 54271b9..7ca47d9 100644
> --- a/sysdeps/aarch64/strlen.S
> +++ b/sysdeps/aarch64/strlen.S
> @@ -20,9 +20,13 @@
> 
>  /* Assumptions:
>   *
> - * ARMv8-a, AArch64
> + * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
>   */
> 
> +/* To test the page crossing code path more thoroughly, compile with
> +   -DTEST_PAGE_CROSS - this will force all calls through the slower
> +   entry path.  This option is not intended for production use.  */
> +
>  /* Arguments and results.  */
>  #define srcin		x0
>  #define len		x0
> @@ -31,87 +35,183 @@
>  #define src		x1
>  #define data1		x2
>  #define data2		x3
> -#define data2a		x4
> -#define has_nul1	x5
> -#define has_nul2	x6
> -#define tmp1		x7
> -#define tmp2		x8
> -#define tmp3		x9
> -#define tmp4		x10
> -#define zeroones	x11
> -#define pos		x12
> +#define has_nul1	x4
> +#define has_nul2	x5
> +#define tmp1		x4
> +#define tmp2		x5
> +#define tmp3		x6
> +#define tmp4		x7
> +#define zeroones	x8
> +
> +	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
> +	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
> +	   can be done in parallel across the entire word. A faster check
> +	   (X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives
> +	   false hits for characters 129..255.	*/
> 
>  #define REP8_01 0x0101010101010101
>  #define REP8_7f 0x7f7f7f7f7f7f7f7f
>  #define REP8_80 0x8080808080808080
> 
> -	/* Start of critial section -- keep to one 64Byte cache line.  */
> +#ifdef TEST_PAGE_CROSS
> +# define MIN_PAGE_SIZE 15
> +#else
> +# define MIN_PAGE_SIZE 4096
> +#endif
> +
> +	/* Since strings are short on average, we check the first 16 bytes
> +	   of the string for a NUL character.  In order to do an unaligned ldp
> +	   safely we have to do a page cross check first.  If there is a NUL
> +	   byte we calculate the length from the 2 8-byte words using
> +	   conditional select to reduce branch mispredictions (it is unlikely
> +	   strlen will be repeatedly called on strings with the same length).
> +
> +	   If the string is longer than 16 bytes, we align src so don't need
> +	   further page cross checks, and process 32 bytes per iteration
> +	   using the fast NUL check.  If we encounter non-ASCII characters,
> +	   fallback to a second loop using the full NUL check.
> +
> +	   If the page cross check fails, we read 16 bytes from an aligned
> +	   address, remove any characters before the string, and continue
> +	   in the main loop using aligned loads.  Since strings crossing a
> +	   page in the first 16 bytes are rare (probability of
> +	   16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized.
> +
> +	   AArch64 systems have a minimum page size of 4k.  We don't bother
> +	   checking for larger page sizes - the cost of setting up the correct
> +	   page size is just not worth the extra gain from a small reduction in
> +	   the cases taking the slow path.  Note that we only care about
> +	   whether the first fetch, which may be misaligned, crosses a page
> +	   boundary.  */
> +
>  ENTRY_ALIGN (strlen, 6)
> -	mov	zeroones, #REP8_01
> -	bic	src, srcin, #15
> -	ands	tmp1, srcin, #15
> -	b.ne	L(misaligned)
> -	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
> -	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
> -	   can be done in parallel across the entire word.  */
> -	/* The inner loop deals with two Dwords at a time.  This has a
> -	   slightly higher start-up cost, but we should win quite quickly,
> -	   especially on cores with a high number of issue slots per
> -	   cycle, as we get much better parallelism out of the operations.  */
> -L(loop):
> -	ldp	data1, data2, [src], #16
> -L(realigned):
> +	and	tmp1, srcin, MIN_PAGE_SIZE - 1
> +	mov	zeroones, REP8_01
> +	cmp	tmp1, MIN_PAGE_SIZE - 16
> +	b.gt	L(page_cross)
> +	ldp	data1, data2, [srcin]
> +#ifdef __AARCH64EB__
> +	/* For big-endian, carry propagation (if the final byte in the
> +	   string is 0x01) means we cannot use has_nul1/2 directly.
> +	   Since we expect strings to be small and early-exit,
> +	   byte-swap the data now so has_null1/2 will be correct.  */
> +	rev	data1, data1
> +	rev	data2, data2
> +#endif
>  	sub	tmp1, data1, zeroones
> -	orr	tmp2, data1, #REP8_7f
> +	orr	tmp2, data1, REP8_7f
>  	sub	tmp3, data2, zeroones
> -	orr	tmp4, data2, #REP8_7f
> -	bic	has_nul1, tmp1, tmp2
> -	bics	has_nul2, tmp3, tmp4
> -	ccmp	has_nul1, #0, #0, eq	/* NZCV = 0000  */
> -	b.eq	L(loop)
> -	/* End of critical section -- keep to one 64Byte cache line.  */
> +	orr	tmp4, data2, REP8_7f
> +	bics	has_nul1, tmp1, tmp2
> +	bic	has_nul2, tmp3, tmp4
> +	ccmp	has_nul2, 0, 0, eq
> +	beq	L(main_loop_entry)
> 
> -	sub	len, src, srcin
> -	cbz	has_nul1, L(nul_in_data2)
> -#ifdef __AARCH64EB__
> -	mov	data2, data1
> -#endif
> -	sub	len, len, #8
> -	mov	has_nul2, has_nul1
> -L(nul_in_data2):
> +	/* Enter with C = has_nul1 == 0.  */
> +	csel	has_nul1, has_nul1, has_nul2, cc
> +	mov	len, 8
> +	rev	has_nul1, has_nul1
> +	clz	tmp1, has_nul1
> +	csel	len, xzr, len, cc
> +	add	len, len, tmp1, lsr 3
> +	ret
> +
> +	/* The inner loop processes 32 bytes per iteration and uses the fast
> +	   NUL check.  If we encounter non-ASCII characters, use a second
> +	   loop with the accurate NUL check.  */
> +	.p2align 4
> +L(main_loop_entry):
> +	bic	src, srcin, 15
> +	sub	src, src, 16
> +L(main_loop):
> +	ldp	data1, data2, [src, 32]!
> +.Lpage_cross_entry:
> +	sub	tmp1, data1, zeroones
> +	sub	tmp3, data2, zeroones
> +	orr	tmp2, tmp1, tmp3
> +	tst	tmp2, zeroones, lsl 7
> +	bne	1f
> +	ldp	data1, data2, [src, 16]
> +	sub	tmp1, data1, zeroones
> +	sub	tmp3, data2, zeroones
> +	orr	tmp2, tmp1, tmp3
> +	tst	tmp2, zeroones, lsl 7
> +	beq	L(main_loop)
> +	add	src, src, 16
> +1:
> +	/* The fast check failed, so do the slower, accurate NUL check.	 */
> +	orr	tmp2, data1, REP8_7f
> +	orr	tmp4, data2, REP8_7f
> +	bics	has_nul1, tmp1, tmp2
> +	bic	has_nul2, tmp3, tmp4
> +	ccmp	has_nul2, 0, 0, eq
> +	beq	L(nonascii_loop)
> +
> +	/* Enter with C = has_nul1 == 0.  */
> +L(tail):
>  #ifdef __AARCH64EB__
>  	/* For big-endian, carry propagation (if the final byte in the
> -	   string is 0x01) means we cannot use has_nul directly.  The
> +	   string is 0x01) means we cannot use has_nul1/2 directly.  The
>  	   easiest way to get the correct byte is to byte-swap the data
>  	   and calculate the syndrome a second time.  */
> -	rev	data2, data2
> -	sub	tmp1, data2, zeroones
> -	orr	tmp2, data2, #REP8_7f
> -	bic	has_nul2, tmp1, tmp2
> +	csel	data1, data1, data2, cc
> +	rev	data1, data1
> +	sub	tmp1, data1, zeroones
> +	orr	tmp2, data1, REP8_7f
> +	bic	has_nul1, tmp1, tmp2
> +#else
> +	csel	has_nul1, has_nul1, has_nul2, cc
>  #endif
> -	sub	len, len, #8
> -	rev	has_nul2, has_nul2
> -	clz	pos, has_nul2
> -	add	len, len, pos, lsr #3		/* Bits to bytes.  */
> -	RET
> -
> -L(misaligned):
> -	cmp	tmp1, #8
> -	neg	tmp1, tmp1
> -	ldp	data1, data2, [src], #16
> -	lsl	tmp1, tmp1, #3		/* Bytes beyond alignment -> bits.  */
> -	mov	tmp2, #~0
> +	sub	len, src, srcin
> +	rev	has_nul1, has_nul1
> +	add	tmp2, len, 8
> +	clz	tmp1, has_nul1
> +	csel	len, len, tmp2, cc
> +	add	len, len, tmp1, lsr 3
> +	ret
> +
> +L(nonascii_loop):
> +	ldp	data1, data2, [src, 16]!
> +	sub	tmp1, data1, zeroones
> +	orr	tmp2, data1, REP8_7f
> +	sub	tmp3, data2, zeroones
> +	orr	tmp4, data2, REP8_7f
> +	bics	has_nul1, tmp1, tmp2
> +	bic	has_nul2, tmp3, tmp4
> +	ccmp	has_nul2, 0, 0, eq
> +	bne	L(tail)
> +	ldp	data1, data2, [src, 16]!
> +	sub	tmp1, data1, zeroones
> +	orr	tmp2, data1, REP8_7f
> +	sub	tmp3, data2, zeroones
> +	orr	tmp4, data2, REP8_7f
> +	bics	has_nul1, tmp1, tmp2
> +	bic	has_nul2, tmp3, tmp4
> +	ccmp	has_nul2, 0, 0, eq
> +	beq	L(nonascii_loop)
> +	b	L(tail)
> +
> +	/* Load 16 bytes from [srcin & ~15] and force the bytes that precede
> +	   srcin to 0x7f, so we ignore any NUL bytes before the string.
> +	   Then continue in the aligned loop.  */
> +L(page_cross):
> +	bic	src, srcin, 15
> +	ldp	data1, data2, [src]
> +	lsl	tmp1, srcin, 3
> +	mov	tmp4, -1
>  #ifdef __AARCH64EB__
> -	/* Big-endian.  Early bytes are at MSB.  */
> -	lsl	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
> +	/* Big-endian.	Early bytes are at MSB.	 */
> +	lsr	tmp1, tmp4, tmp1	/* Shift (tmp1 & 63).  */
>  #else
>  	/* Little-endian.  Early bytes are at LSB.  */
> -	lsr	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
> +	lsl	tmp1, tmp4, tmp1	/* Shift (tmp1 & 63).  */
>  #endif
> -	orr	data1, data1, tmp2
> -	orr	data2a, data2, tmp2
> -	csinv	data1, data1, xzr, le
> -	csel	data2, data2, data2a, le
> -	b	L(realigned)
> +	orr	tmp1, tmp1, REP8_80
> +	orn	data1, data1, tmp1
> +	orn	tmp2, data2, tmp1
> +	tst	srcin, 8
> +	csel	data1, data1, tmp4, eq
> +	csel	data2, data2, tmp2, eq
> +	b	L(page_cross_entry)
>  END (strlen)
>  libc_hidden_builtin_def (strlen)
> --
> 1.9.1




^ permalink raw reply	[flat|nested] 9+ messages in thread
* [PATCH][AArch64] Optimize strlen
@ 2015-01-16 17:39 Wilco Dijkstra
  2015-07-06 14:42 ` Marcus Shawcroft
  0 siblings, 1 reply; 9+ messages in thread
From: Wilco Dijkstra @ 2015-01-16 17:39 UTC (permalink / raw)
  To: libc-alpha

Optimize the strlen implementation by using a page cross check and a fast check
for nul bytes which reverts to separate loop when a non-ASCII char is encountered.
Speedup on test-strlen is ~10%, long ASCII strings are processed ~60% faster,
and on random tests it is ~80% better.

ChangeLog:

2015-01-16  Wilco Dijkstra  <wdijkstr@arm.com>

	* sysdeps/aarch64/strlen.S (strlen): Optimize strlen.

---
 sysdeps/aarch64/strlen.S | 230 +++++++++++++++++++++++++++++++++--------------
 1 file changed, 165 insertions(+), 65 deletions(-)

diff --git a/sysdeps/aarch64/strlen.S b/sysdeps/aarch64/strlen.S
index 54271b9..7ca47d9 100644
--- a/sysdeps/aarch64/strlen.S
+++ b/sysdeps/aarch64/strlen.S
@@ -20,9 +20,13 @@
 
 /* Assumptions:
  *
- * ARMv8-a, AArch64
+ * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
  */
 
+/* To test the page crossing code path more thoroughly, compile with
+   -DTEST_PAGE_CROSS - this will force all calls through the slower
+   entry path.  This option is not intended for production use.  */
+
 /* Arguments and results.  */
 #define srcin		x0
 #define len		x0
@@ -31,87 +35,183 @@
 #define src		x1
 #define data1		x2
 #define data2		x3
-#define data2a		x4
-#define has_nul1	x5
-#define has_nul2	x6
-#define tmp1		x7
-#define tmp2		x8
-#define tmp3		x9
-#define tmp4		x10
-#define zeroones	x11
-#define pos		x12
+#define has_nul1	x4
+#define has_nul2	x5
+#define tmp1		x4
+#define tmp2		x5
+#define tmp3		x6
+#define tmp4		x7
+#define zeroones	x8
+
+	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+	   can be done in parallel across the entire word. A faster check
+	   (X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives
+	   false hits for characters 129..255.	*/
 
 #define REP8_01 0x0101010101010101
 #define REP8_7f 0x7f7f7f7f7f7f7f7f
 #define REP8_80 0x8080808080808080
 
-	/* Start of critial section -- keep to one 64Byte cache line.  */
+#ifdef TEST_PAGE_CROSS
+# define MIN_PAGE_SIZE 15
+#else
+# define MIN_PAGE_SIZE 4096
+#endif
+
+	/* Since strings are short on average, we check the first 16 bytes
+	   of the string for a NUL character.  In order to do an unaligned ldp
+	   safely we have to do a page cross check first.  If there is a NUL
+	   byte we calculate the length from the 2 8-byte words using
+	   conditional select to reduce branch mispredictions (it is unlikely
+	   strlen will be repeatedly called on strings with the same length).
+
+	   If the string is longer than 16 bytes, we align src so don't need
+	   further page cross checks, and process 32 bytes per iteration
+	   using the fast NUL check.  If we encounter non-ASCII characters,
+	   fallback to a second loop using the full NUL check.
+
+	   If the page cross check fails, we read 16 bytes from an aligned
+	   address, remove any characters before the string, and continue
+	   in the main loop using aligned loads.  Since strings crossing a
+	   page in the first 16 bytes are rare (probability of
+	   16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized.
+
+	   AArch64 systems have a minimum page size of 4k.  We don't bother
+	   checking for larger page sizes - the cost of setting up the correct
+	   page size is just not worth the extra gain from a small reduction in
+	   the cases taking the slow path.  Note that we only care about
+	   whether the first fetch, which may be misaligned, crosses a page
+	   boundary.  */
+
 ENTRY_ALIGN (strlen, 6)
-	mov	zeroones, #REP8_01
-	bic	src, srcin, #15
-	ands	tmp1, srcin, #15
-	b.ne	L(misaligned)
-	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
-	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
-	   can be done in parallel across the entire word.  */
-	/* The inner loop deals with two Dwords at a time.  This has a
-	   slightly higher start-up cost, but we should win quite quickly,
-	   especially on cores with a high number of issue slots per
-	   cycle, as we get much better parallelism out of the operations.  */
-L(loop):
-	ldp	data1, data2, [src], #16
-L(realigned):
+	and	tmp1, srcin, MIN_PAGE_SIZE - 1
+	mov	zeroones, REP8_01
+	cmp	tmp1, MIN_PAGE_SIZE - 16
+	b.gt	L(page_cross)
+	ldp	data1, data2, [srcin]
+#ifdef __AARCH64EB__
+	/* For big-endian, carry propagation (if the final byte in the
+	   string is 0x01) means we cannot use has_nul1/2 directly.
+	   Since we expect strings to be small and early-exit,
+	   byte-swap the data now so has_null1/2 will be correct.  */
+	rev	data1, data1
+	rev	data2, data2
+#endif
 	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, #REP8_7f
+	orr	tmp2, data1, REP8_7f
 	sub	tmp3, data2, zeroones
-	orr	tmp4, data2, #REP8_7f
-	bic	has_nul1, tmp1, tmp2
-	bics	has_nul2, tmp3, tmp4
-	ccmp	has_nul1, #0, #0, eq	/* NZCV = 0000  */
-	b.eq	L(loop)
-	/* End of critical section -- keep to one 64Byte cache line.  */
+	orr	tmp4, data2, REP8_7f
+	bics	has_nul1, tmp1, tmp2
+	bic	has_nul2, tmp3, tmp4
+	ccmp	has_nul2, 0, 0, eq
+	beq	L(main_loop_entry)
 
-	sub	len, src, srcin
-	cbz	has_nul1, L(nul_in_data2)
-#ifdef __AARCH64EB__
-	mov	data2, data1
-#endif
-	sub	len, len, #8
-	mov	has_nul2, has_nul1
-L(nul_in_data2):
+	/* Enter with C = has_nul1 == 0.  */
+	csel	has_nul1, has_nul1, has_nul2, cc
+	mov	len, 8
+	rev	has_nul1, has_nul1
+	clz	tmp1, has_nul1
+	csel	len, xzr, len, cc
+	add	len, len, tmp1, lsr 3
+	ret
+
+	/* The inner loop processes 32 bytes per iteration and uses the fast
+	   NUL check.  If we encounter non-ASCII characters, use a second
+	   loop with the accurate NUL check.  */
+	.p2align 4
+L(main_loop_entry):
+	bic	src, srcin, 15
+	sub	src, src, 16
+L(main_loop):
+	ldp	data1, data2, [src, 32]!
+.Lpage_cross_entry:
+	sub	tmp1, data1, zeroones
+	sub	tmp3, data2, zeroones
+	orr	tmp2, tmp1, tmp3
+	tst	tmp2, zeroones, lsl 7
+	bne	1f
+	ldp	data1, data2, [src, 16]
+	sub	tmp1, data1, zeroones
+	sub	tmp3, data2, zeroones
+	orr	tmp2, tmp1, tmp3
+	tst	tmp2, zeroones, lsl 7
+	beq	L(main_loop)
+	add	src, src, 16
+1:
+	/* The fast check failed, so do the slower, accurate NUL check.	 */
+	orr	tmp2, data1, REP8_7f
+	orr	tmp4, data2, REP8_7f
+	bics	has_nul1, tmp1, tmp2
+	bic	has_nul2, tmp3, tmp4
+	ccmp	has_nul2, 0, 0, eq
+	beq	L(nonascii_loop)
+
+	/* Enter with C = has_nul1 == 0.  */
+L(tail):
 #ifdef __AARCH64EB__
 	/* For big-endian, carry propagation (if the final byte in the
-	   string is 0x01) means we cannot use has_nul directly.  The
+	   string is 0x01) means we cannot use has_nul1/2 directly.  The
 	   easiest way to get the correct byte is to byte-swap the data
 	   and calculate the syndrome a second time.  */
-	rev	data2, data2
-	sub	tmp1, data2, zeroones
-	orr	tmp2, data2, #REP8_7f
-	bic	has_nul2, tmp1, tmp2
+	csel	data1, data1, data2, cc
+	rev	data1, data1
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, REP8_7f
+	bic	has_nul1, tmp1, tmp2
+#else
+	csel	has_nul1, has_nul1, has_nul2, cc
 #endif
-	sub	len, len, #8
-	rev	has_nul2, has_nul2
-	clz	pos, has_nul2
-	add	len, len, pos, lsr #3		/* Bits to bytes.  */
-	RET
-
-L(misaligned):
-	cmp	tmp1, #8
-	neg	tmp1, tmp1
-	ldp	data1, data2, [src], #16
-	lsl	tmp1, tmp1, #3		/* Bytes beyond alignment -> bits.  */
-	mov	tmp2, #~0
+	sub	len, src, srcin
+	rev	has_nul1, has_nul1
+	add	tmp2, len, 8
+	clz	tmp1, has_nul1
+	csel	len, len, tmp2, cc
+	add	len, len, tmp1, lsr 3
+	ret
+
+L(nonascii_loop):
+	ldp	data1, data2, [src, 16]!
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, REP8_7f
+	sub	tmp3, data2, zeroones
+	orr	tmp4, data2, REP8_7f
+	bics	has_nul1, tmp1, tmp2
+	bic	has_nul2, tmp3, tmp4
+	ccmp	has_nul2, 0, 0, eq
+	bne	L(tail)
+	ldp	data1, data2, [src, 16]!
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, REP8_7f
+	sub	tmp3, data2, zeroones
+	orr	tmp4, data2, REP8_7f
+	bics	has_nul1, tmp1, tmp2
+	bic	has_nul2, tmp3, tmp4
+	ccmp	has_nul2, 0, 0, eq
+	beq	L(nonascii_loop)
+	b	L(tail)
+
+	/* Load 16 bytes from [srcin & ~15] and force the bytes that precede
+	   srcin to 0x7f, so we ignore any NUL bytes before the string.
+	   Then continue in the aligned loop.  */
+L(page_cross):
+	bic	src, srcin, 15
+	ldp	data1, data2, [src]
+	lsl	tmp1, srcin, 3
+	mov	tmp4, -1
 #ifdef __AARCH64EB__
-	/* Big-endian.  Early bytes are at MSB.  */
-	lsl	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
+	/* Big-endian.	Early bytes are at MSB.	 */
+	lsr	tmp1, tmp4, tmp1	/* Shift (tmp1 & 63).  */
 #else
 	/* Little-endian.  Early bytes are at LSB.  */
-	lsr	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
+	lsl	tmp1, tmp4, tmp1	/* Shift (tmp1 & 63).  */
 #endif
-	orr	data1, data1, tmp2
-	orr	data2a, data2, tmp2
-	csinv	data1, data1, xzr, le
-	csel	data2, data2, data2a, le
-	b	L(realigned)
+	orr	tmp1, tmp1, REP8_80
+	orn	data1, data1, tmp1
+	orn	tmp2, data2, tmp1
+	tst	srcin, 8
+	csel	data1, data1, tmp4, eq
+	csel	data2, data2, tmp2, eq
+	b	L(page_cross_entry)
 END (strlen)
 libc_hidden_builtin_def (strlen)
-- 
1.9.1




^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2023-03-02 14:29 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-01-12 15:53 [PATCH] AArch64: Optimize strlen Wilco Dijkstra
2023-01-13 12:26 ` Szabolcs Nagy
2023-03-02 11:49   ` Cupertino Miranda
2023-03-02 13:07     ` Wilco Dijkstra
2023-03-02 14:29       ` Cupertino Miranda
  -- strict thread matches above, loose matches on Subject: below --
2015-04-15 12:34 [PATCH][AArch64] " Wilco Dijkstra
2015-02-27 15:06 Wilco Dijkstra
2015-01-16 17:39 Wilco Dijkstra
2015-07-06 14:42 ` Marcus Shawcroft

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).