From: Szabolcs Nagy <Szabolcs.Nagy@arm.com>
To: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
Cc: 'GNU C Library' <libc-alpha@sourceware.org>
Subject: Re: [PATCH] AArch64: Improve strnlen performance
Date: Wed, 30 Jun 2021 09:41:46 +0100 [thread overview]
Message-ID: <20210630084145.GD14854@arm.com> (raw)
In-Reply-To: <VE1PR08MB5599E4DA23BB432BDE1ABDEB83089@VE1PR08MB5599.eurprd08.prod.outlook.com>
The 06/23/2021 16:22, Wilco Dijkstra wrote:
>
> Optimize strnlen by avoiding UMINV which is slow on most cores. On Neoverse N1
> large strings are 1.8x faster than the current version, and bench-strnlen is 50%
> faster overall. This version is MTE compatible.
>
> Passes GLIBC regress, OK for commit?
This is ok to commit, thanks.
Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
> diff --git a/sysdeps/aarch64/strnlen.S b/sysdeps/aarch64/strnlen.S
> index 2b57575c55cc41a5c6aa813af216c6e34f6cb7b0..37e9eed4120750f4e03d563938438b8c5384f75d 100644
> --- a/sysdeps/aarch64/strnlen.S
> +++ b/sysdeps/aarch64/strnlen.S
> @@ -22,197 +22,105 @@
>
> /* Assumptions:
> *
> - * ARMv8-a, AArch64
> + * ARMv8-a, AArch64, Advanced SIMD.
> + * MTE compatible.
> */
>
> -/* Arguments and results. */
> #define srcin x0
> -#define len x0
> -#define limit x1
> +#define cntin x1
> +#define result x0
>
> -/* Locals and temporaries. */
> #define src x2
> -#define data1 x3
> -#define data2 x4
> -#define data2a x5
> -#define has_nul1 x6
> -#define has_nul2 x7
> -#define tmp1 x8
> -#define tmp2 x9
> -#define tmp3 x10
> -#define tmp4 x11
> -#define zeroones x12
> -#define pos x13
> -#define limit_wd x14
> -
> -#define dataq q2
> -#define datav v2
> -#define datab2 b3
> -#define dataq2 q3
> -#define datav2 v3
> -#define REP8_01 0x0101010101010101
> -#define REP8_7f 0x7f7f7f7f7f7f7f7f
> -#define REP8_80 0x8080808080808080
> -
> -ENTRY_ALIGN_AND_PAD (__strnlen, 6, 9)
> +#define synd x3
> +#define shift x4
> +#define wtmp w4
> +#define tmp x4
> +#define cntrem x5
> +
> +#define qdata q0
> +#define vdata v0
> +#define vhas_chr v1
> +#define vrepmask v2
> +#define vend v3
> +#define dend d3
> +
> +/*
> + Core algorithm:
> +
> + For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
> + per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
> + requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
> + set likewise for odd bytes so that adjacent bytes can be merged. Since the
> + bits in the syndrome reflect the order in which things occur in the original
> + string, counting trailing zeros identifies exactly which byte matched. */
> +
> +ENTRY (__strnlen)
> PTR_ARG (0)
> SIZE_ARG (1)
> - cbz limit, L(hit_limit)
> - mov zeroones, #REP8_01
> - bic src, srcin, #15
> - ands tmp1, srcin, #15
> - b.ne L(misaligned)
> - /* Calculate the number of full and partial words -1. */
> - sub limit_wd, limit, #1 /* Limit != 0, so no underflow. */
> - lsr limit_wd, limit_wd, #4 /* Convert to Qwords. */
> -
> - /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
> - (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
> - can be done in parallel across the entire word. */
> - /* The inner loop deals with two Dwords at a time. This has a
> - slightly higher start-up cost, but we should win quite quickly,
> - especially on cores with a high number of issue slots per
> - cycle, as we get much better parallelism out of the operations. */
> -
> - /* Start of critial section -- keep to one 64Byte cache line. */
> -
> - ldp data1, data2, [src], #16
> -L(realigned):
> - sub tmp1, data1, zeroones
> - orr tmp2, data1, #REP8_7f
> - sub tmp3, data2, zeroones
> - orr tmp4, data2, #REP8_7f
> - bic has_nul1, tmp1, tmp2
> - bic has_nul2, tmp3, tmp4
> - subs limit_wd, limit_wd, #1
> - orr tmp1, has_nul1, has_nul2
> - ccmp tmp1, #0, #0, pl /* NZCV = 0000 */
> - b.eq L(loop)
> - /* End of critical section -- keep to one 64Byte cache line. */
> -
> - orr tmp1, has_nul1, has_nul2
> - cbz tmp1, L(hit_limit) /* No null in final Qword. */
> -
> - /* We know there's a null in the final Qword. The easiest thing
> - to do now is work out the length of the string and return
> - MIN (len, limit). */
> -
> - sub len, src, srcin
> - cbz has_nul1, L(nul_in_data2)
> -#ifdef __AARCH64EB__
> - mov data2, data1
> + bic src, srcin, 15
> + mov wtmp, 0xf00f
> + cbz cntin, L(nomatch)
> + ld1 {vdata.16b}, [src], 16
> + dup vrepmask.8h, wtmp
> + cmeq vhas_chr.16b, vdata.16b, 0
> + lsl shift, srcin, 2
> + and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> + addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
> + fmov synd, dend
> + lsr synd, synd, shift
> + cbz synd, L(start_loop)
> +L(finish):
> + rbit synd, synd
> + clz synd, synd
> + lsr result, synd, 2
> + cmp cntin, result
> + csel result, cntin, result, ls
> + ret
> +
> +L(start_loop):
> + sub tmp, src, srcin
> + subs cntrem, cntin, tmp
> + b.ls L(nomatch)
> +
> + /* Make sure that it won't overread by a 16-byte chunk */
> + add tmp, cntrem, 15
> + tbnz tmp, 4, L(loop32_2)
> +
> + .p2align 5
> +L(loop32):
> + ldr qdata, [src], 16
> + cmeq vhas_chr.16b, vdata.16b, 0
> + umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
> + fmov synd, dend
> + cbnz synd, L(end)
> +L(loop32_2):
> + ldr qdata, [src], 16
> + subs cntrem, cntrem, 32
> + cmeq vhas_chr.16b, vdata.16b, 0
> + b.ls L(end)
> + umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
> + fmov synd, dend
> + cbz synd, L(loop32)
> +
> +L(end):
> + and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> + addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
> + sub src, src, 16
> + mov synd, vend.d[0]
> + sub result, src, srcin
> +#ifndef __AARCH64EB__
> + rbit synd, synd
> #endif
> - sub len, len, #8
> - mov has_nul2, has_nul1
> -L(nul_in_data2):
> -#ifdef __AARCH64EB__
> - /* For big-endian, carry propagation (if the final byte in the
> - string is 0x01) means we cannot use has_nul directly. The
> - easiest way to get the correct byte is to byte-swap the data
> - and calculate the syndrome a second time. */
> - rev data2, data2
> - sub tmp1, data2, zeroones
> - orr tmp2, data2, #REP8_7f
> - bic has_nul2, tmp1, tmp2
> -#endif
> - sub len, len, #8
> - rev has_nul2, has_nul2
> - clz pos, has_nul2
> - add len, len, pos, lsr #3 /* Bits to bytes. */
> - cmp len, limit
> - csel len, len, limit, ls /* Return the lower value. */
> - RET
> -
> -L(loop):
> - ldr dataq, [src], #16
> - uminv datab2, datav.16b
> - mov tmp1, datav2.d[0]
> - subs limit_wd, limit_wd, #1
> - ccmp tmp1, #0, #4, pl /* NZCV = 0000 */
> - b.eq L(loop_end)
> - ldr dataq, [src], #16
> - uminv datab2, datav.16b
> - mov tmp1, datav2.d[0]
> - subs limit_wd, limit_wd, #1
> - ccmp tmp1, #0, #4, pl /* NZCV = 0000 */
> - b.ne L(loop)
> -L(loop_end):
> - /* End of critical section -- keep to one 64Byte cache line. */
> -
> - cbnz tmp1, L(hit_limit) /* No null in final Qword. */
> -
> - /* We know there's a null in the final Qword. The easiest thing
> - to do now is work out the length of the string and return
> - MIN (len, limit). */
> -
> -#ifdef __AARCH64EB__
> - rev64 datav.16b, datav.16b
> -#endif
> - /* Set te NULL byte as 0xff and the rest as 0x00, move the data into a
> - pair of scalars and then compute the length from the earliest NULL
> - byte. */
> -
> - cmeq datav.16b, datav.16b, #0
> -#ifdef __AARCH64EB__
> - mov data1, datav.d[1]
> - mov data2, datav.d[0]
> -#else
> - mov data1, datav.d[0]
> - mov data2, datav.d[1]
> -#endif
> - cmp data1, 0
> - csel data1, data1, data2, ne
> - sub len, src, srcin
> - sub len, len, #16
> - rev data1, data1
> - add tmp2, len, 8
> - clz tmp1, data1
> - csel len, len, tmp2, ne
> - add len, len, tmp1, lsr 3
> - cmp len, limit
> - csel len, len, limit, ls /* Return the lower value. */
> - RET
> -
> -L(misaligned):
> - /* Deal with a partial first word.
> - We're doing two things in parallel here;
> - 1) Calculate the number of words (but avoiding overflow if
> - limit is near ULONG_MAX) - to do this we need to work out
> - limit + tmp1 - 1 as a 65-bit value before shifting it;
> - 2) Load and mask the initial data words - we force the bytes
> - before the ones we are interested in to 0xff - this ensures
> - early bytes will not hit any zero detection. */
> - sub limit_wd, limit, #1
> - neg tmp4, tmp1
> - cmp tmp1, #8
> -
> - and tmp3, limit_wd, #15
> - lsr limit_wd, limit_wd, #4
> - mov tmp2, #~0
> -
> - ldp data1, data2, [src], #16
> - lsl tmp4, tmp4, #3 /* Bytes beyond alignment -> bits. */
> - add tmp3, tmp3, tmp1
> -
> -#ifdef __AARCH64EB__
> - /* Big-endian. Early bytes are at MSB. */
> - lsl tmp2, tmp2, tmp4 /* Shift (tmp1 & 63). */
> -#else
> - /* Little-endian. Early bytes are at LSB. */
> - lsr tmp2, tmp2, tmp4 /* Shift (tmp1 & 63). */
> -#endif
> - add limit_wd, limit_wd, tmp3, lsr #4
> -
> - orr data1, data1, tmp2
> - orr data2a, data2, tmp2
> + clz synd, synd
> + add result, result, synd, lsr 2
> + cmp cntin, result
> + csel result, cntin, result, ls
> + ret
>
> - csinv data1, data1, xzr, le
> - csel data2, data2, data2a, le
> - b L(realigned)
> +L(nomatch):
> + mov result, cntin
> + ret
>
> -L(hit_limit):
> - mov len, limit
> - RET
> END (__strnlen)
> libc_hidden_def (__strnlen)
> weak_alias (__strnlen, strnlen)
>
prev parent reply other threads:[~2021-06-30 8:42 UTC|newest]
Thread overview: 2+ messages / expand[flat|nested] mbox.gz Atom feed top
2021-06-23 15:22 Wilco Dijkstra
2021-06-30 8:41 ` Szabolcs Nagy [this message]
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20210630084145.GD14854@arm.com \
--to=szabolcs.nagy@arm.com \
--cc=Wilco.Dijkstra@arm.com \
--cc=libc-alpha@sourceware.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).