Re: [PATCH] aarch64: Optimize string functions with shrn instruction

public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed

From: Szabolcs Nagy <Szabolcs.Nagy@arm.com>
To: Danila Kutenin <danilak@google.com>
Cc: libc-alpha@sourceware.org, Danila Kutenin <kutdanila@yandex.ru>
Subject: Re: [PATCH] aarch64: Optimize string functions with shrn instruction
Date: Tue, 21 Jun 2022 10:07:46 +0100	[thread overview]
Message-ID: <YrGKYrHR/rsh0dNo@arm.com> (raw)
In-Reply-To: <20220620174628.2820531-1-danilak@google.com>

The 06/20/2022 17:46, Danila Kutenin wrote:
> From: Danila Kutenin <kutdanila@yandex.ru>
> 
> We found that string functions were using AND+ADDP
> to find the nibble/syndrome mask but there is an easier
> opportunity through `SHRN dst, src, 4` and has same
> latency on all SIMD ARMv8 targets as ADDP. There are also
> gaps for memcmp but that's probably for another patch
> 
> We see 10-20% savings for small-mid size cases which are
> primary cases for general workloads https://pastebin.com/hA5Fd8eM
> 
> I don't have commit rights, asking maintainers to do that
> 
> Signed-off-by: Danila Kutenin <danilak@google.com>

is this a contribution from google or yandex or personal?

(e.g. if your company has copyright assignment with fsf then
you dont need signed-off-by, otherwise it's better to have
the email address consistent with the author address)

> ---
>  sysdeps/aarch64/memchr.S    | 19 +++++++------------
>  sysdeps/aarch64/memrchr.S   | 25 +++++++++----------------
>  sysdeps/aarch64/strchrnul.S | 29 +++++++++++------------------
>  sysdeps/aarch64/strcpy.S    | 32 ++++++++++++--------------------
>  sysdeps/aarch64/strlen.S    | 25 +++++++++----------------
>  sysdeps/aarch64/strnlen.S   | 25 +++++++++----------------
>  6 files changed, 57 insertions(+), 98 deletions(-)
> 
> diff --git a/sysdeps/aarch64/memchr.S b/sysdeps/aarch64/memchr.S
> index b060eee97d..b983489491 100644
> --- a/sysdeps/aarch64/memchr.S
> +++ b/sysdeps/aarch64/memchr.S
> @@ -53,12 +53,11 @@
>  
>  /*
>     Core algorithm:
> -   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
> -   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
> -   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
> -   set likewise for odd bytes so that adjacent bytes can be merged. Since the
> -   bits in the syndrome reflect the order in which things occur in the original
> -   string, counting trailing zeros identifies exactly which byte matched.  */
> +   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
> +   per byte. We take 4 bits of every comparison byte with shift right and narrow
> +   by 4 instruction. Since the bits in the nibble mask reflect the order in
> +   which things occur in the original string, counting leading zeros identifies
> +   exactly which byte matched.  */
>  
>  ENTRY (MEMCHR)
>  	PTR_ARG (0)
> @@ -67,12 +66,9 @@ ENTRY (MEMCHR)
>  	cbz	cntin, L(nomatch)
>  	ld1	{vdata.16b}, [src]
>  	dup	vrepchr.16b, chrin
> -	mov	wtmp, 0xf00f
> -	dup	vrepmask.8h, wtmp
>  	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
>  	lsl	shift, srcin, 2
> -	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> -	addp	vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64 */
> +	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
>  	fmov	synd, dend
>  	lsr	synd, synd, shift
>  	cbz	synd, L(start_loop)
> @@ -111,8 +107,7 @@ L(loop32_2):
>  	fmov	synd, dend
>  	cbz	synd, L(loop32)
>  L(end):
> -	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> -	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
> +	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
>  	fmov	synd, dend
>  	add	tmp, srcin, cntin
>  	sub	cntrem, tmp, src
> diff --git a/sysdeps/aarch64/memrchr.S b/sysdeps/aarch64/memrchr.S
> index e0efbad91c..5179320720 100644
> --- a/sysdeps/aarch64/memrchr.S
> +++ b/sysdeps/aarch64/memrchr.S
> @@ -37,7 +37,6 @@
>  #define synd		x5
>  #define shift		x6
>  #define	tmp		x7
> -#define wtmp		w7
>  #define end		x8
>  #define endm1		x9
>  
> @@ -45,18 +44,16 @@
>  #define qdata		q1
>  #define vdata		v1
>  #define vhas_chr	v2
> -#define vrepmask	v3
> -#define vend		v4
> -#define dend		d4
> +#define vend		v3
> +#define dend		d3
>  
>  /*
>     Core algorithm:
> -   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
> -   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
> -   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
> -   set likewise for odd bytes so that adjacent bytes can be merged. Since the
> -   bits in the syndrome reflect the order in which things occur in the original
> -   string, counting trailing zeros identifies exactly which byte matched.  */
> +   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
> +   per byte. We take 4 bits of every comparison byte with shift right and narrow
> +   by 4 instruction. Since the bits in the nibble mask reflect the order in
> +   which things occur in the original string, counting leading zeros identifies
> +   exactly which byte matched.  */
>  
>  ENTRY (__memrchr)
>  	PTR_ARG (0)
> @@ -67,12 +64,9 @@ ENTRY (__memrchr)
>  	cbz	cntin, L(nomatch)
>  	ld1	{vdata.16b}, [src]
>  	dup	vrepchr.16b, chrin
> -	mov	wtmp, 0xf00f
> -	dup	vrepmask.8h, wtmp
>  	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
>  	neg	shift, end, lsl 2
> -	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> -	addp	vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64 */
> +	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
>  	fmov	synd, dend
>  	lsl	synd, synd, shift
>  	cbz	synd, L(start_loop)
> @@ -109,8 +103,7 @@ L(loop32_2):
>  	fmov	synd, dend
>  	cbz	synd, L(loop32)
>  L(end):
> -	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> -	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
> +	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
>  	fmov	synd, dend
>  
>  	add	tmp, src, 15
> diff --git a/sysdeps/aarch64/strchrnul.S b/sysdeps/aarch64/strchrnul.S
> index 442726fd49..ee154ab74b 100644
> --- a/sysdeps/aarch64/strchrnul.S
> +++ b/sysdeps/aarch64/strchrnul.S
> @@ -33,38 +33,32 @@
>  #define src		x2
>  #define tmp1		x1
>  #define tmp2		x3
> -#define tmp2w		w3
>  
>  #define vrepchr		v0
>  #define vdata		v1
>  #define qdata		q1
>  #define vhas_nul	v2
>  #define vhas_chr	v3
> -#define vrepmask	v4
> -#define vend		v5
> -#define dend		d5
> +#define vend		v4
> +#define dend		d4
>  
> -/* Core algorithm:
> -
> -   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
> -   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
> -   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
> -   set likewise for odd bytes so that adjacent bytes can be merged. Since the
> -   bits in the syndrome reflect the order in which things occur in the original
> -   string, counting trailing zeros identifies exactly which byte matched.  */
> +/*
> +   Core algorithm:
> +   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
> +   per byte. We take 4 bits of every comparison byte with shift right and narrow
> +   by 4 instruction. Since the bits in the nibble mask reflect the order in
> +   which things occur in the original string, counting leading zeros identifies
> +   exactly which byte matched.  */
>  
>  ENTRY (__strchrnul)
>  	PTR_ARG (0)
>  	bic	src, srcin, 15
>  	dup	vrepchr.16b, chrin
>  	ld1	{vdata.16b}, [src]
> -	mov	tmp2w, 0xf00f
> -	dup	vrepmask.8h, tmp2w
>  	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
>  	cmhs	vhas_chr.16b, vhas_chr.16b, vdata.16b
>  	lsl	tmp2, srcin, 2
> -	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> -	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
> +	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
>  	fmov	tmp1, dend
>  	lsr	tmp1, tmp1, tmp2	/* Mask padding bits.  */
>  	cbz	tmp1, L(loop)
> @@ -83,8 +77,7 @@ L(loop):
>  	fmov	tmp1, dend
>  	cbz	tmp1, L(loop)
>  
> -	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> -	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
> +	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
>  	fmov	tmp1, dend
>  #ifndef __AARCH64EB__
>  	rbit	tmp1, tmp1
> diff --git a/sysdeps/aarch64/strcpy.S b/sysdeps/aarch64/strcpy.S
> index da53170ece..78d27b4aa6 100644
> --- a/sysdeps/aarch64/strcpy.S
> +++ b/sysdeps/aarch64/strcpy.S
> @@ -40,7 +40,6 @@
>  #define len		x4
>  #define synd		x4
>  #define	tmp		x5
> -#define wtmp		w5
>  #define shift		x5
>  #define data1		x6
>  #define dataw1		w6
> @@ -50,9 +49,8 @@
>  #define dataq		q0
>  #define vdata		v0
>  #define vhas_nul	v1
> -#define vrepmask	v2
> -#define vend		v3
> -#define dend		d3
> +#define vend		v2
> +#define dend		d2
>  #define dataq2		q1
>  
>  #ifdef BUILD_STPCPY
> @@ -63,34 +61,29 @@
>  # define IFSTPCPY(X,...)
>  #endif
>  
> -/* Core algorithm:
> -
> -   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
> -   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
> -   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
> -   set likewise for odd bytes so that adjacent bytes can be merged. Since the
> -   bits in the syndrome reflect the order in which things occur in the original
> -   string, counting trailing zeros identifies exactly which byte matched.  */
> +/*
> +   Core algorithm:
> +   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
> +   per byte. We take 4 bits of every comparison byte with shift right and narrow
> +   by 4 instruction. Since the bits in the nibble mask reflect the order in
> +   which things occur in the original string, counting leading zeros identifies
> +   exactly which byte matched.  */
>  
>  ENTRY (STRCPY)
>  	PTR_ARG (0)
>  	PTR_ARG (1)
>  	bic	src, srcin, 15
> -	mov	wtmp, 0xf00f
>  	ld1	{vdata.16b}, [src]
> -	dup	vrepmask.8h, wtmp
>  	cmeq	vhas_nul.16b, vdata.16b, 0
>  	lsl	shift, srcin, 2
> -	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> -	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
> +	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
>  	fmov	synd, dend
>  	lsr	synd, synd, shift
>  	cbnz	synd, L(tail)
>  
>  	ldr	dataq, [src, 16]!
>  	cmeq	vhas_nul.16b, vdata.16b, 0
> -	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> -	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
> +	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
>  	fmov	synd, dend
>  	cbz	synd, L(start_loop)
>  
> @@ -162,8 +155,7 @@ L(loop):
>  	fmov	synd, dend
>  	cbz	synd, L(loop)
>  
> -	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> -	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
> +	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
>  	fmov	synd, dend
>  #ifndef __AARCH64EB__
>  	rbit	synd, synd
> diff --git a/sysdeps/aarch64/strlen.S b/sysdeps/aarch64/strlen.S
> index a2310871c2..3a5d088407 100644
> --- a/sysdeps/aarch64/strlen.S
> +++ b/sysdeps/aarch64/strlen.S
> @@ -34,35 +34,29 @@
>  #define src		x1
>  #define	synd		x2
>  #define tmp		x3
> -#define wtmp		w3
>  #define shift		x4
>  
>  #define data		q0
>  #define vdata		v0
>  #define vhas_nul	v1
> -#define vrepmask	v2
> -#define vend		v3
> -#define dend		d3
> +#define vend		v2
> +#define dend		d2
>  
>  /* Core algorithm:
>  
> -   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
> -   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
> -   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
> -   set likewise for odd bytes so that adjacent bytes can be merged. Since the
> -   bits in the syndrome reflect the order in which things occur in the original
> -   string, counting trailing zeros identifies exactly which byte matched.  */
> +   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
> +   per byte. We take 4 bits of every comparison byte with shift right and narrow
> +   by 4 instruction. Since the bits in the nibble mask reflect the order in
> +   which things occur in the original string, counting trailing zeros identifies
> +   exactly which byte matched.  */
>  
>  ENTRY (STRLEN)
>  	PTR_ARG (0)
>  	bic	src, srcin, 15
> -	mov	wtmp, 0xf00f
>  	ld1	{vdata.16b}, [src]
> -	dup	vrepmask.8h, wtmp
>  	cmeq	vhas_nul.16b, vdata.16b, 0
>  	lsl	shift, srcin, 2
> -	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> -	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
> +	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
>  	fmov	synd, dend
>  	lsr	synd, synd, shift
>  	cbz	synd, L(loop)
> @@ -80,8 +74,7 @@ L(loop):
>  	fmov	synd, dend
>  	cbz	synd, L(loop)
>  
> -	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> -	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
> +	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
>  	sub	result, src, srcin
>  	fmov	synd, dend
>  #ifndef __AARCH64EB__
> diff --git a/sysdeps/aarch64/strnlen.S b/sysdeps/aarch64/strnlen.S
> index 0dbecb0ce9..282bddc9aa 100644
> --- a/sysdeps/aarch64/strnlen.S
> +++ b/sysdeps/aarch64/strnlen.S
> @@ -33,39 +33,33 @@
>  #define src		x2
>  #define synd		x3
>  #define	shift		x4
> -#define wtmp		w4
>  #define tmp		x4
>  #define cntrem		x5
>  
>  #define qdata		q0
>  #define vdata		v0
>  #define vhas_chr	v1
> -#define vrepmask	v2
> -#define vend		v3
> -#define dend		d3
> +#define vend		v2
> +#define dend		d2
>  
>  /*
>     Core algorithm:
>  
> -   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
> -   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
> -   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
> -   set likewise for odd bytes so that adjacent bytes can be merged. Since the
> -   bits in the syndrome reflect the order in which things occur in the original
> -   string, counting trailing zeros identifies exactly which byte matched.  */
> +   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
> +   per byte. We take 4 bits of every comparison byte with shift right and narrow
> +   by 4 instruction. Since the bits in the nibble mask reflect the order in
> +   which things occur in the original string, counting trailing zeros identifies
> +   exactly which byte matched.  */
>  
>  ENTRY (__strnlen)
>  	PTR_ARG (0)
>  	SIZE_ARG (1)
>  	bic	src, srcin, 15
> -	mov	wtmp, 0xf00f
>  	cbz	cntin, L(nomatch)
>  	ld1	{vdata.16b}, [src], 16
> -	dup	vrepmask.8h, wtmp
>  	cmeq	vhas_chr.16b, vdata.16b, 0
>  	lsl	shift, srcin, 2
> -	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> -	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
> +	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
>  	fmov	synd, dend
>  	lsr	synd, synd, shift
>  	cbz	synd, L(start_loop)
> @@ -103,8 +97,7 @@ L(loop32_2):
>  	cbz	synd, L(loop32)
>  
>  L(end):
> -	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> -	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
> +	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
>  	sub	src, src, 16
>  	mov	synd, vend.d[0]
>  	sub	result, src, srcin
> -- 
> 2.37.0.rc0.104.g0611611a94-goog
>

next prev parent reply	other threads:[~2022-06-21  9:08 UTC|newest]

Thread overview: 12+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-06-20 17:46 Danila Kutenin
2022-06-21  9:07 ` Szabolcs Nagy [this message]
2022-06-21  9:28   ` Danila Kutenin
2022-06-22  6:48     ` Szabolcs Nagy
2022-06-22  7:31       ` Danila Kutenin
2022-06-22  8:40         ` Szabolcs Nagy
2022-06-22  7:29 Danila Kutenin
2022-06-27 16:02 ` Carlos O'Donell
2022-06-27 16:12   ` Danila Kutenin
2022-06-27 16:12 Danila Kutenin
2022-07-06  7:42 ` Danila Kutenin
2022-07-06  8:30   ` Szabolcs Nagy

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=YrGKYrHR/rsh0dNo@arm.com \
    --to=szabolcs.nagy@arm.com \
    --cc=danilak@google.com \
    --cc=kutdanila@yandex.ru \
    --cc=libc-alpha@sourceware.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).