[PATCH] aarch64: Optimize string functions with shrn instruction

public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed

* [PATCH] aarch64: Optimize string functions with shrn instruction
@ 2022-06-20 17:46 Danila Kutenin
  2022-06-21  9:07 ` Szabolcs Nagy
  0 siblings, 1 reply; 12+ messages in thread
From: Danila Kutenin @ 2022-06-20 17:46 UTC (permalink / raw)
  To: libc-alpha; +Cc: Szabolcs.Nagy, Danila Kutenin, Danila Kutenin

From: Danila Kutenin <kutdanila@yandex.ru>

We found that string functions were using AND+ADDP
to find the nibble/syndrome mask but there is an easier
opportunity through `SHRN dst, src, 4` and has same
latency on all SIMD ARMv8 targets as ADDP. There are also
gaps for memcmp but that's probably for another patch

We see 10-20% savings for small-mid size cases which are
primary cases for general workloads https://pastebin.com/hA5Fd8eM

I don't have commit rights, asking maintainers to do that

Signed-off-by: Danila Kutenin <danilak@google.com>
---
 sysdeps/aarch64/memchr.S    | 19 +++++++------------
 sysdeps/aarch64/memrchr.S   | 25 +++++++++----------------
 sysdeps/aarch64/strchrnul.S | 29 +++++++++++------------------
 sysdeps/aarch64/strcpy.S    | 32 ++++++++++++--------------------
 sysdeps/aarch64/strlen.S    | 25 +++++++++----------------
 sysdeps/aarch64/strnlen.S   | 25 +++++++++----------------
 6 files changed, 57 insertions(+), 98 deletions(-)

diff --git a/sysdeps/aarch64/memchr.S b/sysdeps/aarch64/memchr.S
index b060eee97d..b983489491 100644
--- a/sysdeps/aarch64/memchr.S
+++ b/sysdeps/aarch64/memchr.S
@@ -53,12 +53,11 @@
 
 /*
    Core algorithm:
-   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
-   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
-   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
-   set likewise for odd bytes so that adjacent bytes can be merged. Since the
-   bits in the syndrome reflect the order in which things occur in the original
-   string, counting trailing zeros identifies exactly which byte matched.  */
+   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
+   per byte. We take 4 bits of every comparison byte with shift right and narrow
+   by 4 instruction. Since the bits in the nibble mask reflect the order in
+   which things occur in the original string, counting leading zeros identifies
+   exactly which byte matched.  */
 
 ENTRY (MEMCHR)
 	PTR_ARG (0)
@@ -67,12 +66,9 @@ ENTRY (MEMCHR)
 	cbz	cntin, L(nomatch)
 	ld1	{vdata.16b}, [src]
 	dup	vrepchr.16b, chrin
-	mov	wtmp, 0xf00f
-	dup	vrepmask.8h, wtmp
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	lsl	shift, srcin, 2
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64 */
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	fmov	synd, dend
 	lsr	synd, synd, shift
 	cbz	synd, L(start_loop)
@@ -111,8 +107,7 @@ L(loop32_2):
 	fmov	synd, dend
 	cbz	synd, L(loop32)
 L(end):
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	fmov	synd, dend
 	add	tmp, srcin, cntin
 	sub	cntrem, tmp, src
diff --git a/sysdeps/aarch64/memrchr.S b/sysdeps/aarch64/memrchr.S
index e0efbad91c..5179320720 100644
--- a/sysdeps/aarch64/memrchr.S
+++ b/sysdeps/aarch64/memrchr.S
@@ -37,7 +37,6 @@
 #define synd		x5
 #define shift		x6
 #define	tmp		x7
-#define wtmp		w7
 #define end		x8
 #define endm1		x9
 
@@ -45,18 +44,16 @@
 #define qdata		q1
 #define vdata		v1
 #define vhas_chr	v2
-#define vrepmask	v3
-#define vend		v4
-#define dend		d4
+#define vend		v3
+#define dend		d3
 
 /*
    Core algorithm:
-   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
-   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
-   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
-   set likewise for odd bytes so that adjacent bytes can be merged. Since the
-   bits in the syndrome reflect the order in which things occur in the original
-   string, counting trailing zeros identifies exactly which byte matched.  */
+   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
+   per byte. We take 4 bits of every comparison byte with shift right and narrow
+   by 4 instruction. Since the bits in the nibble mask reflect the order in
+   which things occur in the original string, counting leading zeros identifies
+   exactly which byte matched.  */
 
 ENTRY (__memrchr)
 	PTR_ARG (0)
@@ -67,12 +64,9 @@ ENTRY (__memrchr)
 	cbz	cntin, L(nomatch)
 	ld1	{vdata.16b}, [src]
 	dup	vrepchr.16b, chrin
-	mov	wtmp, 0xf00f
-	dup	vrepmask.8h, wtmp
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	neg	shift, end, lsl 2
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64 */
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	fmov	synd, dend
 	lsl	synd, synd, shift
 	cbz	synd, L(start_loop)
@@ -109,8 +103,7 @@ L(loop32_2):
 	fmov	synd, dend
 	cbz	synd, L(loop32)
 L(end):
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	fmov	synd, dend
 
 	add	tmp, src, 15
diff --git a/sysdeps/aarch64/strchrnul.S b/sysdeps/aarch64/strchrnul.S
index 442726fd49..ee154ab74b 100644
--- a/sysdeps/aarch64/strchrnul.S
+++ b/sysdeps/aarch64/strchrnul.S
@@ -33,38 +33,32 @@
 #define src		x2
 #define tmp1		x1
 #define tmp2		x3
-#define tmp2w		w3
 
 #define vrepchr		v0
 #define vdata		v1
 #define qdata		q1
 #define vhas_nul	v2
 #define vhas_chr	v3
-#define vrepmask	v4
-#define vend		v5
-#define dend		d5
+#define vend		v4
+#define dend		d4
 
-/* Core algorithm:
-
-   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
-   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
-   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
-   set likewise for odd bytes so that adjacent bytes can be merged. Since the
-   bits in the syndrome reflect the order in which things occur in the original
-   string, counting trailing zeros identifies exactly which byte matched.  */
+/*
+   Core algorithm:
+   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
+   per byte. We take 4 bits of every comparison byte with shift right and narrow
+   by 4 instruction. Since the bits in the nibble mask reflect the order in
+   which things occur in the original string, counting leading zeros identifies
+   exactly which byte matched.  */
 
 ENTRY (__strchrnul)
 	PTR_ARG (0)
 	bic	src, srcin, 15
 	dup	vrepchr.16b, chrin
 	ld1	{vdata.16b}, [src]
-	mov	tmp2w, 0xf00f
-	dup	vrepmask.8h, tmp2w
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	cmhs	vhas_chr.16b, vhas_chr.16b, vdata.16b
 	lsl	tmp2, srcin, 2
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	fmov	tmp1, dend
 	lsr	tmp1, tmp1, tmp2	/* Mask padding bits.  */
 	cbz	tmp1, L(loop)
@@ -83,8 +77,7 @@ L(loop):
 	fmov	tmp1, dend
 	cbz	tmp1, L(loop)
 
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	fmov	tmp1, dend
 #ifndef __AARCH64EB__
 	rbit	tmp1, tmp1
diff --git a/sysdeps/aarch64/strcpy.S b/sysdeps/aarch64/strcpy.S
index da53170ece..78d27b4aa6 100644
--- a/sysdeps/aarch64/strcpy.S
+++ b/sysdeps/aarch64/strcpy.S
@@ -40,7 +40,6 @@
 #define len		x4
 #define synd		x4
 #define	tmp		x5
-#define wtmp		w5
 #define shift		x5
 #define data1		x6
 #define dataw1		w6
@@ -50,9 +49,8 @@
 #define dataq		q0
 #define vdata		v0
 #define vhas_nul	v1
-#define vrepmask	v2
-#define vend		v3
-#define dend		d3
+#define vend		v2
+#define dend		d2
 #define dataq2		q1
 
 #ifdef BUILD_STPCPY
@@ -63,34 +61,29 @@
 # define IFSTPCPY(X,...)
 #endif
 
-/* Core algorithm:
-
-   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
-   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
-   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
-   set likewise for odd bytes so that adjacent bytes can be merged. Since the
-   bits in the syndrome reflect the order in which things occur in the original
-   string, counting trailing zeros identifies exactly which byte matched.  */
+/*
+   Core algorithm:
+   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
+   per byte. We take 4 bits of every comparison byte with shift right and narrow
+   by 4 instruction. Since the bits in the nibble mask reflect the order in
+   which things occur in the original string, counting leading zeros identifies
+   exactly which byte matched.  */
 
 ENTRY (STRCPY)
 	PTR_ARG (0)
 	PTR_ARG (1)
 	bic	src, srcin, 15
-	mov	wtmp, 0xf00f
 	ld1	{vdata.16b}, [src]
-	dup	vrepmask.8h, wtmp
 	cmeq	vhas_nul.16b, vdata.16b, 0
 	lsl	shift, srcin, 2
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
 	fmov	synd, dend
 	lsr	synd, synd, shift
 	cbnz	synd, L(tail)
 
 	ldr	dataq, [src, 16]!
 	cmeq	vhas_nul.16b, vdata.16b, 0
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
 	fmov	synd, dend
 	cbz	synd, L(start_loop)
 
@@ -162,8 +155,7 @@ L(loop):
 	fmov	synd, dend
 	cbz	synd, L(loop)
 
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
+	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
 	fmov	synd, dend
 #ifndef __AARCH64EB__
 	rbit	synd, synd
diff --git a/sysdeps/aarch64/strlen.S b/sysdeps/aarch64/strlen.S
index a2310871c2..3a5d088407 100644
--- a/sysdeps/aarch64/strlen.S
+++ b/sysdeps/aarch64/strlen.S
@@ -34,35 +34,29 @@
 #define src		x1
 #define	synd		x2
 #define tmp		x3
-#define wtmp		w3
 #define shift		x4
 
 #define data		q0
 #define vdata		v0
 #define vhas_nul	v1
-#define vrepmask	v2
-#define vend		v3
-#define dend		d3
+#define vend		v2
+#define dend		d2
 
 /* Core algorithm:
 
-   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
-   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
-   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
-   set likewise for odd bytes so that adjacent bytes can be merged. Since the
-   bits in the syndrome reflect the order in which things occur in the original
-   string, counting trailing zeros identifies exactly which byte matched.  */
+   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
+   per byte. We take 4 bits of every comparison byte with shift right and narrow
+   by 4 instruction. Since the bits in the nibble mask reflect the order in
+   which things occur in the original string, counting trailing zeros identifies
+   exactly which byte matched.  */
 
 ENTRY (STRLEN)
 	PTR_ARG (0)
 	bic	src, srcin, 15
-	mov	wtmp, 0xf00f
 	ld1	{vdata.16b}, [src]
-	dup	vrepmask.8h, wtmp
 	cmeq	vhas_nul.16b, vdata.16b, 0
 	lsl	shift, srcin, 2
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
+	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
 	fmov	synd, dend
 	lsr	synd, synd, shift
 	cbz	synd, L(loop)
@@ -80,8 +74,7 @@ L(loop):
 	fmov	synd, dend
 	cbz	synd, L(loop)
 
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
+	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
 	sub	result, src, srcin
 	fmov	synd, dend
 #ifndef __AARCH64EB__
diff --git a/sysdeps/aarch64/strnlen.S b/sysdeps/aarch64/strnlen.S
index 0dbecb0ce9..282bddc9aa 100644
--- a/sysdeps/aarch64/strnlen.S
+++ b/sysdeps/aarch64/strnlen.S
@@ -33,39 +33,33 @@
 #define src		x2
 #define synd		x3
 #define	shift		x4
-#define wtmp		w4
 #define tmp		x4
 #define cntrem		x5
 
 #define qdata		q0
 #define vdata		v0
 #define vhas_chr	v1
-#define vrepmask	v2
-#define vend		v3
-#define dend		d3
+#define vend		v2
+#define dend		d2
 
 /*
    Core algorithm:
 
-   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
-   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
-   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
-   set likewise for odd bytes so that adjacent bytes can be merged. Since the
-   bits in the syndrome reflect the order in which things occur in the original
-   string, counting trailing zeros identifies exactly which byte matched.  */
+   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
+   per byte. We take 4 bits of every comparison byte with shift right and narrow
+   by 4 instruction. Since the bits in the nibble mask reflect the order in
+   which things occur in the original string, counting trailing zeros identifies
+   exactly which byte matched.  */
 
 ENTRY (__strnlen)
 	PTR_ARG (0)
 	SIZE_ARG (1)
 	bic	src, srcin, 15
-	mov	wtmp, 0xf00f
 	cbz	cntin, L(nomatch)
 	ld1	{vdata.16b}, [src], 16
-	dup	vrepmask.8h, wtmp
 	cmeq	vhas_chr.16b, vdata.16b, 0
 	lsl	shift, srcin, 2
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	fmov	synd, dend
 	lsr	synd, synd, shift
 	cbz	synd, L(start_loop)
@@ -103,8 +97,7 @@ L(loop32_2):
 	cbz	synd, L(loop32)
 
 L(end):
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	sub	src, src, 16
 	mov	synd, vend.d[0]
 	sub	result, src, srcin
-- 
2.37.0.rc0.104.g0611611a94-goog


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] aarch64: Optimize string functions with shrn instruction
  2022-06-20 17:46 [PATCH] aarch64: Optimize string functions with shrn instruction Danila Kutenin
@ 2022-06-21  9:07 ` Szabolcs Nagy
  2022-06-21  9:28   ` Danila Kutenin
  0 siblings, 1 reply; 12+ messages in thread
From: Szabolcs Nagy @ 2022-06-21  9:07 UTC (permalink / raw)
  To: Danila Kutenin; +Cc: libc-alpha, Danila Kutenin

The 06/20/2022 17:46, Danila Kutenin wrote:
> From: Danila Kutenin <kutdanila@yandex.ru>
> 
> We found that string functions were using AND+ADDP
> to find the nibble/syndrome mask but there is an easier
> opportunity through `SHRN dst, src, 4` and has same
> latency on all SIMD ARMv8 targets as ADDP. There are also
> gaps for memcmp but that's probably for another patch
> 
> We see 10-20% savings for small-mid size cases which are
> primary cases for general workloads https://pastebin.com/hA5Fd8eM
> 
> I don't have commit rights, asking maintainers to do that
> 
> Signed-off-by: Danila Kutenin <danilak@google.com>

is this a contribution from google or yandex or personal?

(e.g. if your company has copyright assignment with fsf then
you dont need signed-off-by, otherwise it's better to have
the email address consistent with the author address)

> ---
>  sysdeps/aarch64/memchr.S    | 19 +++++++------------
>  sysdeps/aarch64/memrchr.S   | 25 +++++++++----------------
>  sysdeps/aarch64/strchrnul.S | 29 +++++++++++------------------
>  sysdeps/aarch64/strcpy.S    | 32 ++++++++++++--------------------
>  sysdeps/aarch64/strlen.S    | 25 +++++++++----------------
>  sysdeps/aarch64/strnlen.S   | 25 +++++++++----------------
>  6 files changed, 57 insertions(+), 98 deletions(-)
> 
> diff --git a/sysdeps/aarch64/memchr.S b/sysdeps/aarch64/memchr.S
> index b060eee97d..b983489491 100644
> --- a/sysdeps/aarch64/memchr.S
> +++ b/sysdeps/aarch64/memchr.S
> @@ -53,12 +53,11 @@
>  
>  /*
>     Core algorithm:
> -   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
> -   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
> -   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
> -   set likewise for odd bytes so that adjacent bytes can be merged. Since the
> -   bits in the syndrome reflect the order in which things occur in the original
> -   string, counting trailing zeros identifies exactly which byte matched.  */
> +   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
> +   per byte. We take 4 bits of every comparison byte with shift right and narrow
> +   by 4 instruction. Since the bits in the nibble mask reflect the order in
> +   which things occur in the original string, counting leading zeros identifies
> +   exactly which byte matched.  */
>  
>  ENTRY (MEMCHR)
>  	PTR_ARG (0)
> @@ -67,12 +66,9 @@ ENTRY (MEMCHR)
>  	cbz	cntin, L(nomatch)
>  	ld1	{vdata.16b}, [src]
>  	dup	vrepchr.16b, chrin
> -	mov	wtmp, 0xf00f
> -	dup	vrepmask.8h, wtmp
>  	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
>  	lsl	shift, srcin, 2
> -	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> -	addp	vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64 */
> +	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
>  	fmov	synd, dend
>  	lsr	synd, synd, shift
>  	cbz	synd, L(start_loop)
> @@ -111,8 +107,7 @@ L(loop32_2):
>  	fmov	synd, dend
>  	cbz	synd, L(loop32)
>  L(end):
> -	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> -	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
> +	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
>  	fmov	synd, dend
>  	add	tmp, srcin, cntin
>  	sub	cntrem, tmp, src
> diff --git a/sysdeps/aarch64/memrchr.S b/sysdeps/aarch64/memrchr.S
> index e0efbad91c..5179320720 100644
> --- a/sysdeps/aarch64/memrchr.S
> +++ b/sysdeps/aarch64/memrchr.S
> @@ -37,7 +37,6 @@
>  #define synd		x5
>  #define shift		x6
>  #define	tmp		x7
> -#define wtmp		w7
>  #define end		x8
>  #define endm1		x9
>  
> @@ -45,18 +44,16 @@
>  #define qdata		q1
>  #define vdata		v1
>  #define vhas_chr	v2
> -#define vrepmask	v3
> -#define vend		v4
> -#define dend		d4
> +#define vend		v3
> +#define dend		d3
>  
>  /*
>     Core algorithm:
> -   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
> -   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
> -   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
> -   set likewise for odd bytes so that adjacent bytes can be merged. Since the
> -   bits in the syndrome reflect the order in which things occur in the original
> -   string, counting trailing zeros identifies exactly which byte matched.  */
> +   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
> +   per byte. We take 4 bits of every comparison byte with shift right and narrow
> +   by 4 instruction. Since the bits in the nibble mask reflect the order in
> +   which things occur in the original string, counting leading zeros identifies
> +   exactly which byte matched.  */
>  
>  ENTRY (__memrchr)
>  	PTR_ARG (0)
> @@ -67,12 +64,9 @@ ENTRY (__memrchr)
>  	cbz	cntin, L(nomatch)
>  	ld1	{vdata.16b}, [src]
>  	dup	vrepchr.16b, chrin
> -	mov	wtmp, 0xf00f
> -	dup	vrepmask.8h, wtmp
>  	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
>  	neg	shift, end, lsl 2
> -	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> -	addp	vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64 */
> +	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
>  	fmov	synd, dend
>  	lsl	synd, synd, shift
>  	cbz	synd, L(start_loop)
> @@ -109,8 +103,7 @@ L(loop32_2):
>  	fmov	synd, dend
>  	cbz	synd, L(loop32)
>  L(end):
> -	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> -	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
> +	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
>  	fmov	synd, dend
>  
>  	add	tmp, src, 15
> diff --git a/sysdeps/aarch64/strchrnul.S b/sysdeps/aarch64/strchrnul.S
> index 442726fd49..ee154ab74b 100644
> --- a/sysdeps/aarch64/strchrnul.S
> +++ b/sysdeps/aarch64/strchrnul.S
> @@ -33,38 +33,32 @@
>  #define src		x2
>  #define tmp1		x1
>  #define tmp2		x3
> -#define tmp2w		w3
>  
>  #define vrepchr		v0
>  #define vdata		v1
>  #define qdata		q1
>  #define vhas_nul	v2
>  #define vhas_chr	v3
> -#define vrepmask	v4
> -#define vend		v5
> -#define dend		d5
> +#define vend		v4
> +#define dend		d4
>  
> -/* Core algorithm:
> -
> -   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
> -   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
> -   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
> -   set likewise for odd bytes so that adjacent bytes can be merged. Since the
> -   bits in the syndrome reflect the order in which things occur in the original
> -   string, counting trailing zeros identifies exactly which byte matched.  */
> +/*
> +   Core algorithm:
> +   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
> +   per byte. We take 4 bits of every comparison byte with shift right and narrow
> +   by 4 instruction. Since the bits in the nibble mask reflect the order in
> +   which things occur in the original string, counting leading zeros identifies
> +   exactly which byte matched.  */
>  
>  ENTRY (__strchrnul)
>  	PTR_ARG (0)
>  	bic	src, srcin, 15
>  	dup	vrepchr.16b, chrin
>  	ld1	{vdata.16b}, [src]
> -	mov	tmp2w, 0xf00f
> -	dup	vrepmask.8h, tmp2w
>  	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
>  	cmhs	vhas_chr.16b, vhas_chr.16b, vdata.16b
>  	lsl	tmp2, srcin, 2
> -	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> -	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
> +	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
>  	fmov	tmp1, dend
>  	lsr	tmp1, tmp1, tmp2	/* Mask padding bits.  */
>  	cbz	tmp1, L(loop)
> @@ -83,8 +77,7 @@ L(loop):
>  	fmov	tmp1, dend
>  	cbz	tmp1, L(loop)
>  
> -	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> -	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
> +	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
>  	fmov	tmp1, dend
>  #ifndef __AARCH64EB__
>  	rbit	tmp1, tmp1
> diff --git a/sysdeps/aarch64/strcpy.S b/sysdeps/aarch64/strcpy.S
> index da53170ece..78d27b4aa6 100644
> --- a/sysdeps/aarch64/strcpy.S
> +++ b/sysdeps/aarch64/strcpy.S
> @@ -40,7 +40,6 @@
>  #define len		x4
>  #define synd		x4
>  #define	tmp		x5
> -#define wtmp		w5
>  #define shift		x5
>  #define data1		x6
>  #define dataw1		w6
> @@ -50,9 +49,8 @@
>  #define dataq		q0
>  #define vdata		v0
>  #define vhas_nul	v1
> -#define vrepmask	v2
> -#define vend		v3
> -#define dend		d3
> +#define vend		v2
> +#define dend		d2
>  #define dataq2		q1
>  
>  #ifdef BUILD_STPCPY
> @@ -63,34 +61,29 @@
>  # define IFSTPCPY(X,...)
>  #endif
>  
> -/* Core algorithm:
> -
> -   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
> -   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
> -   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
> -   set likewise for odd bytes so that adjacent bytes can be merged. Since the
> -   bits in the syndrome reflect the order in which things occur in the original
> -   string, counting trailing zeros identifies exactly which byte matched.  */
> +/*
> +   Core algorithm:
> +   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
> +   per byte. We take 4 bits of every comparison byte with shift right and narrow
> +   by 4 instruction. Since the bits in the nibble mask reflect the order in
> +   which things occur in the original string, counting leading zeros identifies
> +   exactly which byte matched.  */
>  
>  ENTRY (STRCPY)
>  	PTR_ARG (0)
>  	PTR_ARG (1)
>  	bic	src, srcin, 15
> -	mov	wtmp, 0xf00f
>  	ld1	{vdata.16b}, [src]
> -	dup	vrepmask.8h, wtmp
>  	cmeq	vhas_nul.16b, vdata.16b, 0
>  	lsl	shift, srcin, 2
> -	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> -	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
> +	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
>  	fmov	synd, dend
>  	lsr	synd, synd, shift
>  	cbnz	synd, L(tail)
>  
>  	ldr	dataq, [src, 16]!
>  	cmeq	vhas_nul.16b, vdata.16b, 0
> -	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> -	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
> +	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
>  	fmov	synd, dend
>  	cbz	synd, L(start_loop)
>  
> @@ -162,8 +155,7 @@ L(loop):
>  	fmov	synd, dend
>  	cbz	synd, L(loop)
>  
> -	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> -	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
> +	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
>  	fmov	synd, dend
>  #ifndef __AARCH64EB__
>  	rbit	synd, synd
> diff --git a/sysdeps/aarch64/strlen.S b/sysdeps/aarch64/strlen.S
> index a2310871c2..3a5d088407 100644
> --- a/sysdeps/aarch64/strlen.S
> +++ b/sysdeps/aarch64/strlen.S
> @@ -34,35 +34,29 @@
>  #define src		x1
>  #define	synd		x2
>  #define tmp		x3
> -#define wtmp		w3
>  #define shift		x4
>  
>  #define data		q0
>  #define vdata		v0
>  #define vhas_nul	v1
> -#define vrepmask	v2
> -#define vend		v3
> -#define dend		d3
> +#define vend		v2
> +#define dend		d2
>  
>  /* Core algorithm:
>  
> -   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
> -   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
> -   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
> -   set likewise for odd bytes so that adjacent bytes can be merged. Since the
> -   bits in the syndrome reflect the order in which things occur in the original
> -   string, counting trailing zeros identifies exactly which byte matched.  */
> +   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
> +   per byte. We take 4 bits of every comparison byte with shift right and narrow
> +   by 4 instruction. Since the bits in the nibble mask reflect the order in
> +   which things occur in the original string, counting trailing zeros identifies
> +   exactly which byte matched.  */
>  
>  ENTRY (STRLEN)
>  	PTR_ARG (0)
>  	bic	src, srcin, 15
> -	mov	wtmp, 0xf00f
>  	ld1	{vdata.16b}, [src]
> -	dup	vrepmask.8h, wtmp
>  	cmeq	vhas_nul.16b, vdata.16b, 0
>  	lsl	shift, srcin, 2
> -	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> -	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
> +	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
>  	fmov	synd, dend
>  	lsr	synd, synd, shift
>  	cbz	synd, L(loop)
> @@ -80,8 +74,7 @@ L(loop):
>  	fmov	synd, dend
>  	cbz	synd, L(loop)
>  
> -	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> -	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
> +	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
>  	sub	result, src, srcin
>  	fmov	synd, dend
>  #ifndef __AARCH64EB__
> diff --git a/sysdeps/aarch64/strnlen.S b/sysdeps/aarch64/strnlen.S
> index 0dbecb0ce9..282bddc9aa 100644
> --- a/sysdeps/aarch64/strnlen.S
> +++ b/sysdeps/aarch64/strnlen.S
> @@ -33,39 +33,33 @@
>  #define src		x2
>  #define synd		x3
>  #define	shift		x4
> -#define wtmp		w4
>  #define tmp		x4
>  #define cntrem		x5
>  
>  #define qdata		q0
>  #define vdata		v0
>  #define vhas_chr	v1
> -#define vrepmask	v2
> -#define vend		v3
> -#define dend		d3
> +#define vend		v2
> +#define dend		d2
>  
>  /*
>     Core algorithm:
>  
> -   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
> -   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
> -   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
> -   set likewise for odd bytes so that adjacent bytes can be merged. Since the
> -   bits in the syndrome reflect the order in which things occur in the original
> -   string, counting trailing zeros identifies exactly which byte matched.  */
> +   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
> +   per byte. We take 4 bits of every comparison byte with shift right and narrow
> +   by 4 instruction. Since the bits in the nibble mask reflect the order in
> +   which things occur in the original string, counting trailing zeros identifies
> +   exactly which byte matched.  */
>  
>  ENTRY (__strnlen)
>  	PTR_ARG (0)
>  	SIZE_ARG (1)
>  	bic	src, srcin, 15
> -	mov	wtmp, 0xf00f
>  	cbz	cntin, L(nomatch)
>  	ld1	{vdata.16b}, [src], 16
> -	dup	vrepmask.8h, wtmp
>  	cmeq	vhas_chr.16b, vdata.16b, 0
>  	lsl	shift, srcin, 2
> -	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> -	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
> +	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
>  	fmov	synd, dend
>  	lsr	synd, synd, shift
>  	cbz	synd, L(start_loop)
> @@ -103,8 +97,7 @@ L(loop32_2):
>  	cbz	synd, L(loop32)
>  
>  L(end):
> -	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> -	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
> +	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
>  	sub	src, src, 16
>  	mov	synd, vend.d[0]
>  	sub	result, src, srcin
> -- 
> 2.37.0.rc0.104.g0611611a94-goog
> 

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] aarch64: Optimize string functions with shrn instruction
  2022-06-21  9:07 ` Szabolcs Nagy
@ 2022-06-21  9:28   ` Danila Kutenin
  2022-06-22  6:48     ` Szabolcs Nagy
  0 siblings, 1 reply; 12+ messages in thread
From: Danila Kutenin @ 2022-06-21  9:28 UTC (permalink / raw)
  To: Szabolcs Nagy; +Cc: libc-alpha, Danila Kutenin

It's a contribution from Google. Google has a copyright assignment with
fsf, I think this should cover it.

Sorry for the email confusion, I realized the mess quite late: my git
client was configured with yandex and the commit was set up with the
google's account. Yandex has nothing to do with this work. If needed, I can
recreate the patch

On Tue, Jun 21, 2022, 10:08 Szabolcs Nagy <Szabolcs.Nagy@arm.com> wrote:

> The 06/20/2022 17:46, Danila Kutenin wrote:
> > From: Danila Kutenin <kutdanila@yandex.ru>
> >
> > We found that string functions were using AND+ADDP
> > to find the nibble/syndrome mask but there is an easier
> > opportunity through `SHRN dst, src, 4` and has same
> > latency on all SIMD ARMv8 targets as ADDP. There are also
> > gaps for memcmp but that's probably for another patch
> >
> > We see 10-20% savings for small-mid size cases which are
> > primary cases for general workloads https://pastebin.com/hA5Fd8eM
> >
> > I don't have commit rights, asking maintainers to do that
> >
> > Signed-off-by: Danila Kutenin <danilak@google.com>
>
> is this a contribution from google or yandex or personal?
>
> (e.g. if your company has copyright assignment with fsf then
> you dont need signed-off-by, otherwise it's better to have
> the email address consistent with the author address)
>
> > ---
> >  sysdeps/aarch64/memchr.S    | 19 +++++++------------
> >  sysdeps/aarch64/memrchr.S   | 25 +++++++++----------------
> >  sysdeps/aarch64/strchrnul.S | 29 +++++++++++------------------
> >  sysdeps/aarch64/strcpy.S    | 32 ++++++++++++--------------------
> >  sysdeps/aarch64/strlen.S    | 25 +++++++++----------------
> >  sysdeps/aarch64/strnlen.S   | 25 +++++++++----------------
> >  6 files changed, 57 insertions(+), 98 deletions(-)
> >
> > diff --git a/sysdeps/aarch64/memchr.S b/sysdeps/aarch64/memchr.S
> > index b060eee97d..b983489491 100644
> > --- a/sysdeps/aarch64/memchr.S
> > +++ b/sysdeps/aarch64/memchr.S
> > @@ -53,12 +53,11 @@
> >
> >  /*
> >     Core algorithm:
> > -   For each 16-byte chunk we calculate a 64-bit syndrome value with
> four bits
> > -   per byte. For even bytes, bits 0-3 are set if the relevant byte
> matched the
> > -   requested character or the byte is NUL. Bits 4-7 must be zero. Bits
> 4-7 are
> > -   set likewise for odd bytes so that adjacent bytes can be merged.
> Since the
> > -   bits in the syndrome reflect the order in which things occur in the
> original
> > -   string, counting trailing zeros identifies exactly which byte
> matched.  */
> > +   For each 16-byte chunk we calculate a 64-bit nibble mask value with
> four bits
> > +   per byte. We take 4 bits of every comparison byte with shift right
> and narrow
> > +   by 4 instruction. Since the bits in the nibble mask reflect the
> order in
> > +   which things occur in the original string, counting leading zeros
> identifies
> > +   exactly which byte matched.  */
> >
> >  ENTRY (MEMCHR)
> >       PTR_ARG (0)
> > @@ -67,12 +66,9 @@ ENTRY (MEMCHR)
> >       cbz     cntin, L(nomatch)
> >       ld1     {vdata.16b}, [src]
> >       dup     vrepchr.16b, chrin
> > -     mov     wtmp, 0xf00f
> > -     dup     vrepmask.8h, wtmp
> >       cmeq    vhas_chr.16b, vdata.16b, vrepchr.16b
> >       lsl     shift, srcin, 2
> > -     and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> > -     addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64
> */
> > +     shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
> >       fmov    synd, dend
> >       lsr     synd, synd, shift
> >       cbz     synd, L(start_loop)
> > @@ -111,8 +107,7 @@ L(loop32_2):
> >       fmov    synd, dend
> >       cbz     synd, L(loop32)
> >  L(end):
> > -     and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> > -     addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64
> */
> > +     shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
> >       fmov    synd, dend
> >       add     tmp, srcin, cntin
> >       sub     cntrem, tmp, src
> > diff --git a/sysdeps/aarch64/memrchr.S b/sysdeps/aarch64/memrchr.S
> > index e0efbad91c..5179320720 <(517)%20932-0720> 100644
> > --- a/sysdeps/aarch64/memrchr.S
> > +++ b/sysdeps/aarch64/memrchr.S
> > @@ -37,7 +37,6 @@
> >  #define synd         x5
> >  #define shift                x6
> >  #define      tmp             x7
> > -#define wtmp         w7
> >  #define end          x8
> >  #define endm1                x9
> >
> > @@ -45,18 +44,16 @@
> >  #define qdata                q1
> >  #define vdata                v1
> >  #define vhas_chr     v2
> > -#define vrepmask     v3
> > -#define vend         v4
> > -#define dend         d4
> > +#define vend         v3
> > +#define dend         d3
> >
> >  /*
> >     Core algorithm:
> > -   For each 16-byte chunk we calculate a 64-bit syndrome value with
> four bits
> > -   per byte. For even bytes, bits 0-3 are set if the relevant byte
> matched the
> > -   requested character or the byte is NUL. Bits 4-7 must be zero. Bits
> 4-7 are
> > -   set likewise for odd bytes so that adjacent bytes can be merged.
> Since the
> > -   bits in the syndrome reflect the order in which things occur in the
> original
> > -   string, counting trailing zeros identifies exactly which byte
> matched.  */
> > +   For each 16-byte chunk we calculate a 64-bit nibble mask value with
> four bits
> > +   per byte. We take 4 bits of every comparison byte with shift right
> and narrow
> > +   by 4 instruction. Since the bits in the nibble mask reflect the
> order in
> > +   which things occur in the original string, counting leading zeros
> identifies
> > +   exactly which byte matched.  */
> >
> >  ENTRY (__memrchr)
> >       PTR_ARG (0)
> > @@ -67,12 +64,9 @@ ENTRY (__memrchr)
> >       cbz     cntin, L(nomatch)
> >       ld1     {vdata.16b}, [src]
> >       dup     vrepchr.16b, chrin
> > -     mov     wtmp, 0xf00f
> > -     dup     vrepmask.8h, wtmp
> >       cmeq    vhas_chr.16b, vdata.16b, vrepchr.16b
> >       neg     shift, end, lsl 2
> > -     and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> > -     addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64
> */
> > +     shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
> >       fmov    synd, dend
> >       lsl     synd, synd, shift
> >       cbz     synd, L(start_loop)
> > @@ -109,8 +103,7 @@ L(loop32_2):
> >       fmov    synd, dend
> >       cbz     synd, L(loop32)
> >  L(end):
> > -     and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> > -     addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64
> */
> > +     shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
> >       fmov    synd, dend
> >
> >       add     tmp, src, 15
> > diff --git a/sysdeps/aarch64/strchrnul.S b/sysdeps/aarch64/strchrnul.S
> > index 442726fd49..ee154ab74b 100644
> > --- a/sysdeps/aarch64/strchrnul.S
> > +++ b/sysdeps/aarch64/strchrnul.S
> > @@ -33,38 +33,32 @@
> >  #define src          x2
> >  #define tmp1         x1
> >  #define tmp2         x3
> > -#define tmp2w                w3
> >
> >  #define vrepchr              v0
> >  #define vdata                v1
> >  #define qdata                q1
> >  #define vhas_nul     v2
> >  #define vhas_chr     v3
> > -#define vrepmask     v4
> > -#define vend         v5
> > -#define dend         d5
> > +#define vend         v4
> > +#define dend         d4
> >
> > -/* Core algorithm:
> > -
> > -   For each 16-byte chunk we calculate a 64-bit syndrome value with
> four bits
> > -   per byte. For even bytes, bits 0-3 are set if the relevant byte
> matched the
> > -   requested character or the byte is NUL. Bits 4-7 must be zero. Bits
> 4-7 are
> > -   set likewise for odd bytes so that adjacent bytes can be merged.
> Since the
> > -   bits in the syndrome reflect the order in which things occur in the
> original
> > -   string, counting trailing zeros identifies exactly which byte
> matched.  */
> > +/*
> > +   Core algorithm:
> > +   For each 16-byte chunk we calculate a 64-bit nibble mask value with
> four bits
> > +   per byte. We take 4 bits of every comparison byte with shift right
> and narrow
> > +   by 4 instruction. Since the bits in the nibble mask reflect the
> order in
> > +   which things occur in the original string, counting leading zeros
> identifies
> > +   exactly which byte matched.  */
> >
> >  ENTRY (__strchrnul)
> >       PTR_ARG (0)
> >       bic     src, srcin, 15
> >       dup     vrepchr.16b, chrin
> >       ld1     {vdata.16b}, [src]
> > -     mov     tmp2w, 0xf00f
> > -     dup     vrepmask.8h, tmp2w
> >       cmeq    vhas_chr.16b, vdata.16b, vrepchr.16b
> >       cmhs    vhas_chr.16b, vhas_chr.16b, vdata.16b
> >       lsl     tmp2, srcin, 2
> > -     and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> > -     addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64
> */
> > +     shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
> >       fmov    tmp1, dend
> >       lsr     tmp1, tmp1, tmp2        /* Mask padding bits.  */
> >       cbz     tmp1, L(loop)
> > @@ -83,8 +77,7 @@ L(loop):
> >       fmov    tmp1, dend
> >       cbz     tmp1, L(loop)
> >
> > -     and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> > -     addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64
> */
> > +     shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
> >       fmov    tmp1, dend
> >  #ifndef __AARCH64EB__
> >       rbit    tmp1, tmp1
> > diff --git a/sysdeps/aarch64/strcpy.S b/sysdeps/aarch64/strcpy.S
> > index da53170ece..78d27b4aa6 100644
> > --- a/sysdeps/aarch64/strcpy.S
> > +++ b/sysdeps/aarch64/strcpy.S
> > @@ -40,7 +40,6 @@
> >  #define len          x4
> >  #define synd         x4
> >  #define      tmp             x5
> > -#define wtmp         w5
> >  #define shift                x5
> >  #define data1                x6
> >  #define dataw1               w6
> > @@ -50,9 +49,8 @@
> >  #define dataq                q0
> >  #define vdata                v0
> >  #define vhas_nul     v1
> > -#define vrepmask     v2
> > -#define vend         v3
> > -#define dend         d3
> > +#define vend         v2
> > +#define dend         d2
> >  #define dataq2               q1
> >
> >  #ifdef BUILD_STPCPY
> > @@ -63,34 +61,29 @@
> >  # define IFSTPCPY(X,...)
> >  #endif
> >
> > -/* Core algorithm:
> > -
> > -   For each 16-byte chunk we calculate a 64-bit syndrome value with
> four bits
> > -   per byte. For even bytes, bits 0-3 are set if the relevant byte
> matched the
> > -   requested character or the byte is NUL. Bits 4-7 must be zero. Bits
> 4-7 are
> > -   set likewise for odd bytes so that adjacent bytes can be merged.
> Since the
> > -   bits in the syndrome reflect the order in which things occur in the
> original
> > -   string, counting trailing zeros identifies exactly which byte
> matched.  */
> > +/*
> > +   Core algorithm:
> > +   For each 16-byte chunk we calculate a 64-bit nibble mask value with
> four bits
> > +   per byte. We take 4 bits of every comparison byte with shift right
> and narrow
> > +   by 4 instruction. Since the bits in the nibble mask reflect the
> order in
> > +   which things occur in the original string, counting leading zeros
> identifies
> > +   exactly which byte matched.  */
> >
> >  ENTRY (STRCPY)
> >       PTR_ARG (0)
> >       PTR_ARG (1)
> >       bic     src, srcin, 15
> > -     mov     wtmp, 0xf00f
> >       ld1     {vdata.16b}, [src]
> > -     dup     vrepmask.8h, wtmp
> >       cmeq    vhas_nul.16b, vdata.16b, 0
> >       lsl     shift, srcin, 2
> > -     and     vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> > -     addp    vend.16b, vhas_nul.16b, vhas_nul.16b
> > +     shrn    vend.8b, vhas_nul.8h, 4         /* 128->64 */
> >       fmov    synd, dend
> >       lsr     synd, synd, shift
> >       cbnz    synd, L(tail)
> >
> >       ldr     dataq, [src, 16]!
> >       cmeq    vhas_nul.16b, vdata.16b, 0
> > -     and     vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> > -     addp    vend.16b, vhas_nul.16b, vhas_nul.16b
> > +     shrn    vend.8b, vhas_nul.8h, 4         /* 128->64 */
> >       fmov    synd, dend
> >       cbz     synd, L(start_loop)
> >
> > @@ -162,8 +155,7 @@ L(loop):
> >       fmov    synd, dend
> >       cbz     synd, L(loop)
> >
> > -     and     vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> > -     addp    vend.16b, vhas_nul.16b, vhas_nul.16b            /* 128->64
> */
> > +     shrn    vend.8b, vhas_nul.8h, 4         /* 128->64 */
> >       fmov    synd, dend
> >  #ifndef __AARCH64EB__
> >       rbit    synd, synd
> > diff --git a/sysdeps/aarch64/strlen.S b/sysdeps/aarch64/strlen.S
> > index a2310871c2..3a5d088407 100644
> > --- a/sysdeps/aarch64/strlen.S
> > +++ b/sysdeps/aarch64/strlen.S
> > @@ -34,35 +34,29 @@
> >  #define src          x1
> >  #define      synd            x2
> >  #define tmp          x3
> > -#define wtmp         w3
> >  #define shift                x4
> >
> >  #define data         q0
> >  #define vdata                v0
> >  #define vhas_nul     v1
> > -#define vrepmask     v2
> > -#define vend         v3
> > -#define dend         d3
> > +#define vend         v2
> > +#define dend         d2
> >
> >  /* Core algorithm:
> >
> > -   For each 16-byte chunk we calculate a 64-bit syndrome value with
> four bits
> > -   per byte. For even bytes, bits 0-3 are set if the relevant byte
> matched the
> > -   requested character or the byte is NUL. Bits 4-7 must be zero. Bits
> 4-7 are
> > -   set likewise for odd bytes so that adjacent bytes can be merged.
> Since the
> > -   bits in the syndrome reflect the order in which things occur in the
> original
> > -   string, counting trailing zeros identifies exactly which byte
> matched.  */
> > +   For each 16-byte chunk we calculate a 64-bit nibble mask value with
> four bits
> > +   per byte. We take 4 bits of every comparison byte with shift right
> and narrow
> > +   by 4 instruction. Since the bits in the nibble mask reflect the
> order in
> > +   which things occur in the original string, counting trailing zeros
> identifies
> > +   exactly which byte matched.  */
> >
> >  ENTRY (STRLEN)
> >       PTR_ARG (0)
> >       bic     src, srcin, 15
> > -     mov     wtmp, 0xf00f
> >       ld1     {vdata.16b}, [src]
> > -     dup     vrepmask.8h, wtmp
> >       cmeq    vhas_nul.16b, vdata.16b, 0
> >       lsl     shift, srcin, 2
> > -     and     vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> > -     addp    vend.16b, vhas_nul.16b, vhas_nul.16b            /* 128->64
> */
> > +     shrn    vend.8b, vhas_nul.8h, 4         /* 128->64 */
> >       fmov    synd, dend
> >       lsr     synd, synd, shift
> >       cbz     synd, L(loop)
> > @@ -80,8 +74,7 @@ L(loop):
> >       fmov    synd, dend
> >       cbz     synd, L(loop)
> >
> > -     and     vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> > -     addp    vend.16b, vhas_nul.16b, vhas_nul.16b            /* 128->64
> */
> > +     shrn    vend.8b, vhas_nul.8h, 4         /* 128->64 */
> >       sub     result, src, srcin
> >       fmov    synd, dend
> >  #ifndef __AARCH64EB__
> > diff --git a/sysdeps/aarch64/strnlen.S b/sysdeps/aarch64/strnlen.S
> > index 0dbecb0ce9..282bddc9aa 100644
> > --- a/sysdeps/aarch64/strnlen.S
> > +++ b/sysdeps/aarch64/strnlen.S
> > @@ -33,39 +33,33 @@
> >  #define src          x2
> >  #define synd         x3
> >  #define      shift           x4
> > -#define wtmp         w4
> >  #define tmp          x4
> >  #define cntrem               x5
> >
> >  #define qdata                q0
> >  #define vdata                v0
> >  #define vhas_chr     v1
> > -#define vrepmask     v2
> > -#define vend         v3
> > -#define dend         d3
> > +#define vend         v2
> > +#define dend         d2
> >
> >  /*
> >     Core algorithm:
> >
> > -   For each 16-byte chunk we calculate a 64-bit syndrome value with
> four bits
> > -   per byte. For even bytes, bits 0-3 are set if the relevant byte
> matched the
> > -   requested character or the byte is NUL. Bits 4-7 must be zero. Bits
> 4-7 are
> > -   set likewise for odd bytes so that adjacent bytes can be merged.
> Since the
> > -   bits in the syndrome reflect the order in which things occur in the
> original
> > -   string, counting trailing zeros identifies exactly which byte
> matched.  */
> > +   For each 16-byte chunk we calculate a 64-bit nibble mask value with
> four bits
> > +   per byte. We take 4 bits of every comparison byte with shift right
> and narrow
> > +   by 4 instruction. Since the bits in the nibble mask reflect the
> order in
> > +   which things occur in the original string, counting trailing zeros
> identifies
> > +   exactly which byte matched.  */
> >
> >  ENTRY (__strnlen)
> >       PTR_ARG (0)
> >       SIZE_ARG (1)
> >       bic     src, srcin, 15
> > -     mov     wtmp, 0xf00f
> >       cbz     cntin, L(nomatch)
> >       ld1     {vdata.16b}, [src], 16
> > -     dup     vrepmask.8h, wtmp
> >       cmeq    vhas_chr.16b, vdata.16b, 0
> >       lsl     shift, srcin, 2
> > -     and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> > -     addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64
> */
> > +     shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
> >       fmov    synd, dend
> >       lsr     synd, synd, shift
> >       cbz     synd, L(start_loop)
> > @@ -103,8 +97,7 @@ L(loop32_2):
> >       cbz     synd, L(loop32)
> >
> >  L(end):
> > -     and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> > -     addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64
> */
> > +     shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
> >       sub     src, src, 16
> >       mov     synd, vend.d[0]
> >       sub     result, src, srcin
> > --
> > 2.37.0.rc0.104.g0611611a94-goog
> >
>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] aarch64: Optimize string functions with shrn instruction
  2022-06-21  9:28   ` Danila Kutenin
@ 2022-06-22  6:48     ` Szabolcs Nagy
  2022-06-22  7:31       ` Danila Kutenin
  0 siblings, 1 reply; 12+ messages in thread
From: Szabolcs Nagy @ 2022-06-22  6:48 UTC (permalink / raw)
  To: Danila Kutenin; +Cc: libc-alpha, Danila Kutenin

The 06/21/2022 10:28, Danila Kutenin wrote:
> It's a contribution from Google. Google has a copyright assignment with
> fsf, I think this should cover it.

note that if you are interested in getting the same improvements
into bionic, musl, llvm-libc, newlib,...
then arm maintains an optimized-routines repo on github for that
purpose and you are welcome to contribute your changes there too.

> 
> Sorry for the email confusion, I realized the mess quite late: my git
> client was configured with yandex and the commit was set up with the
> google's account. Yandex has nothing to do with this work. If needed, I can
> recreate the patch

please do, we should not commit pastebin links into the git history.
(just list the measured improvements over the old code in the commit
message, if it's too long then select representative measurements
or aggregate them in some other way.)

> 
> On Tue, Jun 21, 2022, 10:08 Szabolcs Nagy <Szabolcs.Nagy@arm.com> wrote:
> 
> > The 06/20/2022 17:46, Danila Kutenin wrote:
> > > From: Danila Kutenin <kutdanila@yandex.ru>
> > >
> > > We found that string functions were using AND+ADDP
> > > to find the nibble/syndrome mask but there is an easier
> > > opportunity through `SHRN dst, src, 4` and has same
> > > latency on all SIMD ARMv8 targets as ADDP. There are also
> > > gaps for memcmp but that's probably for another patch
> > >
> > > We see 10-20% savings for small-mid size cases which are
> > > primary cases for general workloads https://pastebin.com/hA5Fd8eM

this is good improvement.
we will do some checks (on various cpus).

thanks.


> > >
> > > I don't have commit rights, asking maintainers to do that
> > >
> > > Signed-off-by: Danila Kutenin <danilak@google.com>
> >
> > is this a contribution from google or yandex or personal?
> >
> > (e.g. if your company has copyright assignment with fsf then
> > you dont need signed-off-by, otherwise it's better to have
> > the email address consistent with the author address)
> >
> > > ---
> > >  sysdeps/aarch64/memchr.S    | 19 +++++++------------
> > >  sysdeps/aarch64/memrchr.S   | 25 +++++++++----------------
> > >  sysdeps/aarch64/strchrnul.S | 29 +++++++++++------------------
> > >  sysdeps/aarch64/strcpy.S    | 32 ++++++++++++--------------------
> > >  sysdeps/aarch64/strlen.S    | 25 +++++++++----------------
> > >  sysdeps/aarch64/strnlen.S   | 25 +++++++++----------------
> > >  6 files changed, 57 insertions(+), 98 deletions(-)
> > >
> > > diff --git a/sysdeps/aarch64/memchr.S b/sysdeps/aarch64/memchr.S
> > > index b060eee97d..b983489491 100644
> > > --- a/sysdeps/aarch64/memchr.S
> > > +++ b/sysdeps/aarch64/memchr.S
> > > @@ -53,12 +53,11 @@
> > >
> > >  /*
> > >     Core algorithm:
> > > -   For each 16-byte chunk we calculate a 64-bit syndrome value with
> > four bits
> > > -   per byte. For even bytes, bits 0-3 are set if the relevant byte
> > matched the
> > > -   requested character or the byte is NUL. Bits 4-7 must be zero. Bits
> > 4-7 are
> > > -   set likewise for odd bytes so that adjacent bytes can be merged.
> > Since the
> > > -   bits in the syndrome reflect the order in which things occur in the
> > original
> > > -   string, counting trailing zeros identifies exactly which byte
> > matched.  */
> > > +   For each 16-byte chunk we calculate a 64-bit nibble mask value with
> > four bits
> > > +   per byte. We take 4 bits of every comparison byte with shift right
> > and narrow
> > > +   by 4 instruction. Since the bits in the nibble mask reflect the
> > order in
> > > +   which things occur in the original string, counting leading zeros
> > identifies
> > > +   exactly which byte matched.  */
> > >
> > >  ENTRY (MEMCHR)
> > >       PTR_ARG (0)
> > > @@ -67,12 +66,9 @@ ENTRY (MEMCHR)
> > >       cbz     cntin, L(nomatch)
> > >       ld1     {vdata.16b}, [src]
> > >       dup     vrepchr.16b, chrin
> > > -     mov     wtmp, 0xf00f
> > > -     dup     vrepmask.8h, wtmp
> > >       cmeq    vhas_chr.16b, vdata.16b, vrepchr.16b
> > >       lsl     shift, srcin, 2
> > > -     and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> > > -     addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64
> > */
> > > +     shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
> > >       fmov    synd, dend
> > >       lsr     synd, synd, shift
> > >       cbz     synd, L(start_loop)
> > > @@ -111,8 +107,7 @@ L(loop32_2):
> > >       fmov    synd, dend
> > >       cbz     synd, L(loop32)
> > >  L(end):
> > > -     and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> > > -     addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64
> > */
> > > +     shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
> > >       fmov    synd, dend
> > >       add     tmp, srcin, cntin
> > >       sub     cntrem, tmp, src
> > > diff --git a/sysdeps/aarch64/memrchr.S b/sysdeps/aarch64/memrchr.S
> > > index e0efbad91c..5179320720 <(517)%20932-0720> 100644
> > > --- a/sysdeps/aarch64/memrchr.S
> > > +++ b/sysdeps/aarch64/memrchr.S
> > > @@ -37,7 +37,6 @@
> > >  #define synd         x5
> > >  #define shift                x6
> > >  #define      tmp             x7
> > > -#define wtmp         w7
> > >  #define end          x8
> > >  #define endm1                x9
> > >
> > > @@ -45,18 +44,16 @@
> > >  #define qdata                q1
> > >  #define vdata                v1
> > >  #define vhas_chr     v2
> > > -#define vrepmask     v3
> > > -#define vend         v4
> > > -#define dend         d4
> > > +#define vend         v3
> > > +#define dend         d3
> > >
> > >  /*
> > >     Core algorithm:
> > > -   For each 16-byte chunk we calculate a 64-bit syndrome value with
> > four bits
> > > -   per byte. For even bytes, bits 0-3 are set if the relevant byte
> > matched the
> > > -   requested character or the byte is NUL. Bits 4-7 must be zero. Bits
> > 4-7 are
> > > -   set likewise for odd bytes so that adjacent bytes can be merged.
> > Since the
> > > -   bits in the syndrome reflect the order in which things occur in the
> > original
> > > -   string, counting trailing zeros identifies exactly which byte
> > matched.  */
> > > +   For each 16-byte chunk we calculate a 64-bit nibble mask value with
> > four bits
> > > +   per byte. We take 4 bits of every comparison byte with shift right
> > and narrow
> > > +   by 4 instruction. Since the bits in the nibble mask reflect the
> > order in
> > > +   which things occur in the original string, counting leading zeros
> > identifies
> > > +   exactly which byte matched.  */
> > >
> > >  ENTRY (__memrchr)
> > >       PTR_ARG (0)
> > > @@ -67,12 +64,9 @@ ENTRY (__memrchr)
> > >       cbz     cntin, L(nomatch)
> > >       ld1     {vdata.16b}, [src]
> > >       dup     vrepchr.16b, chrin
> > > -     mov     wtmp, 0xf00f
> > > -     dup     vrepmask.8h, wtmp
> > >       cmeq    vhas_chr.16b, vdata.16b, vrepchr.16b
> > >       neg     shift, end, lsl 2
> > > -     and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> > > -     addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64
> > */
> > > +     shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
> > >       fmov    synd, dend
> > >       lsl     synd, synd, shift
> > >       cbz     synd, L(start_loop)
> > > @@ -109,8 +103,7 @@ L(loop32_2):
> > >       fmov    synd, dend
> > >       cbz     synd, L(loop32)
> > >  L(end):
> > > -     and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> > > -     addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64
> > */
> > > +     shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
> > >       fmov    synd, dend
> > >
> > >       add     tmp, src, 15
> > > diff --git a/sysdeps/aarch64/strchrnul.S b/sysdeps/aarch64/strchrnul.S
> > > index 442726fd49..ee154ab74b 100644
> > > --- a/sysdeps/aarch64/strchrnul.S
> > > +++ b/sysdeps/aarch64/strchrnul.S
> > > @@ -33,38 +33,32 @@
> > >  #define src          x2
> > >  #define tmp1         x1
> > >  #define tmp2         x3
> > > -#define tmp2w                w3
> > >
> > >  #define vrepchr              v0
> > >  #define vdata                v1
> > >  #define qdata                q1
> > >  #define vhas_nul     v2
> > >  #define vhas_chr     v3
> > > -#define vrepmask     v4
> > > -#define vend         v5
> > > -#define dend         d5
> > > +#define vend         v4
> > > +#define dend         d4
> > >
> > > -/* Core algorithm:
> > > -
> > > -   For each 16-byte chunk we calculate a 64-bit syndrome value with
> > four bits
> > > -   per byte. For even bytes, bits 0-3 are set if the relevant byte
> > matched the
> > > -   requested character or the byte is NUL. Bits 4-7 must be zero. Bits
> > 4-7 are
> > > -   set likewise for odd bytes so that adjacent bytes can be merged.
> > Since the
> > > -   bits in the syndrome reflect the order in which things occur in the
> > original
> > > -   string, counting trailing zeros identifies exactly which byte
> > matched.  */
> > > +/*
> > > +   Core algorithm:
> > > +   For each 16-byte chunk we calculate a 64-bit nibble mask value with
> > four bits
> > > +   per byte. We take 4 bits of every comparison byte with shift right
> > and narrow
> > > +   by 4 instruction. Since the bits in the nibble mask reflect the
> > order in
> > > +   which things occur in the original string, counting leading zeros
> > identifies
> > > +   exactly which byte matched.  */
> > >
> > >  ENTRY (__strchrnul)
> > >       PTR_ARG (0)
> > >       bic     src, srcin, 15
> > >       dup     vrepchr.16b, chrin
> > >       ld1     {vdata.16b}, [src]
> > > -     mov     tmp2w, 0xf00f
> > > -     dup     vrepmask.8h, tmp2w
> > >       cmeq    vhas_chr.16b, vdata.16b, vrepchr.16b
> > >       cmhs    vhas_chr.16b, vhas_chr.16b, vdata.16b
> > >       lsl     tmp2, srcin, 2
> > > -     and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> > > -     addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64
> > */
> > > +     shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
> > >       fmov    tmp1, dend
> > >       lsr     tmp1, tmp1, tmp2        /* Mask padding bits.  */
> > >       cbz     tmp1, L(loop)
> > > @@ -83,8 +77,7 @@ L(loop):
> > >       fmov    tmp1, dend
> > >       cbz     tmp1, L(loop)
> > >
> > > -     and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> > > -     addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64
> > */
> > > +     shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
> > >       fmov    tmp1, dend
> > >  #ifndef __AARCH64EB__
> > >       rbit    tmp1, tmp1
> > > diff --git a/sysdeps/aarch64/strcpy.S b/sysdeps/aarch64/strcpy.S
> > > index da53170ece..78d27b4aa6 100644
> > > --- a/sysdeps/aarch64/strcpy.S
> > > +++ b/sysdeps/aarch64/strcpy.S
> > > @@ -40,7 +40,6 @@
> > >  #define len          x4
> > >  #define synd         x4
> > >  #define      tmp             x5
> > > -#define wtmp         w5
> > >  #define shift                x5
> > >  #define data1                x6
> > >  #define dataw1               w6
> > > @@ -50,9 +49,8 @@
> > >  #define dataq                q0
> > >  #define vdata                v0
> > >  #define vhas_nul     v1
> > > -#define vrepmask     v2
> > > -#define vend         v3
> > > -#define dend         d3
> > > +#define vend         v2
> > > +#define dend         d2
> > >  #define dataq2               q1
> > >
> > >  #ifdef BUILD_STPCPY
> > > @@ -63,34 +61,29 @@
> > >  # define IFSTPCPY(X,...)
> > >  #endif
> > >
> > > -/* Core algorithm:
> > > -
> > > -   For each 16-byte chunk we calculate a 64-bit syndrome value with
> > four bits
> > > -   per byte. For even bytes, bits 0-3 are set if the relevant byte
> > matched the
> > > -   requested character or the byte is NUL. Bits 4-7 must be zero. Bits
> > 4-7 are
> > > -   set likewise for odd bytes so that adjacent bytes can be merged.
> > Since the
> > > -   bits in the syndrome reflect the order in which things occur in the
> > original
> > > -   string, counting trailing zeros identifies exactly which byte
> > matched.  */
> > > +/*
> > > +   Core algorithm:
> > > +   For each 16-byte chunk we calculate a 64-bit nibble mask value with
> > four bits
> > > +   per byte. We take 4 bits of every comparison byte with shift right
> > and narrow
> > > +   by 4 instruction. Since the bits in the nibble mask reflect the
> > order in
> > > +   which things occur in the original string, counting leading zeros
> > identifies
> > > +   exactly which byte matched.  */
> > >
> > >  ENTRY (STRCPY)
> > >       PTR_ARG (0)
> > >       PTR_ARG (1)
> > >       bic     src, srcin, 15
> > > -     mov     wtmp, 0xf00f
> > >       ld1     {vdata.16b}, [src]
> > > -     dup     vrepmask.8h, wtmp
> > >       cmeq    vhas_nul.16b, vdata.16b, 0
> > >       lsl     shift, srcin, 2
> > > -     and     vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> > > -     addp    vend.16b, vhas_nul.16b, vhas_nul.16b
> > > +     shrn    vend.8b, vhas_nul.8h, 4         /* 128->64 */
> > >       fmov    synd, dend
> > >       lsr     synd, synd, shift
> > >       cbnz    synd, L(tail)
> > >
> > >       ldr     dataq, [src, 16]!
> > >       cmeq    vhas_nul.16b, vdata.16b, 0
> > > -     and     vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> > > -     addp    vend.16b, vhas_nul.16b, vhas_nul.16b
> > > +     shrn    vend.8b, vhas_nul.8h, 4         /* 128->64 */
> > >       fmov    synd, dend
> > >       cbz     synd, L(start_loop)
> > >
> > > @@ -162,8 +155,7 @@ L(loop):
> > >       fmov    synd, dend
> > >       cbz     synd, L(loop)
> > >
> > > -     and     vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> > > -     addp    vend.16b, vhas_nul.16b, vhas_nul.16b            /* 128->64
> > */
> > > +     shrn    vend.8b, vhas_nul.8h, 4         /* 128->64 */
> > >       fmov    synd, dend
> > >  #ifndef __AARCH64EB__
> > >       rbit    synd, synd
> > > diff --git a/sysdeps/aarch64/strlen.S b/sysdeps/aarch64/strlen.S
> > > index a2310871c2..3a5d088407 100644
> > > --- a/sysdeps/aarch64/strlen.S
> > > +++ b/sysdeps/aarch64/strlen.S
> > > @@ -34,35 +34,29 @@
> > >  #define src          x1
> > >  #define      synd            x2
> > >  #define tmp          x3
> > > -#define wtmp         w3
> > >  #define shift                x4
> > >
> > >  #define data         q0
> > >  #define vdata                v0
> > >  #define vhas_nul     v1
> > > -#define vrepmask     v2
> > > -#define vend         v3
> > > -#define dend         d3
> > > +#define vend         v2
> > > +#define dend         d2
> > >
> > >  /* Core algorithm:
> > >
> > > -   For each 16-byte chunk we calculate a 64-bit syndrome value with
> > four bits
> > > -   per byte. For even bytes, bits 0-3 are set if the relevant byte
> > matched the
> > > -   requested character or the byte is NUL. Bits 4-7 must be zero. Bits
> > 4-7 are
> > > -   set likewise for odd bytes so that adjacent bytes can be merged.
> > Since the
> > > -   bits in the syndrome reflect the order in which things occur in the
> > original
> > > -   string, counting trailing zeros identifies exactly which byte
> > matched.  */
> > > +   For each 16-byte chunk we calculate a 64-bit nibble mask value with
> > four bits
> > > +   per byte. We take 4 bits of every comparison byte with shift right
> > and narrow
> > > +   by 4 instruction. Since the bits in the nibble mask reflect the
> > order in
> > > +   which things occur in the original string, counting trailing zeros
> > identifies
> > > +   exactly which byte matched.  */
> > >
> > >  ENTRY (STRLEN)
> > >       PTR_ARG (0)
> > >       bic     src, srcin, 15
> > > -     mov     wtmp, 0xf00f
> > >       ld1     {vdata.16b}, [src]
> > > -     dup     vrepmask.8h, wtmp
> > >       cmeq    vhas_nul.16b, vdata.16b, 0
> > >       lsl     shift, srcin, 2
> > > -     and     vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> > > -     addp    vend.16b, vhas_nul.16b, vhas_nul.16b            /* 128->64
> > */
> > > +     shrn    vend.8b, vhas_nul.8h, 4         /* 128->64 */
> > >       fmov    synd, dend
> > >       lsr     synd, synd, shift
> > >       cbz     synd, L(loop)
> > > @@ -80,8 +74,7 @@ L(loop):
> > >       fmov    synd, dend
> > >       cbz     synd, L(loop)
> > >
> > > -     and     vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> > > -     addp    vend.16b, vhas_nul.16b, vhas_nul.16b            /* 128->64
> > */
> > > +     shrn    vend.8b, vhas_nul.8h, 4         /* 128->64 */
> > >       sub     result, src, srcin
> > >       fmov    synd, dend
> > >  #ifndef __AARCH64EB__
> > > diff --git a/sysdeps/aarch64/strnlen.S b/sysdeps/aarch64/strnlen.S
> > > index 0dbecb0ce9..282bddc9aa 100644
> > > --- a/sysdeps/aarch64/strnlen.S
> > > +++ b/sysdeps/aarch64/strnlen.S
> > > @@ -33,39 +33,33 @@
> > >  #define src          x2
> > >  #define synd         x3
> > >  #define      shift           x4
> > > -#define wtmp         w4
> > >  #define tmp          x4
> > >  #define cntrem               x5
> > >
> > >  #define qdata                q0
> > >  #define vdata                v0
> > >  #define vhas_chr     v1
> > > -#define vrepmask     v2
> > > -#define vend         v3
> > > -#define dend         d3
> > > +#define vend         v2
> > > +#define dend         d2
> > >
> > >  /*
> > >     Core algorithm:
> > >
> > > -   For each 16-byte chunk we calculate a 64-bit syndrome value with
> > four bits
> > > -   per byte. For even bytes, bits 0-3 are set if the relevant byte
> > matched the
> > > -   requested character or the byte is NUL. Bits 4-7 must be zero. Bits
> > 4-7 are
> > > -   set likewise for odd bytes so that adjacent bytes can be merged.
> > Since the
> > > -   bits in the syndrome reflect the order in which things occur in the
> > original
> > > -   string, counting trailing zeros identifies exactly which byte
> > matched.  */
> > > +   For each 16-byte chunk we calculate a 64-bit nibble mask value with
> > four bits
> > > +   per byte. We take 4 bits of every comparison byte with shift right
> > and narrow
> > > +   by 4 instruction. Since the bits in the nibble mask reflect the
> > order in
> > > +   which things occur in the original string, counting trailing zeros
> > identifies
> > > +   exactly which byte matched.  */
> > >
> > >  ENTRY (__strnlen)
> > >       PTR_ARG (0)
> > >       SIZE_ARG (1)
> > >       bic     src, srcin, 15
> > > -     mov     wtmp, 0xf00f
> > >       cbz     cntin, L(nomatch)
> > >       ld1     {vdata.16b}, [src], 16
> > > -     dup     vrepmask.8h, wtmp
> > >       cmeq    vhas_chr.16b, vdata.16b, 0
> > >       lsl     shift, srcin, 2
> > > -     and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> > > -     addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64
> > */
> > > +     shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
> > >       fmov    synd, dend
> > >       lsr     synd, synd, shift
> > >       cbz     synd, L(start_loop)
> > > @@ -103,8 +97,7 @@ L(loop32_2):
> > >       cbz     synd, L(loop32)
> > >
> > >  L(end):
> > > -     and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> > > -     addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64
> > */
> > > +     shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
> > >       sub     src, src, 16
> > >       mov     synd, vend.d[0]
> > >       sub     result, src, srcin
> > > --
> > > 2.37.0.rc0.104.g0611611a94-goog
> > >
> >

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] aarch64: Optimize string functions with shrn instruction
  2022-06-22  6:48     ` Szabolcs Nagy
@ 2022-06-22  7:31       ` Danila Kutenin
  2022-06-22  8:40         ` Szabolcs Nagy
  0 siblings, 1 reply; 12+ messages in thread
From: Danila Kutenin @ 2022-06-22  7:31 UTC (permalink / raw)
  To: Szabolcs Nagy; +Cc: libc-alpha, Danila Kutenin

>
> note that if you are interested in getting the same improvements
> into bionic, musl, llvm-libc, newlib,...
> then arm maintains an optimized-routines repo on github for that
> purpose and you are welcome to contribute your changes there too.


We have communicated with llvm-libc and bionic, for others will reach out,
thanks!

 please do, we should not commit pastebin links into the git history.
> (just list the measured improvements over the old code in the commit
> message, if it's too long then select representative measurements
> or aggregate them in some other way.)


Created a new one, this one should be abandoned. I still was recommended to
write a Sign-off line at the end by my employer

this is good improvement.
> we will do some checks (on various cpus).


 Thanks!

On Wed, Jun 22, 2022 at 7:49 AM Szabolcs Nagy <Szabolcs.Nagy@arm.com> wrote:

> The 06/21/2022 10:28, Danila Kutenin wrote:
> > It's a contribution from Google. Google has a copyright assignment with
> > fsf, I think this should cover it.
>
> note that if you are interested in getting the same improvements
> into bionic, musl, llvm-libc, newlib,...
> then arm maintains an optimized-routines repo on github for that
> purpose and you are welcome to contribute your changes there too.
>
> >
> > Sorry for the email confusion, I realized the mess quite late: my git
> > client was configured with yandex and the commit was set up with the
> > google's account. Yandex has nothing to do with this work. If needed, I
> can
> > recreate the patch
>
> please do, we should not commit pastebin links into the git history.
> (just list the measured improvements over the old code in the commit
> message, if it's too long then select representative measurements
> or aggregate them in some other way.)
>
> >
> > On Tue, Jun 21, 2022, 10:08 Szabolcs Nagy <Szabolcs.Nagy@arm.com> wrote:
> >
> > > The 06/20/2022 17:46, Danila Kutenin wrote:
> > > > From: Danila Kutenin <kutdanila@yandex.ru>
> > > >
> > > > We found that string functions were using AND+ADDP
> > > > to find the nibble/syndrome mask but there is an easier
> > > > opportunity through `SHRN dst, src, 4` and has same
> > > > latency on all SIMD ARMv8 targets as ADDP. There are also
> > > > gaps for memcmp but that's probably for another patch
> > > >
> > > > We see 10-20% savings for small-mid size cases which are
> > > > primary cases for general workloads https://pastebin.com/hA5Fd8eM
>
> this is good improvement.
> we will do some checks (on various cpus).
>
> thanks.
>
>
> > > >
> > > > I don't have commit rights, asking maintainers to do that
> > > >
> > > > Signed-off-by: Danila Kutenin <danilak@google.com>
> > >
> > > is this a contribution from google or yandex or personal?
> > >
> > > (e.g. if your company has copyright assignment with fsf then
> > > you dont need signed-off-by, otherwise it's better to have
> > > the email address consistent with the author address)
> > >
> > > > ---
> > > >  sysdeps/aarch64/memchr.S    | 19 +++++++------------
> > > >  sysdeps/aarch64/memrchr.S   | 25 +++++++++----------------
> > > >  sysdeps/aarch64/strchrnul.S | 29 +++++++++++------------------
> > > >  sysdeps/aarch64/strcpy.S    | 32 ++++++++++++--------------------
> > > >  sysdeps/aarch64/strlen.S    | 25 +++++++++----------------
> > > >  sysdeps/aarch64/strnlen.S   | 25 +++++++++----------------
> > > >  6 files changed, 57 insertions(+), 98 deletions(-)
> > > >
> > > > diff --git a/sysdeps/aarch64/memchr.S b/sysdeps/aarch64/memchr.S
> > > > index b060eee97d..b983489491 100644
> > > > --- a/sysdeps/aarch64/memchr.S
> > > > +++ b/sysdeps/aarch64/memchr.S
> > > > @@ -53,12 +53,11 @@
> > > >
> > > >  /*
> > > >     Core algorithm:
> > > > -   For each 16-byte chunk we calculate a 64-bit syndrome value with
> > > four bits
> > > > -   per byte. For even bytes, bits 0-3 are set if the relevant byte
> > > matched the
> > > > -   requested character or the byte is NUL. Bits 4-7 must be zero.
> Bits
> > > 4-7 are
> > > > -   set likewise for odd bytes so that adjacent bytes can be merged.
> > > Since the
> > > > -   bits in the syndrome reflect the order in which things occur in
> the
> > > original
> > > > -   string, counting trailing zeros identifies exactly which byte
> > > matched.  */
> > > > +   For each 16-byte chunk we calculate a 64-bit nibble mask value
> with
> > > four bits
> > > > +   per byte. We take 4 bits of every comparison byte with shift
> right
> > > and narrow
> > > > +   by 4 instruction. Since the bits in the nibble mask reflect the
> > > order in
> > > > +   which things occur in the original string, counting leading zeros
> > > identifies
> > > > +   exactly which byte matched.  */
> > > >
> > > >  ENTRY (MEMCHR)
> > > >       PTR_ARG (0)
> > > > @@ -67,12 +66,9 @@ ENTRY (MEMCHR)
> > > >       cbz     cntin, L(nomatch)
> > > >       ld1     {vdata.16b}, [src]
> > > >       dup     vrepchr.16b, chrin
> > > > -     mov     wtmp, 0xf00f
> > > > -     dup     vrepmask.8h, wtmp
> > > >       cmeq    vhas_chr.16b, vdata.16b, vrepchr.16b
> > > >       lsl     shift, srcin, 2
> > > > -     and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> > > > -     addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /*
> 128->64
> > > */
> > > > +     shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
> > > >       fmov    synd, dend
> > > >       lsr     synd, synd, shift
> > > >       cbz     synd, L(start_loop)
> > > > @@ -111,8 +107,7 @@ L(loop32_2):
> > > >       fmov    synd, dend
> > > >       cbz     synd, L(loop32)
> > > >  L(end):
> > > > -     and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> > > > -     addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /*
> 128->64
> > > */
> > > > +     shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
> > > >       fmov    synd, dend
> > > >       add     tmp, srcin, cntin
> > > >       sub     cntrem, tmp, src
> > > > diff --git a/sysdeps/aarch64/memrchr.S b/sysdeps/aarch64/memrchr.S
> > > > index e0efbad91c..5179320720 <(517)%20932-0720> <(517)%20932-0720>
> 100644
> > > > --- a/sysdeps/aarch64/memrchr.S
> > > > +++ b/sysdeps/aarch64/memrchr.S
> > > > @@ -37,7 +37,6 @@
> > > >  #define synd         x5
> > > >  #define shift                x6
> > > >  #define      tmp             x7
> > > > -#define wtmp         w7
> > > >  #define end          x8
> > > >  #define endm1                x9
> > > >
> > > > @@ -45,18 +44,16 @@
> > > >  #define qdata                q1
> > > >  #define vdata                v1
> > > >  #define vhas_chr     v2
> > > > -#define vrepmask     v3
> > > > -#define vend         v4
> > > > -#define dend         d4
> > > > +#define vend         v3
> > > > +#define dend         d3
> > > >
> > > >  /*
> > > >     Core algorithm:
> > > > -   For each 16-byte chunk we calculate a 64-bit syndrome value with
> > > four bits
> > > > -   per byte. For even bytes, bits 0-3 are set if the relevant byte
> > > matched the
> > > > -   requested character or the byte is NUL. Bits 4-7 must be zero.
> Bits
> > > 4-7 are
> > > > -   set likewise for odd bytes so that adjacent bytes can be merged.
> > > Since the
> > > > -   bits in the syndrome reflect the order in which things occur in
> the
> > > original
> > > > -   string, counting trailing zeros identifies exactly which byte
> > > matched.  */
> > > > +   For each 16-byte chunk we calculate a 64-bit nibble mask value
> with
> > > four bits
> > > > +   per byte. We take 4 bits of every comparison byte with shift
> right
> > > and narrow
> > > > +   by 4 instruction. Since the bits in the nibble mask reflect the
> > > order in
> > > > +   which things occur in the original string, counting leading zeros
> > > identifies
> > > > +   exactly which byte matched.  */
> > > >
> > > >  ENTRY (__memrchr)
> > > >       PTR_ARG (0)
> > > > @@ -67,12 +64,9 @@ ENTRY (__memrchr)
> > > >       cbz     cntin, L(nomatch)
> > > >       ld1     {vdata.16b}, [src]
> > > >       dup     vrepchr.16b, chrin
> > > > -     mov     wtmp, 0xf00f
> > > > -     dup     vrepmask.8h, wtmp
> > > >       cmeq    vhas_chr.16b, vdata.16b, vrepchr.16b
> > > >       neg     shift, end, lsl 2
> > > > -     and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> > > > -     addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /*
> 128->64
> > > */
> > > > +     shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
> > > >       fmov    synd, dend
> > > >       lsl     synd, synd, shift
> > > >       cbz     synd, L(start_loop)
> > > > @@ -109,8 +103,7 @@ L(loop32_2):
> > > >       fmov    synd, dend
> > > >       cbz     synd, L(loop32)
> > > >  L(end):
> > > > -     and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> > > > -     addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /*
> 128->64
> > > */
> > > > +     shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
> > > >       fmov    synd, dend
> > > >
> > > >       add     tmp, src, 15
> > > > diff --git a/sysdeps/aarch64/strchrnul.S
> b/sysdeps/aarch64/strchrnul.S
> > > > index 442726fd49..ee154ab74b 100644
> > > > --- a/sysdeps/aarch64/strchrnul.S
> > > > +++ b/sysdeps/aarch64/strchrnul.S
> > > > @@ -33,38 +33,32 @@
> > > >  #define src          x2
> > > >  #define tmp1         x1
> > > >  #define tmp2         x3
> > > > -#define tmp2w                w3
> > > >
> > > >  #define vrepchr              v0
> > > >  #define vdata                v1
> > > >  #define qdata                q1
> > > >  #define vhas_nul     v2
> > > >  #define vhas_chr     v3
> > > > -#define vrepmask     v4
> > > > -#define vend         v5
> > > > -#define dend         d5
> > > > +#define vend         v4
> > > > +#define dend         d4
> > > >
> > > > -/* Core algorithm:
> > > > -
> > > > -   For each 16-byte chunk we calculate a 64-bit syndrome value with
> > > four bits
> > > > -   per byte. For even bytes, bits 0-3 are set if the relevant byte
> > > matched the
> > > > -   requested character or the byte is NUL. Bits 4-7 must be zero.
> Bits
> > > 4-7 are
> > > > -   set likewise for odd bytes so that adjacent bytes can be merged.
> > > Since the
> > > > -   bits in the syndrome reflect the order in which things occur in
> the
> > > original
> > > > -   string, counting trailing zeros identifies exactly which byte
> > > matched.  */
> > > > +/*
> > > > +   Core algorithm:
> > > > +   For each 16-byte chunk we calculate a 64-bit nibble mask value
> with
> > > four bits
> > > > +   per byte. We take 4 bits of every comparison byte with shift
> right
> > > and narrow
> > > > +   by 4 instruction. Since the bits in the nibble mask reflect the
> > > order in
> > > > +   which things occur in the original string, counting leading zeros
> > > identifies
> > > > +   exactly which byte matched.  */
> > > >
> > > >  ENTRY (__strchrnul)
> > > >       PTR_ARG (0)
> > > >       bic     src, srcin, 15
> > > >       dup     vrepchr.16b, chrin
> > > >       ld1     {vdata.16b}, [src]
> > > > -     mov     tmp2w, 0xf00f
> > > > -     dup     vrepmask.8h, tmp2w
> > > >       cmeq    vhas_chr.16b, vdata.16b, vrepchr.16b
> > > >       cmhs    vhas_chr.16b, vhas_chr.16b, vdata.16b
> > > >       lsl     tmp2, srcin, 2
> > > > -     and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> > > > -     addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /*
> 128->64
> > > */
> > > > +     shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
> > > >       fmov    tmp1, dend
> > > >       lsr     tmp1, tmp1, tmp2        /* Mask padding bits.  */
> > > >       cbz     tmp1, L(loop)
> > > > @@ -83,8 +77,7 @@ L(loop):
> > > >       fmov    tmp1, dend
> > > >       cbz     tmp1, L(loop)
> > > >
> > > > -     and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> > > > -     addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /*
> 128->64
> > > */
> > > > +     shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
> > > >       fmov    tmp1, dend
> > > >  #ifndef __AARCH64EB__
> > > >       rbit    tmp1, tmp1
> > > > diff --git a/sysdeps/aarch64/strcpy.S b/sysdeps/aarch64/strcpy.S
> > > > index da53170ece..78d27b4aa6 100644
> > > > --- a/sysdeps/aarch64/strcpy.S
> > > > +++ b/sysdeps/aarch64/strcpy.S
> > > > @@ -40,7 +40,6 @@
> > > >  #define len          x4
> > > >  #define synd         x4
> > > >  #define      tmp             x5
> > > > -#define wtmp         w5
> > > >  #define shift                x5
> > > >  #define data1                x6
> > > >  #define dataw1               w6
> > > > @@ -50,9 +49,8 @@
> > > >  #define dataq                q0
> > > >  #define vdata                v0
> > > >  #define vhas_nul     v1
> > > > -#define vrepmask     v2
> > > > -#define vend         v3
> > > > -#define dend         d3
> > > > +#define vend         v2
> > > > +#define dend         d2
> > > >  #define dataq2               q1
> > > >
> > > >  #ifdef BUILD_STPCPY
> > > > @@ -63,34 +61,29 @@
> > > >  # define IFSTPCPY(X,...)
> > > >  #endif
> > > >
> > > > -/* Core algorithm:
> > > > -
> > > > -   For each 16-byte chunk we calculate a 64-bit syndrome value with
> > > four bits
> > > > -   per byte. For even bytes, bits 0-3 are set if the relevant byte
> > > matched the
> > > > -   requested character or the byte is NUL. Bits 4-7 must be zero.
> Bits
> > > 4-7 are
> > > > -   set likewise for odd bytes so that adjacent bytes can be merged.
> > > Since the
> > > > -   bits in the syndrome reflect the order in which things occur in
> the
> > > original
> > > > -   string, counting trailing zeros identifies exactly which byte
> > > matched.  */
> > > > +/*
> > > > +   Core algorithm:
> > > > +   For each 16-byte chunk we calculate a 64-bit nibble mask value
> with
> > > four bits
> > > > +   per byte. We take 4 bits of every comparison byte with shift
> right
> > > and narrow
> > > > +   by 4 instruction. Since the bits in the nibble mask reflect the
> > > order in
> > > > +   which things occur in the original string, counting leading zeros
> > > identifies
> > > > +   exactly which byte matched.  */
> > > >
> > > >  ENTRY (STRCPY)
> > > >       PTR_ARG (0)
> > > >       PTR_ARG (1)
> > > >       bic     src, srcin, 15
> > > > -     mov     wtmp, 0xf00f
> > > >       ld1     {vdata.16b}, [src]
> > > > -     dup     vrepmask.8h, wtmp
> > > >       cmeq    vhas_nul.16b, vdata.16b, 0
> > > >       lsl     shift, srcin, 2
> > > > -     and     vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> > > > -     addp    vend.16b, vhas_nul.16b, vhas_nul.16b
> > > > +     shrn    vend.8b, vhas_nul.8h, 4         /* 128->64 */
> > > >       fmov    synd, dend
> > > >       lsr     synd, synd, shift
> > > >       cbnz    synd, L(tail)
> > > >
> > > >       ldr     dataq, [src, 16]!
> > > >       cmeq    vhas_nul.16b, vdata.16b, 0
> > > > -     and     vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> > > > -     addp    vend.16b, vhas_nul.16b, vhas_nul.16b
> > > > +     shrn    vend.8b, vhas_nul.8h, 4         /* 128->64 */
> > > >       fmov    synd, dend
> > > >       cbz     synd, L(start_loop)
> > > >
> > > > @@ -162,8 +155,7 @@ L(loop):
> > > >       fmov    synd, dend
> > > >       cbz     synd, L(loop)
> > > >
> > > > -     and     vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> > > > -     addp    vend.16b, vhas_nul.16b, vhas_nul.16b            /*
> 128->64
> > > */
> > > > +     shrn    vend.8b, vhas_nul.8h, 4         /* 128->64 */
> > > >       fmov    synd, dend
> > > >  #ifndef __AARCH64EB__
> > > >       rbit    synd, synd
> > > > diff --git a/sysdeps/aarch64/strlen.S b/sysdeps/aarch64/strlen.S
> > > > index a2310871c2..3a5d088407 100644
> > > > --- a/sysdeps/aarch64/strlen.S
> > > > +++ b/sysdeps/aarch64/strlen.S
> > > > @@ -34,35 +34,29 @@
> > > >  #define src          x1
> > > >  #define      synd            x2
> > > >  #define tmp          x3
> > > > -#define wtmp         w3
> > > >  #define shift                x4
> > > >
> > > >  #define data         q0
> > > >  #define vdata                v0
> > > >  #define vhas_nul     v1
> > > > -#define vrepmask     v2
> > > > -#define vend         v3
> > > > -#define dend         d3
> > > > +#define vend         v2
> > > > +#define dend         d2
> > > >
> > > >  /* Core algorithm:
> > > >
> > > > -   For each 16-byte chunk we calculate a 64-bit syndrome value with
> > > four bits
> > > > -   per byte. For even bytes, bits 0-3 are set if the relevant byte
> > > matched the
> > > > -   requested character or the byte is NUL. Bits 4-7 must be zero.
> Bits
> > > 4-7 are
> > > > -   set likewise for odd bytes so that adjacent bytes can be merged.
> > > Since the
> > > > -   bits in the syndrome reflect the order in which things occur in
> the
> > > original
> > > > -   string, counting trailing zeros identifies exactly which byte
> > > matched.  */
> > > > +   For each 16-byte chunk we calculate a 64-bit nibble mask value
> with
> > > four bits
> > > > +   per byte. We take 4 bits of every comparison byte with shift
> right
> > > and narrow
> > > > +   by 4 instruction. Since the bits in the nibble mask reflect the
> > > order in
> > > > +   which things occur in the original string, counting trailing
> zeros
> > > identifies
> > > > +   exactly which byte matched.  */
> > > >
> > > >  ENTRY (STRLEN)
> > > >       PTR_ARG (0)
> > > >       bic     src, srcin, 15
> > > > -     mov     wtmp, 0xf00f
> > > >       ld1     {vdata.16b}, [src]
> > > > -     dup     vrepmask.8h, wtmp
> > > >       cmeq    vhas_nul.16b, vdata.16b, 0
> > > >       lsl     shift, srcin, 2
> > > > -     and     vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> > > > -     addp    vend.16b, vhas_nul.16b, vhas_nul.16b            /*
> 128->64
> > > */
> > > > +     shrn    vend.8b, vhas_nul.8h, 4         /* 128->64 */
> > > >       fmov    synd, dend
> > > >       lsr     synd, synd, shift
> > > >       cbz     synd, L(loop)
> > > > @@ -80,8 +74,7 @@ L(loop):
> > > >       fmov    synd, dend
> > > >       cbz     synd, L(loop)
> > > >
> > > > -     and     vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> > > > -     addp    vend.16b, vhas_nul.16b, vhas_nul.16b            /*
> 128->64
> > > */
> > > > +     shrn    vend.8b, vhas_nul.8h, 4         /* 128->64 */
> > > >       sub     result, src, srcin
> > > >       fmov    synd, dend
> > > >  #ifndef __AARCH64EB__
> > > > diff --git a/sysdeps/aarch64/strnlen.S b/sysdeps/aarch64/strnlen.S
> > > > index 0dbecb0ce9..282bddc9aa 100644
> > > > --- a/sysdeps/aarch64/strnlen.S
> > > > +++ b/sysdeps/aarch64/strnlen.S
> > > > @@ -33,39 +33,33 @@
> > > >  #define src          x2
> > > >  #define synd         x3
> > > >  #define      shift           x4
> > > > -#define wtmp         w4
> > > >  #define tmp          x4
> > > >  #define cntrem               x5
> > > >
> > > >  #define qdata                q0
> > > >  #define vdata                v0
> > > >  #define vhas_chr     v1
> > > > -#define vrepmask     v2
> > > > -#define vend         v3
> > > > -#define dend         d3
> > > > +#define vend         v2
> > > > +#define dend         d2
> > > >
> > > >  /*
> > > >     Core algorithm:
> > > >
> > > > -   For each 16-byte chunk we calculate a 64-bit syndrome value with
> > > four bits
> > > > -   per byte. For even bytes, bits 0-3 are set if the relevant byte
> > > matched the
> > > > -   requested character or the byte is NUL. Bits 4-7 must be zero.
> Bits
> > > 4-7 are
> > > > -   set likewise for odd bytes so that adjacent bytes can be merged.
> > > Since the
> > > > -   bits in the syndrome reflect the order in which things occur in
> the
> > > original
> > > > -   string, counting trailing zeros identifies exactly which byte
> > > matched.  */
> > > > +   For each 16-byte chunk we calculate a 64-bit nibble mask value
> with
> > > four bits
> > > > +   per byte. We take 4 bits of every comparison byte with shift
> right
> > > and narrow
> > > > +   by 4 instruction. Since the bits in the nibble mask reflect the
> > > order in
> > > > +   which things occur in the original string, counting trailing
> zeros
> > > identifies
> > > > +   exactly which byte matched.  */
> > > >
> > > >  ENTRY (__strnlen)
> > > >       PTR_ARG (0)
> > > >       SIZE_ARG (1)
> > > >       bic     src, srcin, 15
> > > > -     mov     wtmp, 0xf00f
> > > >       cbz     cntin, L(nomatch)
> > > >       ld1     {vdata.16b}, [src], 16
> > > > -     dup     vrepmask.8h, wtmp
> > > >       cmeq    vhas_chr.16b, vdata.16b, 0
> > > >       lsl     shift, srcin, 2
> > > > -     and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> > > > -     addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /*
> 128->64
> > > */
> > > > +     shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
> > > >       fmov    synd, dend
> > > >       lsr     synd, synd, shift
> > > >       cbz     synd, L(start_loop)
> > > > @@ -103,8 +97,7 @@ L(loop32_2):
> > > >       cbz     synd, L(loop32)
> > > >
> > > >  L(end):
> > > > -     and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> > > > -     addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /*
> 128->64
> > > */
> > > > +     shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
> > > >       sub     src, src, 16
> > > >       mov     synd, vend.d[0]
> > > >       sub     result, src, srcin
> > > > --
> > > > 2.37.0.rc0.104.g0611611a94-goog
> > > >
> > >
>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] aarch64: Optimize string functions with shrn instruction
  2022-06-22  7:31       ` Danila Kutenin
@ 2022-06-22  8:40         ` Szabolcs Nagy
  0 siblings, 0 replies; 12+ messages in thread
From: Szabolcs Nagy @ 2022-06-22  8:40 UTC (permalink / raw)
  To: Danila Kutenin; +Cc: libc-alpha, Danila Kutenin

The 06/22/2022 08:31, Danila Kutenin wrote:
> >
> > note that if you are interested in getting the same improvements
> > into bionic, musl, llvm-libc, newlib,...
> > then arm maintains an optimized-routines repo on github for that
> > purpose and you are welcome to contribute your changes there too.
> 
> 
> We have communicated with llvm-libc and bionic, for others will reach out,
> thanks!

note, many projects take code from arm optimized-routines, we
don't have an exact list. e.g. bionic uses it directly, if you
fork that then merging fixes later will be more expensive.

>  please do, we should not commit pastebin links into the git history.
> > (just list the measured improvements over the old code in the commit
> > message, if it's too long then select representative measurements
> > or aggregate them in some other way.)
> 
> 
> Created a new one, this one should be abandoned. I still was recommended to
> write a Sign-off line at the end by my employer

i will wait for somebody from glibc or fsf to confirm that
this is ok to commit this way.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] aarch64: Optimize string functions with shrn instruction
  2022-07-06  7:42 ` Danila Kutenin
@ 2022-07-06  8:30   ` Szabolcs Nagy
  0 siblings, 0 replies; 12+ messages in thread
From: Szabolcs Nagy @ 2022-07-06  8:30 UTC (permalink / raw)
  To: Danila Kutenin; +Cc: libc-alpha

The 07/06/2022 08:42, Danila Kutenin wrote:
> Friendly ping
> 
> On Mon, Jun 27, 2022 at 5:12 PM Danila Kutenin <danilak@google.com> wrote:
> 
> > We found that string functions were using AND+ADDP
> > to find the nibble/syndrome mask but there is an easier
> > opportunity through `SHRN dst.8b, src.8h, 4` (shift
> > right every 2 bytes by 4 and narrow to 1 byte) and has
> > same latency on all SIMD ARMv8 targets as ADDP. There
> > are also possible gaps for memcmp but that's for
> > another patch.
> >
> > We see 10-20% savings for small-mid size cases (<=128)
> > which are primary cases for general workloads.


thanks for the patch, committed it now.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] aarch64: Optimize string functions with shrn instruction
  2022-06-27 16:12 Danila Kutenin
@ 2022-07-06  7:42 ` Danila Kutenin
  2022-07-06  8:30   ` Szabolcs Nagy
  0 siblings, 1 reply; 12+ messages in thread
From: Danila Kutenin @ 2022-07-06  7:42 UTC (permalink / raw)
  To: libc-alpha; +Cc: Szabolcs.Nagy

Friendly ping

On Mon, Jun 27, 2022 at 5:12 PM Danila Kutenin <danilak@google.com> wrote:

> We found that string functions were using AND+ADDP
> to find the nibble/syndrome mask but there is an easier
> opportunity through `SHRN dst.8b, src.8h, 4` (shift
> right every 2 bytes by 4 and narrow to 1 byte) and has
> same latency on all SIMD ARMv8 targets as ADDP. There
> are also possible gaps for memcmp but that's for
> another patch.
>
> We see 10-20% savings for small-mid size cases (<=128)
> which are primary cases for general workloads.
> ---
>  sysdeps/aarch64/memchr.S    | 25 +++++++++----------------
>  sysdeps/aarch64/memrchr.S   | 25 +++++++++----------------
>  sysdeps/aarch64/strchrnul.S | 29 +++++++++++------------------
>  sysdeps/aarch64/strcpy.S    | 32 ++++++++++++--------------------
>  sysdeps/aarch64/strlen.S    | 25 +++++++++----------------
>  sysdeps/aarch64/strnlen.S   | 25 +++++++++----------------
>  6 files changed, 59 insertions(+), 102 deletions(-)
>
> diff --git a/sysdeps/aarch64/memchr.S b/sysdeps/aarch64/memchr.S
> index b060eee97d..2053a977b6 100644
> --- a/sysdeps/aarch64/memchr.S
> +++ b/sysdeps/aarch64/memchr.S
> @@ -41,24 +41,21 @@
>  #define synd           x5
>  #define shift          x6
>  #define        tmp             x7
> -#define wtmp           w7
>
>  #define vrepchr                v0
>  #define qdata          q1
>  #define vdata          v1
>  #define vhas_chr       v2
> -#define vrepmask       v3
> -#define vend           v4
> -#define dend           d4
> +#define vend           v3
> +#define dend           d3
>
>  /*
>     Core algorithm:
> -   For each 16-byte chunk we calculate a 64-bit syndrome value with four
> bits
> -   per byte. For even bytes, bits 0-3 are set if the relevant byte
> matched the
> -   requested character or the byte is NUL. Bits 4-7 must be zero. Bits
> 4-7 are
> -   set likewise for odd bytes so that adjacent bytes can be merged. Since
> the
> -   bits in the syndrome reflect the order in which things occur in the
> original
> -   string, counting trailing zeros identifies exactly which byte
> matched.  */
> +   For each 16-byte chunk we calculate a 64-bit nibble mask value with
> four bits
> +   per byte. We take 4 bits of every comparison byte with shift right and
> narrow
> +   by 4 instruction. Since the bits in the nibble mask reflect the order
> in
> +   which things occur in the original string, counting leading zeros
> identifies
> +   exactly which byte matched.  */
>
>  ENTRY (MEMCHR)
>         PTR_ARG (0)
> @@ -67,12 +64,9 @@ ENTRY (MEMCHR)
>         cbz     cntin, L(nomatch)
>         ld1     {vdata.16b}, [src]
>         dup     vrepchr.16b, chrin
> -       mov     wtmp, 0xf00f
> -       dup     vrepmask.8h, wtmp
>         cmeq    vhas_chr.16b, vdata.16b, vrepchr.16b
>         lsl     shift, srcin, 2
> -       and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> -       addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64
> */
> +       shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
>         fmov    synd, dend
>         lsr     synd, synd, shift
>         cbz     synd, L(start_loop)
> @@ -111,8 +105,7 @@ L(loop32_2):
>         fmov    synd, dend
>         cbz     synd, L(loop32)
>  L(end):
> -       and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> -       addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64
> */
> +       shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
>         fmov    synd, dend
>         add     tmp, srcin, cntin
>         sub     cntrem, tmp, src
> diff --git a/sysdeps/aarch64/memrchr.S b/sysdeps/aarch64/memrchr.S
> index e0efbad91c..5179320720 <(517)%20932-0720> 100644
> --- a/sysdeps/aarch64/memrchr.S
> +++ b/sysdeps/aarch64/memrchr.S
> @@ -37,7 +37,6 @@
>  #define synd           x5
>  #define shift          x6
>  #define        tmp             x7
> -#define wtmp           w7
>  #define end            x8
>  #define endm1          x9
>
> @@ -45,18 +44,16 @@
>  #define qdata          q1
>  #define vdata          v1
>  #define vhas_chr       v2
> -#define vrepmask       v3
> -#define vend           v4
> -#define dend           d4
> +#define vend           v3
> +#define dend           d3
>
>  /*
>     Core algorithm:
> -   For each 16-byte chunk we calculate a 64-bit syndrome value with four
> bits
> -   per byte. For even bytes, bits 0-3 are set if the relevant byte
> matched the
> -   requested character or the byte is NUL. Bits 4-7 must be zero. Bits
> 4-7 are
> -   set likewise for odd bytes so that adjacent bytes can be merged. Since
> the
> -   bits in the syndrome reflect the order in which things occur in the
> original
> -   string, counting trailing zeros identifies exactly which byte
> matched.  */
> +   For each 16-byte chunk we calculate a 64-bit nibble mask value with
> four bits
> +   per byte. We take 4 bits of every comparison byte with shift right and
> narrow
> +   by 4 instruction. Since the bits in the nibble mask reflect the order
> in
> +   which things occur in the original string, counting leading zeros
> identifies
> +   exactly which byte matched.  */
>
>  ENTRY (__memrchr)
>         PTR_ARG (0)
> @@ -67,12 +64,9 @@ ENTRY (__memrchr)
>         cbz     cntin, L(nomatch)
>         ld1     {vdata.16b}, [src]
>         dup     vrepchr.16b, chrin
> -       mov     wtmp, 0xf00f
> -       dup     vrepmask.8h, wtmp
>         cmeq    vhas_chr.16b, vdata.16b, vrepchr.16b
>         neg     shift, end, lsl 2
> -       and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> -       addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64
> */
> +       shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
>         fmov    synd, dend
>         lsl     synd, synd, shift
>         cbz     synd, L(start_loop)
> @@ -109,8 +103,7 @@ L(loop32_2):
>         fmov    synd, dend
>         cbz     synd, L(loop32)
>  L(end):
> -       and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> -       addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64
> */
> +       shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
>         fmov    synd, dend
>
>         add     tmp, src, 15
> diff --git a/sysdeps/aarch64/strchrnul.S b/sysdeps/aarch64/strchrnul.S
> index 442726fd49..ee154ab74b 100644
> --- a/sysdeps/aarch64/strchrnul.S
> +++ b/sysdeps/aarch64/strchrnul.S
> @@ -33,38 +33,32 @@
>  #define src            x2
>  #define tmp1           x1
>  #define tmp2           x3
> -#define tmp2w          w3
>
>  #define vrepchr                v0
>  #define vdata          v1
>  #define qdata          q1
>  #define vhas_nul       v2
>  #define vhas_chr       v3
> -#define vrepmask       v4
> -#define vend           v5
> -#define dend           d5
> +#define vend           v4
> +#define dend           d4
>
> -/* Core algorithm:
> -
> -   For each 16-byte chunk we calculate a 64-bit syndrome value with four
> bits
> -   per byte. For even bytes, bits 0-3 are set if the relevant byte
> matched the
> -   requested character or the byte is NUL. Bits 4-7 must be zero. Bits
> 4-7 are
> -   set likewise for odd bytes so that adjacent bytes can be merged. Since
> the
> -   bits in the syndrome reflect the order in which things occur in the
> original
> -   string, counting trailing zeros identifies exactly which byte
> matched.  */
> +/*
> +   Core algorithm:
> +   For each 16-byte chunk we calculate a 64-bit nibble mask value with
> four bits
> +   per byte. We take 4 bits of every comparison byte with shift right and
> narrow
> +   by 4 instruction. Since the bits in the nibble mask reflect the order
> in
> +   which things occur in the original string, counting leading zeros
> identifies
> +   exactly which byte matched.  */
>
>  ENTRY (__strchrnul)
>         PTR_ARG (0)
>         bic     src, srcin, 15
>         dup     vrepchr.16b, chrin
>         ld1     {vdata.16b}, [src]
> -       mov     tmp2w, 0xf00f
> -       dup     vrepmask.8h, tmp2w
>         cmeq    vhas_chr.16b, vdata.16b, vrepchr.16b
>         cmhs    vhas_chr.16b, vhas_chr.16b, vdata.16b
>         lsl     tmp2, srcin, 2
> -       and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> -       addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64
> */
> +       shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
>         fmov    tmp1, dend
>         lsr     tmp1, tmp1, tmp2        /* Mask padding bits.  */
>         cbz     tmp1, L(loop)
> @@ -83,8 +77,7 @@ L(loop):
>         fmov    tmp1, dend
>         cbz     tmp1, L(loop)
>
> -       and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> -       addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64
> */
> +       shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
>         fmov    tmp1, dend
>  #ifndef __AARCH64EB__
>         rbit    tmp1, tmp1
> diff --git a/sysdeps/aarch64/strcpy.S b/sysdeps/aarch64/strcpy.S
> index da53170ece..78d27b4aa6 100644
> --- a/sysdeps/aarch64/strcpy.S
> +++ b/sysdeps/aarch64/strcpy.S
> @@ -40,7 +40,6 @@
>  #define len            x4
>  #define synd           x4
>  #define        tmp             x5
> -#define wtmp           w5
>  #define shift          x5
>  #define data1          x6
>  #define dataw1         w6
> @@ -50,9 +49,8 @@
>  #define dataq          q0
>  #define vdata          v0
>  #define vhas_nul       v1
> -#define vrepmask       v2
> -#define vend           v3
> -#define dend           d3
> +#define vend           v2
> +#define dend           d2
>  #define dataq2         q1
>
>  #ifdef BUILD_STPCPY
> @@ -63,34 +61,29 @@
>  # define IFSTPCPY(X,...)
>  #endif
>
> -/* Core algorithm:
> -
> -   For each 16-byte chunk we calculate a 64-bit syndrome value with four
> bits
> -   per byte. For even bytes, bits 0-3 are set if the relevant byte
> matched the
> -   requested character or the byte is NUL. Bits 4-7 must be zero. Bits
> 4-7 are
> -   set likewise for odd bytes so that adjacent bytes can be merged. Since
> the
> -   bits in the syndrome reflect the order in which things occur in the
> original
> -   string, counting trailing zeros identifies exactly which byte
> matched.  */
> +/*
> +   Core algorithm:
> +   For each 16-byte chunk we calculate a 64-bit nibble mask value with
> four bits
> +   per byte. We take 4 bits of every comparison byte with shift right and
> narrow
> +   by 4 instruction. Since the bits in the nibble mask reflect the order
> in
> +   which things occur in the original string, counting leading zeros
> identifies
> +   exactly which byte matched.  */
>
>  ENTRY (STRCPY)
>         PTR_ARG (0)
>         PTR_ARG (1)
>         bic     src, srcin, 15
> -       mov     wtmp, 0xf00f
>         ld1     {vdata.16b}, [src]
> -       dup     vrepmask.8h, wtmp
>         cmeq    vhas_nul.16b, vdata.16b, 0
>         lsl     shift, srcin, 2
> -       and     vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> -       addp    vend.16b, vhas_nul.16b, vhas_nul.16b
> +       shrn    vend.8b, vhas_nul.8h, 4         /* 128->64 */
>         fmov    synd, dend
>         lsr     synd, synd, shift
>         cbnz    synd, L(tail)
>
>         ldr     dataq, [src, 16]!
>         cmeq    vhas_nul.16b, vdata.16b, 0
> -       and     vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> -       addp    vend.16b, vhas_nul.16b, vhas_nul.16b
> +       shrn    vend.8b, vhas_nul.8h, 4         /* 128->64 */
>         fmov    synd, dend
>         cbz     synd, L(start_loop)
>
> @@ -162,8 +155,7 @@ L(loop):
>         fmov    synd, dend
>         cbz     synd, L(loop)
>
> -       and     vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> -       addp    vend.16b, vhas_nul.16b, vhas_nul.16b            /* 128->64
> */
> +       shrn    vend.8b, vhas_nul.8h, 4         /* 128->64 */
>         fmov    synd, dend
>  #ifndef __AARCH64EB__
>         rbit    synd, synd
> diff --git a/sysdeps/aarch64/strlen.S b/sysdeps/aarch64/strlen.S
> index a2310871c2..3a5d088407 100644
> --- a/sysdeps/aarch64/strlen.S
> +++ b/sysdeps/aarch64/strlen.S
> @@ -34,35 +34,29 @@
>  #define src            x1
>  #define        synd            x2
>  #define tmp            x3
> -#define wtmp           w3
>  #define shift          x4
>
>  #define data           q0
>  #define vdata          v0
>  #define vhas_nul       v1
> -#define vrepmask       v2
> -#define vend           v3
> -#define dend           d3
> +#define vend           v2
> +#define dend           d2
>
>  /* Core algorithm:
>
> -   For each 16-byte chunk we calculate a 64-bit syndrome value with four
> bits
> -   per byte. For even bytes, bits 0-3 are set if the relevant byte
> matched the
> -   requested character or the byte is NUL. Bits 4-7 must be zero. Bits
> 4-7 are
> -   set likewise for odd bytes so that adjacent bytes can be merged. Since
> the
> -   bits in the syndrome reflect the order in which things occur in the
> original
> -   string, counting trailing zeros identifies exactly which byte
> matched.  */
> +   For each 16-byte chunk we calculate a 64-bit nibble mask value with
> four bits
> +   per byte. We take 4 bits of every comparison byte with shift right and
> narrow
> +   by 4 instruction. Since the bits in the nibble mask reflect the order
> in
> +   which things occur in the original string, counting trailing zeros
> identifies
> +   exactly which byte matched.  */
>
>  ENTRY (STRLEN)
>         PTR_ARG (0)
>         bic     src, srcin, 15
> -       mov     wtmp, 0xf00f
>         ld1     {vdata.16b}, [src]
> -       dup     vrepmask.8h, wtmp
>         cmeq    vhas_nul.16b, vdata.16b, 0
>         lsl     shift, srcin, 2
> -       and     vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> -       addp    vend.16b, vhas_nul.16b, vhas_nul.16b            /* 128->64
> */
> +       shrn    vend.8b, vhas_nul.8h, 4         /* 128->64 */
>         fmov    synd, dend
>         lsr     synd, synd, shift
>         cbz     synd, L(loop)
> @@ -80,8 +74,7 @@ L(loop):
>         fmov    synd, dend
>         cbz     synd, L(loop)
>
> -       and     vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> -       addp    vend.16b, vhas_nul.16b, vhas_nul.16b            /* 128->64
> */
> +       shrn    vend.8b, vhas_nul.8h, 4         /* 128->64 */
>         sub     result, src, srcin
>         fmov    synd, dend
>  #ifndef __AARCH64EB__
> diff --git a/sysdeps/aarch64/strnlen.S b/sysdeps/aarch64/strnlen.S
> index 0dbecb0ce9..282bddc9aa 100644
> --- a/sysdeps/aarch64/strnlen.S
> +++ b/sysdeps/aarch64/strnlen.S
> @@ -33,39 +33,33 @@
>  #define src            x2
>  #define synd           x3
>  #define        shift           x4
> -#define wtmp           w4
>  #define tmp            x4
>  #define cntrem         x5
>
>  #define qdata          q0
>  #define vdata          v0
>  #define vhas_chr       v1
> -#define vrepmask       v2
> -#define vend           v3
> -#define dend           d3
> +#define vend           v2
> +#define dend           d2
>
>  /*
>     Core algorithm:
>
> -   For each 16-byte chunk we calculate a 64-bit syndrome value with four
> bits
> -   per byte. For even bytes, bits 0-3 are set if the relevant byte
> matched the
> -   requested character or the byte is NUL. Bits 4-7 must be zero. Bits
> 4-7 are
> -   set likewise for odd bytes so that adjacent bytes can be merged. Since
> the
> -   bits in the syndrome reflect the order in which things occur in the
> original
> -   string, counting trailing zeros identifies exactly which byte
> matched.  */
> +   For each 16-byte chunk we calculate a 64-bit nibble mask value with
> four bits
> +   per byte. We take 4 bits of every comparison byte with shift right and
> narrow
> +   by 4 instruction. Since the bits in the nibble mask reflect the order
> in
> +   which things occur in the original string, counting trailing zeros
> identifies
> +   exactly which byte matched.  */
>
>  ENTRY (__strnlen)
>         PTR_ARG (0)
>         SIZE_ARG (1)
>         bic     src, srcin, 15
> -       mov     wtmp, 0xf00f
>         cbz     cntin, L(nomatch)
>         ld1     {vdata.16b}, [src], 16
> -       dup     vrepmask.8h, wtmp
>         cmeq    vhas_chr.16b, vdata.16b, 0
>         lsl     shift, srcin, 2
> -       and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> -       addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64
> */
> +       shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
>         fmov    synd, dend
>         lsr     synd, synd, shift
>         cbz     synd, L(start_loop)
> @@ -103,8 +97,7 @@ L(loop32_2):
>         cbz     synd, L(loop32)
>
>  L(end):
> -       and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> -       addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64
> */
> +       shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
>         sub     src, src, 16
>         mov     synd, vend.d[0]
>         sub     result, src, srcin
> --
> 2.37.0.rc0.161.g10f37bed90-goog
>
>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] aarch64: Optimize string functions with shrn instruction
  2022-06-27 16:02 ` Carlos O'Donell
@ 2022-06-27 16:12   ` Danila Kutenin
  0 siblings, 0 replies; 12+ messages in thread
From: Danila Kutenin @ 2022-06-27 16:12 UTC (permalink / raw)
  To: Carlos O'Donell; +Cc: libc-alpha, Szabolcs.Nagy

Done, thank you for a thorough explanation!

On Mon, Jun 27, 2022 at 5:02 PM Carlos O'Donell <carlos@redhat.com> wrote:

> On 6/22/22 03:29, Danila Kutenin via Libc-alpha wrote:
> > We found that string functions were using AND+ADDP
> > to find the nibble/syndrome mask but there is an easier
> > opportunity through `SHRN dst, src, 4` and has same
> > latency on all SIMD ARMv8 targets as ADDP. There are also
> > possible gaps for memcmp but that's for another patch.
> >
> > We see 10-20% savings for small-mid size cases (<=128)
> > which are primary cases for general workloads.
> >
> > Signed-off-by: Danila Kutenin <danilak@google.com>
> Thank you very much for working through this.
>
> This patch came up for review in Monday patch queue review.
>
> Please drop the "Signed-off-by" and submit the patch again.
>
> We only accept "Signed-off-by" in the cases where the submitter
> is not assigning copyright. In your case the Google blanket
> copyright assignment is in place (I verified) and so you are
> assigning to the FSF.
>
> Thank you for taking this extra step. I appreciate that it is a
> complicated extra step. We are working to keep the two submission
> processes clear (copyright assignment vs. no copyright assignment)
> and easy to follow.
>
> --
> Cheers,
> Carlos.
>
>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH] aarch64: Optimize string functions with shrn instruction
@ 2022-06-27 16:12 Danila Kutenin
  2022-07-06  7:42 ` Danila Kutenin
  0 siblings, 1 reply; 12+ messages in thread
From: Danila Kutenin @ 2022-06-27 16:12 UTC (permalink / raw)
  To: libc-alpha; +Cc: Szabolcs.Nagy, Danila Kutenin

We found that string functions were using AND+ADDP
to find the nibble/syndrome mask but there is an easier
opportunity through `SHRN dst.8b, src.8h, 4` (shift
right every 2 bytes by 4 and narrow to 1 byte) and has
same latency on all SIMD ARMv8 targets as ADDP. There
are also possible gaps for memcmp but that's for
another patch.

We see 10-20% savings for small-mid size cases (<=128)
which are primary cases for general workloads.
---
 sysdeps/aarch64/memchr.S    | 25 +++++++++----------------
 sysdeps/aarch64/memrchr.S   | 25 +++++++++----------------
 sysdeps/aarch64/strchrnul.S | 29 +++++++++++------------------
 sysdeps/aarch64/strcpy.S    | 32 ++++++++++++--------------------
 sysdeps/aarch64/strlen.S    | 25 +++++++++----------------
 sysdeps/aarch64/strnlen.S   | 25 +++++++++----------------
 6 files changed, 59 insertions(+), 102 deletions(-)

diff --git a/sysdeps/aarch64/memchr.S b/sysdeps/aarch64/memchr.S
index b060eee97d..2053a977b6 100644
--- a/sysdeps/aarch64/memchr.S
+++ b/sysdeps/aarch64/memchr.S
@@ -41,24 +41,21 @@
 #define synd		x5
 #define shift		x6
 #define	tmp		x7
-#define wtmp		w7
 
 #define vrepchr		v0
 #define qdata		q1
 #define vdata		v1
 #define vhas_chr	v2
-#define vrepmask	v3
-#define vend		v4
-#define dend		d4
+#define vend		v3
+#define dend		d3
 
 /*
    Core algorithm:
-   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
-   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
-   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
-   set likewise for odd bytes so that adjacent bytes can be merged. Since the
-   bits in the syndrome reflect the order in which things occur in the original
-   string, counting trailing zeros identifies exactly which byte matched.  */
+   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
+   per byte. We take 4 bits of every comparison byte with shift right and narrow
+   by 4 instruction. Since the bits in the nibble mask reflect the order in
+   which things occur in the original string, counting leading zeros identifies
+   exactly which byte matched.  */
 
 ENTRY (MEMCHR)
 	PTR_ARG (0)
@@ -67,12 +64,9 @@ ENTRY (MEMCHR)
 	cbz	cntin, L(nomatch)
 	ld1	{vdata.16b}, [src]
 	dup	vrepchr.16b, chrin
-	mov	wtmp, 0xf00f
-	dup	vrepmask.8h, wtmp
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	lsl	shift, srcin, 2
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64 */
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	fmov	synd, dend
 	lsr	synd, synd, shift
 	cbz	synd, L(start_loop)
@@ -111,8 +105,7 @@ L(loop32_2):
 	fmov	synd, dend
 	cbz	synd, L(loop32)
 L(end):
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	fmov	synd, dend
 	add	tmp, srcin, cntin
 	sub	cntrem, tmp, src
diff --git a/sysdeps/aarch64/memrchr.S b/sysdeps/aarch64/memrchr.S
index e0efbad91c..5179320720 100644
--- a/sysdeps/aarch64/memrchr.S
+++ b/sysdeps/aarch64/memrchr.S
@@ -37,7 +37,6 @@
 #define synd		x5
 #define shift		x6
 #define	tmp		x7
-#define wtmp		w7
 #define end		x8
 #define endm1		x9
 
@@ -45,18 +44,16 @@
 #define qdata		q1
 #define vdata		v1
 #define vhas_chr	v2
-#define vrepmask	v3
-#define vend		v4
-#define dend		d4
+#define vend		v3
+#define dend		d3
 
 /*
    Core algorithm:
-   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
-   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
-   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
-   set likewise for odd bytes so that adjacent bytes can be merged. Since the
-   bits in the syndrome reflect the order in which things occur in the original
-   string, counting trailing zeros identifies exactly which byte matched.  */
+   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
+   per byte. We take 4 bits of every comparison byte with shift right and narrow
+   by 4 instruction. Since the bits in the nibble mask reflect the order in
+   which things occur in the original string, counting leading zeros identifies
+   exactly which byte matched.  */
 
 ENTRY (__memrchr)
 	PTR_ARG (0)
@@ -67,12 +64,9 @@ ENTRY (__memrchr)
 	cbz	cntin, L(nomatch)
 	ld1	{vdata.16b}, [src]
 	dup	vrepchr.16b, chrin
-	mov	wtmp, 0xf00f
-	dup	vrepmask.8h, wtmp
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	neg	shift, end, lsl 2
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64 */
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	fmov	synd, dend
 	lsl	synd, synd, shift
 	cbz	synd, L(start_loop)
@@ -109,8 +103,7 @@ L(loop32_2):
 	fmov	synd, dend
 	cbz	synd, L(loop32)
 L(end):
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	fmov	synd, dend
 
 	add	tmp, src, 15
diff --git a/sysdeps/aarch64/strchrnul.S b/sysdeps/aarch64/strchrnul.S
index 442726fd49..ee154ab74b 100644
--- a/sysdeps/aarch64/strchrnul.S
+++ b/sysdeps/aarch64/strchrnul.S
@@ -33,38 +33,32 @@
 #define src		x2
 #define tmp1		x1
 #define tmp2		x3
-#define tmp2w		w3
 
 #define vrepchr		v0
 #define vdata		v1
 #define qdata		q1
 #define vhas_nul	v2
 #define vhas_chr	v3
-#define vrepmask	v4
-#define vend		v5
-#define dend		d5
+#define vend		v4
+#define dend		d4
 
-/* Core algorithm:
-
-   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
-   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
-   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
-   set likewise for odd bytes so that adjacent bytes can be merged. Since the
-   bits in the syndrome reflect the order in which things occur in the original
-   string, counting trailing zeros identifies exactly which byte matched.  */
+/*
+   Core algorithm:
+   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
+   per byte. We take 4 bits of every comparison byte with shift right and narrow
+   by 4 instruction. Since the bits in the nibble mask reflect the order in
+   which things occur in the original string, counting leading zeros identifies
+   exactly which byte matched.  */
 
 ENTRY (__strchrnul)
 	PTR_ARG (0)
 	bic	src, srcin, 15
 	dup	vrepchr.16b, chrin
 	ld1	{vdata.16b}, [src]
-	mov	tmp2w, 0xf00f
-	dup	vrepmask.8h, tmp2w
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	cmhs	vhas_chr.16b, vhas_chr.16b, vdata.16b
 	lsl	tmp2, srcin, 2
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	fmov	tmp1, dend
 	lsr	tmp1, tmp1, tmp2	/* Mask padding bits.  */
 	cbz	tmp1, L(loop)
@@ -83,8 +77,7 @@ L(loop):
 	fmov	tmp1, dend
 	cbz	tmp1, L(loop)
 
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	fmov	tmp1, dend
 #ifndef __AARCH64EB__
 	rbit	tmp1, tmp1
diff --git a/sysdeps/aarch64/strcpy.S b/sysdeps/aarch64/strcpy.S
index da53170ece..78d27b4aa6 100644
--- a/sysdeps/aarch64/strcpy.S
+++ b/sysdeps/aarch64/strcpy.S
@@ -40,7 +40,6 @@
 #define len		x4
 #define synd		x4
 #define	tmp		x5
-#define wtmp		w5
 #define shift		x5
 #define data1		x6
 #define dataw1		w6
@@ -50,9 +49,8 @@
 #define dataq		q0
 #define vdata		v0
 #define vhas_nul	v1
-#define vrepmask	v2
-#define vend		v3
-#define dend		d3
+#define vend		v2
+#define dend		d2
 #define dataq2		q1
 
 #ifdef BUILD_STPCPY
@@ -63,34 +61,29 @@
 # define IFSTPCPY(X,...)
 #endif
 
-/* Core algorithm:
-
-   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
-   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
-   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
-   set likewise for odd bytes so that adjacent bytes can be merged. Since the
-   bits in the syndrome reflect the order in which things occur in the original
-   string, counting trailing zeros identifies exactly which byte matched.  */
+/*
+   Core algorithm:
+   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
+   per byte. We take 4 bits of every comparison byte with shift right and narrow
+   by 4 instruction. Since the bits in the nibble mask reflect the order in
+   which things occur in the original string, counting leading zeros identifies
+   exactly which byte matched.  */
 
 ENTRY (STRCPY)
 	PTR_ARG (0)
 	PTR_ARG (1)
 	bic	src, srcin, 15
-	mov	wtmp, 0xf00f
 	ld1	{vdata.16b}, [src]
-	dup	vrepmask.8h, wtmp
 	cmeq	vhas_nul.16b, vdata.16b, 0
 	lsl	shift, srcin, 2
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
 	fmov	synd, dend
 	lsr	synd, synd, shift
 	cbnz	synd, L(tail)
 
 	ldr	dataq, [src, 16]!
 	cmeq	vhas_nul.16b, vdata.16b, 0
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
 	fmov	synd, dend
 	cbz	synd, L(start_loop)
 
@@ -162,8 +155,7 @@ L(loop):
 	fmov	synd, dend
 	cbz	synd, L(loop)
 
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
+	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
 	fmov	synd, dend
 #ifndef __AARCH64EB__
 	rbit	synd, synd
diff --git a/sysdeps/aarch64/strlen.S b/sysdeps/aarch64/strlen.S
index a2310871c2..3a5d088407 100644
--- a/sysdeps/aarch64/strlen.S
+++ b/sysdeps/aarch64/strlen.S
@@ -34,35 +34,29 @@
 #define src		x1
 #define	synd		x2
 #define tmp		x3
-#define wtmp		w3
 #define shift		x4
 
 #define data		q0
 #define vdata		v0
 #define vhas_nul	v1
-#define vrepmask	v2
-#define vend		v3
-#define dend		d3
+#define vend		v2
+#define dend		d2
 
 /* Core algorithm:
 
-   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
-   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
-   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
-   set likewise for odd bytes so that adjacent bytes can be merged. Since the
-   bits in the syndrome reflect the order in which things occur in the original
-   string, counting trailing zeros identifies exactly which byte matched.  */
+   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
+   per byte. We take 4 bits of every comparison byte with shift right and narrow
+   by 4 instruction. Since the bits in the nibble mask reflect the order in
+   which things occur in the original string, counting trailing zeros identifies
+   exactly which byte matched.  */
 
 ENTRY (STRLEN)
 	PTR_ARG (0)
 	bic	src, srcin, 15
-	mov	wtmp, 0xf00f
 	ld1	{vdata.16b}, [src]
-	dup	vrepmask.8h, wtmp
 	cmeq	vhas_nul.16b, vdata.16b, 0
 	lsl	shift, srcin, 2
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
+	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
 	fmov	synd, dend
 	lsr	synd, synd, shift
 	cbz	synd, L(loop)
@@ -80,8 +74,7 @@ L(loop):
 	fmov	synd, dend
 	cbz	synd, L(loop)
 
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
+	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
 	sub	result, src, srcin
 	fmov	synd, dend
 #ifndef __AARCH64EB__
diff --git a/sysdeps/aarch64/strnlen.S b/sysdeps/aarch64/strnlen.S
index 0dbecb0ce9..282bddc9aa 100644
--- a/sysdeps/aarch64/strnlen.S
+++ b/sysdeps/aarch64/strnlen.S
@@ -33,39 +33,33 @@
 #define src		x2
 #define synd		x3
 #define	shift		x4
-#define wtmp		w4
 #define tmp		x4
 #define cntrem		x5
 
 #define qdata		q0
 #define vdata		v0
 #define vhas_chr	v1
-#define vrepmask	v2
-#define vend		v3
-#define dend		d3
+#define vend		v2
+#define dend		d2
 
 /*
    Core algorithm:
 
-   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
-   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
-   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
-   set likewise for odd bytes so that adjacent bytes can be merged. Since the
-   bits in the syndrome reflect the order in which things occur in the original
-   string, counting trailing zeros identifies exactly which byte matched.  */
+   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
+   per byte. We take 4 bits of every comparison byte with shift right and narrow
+   by 4 instruction. Since the bits in the nibble mask reflect the order in
+   which things occur in the original string, counting trailing zeros identifies
+   exactly which byte matched.  */
 
 ENTRY (__strnlen)
 	PTR_ARG (0)
 	SIZE_ARG (1)
 	bic	src, srcin, 15
-	mov	wtmp, 0xf00f
 	cbz	cntin, L(nomatch)
 	ld1	{vdata.16b}, [src], 16
-	dup	vrepmask.8h, wtmp
 	cmeq	vhas_chr.16b, vdata.16b, 0
 	lsl	shift, srcin, 2
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	fmov	synd, dend
 	lsr	synd, synd, shift
 	cbz	synd, L(start_loop)
@@ -103,8 +97,7 @@ L(loop32_2):
 	cbz	synd, L(loop32)
 
 L(end):
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	sub	src, src, 16
 	mov	synd, vend.d[0]
 	sub	result, src, srcin
-- 
2.37.0.rc0.161.g10f37bed90-goog


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] aarch64: Optimize string functions with shrn instruction
  2022-06-22  7:29 Danila Kutenin
@ 2022-06-27 16:02 ` Carlos O'Donell
  2022-06-27 16:12   ` Danila Kutenin
  0 siblings, 1 reply; 12+ messages in thread
From: Carlos O'Donell @ 2022-06-27 16:02 UTC (permalink / raw)
  To: danilak, libc-alpha; +Cc: Szabolcs.Nagy

On 6/22/22 03:29, Danila Kutenin via Libc-alpha wrote:
> We found that string functions were using AND+ADDP
> to find the nibble/syndrome mask but there is an easier
> opportunity through `SHRN dst, src, 4` and has same
> latency on all SIMD ARMv8 targets as ADDP. There are also
> possible gaps for memcmp but that's for another patch.
> 
> We see 10-20% savings for small-mid size cases (<=128)
> which are primary cases for general workloads.
> 
> Signed-off-by: Danila Kutenin <danilak@google.com>
Thank you very much for working through this.

This patch came up for review in Monday patch queue review.

Please drop the "Signed-off-by" and submit the patch again.

We only accept "Signed-off-by" in the cases where the submitter
is not assigning copyright. In your case the Google blanket
copyright assignment is in place (I verified) and so you are
assigning to the FSF.

Thank you for taking this extra step. I appreciate that it is a
complicated extra step. We are working to keep the two submission
processes clear (copyright assignment vs. no copyright assignment)
and easy to follow.

-- 
Cheers,
Carlos.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH] aarch64: Optimize string functions with shrn instruction
@ 2022-06-22  7:29 Danila Kutenin
  2022-06-27 16:02 ` Carlos O'Donell
  0 siblings, 1 reply; 12+ messages in thread
From: Danila Kutenin @ 2022-06-22  7:29 UTC (permalink / raw)
  To: libc-alpha; +Cc: Szabolcs.Nagy, Danila Kutenin

We found that string functions were using AND+ADDP
to find the nibble/syndrome mask but there is an easier
opportunity through `SHRN dst, src, 4` and has same
latency on all SIMD ARMv8 targets as ADDP. There are also
possible gaps for memcmp but that's for another patch.

We see 10-20% savings for small-mid size cases (<=128)
which are primary cases for general workloads.

Signed-off-by: Danila Kutenin <danilak@google.com>
---
 sysdeps/aarch64/memchr.S    | 25 +++++++++----------------
 sysdeps/aarch64/memrchr.S   | 25 +++++++++----------------
 sysdeps/aarch64/strchrnul.S | 29 +++++++++++------------------
 sysdeps/aarch64/strcpy.S    | 32 ++++++++++++--------------------
 sysdeps/aarch64/strlen.S    | 25 +++++++++----------------
 sysdeps/aarch64/strnlen.S   | 25 +++++++++----------------
 6 files changed, 59 insertions(+), 102 deletions(-)

diff --git a/sysdeps/aarch64/memchr.S b/sysdeps/aarch64/memchr.S
index b060eee97d..2053a977b6 100644
--- a/sysdeps/aarch64/memchr.S
+++ b/sysdeps/aarch64/memchr.S
@@ -41,24 +41,21 @@
 #define synd		x5
 #define shift		x6
 #define	tmp		x7
-#define wtmp		w7
 
 #define vrepchr		v0
 #define qdata		q1
 #define vdata		v1
 #define vhas_chr	v2
-#define vrepmask	v3
-#define vend		v4
-#define dend		d4
+#define vend		v3
+#define dend		d3
 
 /*
    Core algorithm:
-   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
-   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
-   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
-   set likewise for odd bytes so that adjacent bytes can be merged. Since the
-   bits in the syndrome reflect the order in which things occur in the original
-   string, counting trailing zeros identifies exactly which byte matched.  */
+   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
+   per byte. We take 4 bits of every comparison byte with shift right and narrow
+   by 4 instruction. Since the bits in the nibble mask reflect the order in
+   which things occur in the original string, counting leading zeros identifies
+   exactly which byte matched.  */
 
 ENTRY (MEMCHR)
 	PTR_ARG (0)
@@ -67,12 +64,9 @@ ENTRY (MEMCHR)
 	cbz	cntin, L(nomatch)
 	ld1	{vdata.16b}, [src]
 	dup	vrepchr.16b, chrin
-	mov	wtmp, 0xf00f
-	dup	vrepmask.8h, wtmp
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	lsl	shift, srcin, 2
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64 */
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	fmov	synd, dend
 	lsr	synd, synd, shift
 	cbz	synd, L(start_loop)
@@ -111,8 +105,7 @@ L(loop32_2):
 	fmov	synd, dend
 	cbz	synd, L(loop32)
 L(end):
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	fmov	synd, dend
 	add	tmp, srcin, cntin
 	sub	cntrem, tmp, src
diff --git a/sysdeps/aarch64/memrchr.S b/sysdeps/aarch64/memrchr.S
index e0efbad91c..5179320720 100644
--- a/sysdeps/aarch64/memrchr.S
+++ b/sysdeps/aarch64/memrchr.S
@@ -37,7 +37,6 @@
 #define synd		x5
 #define shift		x6
 #define	tmp		x7
-#define wtmp		w7
 #define end		x8
 #define endm1		x9
 
@@ -45,18 +44,16 @@
 #define qdata		q1
 #define vdata		v1
 #define vhas_chr	v2
-#define vrepmask	v3
-#define vend		v4
-#define dend		d4
+#define vend		v3
+#define dend		d3
 
 /*
    Core algorithm:
-   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
-   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
-   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
-   set likewise for odd bytes so that adjacent bytes can be merged. Since the
-   bits in the syndrome reflect the order in which things occur in the original
-   string, counting trailing zeros identifies exactly which byte matched.  */
+   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
+   per byte. We take 4 bits of every comparison byte with shift right and narrow
+   by 4 instruction. Since the bits in the nibble mask reflect the order in
+   which things occur in the original string, counting leading zeros identifies
+   exactly which byte matched.  */
 
 ENTRY (__memrchr)
 	PTR_ARG (0)
@@ -67,12 +64,9 @@ ENTRY (__memrchr)
 	cbz	cntin, L(nomatch)
 	ld1	{vdata.16b}, [src]
 	dup	vrepchr.16b, chrin
-	mov	wtmp, 0xf00f
-	dup	vrepmask.8h, wtmp
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	neg	shift, end, lsl 2
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64 */
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	fmov	synd, dend
 	lsl	synd, synd, shift
 	cbz	synd, L(start_loop)
@@ -109,8 +103,7 @@ L(loop32_2):
 	fmov	synd, dend
 	cbz	synd, L(loop32)
 L(end):
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	fmov	synd, dend
 
 	add	tmp, src, 15
diff --git a/sysdeps/aarch64/strchrnul.S b/sysdeps/aarch64/strchrnul.S
index 442726fd49..ee154ab74b 100644
--- a/sysdeps/aarch64/strchrnul.S
+++ b/sysdeps/aarch64/strchrnul.S
@@ -33,38 +33,32 @@
 #define src		x2
 #define tmp1		x1
 #define tmp2		x3
-#define tmp2w		w3
 
 #define vrepchr		v0
 #define vdata		v1
 #define qdata		q1
 #define vhas_nul	v2
 #define vhas_chr	v3
-#define vrepmask	v4
-#define vend		v5
-#define dend		d5
+#define vend		v4
+#define dend		d4
 
-/* Core algorithm:
-
-   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
-   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
-   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
-   set likewise for odd bytes so that adjacent bytes can be merged. Since the
-   bits in the syndrome reflect the order in which things occur in the original
-   string, counting trailing zeros identifies exactly which byte matched.  */
+/*
+   Core algorithm:
+   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
+   per byte. We take 4 bits of every comparison byte with shift right and narrow
+   by 4 instruction. Since the bits in the nibble mask reflect the order in
+   which things occur in the original string, counting leading zeros identifies
+   exactly which byte matched.  */
 
 ENTRY (__strchrnul)
 	PTR_ARG (0)
 	bic	src, srcin, 15
 	dup	vrepchr.16b, chrin
 	ld1	{vdata.16b}, [src]
-	mov	tmp2w, 0xf00f
-	dup	vrepmask.8h, tmp2w
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	cmhs	vhas_chr.16b, vhas_chr.16b, vdata.16b
 	lsl	tmp2, srcin, 2
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	fmov	tmp1, dend
 	lsr	tmp1, tmp1, tmp2	/* Mask padding bits.  */
 	cbz	tmp1, L(loop)
@@ -83,8 +77,7 @@ L(loop):
 	fmov	tmp1, dend
 	cbz	tmp1, L(loop)
 
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	fmov	tmp1, dend
 #ifndef __AARCH64EB__
 	rbit	tmp1, tmp1
diff --git a/sysdeps/aarch64/strcpy.S b/sysdeps/aarch64/strcpy.S
index da53170ece..78d27b4aa6 100644
--- a/sysdeps/aarch64/strcpy.S
+++ b/sysdeps/aarch64/strcpy.S
@@ -40,7 +40,6 @@
 #define len		x4
 #define synd		x4
 #define	tmp		x5
-#define wtmp		w5
 #define shift		x5
 #define data1		x6
 #define dataw1		w6
@@ -50,9 +49,8 @@
 #define dataq		q0
 #define vdata		v0
 #define vhas_nul	v1
-#define vrepmask	v2
-#define vend		v3
-#define dend		d3
+#define vend		v2
+#define dend		d2
 #define dataq2		q1
 
 #ifdef BUILD_STPCPY
@@ -63,34 +61,29 @@
 # define IFSTPCPY(X,...)
 #endif
 
-/* Core algorithm:
-
-   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
-   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
-   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
-   set likewise for odd bytes so that adjacent bytes can be merged. Since the
-   bits in the syndrome reflect the order in which things occur in the original
-   string, counting trailing zeros identifies exactly which byte matched.  */
+/*
+   Core algorithm:
+   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
+   per byte. We take 4 bits of every comparison byte with shift right and narrow
+   by 4 instruction. Since the bits in the nibble mask reflect the order in
+   which things occur in the original string, counting leading zeros identifies
+   exactly which byte matched.  */
 
 ENTRY (STRCPY)
 	PTR_ARG (0)
 	PTR_ARG (1)
 	bic	src, srcin, 15
-	mov	wtmp, 0xf00f
 	ld1	{vdata.16b}, [src]
-	dup	vrepmask.8h, wtmp
 	cmeq	vhas_nul.16b, vdata.16b, 0
 	lsl	shift, srcin, 2
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
 	fmov	synd, dend
 	lsr	synd, synd, shift
 	cbnz	synd, L(tail)
 
 	ldr	dataq, [src, 16]!
 	cmeq	vhas_nul.16b, vdata.16b, 0
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
 	fmov	synd, dend
 	cbz	synd, L(start_loop)
 
@@ -162,8 +155,7 @@ L(loop):
 	fmov	synd, dend
 	cbz	synd, L(loop)
 
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
+	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
 	fmov	synd, dend
 #ifndef __AARCH64EB__
 	rbit	synd, synd
diff --git a/sysdeps/aarch64/strlen.S b/sysdeps/aarch64/strlen.S
index a2310871c2..3a5d088407 100644
--- a/sysdeps/aarch64/strlen.S
+++ b/sysdeps/aarch64/strlen.S
@@ -34,35 +34,29 @@
 #define src		x1
 #define	synd		x2
 #define tmp		x3
-#define wtmp		w3
 #define shift		x4
 
 #define data		q0
 #define vdata		v0
 #define vhas_nul	v1
-#define vrepmask	v2
-#define vend		v3
-#define dend		d3
+#define vend		v2
+#define dend		d2
 
 /* Core algorithm:
 
-   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
-   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
-   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
-   set likewise for odd bytes so that adjacent bytes can be merged. Since the
-   bits in the syndrome reflect the order in which things occur in the original
-   string, counting trailing zeros identifies exactly which byte matched.  */
+   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
+   per byte. We take 4 bits of every comparison byte with shift right and narrow
+   by 4 instruction. Since the bits in the nibble mask reflect the order in
+   which things occur in the original string, counting trailing zeros identifies
+   exactly which byte matched.  */
 
 ENTRY (STRLEN)
 	PTR_ARG (0)
 	bic	src, srcin, 15
-	mov	wtmp, 0xf00f
 	ld1	{vdata.16b}, [src]
-	dup	vrepmask.8h, wtmp
 	cmeq	vhas_nul.16b, vdata.16b, 0
 	lsl	shift, srcin, 2
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
+	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
 	fmov	synd, dend
 	lsr	synd, synd, shift
 	cbz	synd, L(loop)
@@ -80,8 +74,7 @@ L(loop):
 	fmov	synd, dend
 	cbz	synd, L(loop)
 
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
+	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
 	sub	result, src, srcin
 	fmov	synd, dend
 #ifndef __AARCH64EB__
diff --git a/sysdeps/aarch64/strnlen.S b/sysdeps/aarch64/strnlen.S
index 0dbecb0ce9..282bddc9aa 100644
--- a/sysdeps/aarch64/strnlen.S
+++ b/sysdeps/aarch64/strnlen.S
@@ -33,39 +33,33 @@
 #define src		x2
 #define synd		x3
 #define	shift		x4
-#define wtmp		w4
 #define tmp		x4
 #define cntrem		x5
 
 #define qdata		q0
 #define vdata		v0
 #define vhas_chr	v1
-#define vrepmask	v2
-#define vend		v3
-#define dend		d3
+#define vend		v2
+#define dend		d2
 
 /*
    Core algorithm:
 
-   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
-   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
-   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
-   set likewise for odd bytes so that adjacent bytes can be merged. Since the
-   bits in the syndrome reflect the order in which things occur in the original
-   string, counting trailing zeros identifies exactly which byte matched.  */
+   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
+   per byte. We take 4 bits of every comparison byte with shift right and narrow
+   by 4 instruction. Since the bits in the nibble mask reflect the order in
+   which things occur in the original string, counting trailing zeros identifies
+   exactly which byte matched.  */
 
 ENTRY (__strnlen)
 	PTR_ARG (0)
 	SIZE_ARG (1)
 	bic	src, srcin, 15
-	mov	wtmp, 0xf00f
 	cbz	cntin, L(nomatch)
 	ld1	{vdata.16b}, [src], 16
-	dup	vrepmask.8h, wtmp
 	cmeq	vhas_chr.16b, vdata.16b, 0
 	lsl	shift, srcin, 2
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	fmov	synd, dend
 	lsr	synd, synd, shift
 	cbz	synd, L(start_loop)
@@ -103,8 +97,7 @@ L(loop32_2):
 	cbz	synd, L(loop32)
 
 L(end):
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	sub	src, src, 16
 	mov	synd, vend.d[0]
 	sub	result, src, srcin
-- 
2.37.0.rc0.104.g0611611a94-goog


^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2022-07-06  8:30 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-06-20 17:46 [PATCH] aarch64: Optimize string functions with shrn instruction Danila Kutenin
2022-06-21  9:07 ` Szabolcs Nagy
2022-06-21  9:28   ` Danila Kutenin
2022-06-22  6:48     ` Szabolcs Nagy
2022-06-22  7:31       ` Danila Kutenin
2022-06-22  8:40         ` Szabolcs Nagy
2022-06-22  7:29 Danila Kutenin
2022-06-27 16:02 ` Carlos O'Donell
2022-06-27 16:12   ` Danila Kutenin
2022-06-27 16:12 Danila Kutenin
2022-07-06  7:42 ` Danila Kutenin
2022-07-06  8:30   ` Szabolcs Nagy

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).