From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <wilco@sourceware.org>
Received: by sourceware.org (Postfix, from userid 1895)
	id EB1383858426; Wed, 10 Apr 2024 14:10:37 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org EB1383858426
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org;
	s=default; t=1712758237;
	bh=E6ADOFoCfCd5A/6WBwQJDaKK0mxkqqgWB/2VfKT+6Sc=;
	h=From:To:Subject:Date:From;
	b=cXMePLhiESAYdR5ZFkElhU4cFljid1IDyh1Xl2qU/m7Ic2trxZhkG0Vyt0D0VAIcN
	 uPIq5YgT+AFvMi/ie5t3r8Y5AQzSyrx2CG2tffMNzDeEtyas8Oyse4OP+w7QT9CYUO
	 qOk940aLsKPOuDxuuUfHbJL+aI6NHIKwh/TjCx58=
Content-Type: text/plain; charset="us-ascii"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
From: Wilco Dijkstra <wilco@sourceware.org>
To: glibc-cvs@sourceware.org
Subject: [glibc/release/2.36/master] AArch64: Improve strrchr
X-Act-Checkin: glibc
X-Git-Author: Wilco Dijkstra <wilco.dijkstra@arm.com>
X-Git-Refname: refs/heads/release/2.36/master
X-Git-Oldrev: 7cbcc959270dcf25789b2f09c4bf1c60a86b760a
X-Git-Newrev: 600098c58ab53107a76237cba8b90ce26b253b56
Message-Id: <20240410141037.EB1383858426@sourceware.org>
Date: Wed, 10 Apr 2024 14:10:37 +0000 (GMT)
List-Id: <glibc-cvs.sourceware.org>

https://sourceware.org/git/gitweb.cgi?p=glibc.git;h=600098c58ab53107a76237cba8b90ce26b253b56

commit 600098c58ab53107a76237cba8b90ce26b253b56
Author: Wilco Dijkstra <wilco.dijkstra@arm.com>
Date:   Wed Jan 11 13:53:19 2023 +0000

    AArch64: Improve strrchr
    
    Use shrn for narrowing the mask which simplifies code and speeds up small
    strings.  Unroll the first search loop to improve performance on large
    strings.
    
    Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
    (cherry picked from commit 55599d480437dcf129b41b95be32b48f2a9e5da9)

Diff:
---
 sysdeps/aarch64/strrchr.S | 58 +++++++++++++++++++++++++++--------------------
 1 file changed, 33 insertions(+), 25 deletions(-)

diff --git a/sysdeps/aarch64/strrchr.S b/sysdeps/aarch64/strrchr.S
index 596e77c43b..eda6fefb99 100644
--- a/sysdeps/aarch64/strrchr.S
+++ b/sysdeps/aarch64/strrchr.S
@@ -22,19 +22,16 @@
 
 /* Assumptions:
  *
- * ARMv8-a, AArch64
- * Neon Available.
+ * ARMv8-a, AArch64, Advanced SIMD.
  * MTE compatible.
  */
 
-/* Arguments and results.  */
 #define srcin		x0
 #define chrin		w1
 #define result		x0
 
 #define src		x2
 #define tmp		x3
-#define wtmp		w3
 #define synd		x3
 #define shift		x4
 #define src_match	x4
@@ -46,7 +43,6 @@
 #define vhas_nul	v2
 #define vhas_chr	v3
 #define vrepmask	v4
-#define vrepmask2	v5
 #define vend		v5
 #define dend		d5
 
@@ -58,59 +54,71 @@
    the relevant byte matched the requested character; bits 2-3 are set
    if the relevant byte matched the NUL end of string.  */
 
-ENTRY(strrchr)
+ENTRY (strrchr)
 	PTR_ARG (0)
 	bic	src, srcin, 15
 	dup	vrepchr.16b, chrin
-	mov	wtmp, 0x3003
-	dup	vrepmask.8h, wtmp
-	tst	srcin, 15
-	beq	L(loop1)
-
-	ld1	{vdata.16b}, [src], 16
+	movi	vrepmask.16b, 0x33
+	ld1	{vdata.16b}, [src]
 	cmeq	vhas_nul.16b, vdata.16b, 0
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
-	mov	wtmp, 0xf00f
-	dup	vrepmask2.8h, wtmp
 	bit	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	shrn	vend.8b, vhas_nul.8h, 4
 	lsl	shift, srcin, 2
 	fmov	synd, dend
 	lsr	synd, synd, shift
 	lsl	synd, synd, shift
 	ands	nul_match, synd, 0xcccccccccccccccc
 	bne	L(tail)
-	cbnz	synd, L(loop2)
+	cbnz	synd, L(loop2_start)
 
-	.p2align 5
+	.p2align 4
 L(loop1):
-	ld1	{vdata.16b}, [src], 16
+	ldr	q1, [src, 16]
+	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
+	cmhs	vhas_nul.16b, vhas_chr.16b, vdata.16b
+	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	fmov	synd, dend
+	cbnz	synd, L(loop1_end)
+	ldr	q1, [src, 32]!
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	cmhs	vhas_nul.16b, vhas_chr.16b, vdata.16b
 	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
 	fmov	synd, dend
 	cbz	synd, L(loop1)
-
+	sub	src, src, 16
+L(loop1_end):
+	add	src, src, 16
 	cmeq	vhas_nul.16b, vdata.16b, 0
+#ifdef __AARCH64EB__
+	bif	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
+	shrn	vend.8b, vhas_nul.8h, 4
+	fmov	synd, dend
+	rbit	synd, synd
+#else
 	bit	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
-	bic	vhas_nul.8h, 0x0f, lsl 8
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	shrn	vend.8b, vhas_nul.8h, 4
 	fmov	synd, dend
+#endif
 	ands	nul_match, synd, 0xcccccccccccccccc
-	beq	L(loop2)
-
+	beq	L(loop2_start)
 L(tail):
 	sub	nul_match, nul_match, 1
 	and	chr_match, synd, 0x3333333333333333
 	ands	chr_match, chr_match, nul_match
-	sub	result, src, 1
+	add	result, src, 15
 	clz	tmp, chr_match
 	sub	result, result, tmp, lsr 2
 	csel	result, result, xzr, ne
 	ret
 
 	.p2align 4
+	nop
+	nop
+L(loop2_start):
+	add	src, src, 16
+	bic	vrepmask.8h, 0xf0
+
 L(loop2):
 	cmp	synd, 0
 	csel	src_match, src, src_match, ne