From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <skpgkp2@sourceware.org>
Received: by sourceware.org (Postfix, from userid 7852)
	id 067D73858D38; Sun, 30 Oct 2022 21:14:04 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 067D73858D38
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org;
	s=default; t=1667164444;
	bh=nnVr0iRP4dt1pkGjR4PpN0sopRbeAftM+N4Pp0dIt9A=;
	h=From:To:Subject:Date:From;
	b=tfkavkPMA9HupYkuHJ8jRFczYUejscOMareAUeu71xqovebkW6U8nC/BGowpFp1CS
	 O9Ikh0SW23M0ZcRhXaUirZxPi5DVQ0DKw8ruIDZIiNOUeKzsC8NyyTBBLUkChGx0fx
	 cekShuFQFbwWkqM/w0m3fLIIXK5SVFzJTc/DCjzk=
Content-Type: text/plain; charset="us-ascii"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
From: Sunil Pandey <skpgkp2@sourceware.org>
To: glibc-cvs@sourceware.org
Subject: [glibc] x86-64: Improve evex512 version of strlen functions
X-Act-Checkin: glibc
X-Git-Author: Sunil K Pandey <skpgkp2@gmail.com>
X-Git-Refname: refs/heads/master
X-Git-Oldrev: 361d6454c034a920f2c96517c277990d390b9652
X-Git-Newrev: e96971482de05eff92c1408b694c320cedd2d167
Message-Id: <20221030211404.067D73858D38@sourceware.org>
Date: Sun, 30 Oct 2022 21:14:04 +0000 (GMT)
List-Id: <glibc-cvs.sourceware.org>

https://sourceware.org/git/gitweb.cgi?p=glibc.git;h=e96971482de05eff92c1408b694c320cedd2d167

commit e96971482de05eff92c1408b694c320cedd2d167
Author: Sunil K Pandey <skpgkp2@gmail.com>
Date:   Mon Oct 3 12:00:53 2022 -0700

    x86-64: Improve evex512 version of strlen functions
    
    This patch improves following functionality
    - Replace VPCMP with VPCMPEQ.
    - Replace page cross check logic with sall.
    - Remove extra lea from align_more.
    - Remove uncondition loop jump.
    - Use bsf to check max length in first vector.
    
    Reviewed-by: Noah Goldstein <goldstein.w.n@gmail.com>

Diff:
---
 sysdeps/x86_64/multiarch/strlen-evex-base.S | 91 ++++++++++++++++++-----------
 1 file changed, 57 insertions(+), 34 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/strlen-evex-base.S b/sysdeps/x86_64/multiarch/strlen-evex-base.S
index c832b15a48..fd6c770e6e 100644
--- a/sysdeps/x86_64/multiarch/strlen-evex-base.S
+++ b/sysdeps/x86_64/multiarch/strlen-evex-base.S
@@ -25,12 +25,12 @@
 # include <sysdep.h>
 
 # ifdef USE_AS_WCSLEN
-#  define VPCMP		vpcmpd
+#  define VPCMPEQ	vpcmpeqd
 #  define VPTESTN	vptestnmd
 #  define VPMINU	vpminud
 #  define CHAR_SIZE	4
 # else
-#  define VPCMP		vpcmpb
+#  define VPCMPEQ	vpcmpeqb
 #  define VPTESTN	vptestnmb
 #  define VPMINU	vpminub
 #  define CHAR_SIZE	1
@@ -55,20 +55,29 @@ ENTRY_P2ALIGN (STRLEN, 6)
 
 	movl	%edi, %eax
 	vpxorq	%VMM_128(0), %VMM_128(0), %VMM_128(0)
-	andl	$(PAGE_SIZE - 1), %eax
-	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	sall	$20, %eax
+	cmpl	$((PAGE_SIZE - VEC_SIZE) << 20), %eax
 	ja	L(page_cross)
 
 	/* Compare [w]char for null, mask bit will be set for match.  */
-	VPCMP	$0, (%rdi), %VMM(0), %k0
+	VPCMPEQ	(%rdi), %VMM(0), %k0
+# ifdef USE_AS_STRNLEN
+	KMOV	%k0, %VRCX
+	/* Store max length in rax.  */
+	mov	%rsi, %rax
+	/* If rcx is 0, rax will have max length.  We can not use VRCX
+	   and VRAX here for evex256 because, upper 32 bits may be
+	   undefined for ecx and eax.  */
+	bsfq	%rcx, %rax
+	cmp	$CHAR_PER_VEC, %rax
+	ja	L(align_more)
+	cmpq	%rax, %rsi
+	cmovb	%esi, %eax
+# else
 	KMOV	%k0, %VRAX
 	test	%VRAX, %VRAX
 	jz	L(align_more)
-
 	bsf	%VRAX, %VRAX
-# ifdef USE_AS_STRNLEN
-	cmpq	%rsi, %rax
-	cmovnb	%rsi, %rax
 # endif
 	ret
 
@@ -81,25 +90,24 @@ L(ret_max):
 # endif
 
 L(align_more):
-	leaq	VEC_SIZE(%rdi), %rax
+	mov	%rdi, %rax
 	/* Align rax to VEC_SIZE.  */
 	andq	$-VEC_SIZE, %rax
 # ifdef USE_AS_STRNLEN
-	movq	%rax, %rdx
-	subq	%rdi, %rdx
+	movq	%rdi, %rdx
+	subq	%rax, %rdx
 #  ifdef USE_AS_WCSLEN
 	shr	$2, %VRDX
 #  endif
 	/* At this point rdx contains [w]chars already compared.  */
-	subq	%rsi, %rdx
-	jae	L(ret_max)
-	negq	%rdx
+	leaq	-CHAR_PER_VEC(%rsi, %rdx), %rdx
 	/* At this point rdx contains number of w[char] needs to go.
 	   Now onwards rdx will keep decrementing with each compare.  */
 # endif
 
 	/* Loop unroll 4 times for 4 vector loop.  */
-	VPCMP	$0, (%rax), %VMM(0), %k0
+	VPCMPEQ	VEC_SIZE(%rax), %VMM(0), %k0
+	subq	$-VEC_SIZE, %rax
 	KMOV	%k0, %VRCX
 	test	%VRCX, %VRCX
 	jnz	L(ret_vec_x1)
@@ -109,7 +117,7 @@ L(align_more):
 	jbe	L(ret_max)
 # endif
 
-	VPCMP	$0, VEC_SIZE(%rax), %VMM(0), %k0
+	VPCMPEQ	VEC_SIZE(%rax), %VMM(0), %k0
 	KMOV	%k0, %VRCX
 	test	%VRCX, %VRCX
 	jnz	L(ret_vec_x2)
@@ -119,7 +127,7 @@ L(align_more):
 	jbe	L(ret_max)
 # endif
 
-	VPCMP	$0, (VEC_SIZE * 2)(%rax), %VMM(0), %k0
+	VPCMPEQ	(VEC_SIZE * 2)(%rax), %VMM(0), %k0
 	KMOV	%k0, %VRCX
 	test	%VRCX, %VRCX
 	jnz	L(ret_vec_x3)
@@ -129,7 +137,7 @@ L(align_more):
 	jbe	L(ret_max)
 # endif
 
-	VPCMP	$0, (VEC_SIZE * 3)(%rax), %VMM(0), %k0
+	VPCMPEQ	(VEC_SIZE * 3)(%rax), %VMM(0), %k0
 	KMOV	%k0, %VRCX
 	test	%VRCX, %VRCX
 	jnz	L(ret_vec_x4)
@@ -155,16 +163,10 @@ L(align_more):
 	addq	%rcx, %rdx
 	/* Need jump as we don't want to add/subtract rdx for first
 	   iteration of 4 x VEC_SIZE aligned loop.  */
-	jmp	L(loop_entry)
 # endif
 
 	.p2align 4,,11
 L(loop):
-# ifdef USE_AS_STRNLEN
-	subq	$(CHAR_PER_VEC * 4), %rdx
-	jbe	L(ret_max)
-L(loop_entry):
-# endif
 	/* VPMINU and VPCMP combination provide better performance as
 	   compared to alternative combinations.  */
 	VMOVA	(VEC_SIZE * 4)(%rax), %VMM(1)
@@ -177,7 +179,18 @@ L(loop_entry):
 
 	subq	$-(VEC_SIZE * 4), %rax
 	KORTEST	%k0, %k1
-	jz	L(loop)
+
+# ifndef USE_AS_STRNLEN
+	jz      L(loop)
+# else
+	jnz	L(loopend)
+	subq	$(CHAR_PER_VEC * 4), %rdx
+	ja	L(loop)
+	mov	%rsi, %rax
+	ret
+# endif
+
+L(loopend):
 
 	VPTESTN	%VMM(1), %VMM(1), %k2
 	KMOV	%k2, %VRCX
@@ -249,24 +262,34 @@ L(ret_vec_x1):
 	ret
 
 L(page_cross):
-	movl	%eax, %ecx
-# ifdef USE_AS_WCSLEN
+	mov	%rdi, %rax
+	movl	%edi, %ecx
 	andl	$(VEC_SIZE - 1), %ecx
+# ifdef USE_AS_WCSLEN
 	sarl	$2, %ecx
 # endif
 	/* ecx contains number of w[char] to be skipped as a result
 	   of address alignment.  */
-	xorq	%rdi, %rax
-	VPCMP	$0, (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(0), %k0
-	KMOV	%k0, %VRAX
+	andq	$-VEC_SIZE, %rax
+	VPCMPEQ	(%rax), %VMM(0), %k0
+	KMOV	%k0, %VRDX
 	/* Ignore number of character for alignment adjustment.  */
-	shr	%cl, %VRAX
+	shr	%cl, %VRDX
+# ifdef USE_AS_STRNLEN
+	jnz	L(page_cross_end)
+	movl    $CHAR_PER_VEC, %eax
+	sub     %ecx, %eax
+	cmp	%rax, %rsi
+	ja	L(align_more)
+# else
 	jz	L(align_more)
+# endif
 
-	bsf	%VRAX, %VRAX
+L(page_cross_end):
+	bsf	%VRDX, %VRAX
 # ifdef USE_AS_STRNLEN
 	cmpq	%rsi, %rax
-	cmovnb	%rsi, %rax
+	cmovnb	%esi, %eax
 # endif
 	ret