From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <skpgkp2@gmail.com>
Received: from mga09.intel.com (mga09.intel.com [134.134.136.24])
	by sourceware.org (Postfix) with ESMTPS id 22676385D0E0
	for <libc-alpha@sourceware.org>; Fri, 28 Oct 2022 15:48:31 +0000 (GMT)
DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 22676385D0E0
Authentication-Results: sourceware.org; dmarc=fail (p=none dis=none) header.from=gmail.com
Authentication-Results: sourceware.org; spf=fail smtp.mailfrom=gmail.com
X-IronPort-AV: E=McAfee;i="6500,9779,10514"; a="309611866"
X-IronPort-AV: E=Sophos;i="5.95,221,1661842800"; 
   d="scan'208";a="309611866"
Received: from orsmga002.jf.intel.com ([10.7.209.21])
  by orsmga102.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 28 Oct 2022 08:48:10 -0700
X-ExtLoop1: 1
X-IronPort-AV: E=McAfee;i="6500,9779,10514"; a="632815657"
X-IronPort-AV: E=Sophos;i="5.95,221,1661842800"; 
   d="scan'208";a="632815657"
Received: from scymds01.sc.intel.com ([10.148.94.138])
  by orsmga002.jf.intel.com with ESMTP; 28 Oct 2022 08:48:10 -0700
Received: from gskx-1.sc.intel.com (gskx-1.sc.intel.com [172.25.149.211])
	by scymds01.sc.intel.com
	with ESMTP id 29SFmAsW031281
	for <libc-alpha@sourceware.org>; Fri, 28 Oct 2022 08:48:10 -0700
From: Sunil K Pandey <skpgkp2@gmail.com>
To: libc-alpha@sourceware.org
Subject: [PATCH] x86-64: Improve evex512 version of strlen functions
Date: Fri, 28 Oct 2022 08:48:10 -0700
Message-Id: <20221028154810.1801123-1-skpgkp2@gmail.com>
X-Mailer: git-send-email 2.36.1
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
X-Spam-Status: No, score=-7.8 required=5.0 tests=BAYES_00,DKIM_ADSP_CUSTOM_MED,FORGED_GMAIL_RCVD,FREEMAIL_ENVFROM_END_DIGIT,FREEMAIL_FROM,GIT_PATCH_0,HK_RANDOM_ENVFROM,HK_RANDOM_FROM,KAM_DMARC_NONE,KAM_DMARC_STATUS,NML_ADSP_CUSTOM_MED,RCVD_IN_MSPIKE_H3,RCVD_IN_MSPIKE_WL,SPF_HELO_NONE,SPF_SOFTFAIL,SPOOFED_FREEMAIL,SPOOF_GMAIL_MID,TXREP autolearn=ham autolearn_force=no version=3.4.6
X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on server2.sourceware.org
List-Id: <libc-alpha.sourceware.org>

This patch improves following functionality
- Replace VPCMP with VPCMPEQ.
- Replace page cross check logic with sall.
- Remove extra lea from align_more.
- Remove uncondition loop jump.
- Use bsf to check max length in first vector.
---
 sysdeps/x86_64/multiarch/strlen-evex-base.S | 91 +++++++++++++--------
 1 file changed, 57 insertions(+), 34 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/strlen-evex-base.S b/sysdeps/x86_64/multiarch/strlen-evex-base.S
index c832b15a48..fd6c770e6e 100644
--- a/sysdeps/x86_64/multiarch/strlen-evex-base.S
+++ b/sysdeps/x86_64/multiarch/strlen-evex-base.S
@@ -25,12 +25,12 @@
 # include <sysdep.h>
 
 # ifdef USE_AS_WCSLEN
-#  define VPCMP		vpcmpd
+#  define VPCMPEQ	vpcmpeqd
 #  define VPTESTN	vptestnmd
 #  define VPMINU	vpminud
 #  define CHAR_SIZE	4
 # else
-#  define VPCMP		vpcmpb
+#  define VPCMPEQ	vpcmpeqb
 #  define VPTESTN	vptestnmb
 #  define VPMINU	vpminub
 #  define CHAR_SIZE	1
@@ -55,20 +55,29 @@ ENTRY_P2ALIGN (STRLEN, 6)
 
 	movl	%edi, %eax
 	vpxorq	%VMM_128(0), %VMM_128(0), %VMM_128(0)
-	andl	$(PAGE_SIZE - 1), %eax
-	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	sall	$20, %eax
+	cmpl	$((PAGE_SIZE - VEC_SIZE) << 20), %eax
 	ja	L(page_cross)
 
 	/* Compare [w]char for null, mask bit will be set for match.  */
-	VPCMP	$0, (%rdi), %VMM(0), %k0
+	VPCMPEQ	(%rdi), %VMM(0), %k0
+# ifdef USE_AS_STRNLEN
+	KMOV	%k0, %VRCX
+	/* Store max length in rax.  */
+	mov	%rsi, %rax
+	/* If rcx is 0, rax will have max length.  We can not use VRCX
+	   and VRAX here for evex256 because, upper 32 bits may be
+	   undefined for ecx and eax.  */
+	bsfq	%rcx, %rax
+	cmp	$CHAR_PER_VEC, %rax
+	ja	L(align_more)
+	cmpq	%rax, %rsi
+	cmovb	%esi, %eax
+# else
 	KMOV	%k0, %VRAX
 	test	%VRAX, %VRAX
 	jz	L(align_more)
-
 	bsf	%VRAX, %VRAX
-# ifdef USE_AS_STRNLEN
-	cmpq	%rsi, %rax
-	cmovnb	%rsi, %rax
 # endif
 	ret
 
@@ -81,25 +90,24 @@ L(ret_max):
 # endif
 
 L(align_more):
-	leaq	VEC_SIZE(%rdi), %rax
+	mov	%rdi, %rax
 	/* Align rax to VEC_SIZE.  */
 	andq	$-VEC_SIZE, %rax
 # ifdef USE_AS_STRNLEN
-	movq	%rax, %rdx
-	subq	%rdi, %rdx
+	movq	%rdi, %rdx
+	subq	%rax, %rdx
 #  ifdef USE_AS_WCSLEN
 	shr	$2, %VRDX
 #  endif
 	/* At this point rdx contains [w]chars already compared.  */
-	subq	%rsi, %rdx
-	jae	L(ret_max)
-	negq	%rdx
+	leaq	-CHAR_PER_VEC(%rsi, %rdx), %rdx
 	/* At this point rdx contains number of w[char] needs to go.
 	   Now onwards rdx will keep decrementing with each compare.  */
 # endif
 
 	/* Loop unroll 4 times for 4 vector loop.  */
-	VPCMP	$0, (%rax), %VMM(0), %k0
+	VPCMPEQ	VEC_SIZE(%rax), %VMM(0), %k0
+	subq	$-VEC_SIZE, %rax
 	KMOV	%k0, %VRCX
 	test	%VRCX, %VRCX
 	jnz	L(ret_vec_x1)
@@ -109,7 +117,7 @@ L(align_more):
 	jbe	L(ret_max)
 # endif
 
-	VPCMP	$0, VEC_SIZE(%rax), %VMM(0), %k0
+	VPCMPEQ	VEC_SIZE(%rax), %VMM(0), %k0
 	KMOV	%k0, %VRCX
 	test	%VRCX, %VRCX
 	jnz	L(ret_vec_x2)
@@ -119,7 +127,7 @@ L(align_more):
 	jbe	L(ret_max)
 # endif
 
-	VPCMP	$0, (VEC_SIZE * 2)(%rax), %VMM(0), %k0
+	VPCMPEQ	(VEC_SIZE * 2)(%rax), %VMM(0), %k0
 	KMOV	%k0, %VRCX
 	test	%VRCX, %VRCX
 	jnz	L(ret_vec_x3)
@@ -129,7 +137,7 @@ L(align_more):
 	jbe	L(ret_max)
 # endif
 
-	VPCMP	$0, (VEC_SIZE * 3)(%rax), %VMM(0), %k0
+	VPCMPEQ	(VEC_SIZE * 3)(%rax), %VMM(0), %k0
 	KMOV	%k0, %VRCX
 	test	%VRCX, %VRCX
 	jnz	L(ret_vec_x4)
@@ -155,16 +163,10 @@ L(align_more):
 	addq	%rcx, %rdx
 	/* Need jump as we don't want to add/subtract rdx for first
 	   iteration of 4 x VEC_SIZE aligned loop.  */
-	jmp	L(loop_entry)
 # endif
 
 	.p2align 4,,11
 L(loop):
-# ifdef USE_AS_STRNLEN
-	subq	$(CHAR_PER_VEC * 4), %rdx
-	jbe	L(ret_max)
-L(loop_entry):
-# endif
 	/* VPMINU and VPCMP combination provide better performance as
 	   compared to alternative combinations.  */
 	VMOVA	(VEC_SIZE * 4)(%rax), %VMM(1)
@@ -177,7 +179,18 @@ L(loop_entry):
 
 	subq	$-(VEC_SIZE * 4), %rax
 	KORTEST	%k0, %k1
-	jz	L(loop)
+
+# ifndef USE_AS_STRNLEN
+	jz      L(loop)
+# else
+	jnz	L(loopend)
+	subq	$(CHAR_PER_VEC * 4), %rdx
+	ja	L(loop)
+	mov	%rsi, %rax
+	ret
+# endif
+
+L(loopend):
 
 	VPTESTN	%VMM(1), %VMM(1), %k2
 	KMOV	%k2, %VRCX
@@ -249,24 +262,34 @@ L(ret_vec_x1):
 	ret
 
 L(page_cross):
-	movl	%eax, %ecx
-# ifdef USE_AS_WCSLEN
+	mov	%rdi, %rax
+	movl	%edi, %ecx
 	andl	$(VEC_SIZE - 1), %ecx
+# ifdef USE_AS_WCSLEN
 	sarl	$2, %ecx
 # endif
 	/* ecx contains number of w[char] to be skipped as a result
 	   of address alignment.  */
-	xorq	%rdi, %rax
-	VPCMP	$0, (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(0), %k0
-	KMOV	%k0, %VRAX
+	andq	$-VEC_SIZE, %rax
+	VPCMPEQ	(%rax), %VMM(0), %k0
+	KMOV	%k0, %VRDX
 	/* Ignore number of character for alignment adjustment.  */
-	shr	%cl, %VRAX
+	shr	%cl, %VRDX
+# ifdef USE_AS_STRNLEN
+	jnz	L(page_cross_end)
+	movl    $CHAR_PER_VEC, %eax
+	sub     %ecx, %eax
+	cmp	%rax, %rsi
+	ja	L(align_more)
+# else
 	jz	L(align_more)
+# endif
 
-	bsf	%VRAX, %VRAX
+L(page_cross_end):
+	bsf	%VRDX, %VRAX
 # ifdef USE_AS_STRNLEN
 	cmpq	%rsi, %rax
-	cmovnb	%rsi, %rax
+	cmovnb	%esi, %eax
 # endif
 	ret
 
-- 
2.36.1