From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <skpgkp2@sourceware.org>
Received: by sourceware.org (Postfix, from userid 7852)
 id 339C13857BA1; Tue, 19 Jul 2022 05:53:56 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 339C13857BA1
Content-Type: text/plain; charset="us-ascii"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
From: Sunil Pandey <skpgkp2@sourceware.org>
To: glibc-cvs@sourceware.org
Subject: [glibc/release/2.33/master] x86: Shrink code size of memchr-evex.S
X-Act-Checkin: glibc
X-Git-Author: Noah Goldstein <goldstein.w.n@gmail.com>
X-Git-Refname: refs/heads/release/2.33/master
X-Git-Oldrev: 992086b659a897e9019e97d82d3589adba317e6e
X-Git-Newrev: 289f772721e088f4b3f9204903bc6cba7e860849
Message-Id: <20220719055356.339C13857BA1@sourceware.org>
Date: Tue, 19 Jul 2022 05:53:56 +0000 (GMT)
X-BeenThere: glibc-cvs@sourceware.org
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: Glibc-cvs mailing list <glibc-cvs.sourceware.org>
List-Unsubscribe: <https://sourceware.org/mailman/options/glibc-cvs>,
 <mailto:glibc-cvs-request@sourceware.org?subject=unsubscribe>
List-Archive: <https://sourceware.org/pipermail/glibc-cvs/>
List-Help: <mailto:glibc-cvs-request@sourceware.org?subject=help>
List-Subscribe: <https://sourceware.org/mailman/listinfo/glibc-cvs>,
 <mailto:glibc-cvs-request@sourceware.org?subject=subscribe>
X-List-Received-Date: Tue, 19 Jul 2022 05:53:56 -0000

https://sourceware.org/git/gitweb.cgi?p=glibc.git;h=289f772721e088f4b3f9204903bc6cba7e860849

commit 289f772721e088f4b3f9204903bc6cba7e860849
Author: Noah Goldstein <goldstein.w.n@gmail.com>
Date:   Mon Jun 6 21:11:34 2022 -0700

    x86: Shrink code size of memchr-evex.S
    
    This is not meant as a performance optimization. The previous code was
    far to liberal in aligning targets and wasted code size unnecissarily.
    
    The total code size saving is: 64 bytes
    
    There are no non-negligible changes in the benchmarks.
    Geometric Mean of all benchmarks New / Old: 1.000
    
    Full xcheck passes on x86_64.
    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
    
    (cherry picked from commit 56da3fe1dd075285fa8186d44b3c28e68c687e62)

Diff:
---
 sysdeps/x86_64/multiarch/memchr-evex.S | 46 ++++++++++++++++++----------------
 1 file changed, 25 insertions(+), 21 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
index 4d0ed6d136..68381c99a4 100644
--- a/sysdeps/x86_64/multiarch/memchr-evex.S
+++ b/sysdeps/x86_64/multiarch/memchr-evex.S
@@ -88,7 +88,7 @@
 # define PAGE_SIZE 4096
 
 	.section SECTION(.text),"ax",@progbits
-ENTRY (MEMCHR)
+ENTRY_P2ALIGN (MEMCHR, 6)
 # ifndef USE_AS_RAWMEMCHR
 	/* Check for zero length.  */
 	test	%RDX_LP, %RDX_LP
@@ -131,22 +131,24 @@ L(zero):
 	xorl	%eax, %eax
 	ret
 
-	.p2align 5
+	.p2align 4
 L(first_vec_x0):
-	/* Check if first match was before length.  */
-	tzcntl	%eax, %eax
-	xorl	%ecx, %ecx
-	cmpl	%eax, %edx
-	leaq	(%rdi, %rax, CHAR_SIZE), %rax
-	cmovle	%rcx, %rax
+	/* Check if first match was before length. NB: tzcnt has false data-
+	   dependency on destination. eax already had a data-dependency on esi
+	   so this should have no affect here.  */
+	tzcntl	%eax, %esi
+#  ifdef USE_AS_WMEMCHR
+	leaq	(%rdi, %rsi, CHAR_SIZE), %rdi
+#  else
+	addq	%rsi, %rdi
+#  endif
+	xorl	%eax, %eax
+	cmpl	%esi, %edx
+	cmovg	%rdi, %rax
 	ret
-# else
-	/* NB: first_vec_x0 is 17 bytes which will leave
-	   cross_page_boundary (which is relatively cold) close enough
-	   to ideal alignment. So only realign L(cross_page_boundary) if
-	   rawmemchr.  */
-	.p2align 4
 # endif
+
+	.p2align 4
 L(cross_page_boundary):
 	/* Save pointer before aligning as its original value is
 	   necessary for computer return address if byte is found or
@@ -400,10 +402,14 @@ L(last_2x_vec):
 L(zero_end):
 	ret
 
+L(set_zero_end):
+	xorl	%eax, %eax
+	ret
 
 	.p2align 4
 L(first_vec_x1_check):
-	tzcntl	%eax, %eax
+	/* eax must be non-zero. Use bsfl to save code size.  */
+	bsfl	%eax, %eax
 	/* Adjust length.  */
 	subl	$-(CHAR_PER_VEC * 4), %edx
 	/* Check if match within remaining length.  */
@@ -412,9 +418,6 @@ L(first_vec_x1_check):
 	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
 	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
 	ret
-L(set_zero_end):
-	xorl	%eax, %eax
-	ret
 
 	.p2align 4
 L(loop_4x_vec_end):
@@ -464,7 +467,7 @@ L(loop_4x_vec_end):
 # endif
 	ret
 
-	.p2align 4
+	.p2align 4,, 10
 L(last_vec_x1_return):
 	tzcntl	%eax, %eax
 # if defined USE_AS_WMEMCHR || RET_OFFSET != 0
@@ -496,6 +499,7 @@ L(last_vec_x3_return):
 # endif
 
 # ifndef USE_AS_RAWMEMCHR
+	.p2align 4,, 5
 L(last_4x_vec_or_less_cmpeq):
 	VPCMP	$0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0
 	kmovd	%k0, %eax
@@ -546,7 +550,7 @@ L(last_4x_vec):
 #  endif
 	andl	%ecx, %eax
 	jz	L(zero_end2)
-	tzcntl	%eax, %eax
+	bsfl	%eax, %eax
 	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
 L(zero_end2):
 	ret
@@ -562,6 +566,6 @@ L(last_vec_x3):
 	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
 	ret
 # endif
-
+	/* 7 bytes from next cache line.  */
 END (MEMCHR)
 #endif