public inbox for glibc-cvs@sourceware.org
help / color / mirror / Atom feed
* [glibc] x86: Optimize memrchr-evex.S
@ 2022-10-20  1:45 Noah Goldstein
  0 siblings, 0 replies; 2+ messages in thread
From: Noah Goldstein @ 2022-10-20  1:45 UTC (permalink / raw)
  To: glibc-cvs

https://sourceware.org/git/gitweb.cgi?p=glibc.git;h=4af6844aa5d3577e327f15dd877a38a043cb236a

commit 4af6844aa5d3577e327f15dd877a38a043cb236a
Author: Noah Goldstein <goldstein.w.n@gmail.com>
Date:   Tue Oct 18 17:44:06 2022 -0700

    x86: Optimize memrchr-evex.S
    
    Optimizations are:
    1. Use the fact that lzcnt(0) -> VEC_SIZE for memchr to save a branch
       in short string case.
    2. Save several instructions in len = [VEC_SIZE, 4 * VEC_SIZE] case.
    3. Use more code-size efficient instructions.
            - tzcnt ...     -> bsf ...
            - vpcmpb $0 ... -> vpcmpeq ...
    
    Code Size Changes:
    memrchr-evex.S      :  -29 bytes
    
    Net perf changes:
    
    Reported as geometric mean of all improvements / regressions from N=10
    runs of the benchtests. Value as New Time / Old Time so < 1.0 is
    improvement and 1.0 is regression.
    
    memrchr-evex.S      : 0.949 (Mostly from improvements in small strings)
    
    Full results attached in email.
    
    Full check passes on x86-64.

Diff:
---
 sysdeps/x86_64/multiarch/memrchr-evex.S | 538 +++++++++++++++++++-------------
 1 file changed, 324 insertions(+), 214 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/memrchr-evex.S b/sysdeps/x86_64/multiarch/memrchr-evex.S
index 550b328c5a..dbcf52808f 100644
--- a/sysdeps/x86_64/multiarch/memrchr-evex.S
+++ b/sysdeps/x86_64/multiarch/memrchr-evex.S
@@ -21,17 +21,19 @@
 #if ISA_SHOULD_BUILD (4)
 
 # include <sysdep.h>
-# include "x86-evex256-vecs.h"
-# if VEC_SIZE != 32
-#  error "VEC_SIZE != 32 unimplemented"
+
+# ifndef VEC_SIZE
+#  include "x86-evex256-vecs.h"
 # endif
 
+# include "reg-macros.h"
+
 # ifndef MEMRCHR
-#  define MEMRCHR				__memrchr_evex
+#  define MEMRCHR	__memrchr_evex
 # endif
 
-# define PAGE_SIZE			4096
-# define VMMMATCH			VMM(0)
+# define PAGE_SIZE	4096
+# define VMATCH	VMM(0)
 
 	.section SECTION(.text), "ax", @progbits
 ENTRY_P2ALIGN(MEMRCHR, 6)
@@ -43,294 +45,402 @@ ENTRY_P2ALIGN(MEMRCHR, 6)
 # endif
 	jz	L(zero_0)
 
-	/* Get end pointer. Minus one for two reasons. 1) It is necessary for a
-	   correct page cross check and 2) it correctly sets up end ptr to be
-	   subtract by lzcnt aligned.  */
+	/* Get end pointer. Minus one for three reasons. 1) It is
+	   necessary for a correct page cross check and 2) it correctly
+	   sets up end ptr to be subtract by lzcnt aligned. 3) it is a
+	   necessary step in aligning ptr.  */
 	leaq	-1(%rdi, %rdx), %rax
-	vpbroadcastb %esi, %VMMMATCH
+	vpbroadcastb %esi, %VMATCH
 
 	/* Check if we can load 1x VEC without cross a page.  */
 	testl	$(PAGE_SIZE - VEC_SIZE), %eax
 	jz	L(page_cross)
 
-	/* Don't use rax for pointer here because EVEX has better encoding with
-	   offset % VEC_SIZE == 0.  */
-	vpcmpb	$0, -(VEC_SIZE)(%rdi, %rdx), %VMMMATCH, %k0
-	kmovd	%k0, %ecx
-
-	/* Fall through for rdx (len) <= VEC_SIZE (expect small sizes).  */
-	cmpq	$VEC_SIZE, %rdx
-	ja	L(more_1x_vec)
-L(ret_vec_x0_test):
-
-	/* If ecx is zero (no matches) lzcnt will set it 32 (VEC_SIZE) which
-	   will guarantee edx (len) is less than it.  */
-	lzcntl	%ecx, %ecx
-	cmpl	%ecx, %edx
-	jle	L(zero_0)
-	subq	%rcx, %rax
+	/* Don't use rax for pointer here because EVEX has better
+	   encoding with offset % VEC_SIZE == 0.  */
+	vpcmpeqb (VEC_SIZE * -1)(%rdi, %rdx), %VMATCH, %k0
+	KMOV	%k0, %VRCX
+
+	/* If rcx is zero then lzcnt -> VEC_SIZE.  NB: there is a
+	   already a dependency between rcx and rsi so no worries about
+	   false-dep here.  */
+	lzcnt	%VRCX, %VRSI
+	/* If rdx <= rsi then either 1) rcx was non-zero (there was a
+	   match) but it was out of bounds or 2) rcx was zero and rdx
+	   was <= VEC_SIZE so we are done scanning.  */
+	cmpq	%rsi, %rdx
+	/* NB: Use branch to return zero/non-zero.  Common usage will
+	   branch on result of function (if return is null/non-null).
+	   This branch can be used to predict the ensuing one so there
+	   is no reason to extend the data-dependency with cmovcc.  */
+	jbe	L(zero_0)
+
+	/* If rcx is zero then len must be > RDX, otherwise since we
+	   already tested len vs lzcnt(rcx) (in rsi) we are good to
+	   return this match.  */
+	test	%VRCX, %VRCX
+	jz	L(more_1x_vec)
+	subq	%rsi, %rax
 	ret
 
-	/* Fits in aligning bytes of first cache line.  */
+	/* Fits in aligning bytes of first cache line for VEC_SIZE ==
+	   32.  */
+# if VEC_SIZE == 32
+	.p2align 4,, 2
 L(zero_0):
 	xorl	%eax, %eax
 	ret
-
-	.p2align 4,, 9
-L(ret_vec_x0_dec):
-	decq	%rax
-L(ret_vec_x0):
-	lzcntl	%ecx, %ecx
-	subq	%rcx, %rax
-	ret
+# endif
 
 	.p2align 4,, 10
 L(more_1x_vec):
-	testl	%ecx, %ecx
-	jnz	L(ret_vec_x0)
-
 	/* Align rax (pointer to string).  */
 	andq	$-VEC_SIZE, %rax
-
+L(page_cross_continue):
 	/* Recompute length after aligning.  */
-	movq	%rax, %rdx
+	subq	%rdi, %rax
 
-	/* Need no matter what.  */
-	vpcmpb	$0, -(VEC_SIZE)(%rax), %VMMMATCH, %k0
-	kmovd	%k0, %ecx
-
-	subq	%rdi, %rdx
-
-	cmpq	$(VEC_SIZE * 2), %rdx
+	cmpq	$(VEC_SIZE * 2), %rax
 	ja	L(more_2x_vec)
+
 L(last_2x_vec):
+	vpcmpeqb (VEC_SIZE * -1)(%rdi, %rax), %VMATCH, %k0
+	KMOV	%k0, %VRCX
 
-	/* Must dec rax because L(ret_vec_x0_test) expects it.  */
-	decq	%rax
-	cmpl	$VEC_SIZE, %edx
-	jbe	L(ret_vec_x0_test)
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x0_test)
 
-	testl	%ecx, %ecx
-	jnz	L(ret_vec_x0)
+	/* If VEC_SIZE == 64 need to subtract because lzcntq won't
+	   implicitly add VEC_SIZE to match position.  */
+# if VEC_SIZE == 64
+	subl	$VEC_SIZE, %eax
+# else
+	cmpb	$VEC_SIZE, %al
+# endif
+	jle	L(zero_2)
 
-	/* Don't use rax for pointer here because EVEX has better encoding with
-	   offset % VEC_SIZE == 0.  */
-	vpcmpb	$0, -(VEC_SIZE * 2)(%rdi, %rdx), %VMMMATCH, %k0
-	kmovd	%k0, %ecx
-	/* NB: 64-bit lzcnt. This will naturally add 32 to position.  */
+	/* We adjusted rax (length) for VEC_SIZE == 64 so need seperate
+	   offsets.  */
+# if VEC_SIZE == 64
+	vpcmpeqb (VEC_SIZE * -1)(%rdi, %rax), %VMATCH, %k0
+# else
+	vpcmpeqb (VEC_SIZE * -2)(%rdi, %rax), %VMATCH, %k0
+# endif
+	KMOV	%k0, %VRCX
+	/* NB: 64-bit lzcnt. This will naturally add 32 to position for
+	   VEC_SIZE == 32.  */
 	lzcntq	%rcx, %rcx
-	cmpl	%ecx, %edx
-	jle	L(zero_0)
-	subq	%rcx, %rax
-	ret
-
-	/* Inexpensive place to put this regarding code size / target alignments
-	   / ICache NLP. Necessary for 2-byte encoding of jump to page cross
-	   case which in turn is necessary for hot path (len <= VEC_SIZE) to fit
-	   in first cache line.  */
-L(page_cross):
-	movq	%rax, %rsi
-	andq	$-VEC_SIZE, %rsi
-	vpcmpb	$0, (%rsi), %VMMMATCH, %k0
-	kmovd	%k0, %r8d
-	/* Shift out negative alignment (because we are starting from endptr and
-	   working backwards).  */
-	movl	%eax, %ecx
-	/* notl because eax already has endptr - 1.  (-x = ~(x - 1)).  */
-	notl	%ecx
-	shlxl	%ecx, %r8d, %ecx
-	cmpq	%rdi, %rsi
-	ja	L(more_1x_vec)
-	lzcntl	%ecx, %ecx
-	cmpl	%ecx, %edx
-	jle	L(zero_1)
-	subq	%rcx, %rax
+	subl	%ecx, %eax
+	ja	L(first_vec_x1_ret)
+	/* If VEC_SIZE == 64 put L(zero_0) here as we can't fit in the
+	   first cache line (this is the second cache line).  */
+# if VEC_SIZE == 64
+L(zero_0):
+# endif
+L(zero_2):
+	xorl	%eax, %eax
 	ret
 
-	/* Continue creating zero labels that fit in aligning bytes and get
-	   2-byte encoding / are in the same cache line as condition.  */
-L(zero_1):
-	xorl	%eax, %eax
+	/* NB: Fits in aligning bytes before next cache line for
+	   VEC_SIZE == 32.  For VEC_SIZE == 64 this is attached to
+	   L(first_vec_x0_test).  */
+# if VEC_SIZE == 32
+L(first_vec_x1_ret):
+	leaq	-1(%rdi, %rax), %rax
 	ret
+# endif
 
-	.p2align 4,, 8
-L(ret_vec_x1):
-	/* This will naturally add 32 to position.  */
-	bsrl	%ecx, %ecx
-	leaq	-(VEC_SIZE * 2)(%rcx, %rax), %rax
+	.p2align 4,, 6
+L(ret_vec_x0_test):
+	lzcnt	%VRCX, %VRCX
+	subl	%ecx, %eax
+	jle	L(zero_2)
+# if VEC_SIZE == 64
+	/* Reuse code at the end of L(ret_vec_x0_test) as we can't fit
+	   L(first_vec_x1_ret) in the same cache line as its jmp base
+	   so we might as well save code size.  */
+L(first_vec_x1_ret):
+# endif
+	leaq	-1(%rdi, %rax), %rax
 	ret
 
-	.p2align 4,, 8
+	.p2align 4,, 6
+L(loop_last_4x_vec):
+	/* Compute remaining length.  */
+	subl	%edi, %eax
+L(last_4x_vec):
+	cmpl	$(VEC_SIZE * 2), %eax
+	jle	L(last_2x_vec)
+# if VEC_SIZE == 32
+	/* Only align for VEC_SIZE == 32.  For VEC_SIZE == 64 we need
+	   the spare bytes to align the loop properly.  */
+	.p2align 4,, 10
+# endif
 L(more_2x_vec):
-	testl	%ecx, %ecx
-	jnz	L(ret_vec_x0_dec)
 
-	vpcmpb	$0, -(VEC_SIZE * 2)(%rax), %VMMMATCH, %k0
-	kmovd	%k0, %ecx
-	testl	%ecx, %ecx
-	jnz	L(ret_vec_x1)
+	/* Length > VEC_SIZE * 2 so check the first 2x VEC for match and
+	   return if either hit.  */
+	vpcmpeqb (VEC_SIZE * -1)(%rdi, %rax), %VMATCH, %k0
+	KMOV	%k0, %VRCX
+
+	test	%VRCX, %VRCX
+	jnz	L(first_vec_x0)
+
+	vpcmpeqb (VEC_SIZE * -2)(%rdi, %rax), %VMATCH, %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(first_vec_x1)
 
 	/* Need no matter what.  */
-	vpcmpb	$0, -(VEC_SIZE * 3)(%rax), %VMMMATCH, %k0
-	kmovd	%k0, %ecx
+	vpcmpeqb (VEC_SIZE * -3)(%rdi, %rax), %VMATCH, %k0
+	KMOV	%k0, %VRCX
 
-	subq	$(VEC_SIZE * 4), %rdx
+	/* Check if we are near the end.  */
+	subq	$(VEC_SIZE * 4), %rax
 	ja	L(more_4x_vec)
 
-	cmpl	$(VEC_SIZE * -1), %edx
-	jle	L(ret_vec_x2_test)
-L(last_vec):
-	testl	%ecx, %ecx
-	jnz	L(ret_vec_x2)
+	test	%VRCX, %VRCX
+	jnz	L(first_vec_x2_test)
 
+	/* Adjust length for final check and check if we are at the end.
+	 */
+	addl	$(VEC_SIZE * 1), %eax
+	jle	L(zero_1)
 
-	/* Need no matter what.  */
-	vpcmpb	$0, -(VEC_SIZE * 4)(%rax), %VMMMATCH, %k0
-	kmovd	%k0, %ecx
-	lzcntl	%ecx, %ecx
-	subq	$(VEC_SIZE * 3 + 1), %rax
-	subq	%rcx, %rax
-	cmpq	%rax, %rdi
-	ja	L(zero_1)
+	vpcmpeqb (VEC_SIZE * -1)(%rdi, %rax), %VMATCH, %k0
+	KMOV	%k0, %VRCX
+
+	lzcnt	%VRCX, %VRCX
+	subl	%ecx, %eax
+	ja	L(first_vec_x3_ret)
+L(zero_1):
+	xorl	%eax, %eax
+	ret
+L(first_vec_x3_ret):
+	leaq	-1(%rdi, %rax), %rax
 	ret
 
-	.p2align 4,, 8
-L(ret_vec_x2_test):
-	lzcntl	%ecx, %ecx
-	subq	$(VEC_SIZE * 2 + 1), %rax
-	subq	%rcx, %rax
-	cmpq	%rax, %rdi
-	ja	L(zero_1)
+	.p2align 4,, 6
+L(first_vec_x2_test):
+	/* Must adjust length before check.  */
+	subl	$-(VEC_SIZE * 2 - 1), %eax
+	lzcnt	%VRCX, %VRCX
+	subl	%ecx, %eax
+	jl	L(zero_4)
+	addq	%rdi, %rax
 	ret
 
-	.p2align 4,, 8
-L(ret_vec_x2):
-	bsrl	%ecx, %ecx
-	leaq	-(VEC_SIZE * 3)(%rcx, %rax), %rax
+
+	.p2align 4,, 10
+L(first_vec_x0):
+	bsr	%VRCX, %VRCX
+	leaq	(VEC_SIZE * -1)(%rdi, %rax), %rax
+	addq	%rcx, %rax
 	ret
 
-	.p2align 4,, 8
-L(ret_vec_x3):
-	bsrl	%ecx, %ecx
-	leaq	-(VEC_SIZE * 4)(%rcx, %rax), %rax
+	/* Fits unobtrusively here.  */
+L(zero_4):
+	xorl	%eax, %eax
+	ret
+
+	.p2align 4,, 10
+L(first_vec_x1):
+	bsr	%VRCX, %VRCX
+	leaq	(VEC_SIZE * -2)(%rdi, %rax), %rax
+	addq	%rcx, %rax
 	ret
 
 	.p2align 4,, 8
+L(first_vec_x3):
+	bsr	%VRCX, %VRCX
+	addq	%rdi, %rax
+	addq	%rcx, %rax
+	ret
+
+	.p2align 4,, 6
+L(first_vec_x2):
+	bsr	%VRCX, %VRCX
+	leaq	(VEC_SIZE * 1)(%rdi, %rax), %rax
+	addq	%rcx, %rax
+	ret
+
+	.p2align 4,, 2
 L(more_4x_vec):
-	testl	%ecx, %ecx
-	jnz	L(ret_vec_x2)
+	test	%VRCX, %VRCX
+	jnz	L(first_vec_x2)
 
-	vpcmpb	$0, -(VEC_SIZE * 4)(%rax), %VMMMATCH, %k0
-	kmovd	%k0, %ecx
+	vpcmpeqb (%rdi, %rax), %VMATCH, %k0
+	KMOV	%k0, %VRCX
 
-	testl	%ecx, %ecx
-	jnz	L(ret_vec_x3)
+	test	%VRCX, %VRCX
+	jnz	L(first_vec_x3)
 
 	/* Check if near end before re-aligning (otherwise might do an
 	   unnecessary loop iteration).  */
-	addq	$-(VEC_SIZE * 4), %rax
-	cmpq	$(VEC_SIZE * 4), %rdx
+	cmpq	$(VEC_SIZE * 4), %rax
 	jbe	L(last_4x_vec)
 
-	decq	%rax
-	andq	$-(VEC_SIZE * 4), %rax
-	movq	%rdi, %rdx
-	/* Get endptr for loop in rdx. NB: Can't just do while rax > rdi because
-	   lengths that overflow can be valid and break the comparison.  */
-	andq	$-(VEC_SIZE * 4), %rdx
+
+	/* NB: We setup the loop to NOT use index-address-mode for the
+	   buffer.  This costs some instructions & code size but avoids
+	   stalls due to unlaminated micro-fused instructions (as used
+	   in the loop) from being forced to issue in the same group
+	   (essentially narrowing the backend width).  */
+
+	/* Get endptr for loop in rdx. NB: Can't just do while rax > rdi
+	   because lengths that overflow can be valid and break the
+	   comparison.  */
+# if VEC_SIZE == 64
+	/* Use rdx as intermediate to compute rax, this gets us imm8
+	   encoding which just allows the L(more_4x_vec) block to fit
+	   in 1 cache-line.  */
+	leaq	(VEC_SIZE * 4)(%rdi), %rdx
+	leaq	(VEC_SIZE * -1)(%rdx, %rax), %rax
+
+	/* No evex machine has partial register stalls. This can be
+	   replaced with: `andq $(VEC_SIZE * -4), %rax/%rdx` if that
+	   changes.  */
+	xorb	%al, %al
+	xorb	%dl, %dl
+# else
+	leaq	(VEC_SIZE * 3)(%rdi, %rax), %rax
+	andq	$(VEC_SIZE * -4), %rax
+	leaq	(VEC_SIZE * 4)(%rdi), %rdx
+	andq	$(VEC_SIZE * -4), %rdx
+# endif
+
 
 	.p2align 4
 L(loop_4x_vec):
-	/* Store 1 were not-equals and 0 where equals in k1 (used to mask later
-	   on).  */
-	vpcmpb	$4, (VEC_SIZE * 3)(%rax), %VMMMATCH, %k1
+	/* NB: We could do the same optimization here as we do for
+	   memchr/rawmemchr by using VEX encoding in the loop for access
+	   to VEX vpcmpeqb + vpternlogd.  Since memrchr is not as hot as
+	   memchr it may not be worth the extra code size, but if the
+	   need arises it an easy ~15% perf improvement to the loop.  */
+
+	cmpq	%rdx, %rax
+	je	L(loop_last_4x_vec)
+	/* Store 1 were not-equals and 0 where equals in k1 (used to
+	   mask later on).  */
+	vpcmpb	$4, (VEC_SIZE * -1)(%rax), %VMATCH, %k1
 
 	/* VEC(2/3) will have zero-byte where we found a CHAR.  */
-	vpxorq	(VEC_SIZE * 2)(%rax), %VMMMATCH, %VMM(2)
-	vpxorq	(VEC_SIZE * 1)(%rax), %VMMMATCH, %VMM(3)
-	vpcmpb	$0, (VEC_SIZE * 0)(%rax), %VMMMATCH, %k4
+	vpxorq	(VEC_SIZE * -2)(%rax), %VMATCH, %VMM(2)
+	vpxorq	(VEC_SIZE * -3)(%rax), %VMATCH, %VMM(3)
+	vpcmpeqb (VEC_SIZE * -4)(%rax), %VMATCH, %k4
 
-	/* Combine VEC(2/3) with min and maskz with k1 (k1 has zero bit where
-	   CHAR is found and VEC(2/3) have zero-byte where CHAR is found.  */
+	/* Combine VEC(2/3) with min and maskz with k1 (k1 has zero bit
+	   where CHAR is found and VEC(2/3) have zero-byte where CHAR
+	   is found.  */
 	vpminub	%VMM(2), %VMM(3), %VMM(3){%k1}{z}
 	vptestnmb %VMM(3), %VMM(3), %k2
 
-	/* Any 1s and we found CHAR.  */
-	kortestd %k2, %k4
-	jnz	L(loop_end)
-
 	addq	$-(VEC_SIZE * 4), %rax
-	cmpq	%rdx, %rax
-	jne	L(loop_4x_vec)
 
-	/* Need to re-adjust rdx / rax for L(last_4x_vec).  */
-	subq	$-(VEC_SIZE * 4), %rdx
-	movq	%rdx, %rax
-	subl	%edi, %edx
-L(last_4x_vec):
+	/* Any 1s and we found CHAR.  */
+	KORTEST %k2, %k4
+	jz	L(loop_4x_vec)
+
 
-	/* Used no matter what.  */
-	vpcmpb	$0, (VEC_SIZE * -1)(%rax), %VMMMATCH, %k0
-	kmovd	%k0, %ecx
+	/* K1 has non-matches for first VEC. inc; jz will overflow rcx
+	   iff all bytes where non-matches.  */
+	KMOV	%k1, %VRCX
+	inc	%VRCX
+	jnz	L(first_vec_x0_end)
 
-	cmpl	$(VEC_SIZE * 2), %edx
-	jbe	L(last_2x_vec)
+	vptestnmb %VMM(2), %VMM(2), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(first_vec_x1_end)
+	KMOV	%k2, %VRCX
+
+	/* Seperate logic for VEC_SIZE == 64 and VEC_SIZE == 32 for
+	   returning last 2x VEC. For VEC_SIZE == 64 we test each VEC
+	   individually, for VEC_SIZE == 32 we combine them in a single
+	   64-bit GPR.  */
+# if VEC_SIZE == 64
+	test	%VRCX, %VRCX
+	jnz	L(first_vec_x2_end)
+	KMOV	%k4, %VRCX
+# else
+	/* Combine last 2 VEC matches for VEC_SIZE == 32. If rcx (from
+	   VEC(3)) is zero (no CHAR in VEC(3)) then it won't affect the
+	   result in rsi (from VEC(4)). If rcx is non-zero then CHAR in
+	   VEC(3) and bsrq will use that position.  */
+	KMOV	%k4, %VRSI
+	salq	$32, %rcx
+	orq	%rsi, %rcx
+# endif
+	bsrq	%rcx, %rcx
+	addq	%rcx, %rax
+	ret
 
-	testl	%ecx, %ecx
-	jnz	L(ret_vec_x0_dec)
+	.p2align 4,, 4
+L(first_vec_x0_end):
+	/* rcx has 1s at non-matches so we need to `not` it. We used
+	   `inc` to test if zero so use `neg` to complete the `not` so
+	   the last 1 bit represent a match.  NB: (-x + 1 == ~x).  */
+	neg	%VRCX
+	bsr	%VRCX, %VRCX
+	leaq	(VEC_SIZE * 3)(%rcx, %rax), %rax
+	ret
 
+	.p2align 4,, 10
+L(first_vec_x1_end):
+	bsr	%VRCX, %VRCX
+	leaq	(VEC_SIZE * 2)(%rcx, %rax), %rax
+	ret
 
-	vpcmpb	$0, (VEC_SIZE * -2)(%rax), %VMMMATCH, %k0
-	kmovd	%k0, %ecx
+# if VEC_SIZE == 64
+	/* Since we can't combine the last 2x VEC for VEC_SIZE == 64
+	   need return label for it.  */
+	.p2align 4,, 4
+L(first_vec_x2_end):
+	bsr	%VRCX, %VRCX
+	leaq	(VEC_SIZE * 1)(%rcx, %rax), %rax
+	ret
+# endif
 
-	testl	%ecx, %ecx
-	jnz	L(ret_vec_x1)
 
-	/* Used no matter what.  */
-	vpcmpb	$0, (VEC_SIZE * -3)(%rax), %VMMMATCH, %k0
-	kmovd	%k0, %ecx
+	.p2align 4,, 4
+L(page_cross):
+	/* only lower bits of eax[log2(VEC_SIZE):0] are set so we can
+	   use movzbl to get the amount of bytes we are checking here.
+	 */
+	movzbl	%al, %ecx
+	andq	$-VEC_SIZE, %rax
+	vpcmpeqb (%rax), %VMATCH, %k0
+	KMOV	%k0, %VRSI
 
-	cmpl	$(VEC_SIZE * 3), %edx
-	ja	L(last_vec)
+	/* eax was comptued as %rdi + %rdx - 1 so need to add back 1
+	   here.  */
+	leal	1(%rcx), %r8d
 
-	lzcntl	%ecx, %ecx
-	subq	$(VEC_SIZE * 2 + 1), %rax
-	subq	%rcx, %rax
-	cmpq	%rax, %rdi
-	jbe	L(ret_1)
+	/* Invert ecx to get shift count for byte matches out of range.
+	 */
+	notl	%ecx
+	shlx	%VRCX, %VRSI, %VRSI
+
+	/* if r8 < rdx then the entire [buf, buf + len] is handled in
+	   the page cross case.  NB: we can't use the trick here we use
+	   in the non page-cross case because we aren't checking full
+	   VEC_SIZE.  */
+	cmpq	%r8, %rdx
+	ja	L(page_cross_check)
+	lzcnt	%VRSI, %VRSI
+	subl	%esi, %edx
+	ja	L(page_cross_ret)
 	xorl	%eax, %eax
-L(ret_1):
 	ret
 
-	.p2align 4,, 6
-L(loop_end):
-	kmovd	%k1, %ecx
-	notl	%ecx
-	testl	%ecx, %ecx
-	jnz	L(ret_vec_x0_end)
+L(page_cross_check):
+	test	%VRSI, %VRSI
+	jz	L(page_cross_continue)
 
-	vptestnmb %VMM(2), %VMM(2), %k0
-	kmovd	%k0, %ecx
-	testl	%ecx, %ecx
-	jnz	L(ret_vec_x1_end)
-
-	kmovd	%k2, %ecx
-	kmovd	%k4, %esi
-	/* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
-	   then it won't affect the result in esi (VEC4). If ecx is non-zero
-	   then CHAR in VEC3 and bsrq will use that position.  */
-	salq	$32, %rcx
-	orq	%rsi, %rcx
-	bsrq	%rcx, %rcx
-	addq	%rcx, %rax
-	ret
-	.p2align 4,, 4
-L(ret_vec_x0_end):
-	addq	$(VEC_SIZE), %rax
-L(ret_vec_x1_end):
-	bsrl	%ecx, %ecx
-	leaq	(VEC_SIZE * 2)(%rax, %rcx), %rax
+	lzcnt	%VRSI, %VRSI
+	subl	%esi, %edx
+L(page_cross_ret):
+	leaq	-1(%rdi, %rdx), %rax
 	ret
-
 END(MEMRCHR)
 #endif

^ permalink raw reply	[flat|nested] 2+ messages in thread

* [glibc] x86: Optimize memrchr-evex.S
@ 2022-06-07 20:44 Noah Goldstein
  0 siblings, 0 replies; 2+ messages in thread
From: Noah Goldstein @ 2022-06-07 20:44 UTC (permalink / raw)
  To: glibc-cvs

https://sourceware.org/git/gitweb.cgi?p=glibc.git;h=b4209615a06b01c974f47b4998b00e4c7b1aa5d9

commit b4209615a06b01c974f47b4998b00e4c7b1aa5d9
Author: Noah Goldstein <goldstein.w.n@gmail.com>
Date:   Mon Jun 6 21:11:31 2022 -0700

    x86: Optimize memrchr-evex.S
    
    The new code:
        1. prioritizes smaller user-arg lengths more.
        2. optimizes target placement more carefully
        3. reuses logic more
        4. fixes up various inefficiencies in the logic. The biggest
           case here is the `lzcnt` logic for checking returns which
           saves either a branch or multiple instructions.
    
    The total code size saving is: 263 bytes
    Geometric Mean of all benchmarks New / Old: 0.755
    
    Regressions:
    There are some regressions. Particularly where the length (user arg
    length) is large but the position of the match char is near the
    beginning of the string (in first VEC). This case has roughly a
    20% regression.
    
    This is because the new logic gives the hot path for immediate matches
    to shorter lengths (the more common input). This case has roughly
    a 35% speedup.
    
    Full xcheck passes on x86_64.
    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Diff:
---
 sysdeps/x86_64/multiarch/memrchr-evex.S | 539 ++++++++++++++++----------------
 1 file changed, 268 insertions(+), 271 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/memrchr-evex.S b/sysdeps/x86_64/multiarch/memrchr-evex.S
index 0b99709c6b..91329b18dc 100644
--- a/sysdeps/x86_64/multiarch/memrchr-evex.S
+++ b/sysdeps/x86_64/multiarch/memrchr-evex.S
@@ -19,319 +19,316 @@
 #if IS_IN (libc)
 
 # include <sysdep.h>
+# include "evex256-vecs.h"
+# if VEC_SIZE != 32
+#  error "VEC_SIZE != 32 unimplemented"
+# endif
+
+# ifndef MEMRCHR
+#  define MEMRCHR				__memrchr_evex
+# endif
+
+# define PAGE_SIZE			4096
+# define VECMATCH			VEC(0)
+
+	.section SECTION(.text), "ax", @progbits
+ENTRY_P2ALIGN(MEMRCHR, 6)
+# ifdef __ILP32__
+	/* Clear upper bits.  */
+	and	%RDX_LP, %RDX_LP
+# else
+	test	%RDX_LP, %RDX_LP
+# endif
+	jz	L(zero_0)
+
+	/* Get end pointer. Minus one for two reasons. 1) It is necessary for a
+	   correct page cross check and 2) it correctly sets up end ptr to be
+	   subtract by lzcnt aligned.  */
+	leaq	-1(%rdi, %rdx), %rax
+	vpbroadcastb %esi, %VECMATCH
+
+	/* Check if we can load 1x VEC without cross a page.  */
+	testl	$(PAGE_SIZE - VEC_SIZE), %eax
+	jz	L(page_cross)
+
+	/* Don't use rax for pointer here because EVEX has better encoding with
+	   offset % VEC_SIZE == 0.  */
+	vpcmpb	$0, -(VEC_SIZE)(%rdi, %rdx), %VECMATCH, %k0
+	kmovd	%k0, %ecx
+
+	/* Fall through for rdx (len) <= VEC_SIZE (expect small sizes).  */
+	cmpq	$VEC_SIZE, %rdx
+	ja	L(more_1x_vec)
+L(ret_vec_x0_test):
+
+	/* If ecx is zero (no matches) lzcnt will set it 32 (VEC_SIZE) which
+	   will guarantee edx (len) is less than it.  */
+	lzcntl	%ecx, %ecx
+	cmpl	%ecx, %edx
+	jle	L(zero_0)
+	subq	%rcx, %rax
+	ret
 
-# define VMOVA		vmovdqa64
-
-# define YMMMATCH	ymm16
-
-# define VEC_SIZE 32
-
-	.section .text.evex,"ax",@progbits
-ENTRY (__memrchr_evex)
-	/* Broadcast CHAR to YMMMATCH.  */
-	vpbroadcastb %esi, %YMMMATCH
-
-	sub	$VEC_SIZE, %RDX_LP
-	jbe	L(last_vec_or_less)
-
-	add	%RDX_LP, %RDI_LP
-
-	/* Check the last VEC_SIZE bytes.  */
-	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x0)
-
-	subq	$(VEC_SIZE * 4), %rdi
-	movl	%edi, %ecx
-	andl	$(VEC_SIZE - 1), %ecx
-	jz	L(aligned_more)
-
-	/* Align data for aligned loads in the loop.  */
-	addq	$VEC_SIZE, %rdi
-	addq	$VEC_SIZE, %rdx
-	andq	$-VEC_SIZE, %rdi
-	subq	%rcx, %rdx
-
-	.p2align 4
-L(aligned_more):
-	subq	$(VEC_SIZE * 4), %rdx
-	jbe	L(last_4x_vec_or_less)
-
-	/* Check the last 4 * VEC_SIZE.  Only one VEC_SIZE at a time
-	   since data is only aligned to VEC_SIZE.  */
-	vpcmpb	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x3)
-
-	vpcmpb	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2
-	kmovd	%k2, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x2)
-
-	vpcmpb	$0, VEC_SIZE(%rdi), %YMMMATCH, %k3
-	kmovd	%k3, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x1)
-
-	vpcmpb	$0, (%rdi), %YMMMATCH, %k4
-	kmovd	%k4, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x0)
-
-	/* Align data to 4 * VEC_SIZE for loop with fewer branches.
-	   There are some overlaps with above if data isn't aligned
-	   to 4 * VEC_SIZE.  */
-	movl	%edi, %ecx
-	andl	$(VEC_SIZE * 4 - 1), %ecx
-	jz	L(loop_4x_vec)
-
-	addq	$(VEC_SIZE * 4), %rdi
-	addq	$(VEC_SIZE * 4), %rdx
-	andq	$-(VEC_SIZE * 4), %rdi
-	subq	%rcx, %rdx
+	/* Fits in aligning bytes of first cache line.  */
+L(zero_0):
+	xorl	%eax, %eax
+	ret
 
-	.p2align 4
-L(loop_4x_vec):
-	/* Compare 4 * VEC at a time forward.  */
-	subq	$(VEC_SIZE * 4), %rdi
-	subq	$(VEC_SIZE * 4), %rdx
-	jbe	L(last_4x_vec_or_less)
-
-	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
-	vpcmpb	$0, VEC_SIZE(%rdi), %YMMMATCH, %k2
-	kord	%k1, %k2, %k5
-	vpcmpb	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3
-	vpcmpb	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4
-
-	kord	%k3, %k4, %k6
-	kortestd %k5, %k6
-	jz	L(loop_4x_vec)
-
-	/* There is a match.  */
-	kmovd	%k4, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x3)
-
-	kmovd	%k3, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x2)
-
-	kmovd	%k2, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x1)
-
-	kmovd	%k1, %eax
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
+	.p2align 4,, 9
+L(ret_vec_x0_dec):
+	decq	%rax
+L(ret_vec_x0):
+	lzcntl	%ecx, %ecx
+	subq	%rcx, %rax
 	ret
 
-	.p2align 4
-L(last_4x_vec_or_less):
-	addl	$(VEC_SIZE * 4), %edx
-	cmpl	$(VEC_SIZE * 2), %edx
-	jbe	L(last_2x_vec)
+	.p2align 4,, 10
+L(more_1x_vec):
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0)
 
-	vpcmpb	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x3)
+	/* Align rax (pointer to string).  */
+	andq	$-VEC_SIZE, %rax
 
-	vpcmpb	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2
-	kmovd	%k2, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x2)
+	/* Recompute length after aligning.  */
+	movq	%rax, %rdx
 
-	vpcmpb	$0, VEC_SIZE(%rdi), %YMMMATCH, %k3
-	kmovd	%k3, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x1_check)
-	cmpl	$(VEC_SIZE * 3), %edx
-	jbe	L(zero)
+	/* Need no matter what.  */
+	vpcmpb	$0, -(VEC_SIZE)(%rax), %VECMATCH, %k0
+	kmovd	%k0, %ecx
 
-	vpcmpb	$0, (%rdi), %YMMMATCH, %k4
-	kmovd	%k4, %eax
-	testl	%eax, %eax
-	jz	L(zero)
-	bsrl	%eax, %eax
-	subq	$(VEC_SIZE * 4), %rdx
-	addq	%rax, %rdx
-	jl	L(zero)
-	addq	%rdi, %rax
-	ret
+	subq	%rdi, %rdx
 
-	.p2align 4
+	cmpq	$(VEC_SIZE * 2), %rdx
+	ja	L(more_2x_vec)
 L(last_2x_vec):
-	vpcmpb	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x3_check)
+
+	/* Must dec rax because L(ret_vec_x0_test) expects it.  */
+	decq	%rax
 	cmpl	$VEC_SIZE, %edx
-	jbe	L(zero)
-
-	vpcmpb	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
-	jz	L(zero)
-	bsrl	%eax, %eax
-	subq	$(VEC_SIZE * 2), %rdx
-	addq	%rax, %rdx
-	jl	L(zero)
-	addl	$(VEC_SIZE * 2), %eax
-	addq	%rdi, %rax
+	jbe	L(ret_vec_x0_test)
+
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0)
+
+	/* Don't use rax for pointer here because EVEX has better encoding with
+	   offset % VEC_SIZE == 0.  */
+	vpcmpb	$0, -(VEC_SIZE * 2)(%rdi, %rdx), %VECMATCH, %k0
+	kmovd	%k0, %ecx
+	/* NB: 64-bit lzcnt. This will naturally add 32 to position.  */
+	lzcntq	%rcx, %rcx
+	cmpl	%ecx, %edx
+	jle	L(zero_0)
+	subq	%rcx, %rax
 	ret
 
-	.p2align 4
-L(last_vec_x0):
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
+	/* Inexpensive place to put this regarding code size / target alignments
+	   / ICache NLP. Necessary for 2-byte encoding of jump to page cross
+	   case which in turn is necessary for hot path (len <= VEC_SIZE) to fit
+	   in first cache line.  */
+L(page_cross):
+	movq	%rax, %rsi
+	andq	$-VEC_SIZE, %rsi
+	vpcmpb	$0, (%rsi), %VECMATCH, %k0
+	kmovd	%k0, %r8d
+	/* Shift out negative alignment (because we are starting from endptr and
+	   working backwards).  */
+	movl	%eax, %ecx
+	/* notl because eax already has endptr - 1.  (-x = ~(x - 1)).  */
+	notl	%ecx
+	shlxl	%ecx, %r8d, %ecx
+	cmpq	%rdi, %rsi
+	ja	L(more_1x_vec)
+	lzcntl	%ecx, %ecx
+	cmpl	%ecx, %edx
+	jle	L(zero_1)
+	subq	%rcx, %rax
 	ret
 
-	.p2align 4
-L(last_vec_x1):
-	bsrl	%eax, %eax
-	addl	$VEC_SIZE, %eax
-	addq	%rdi, %rax
+	/* Continue creating zero labels that fit in aligning bytes and get
+	   2-byte encoding / are in the same cache line as condition.  */
+L(zero_1):
+	xorl	%eax, %eax
 	ret
 
-	.p2align 4
-L(last_vec_x2):
-	bsrl	%eax, %eax
-	addl	$(VEC_SIZE * 2), %eax
-	addq	%rdi, %rax
+	.p2align 4,, 8
+L(ret_vec_x1):
+	/* This will naturally add 32 to position.  */
+	bsrl	%ecx, %ecx
+	leaq	-(VEC_SIZE * 2)(%rcx, %rax), %rax
 	ret
 
-	.p2align 4
-L(last_vec_x3):
-	bsrl	%eax, %eax
-	addl	$(VEC_SIZE * 3), %eax
-	addq	%rdi, %rax
-	ret
+	.p2align 4,, 8
+L(more_2x_vec):
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0_dec)
 
-	.p2align 4
-L(last_vec_x1_check):
-	bsrl	%eax, %eax
-	subq	$(VEC_SIZE * 3), %rdx
-	addq	%rax, %rdx
-	jl	L(zero)
-	addl	$VEC_SIZE, %eax
-	addq	%rdi, %rax
-	ret
+	vpcmpb	$0, -(VEC_SIZE * 2)(%rax), %VECMATCH, %k0
+	kmovd	%k0, %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x1)
 
-	.p2align 4
-L(last_vec_x3_check):
-	bsrl	%eax, %eax
-	subq	$VEC_SIZE, %rdx
-	addq	%rax, %rdx
-	jl	L(zero)
-	addl	$(VEC_SIZE * 3), %eax
-	addq	%rdi, %rax
-	ret
+	/* Need no matter what.  */
+	vpcmpb	$0, -(VEC_SIZE * 3)(%rax), %VECMATCH, %k0
+	kmovd	%k0, %ecx
 
-	.p2align 4
-L(zero):
-	xorl	%eax, %eax
+	subq	$(VEC_SIZE * 4), %rdx
+	ja	L(more_4x_vec)
+
+	cmpl	$(VEC_SIZE * -1), %edx
+	jle	L(ret_vec_x2_test)
+L(last_vec):
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x2)
+
+
+	/* Need no matter what.  */
+	vpcmpb	$0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0
+	kmovd	%k0, %ecx
+	lzcntl	%ecx, %ecx
+	subq	$(VEC_SIZE * 3 + 1), %rax
+	subq	%rcx, %rax
+	cmpq	%rax, %rdi
+	ja	L(zero_1)
 	ret
 
-	.p2align 4
-L(last_vec_or_less_aligned):
-	movl	%edx, %ecx
-
-	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
-
-	movl	$1, %edx
-	/* Support rdx << 32.  */
-	salq	%cl, %rdx
-	subq	$1, %rdx
-
-	kmovd	%k1, %eax
-
-	/* Remove the trailing bytes.  */
-	andl	%edx, %eax
-	testl	%eax, %eax
-	jz	L(zero)
-
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
+	.p2align 4,, 8
+L(ret_vec_x2_test):
+	lzcntl	%ecx, %ecx
+	subq	$(VEC_SIZE * 2 + 1), %rax
+	subq	%rcx, %rax
+	cmpq	%rax, %rdi
+	ja	L(zero_1)
 	ret
 
-	.p2align 4
-L(last_vec_or_less):
-	addl	$VEC_SIZE, %edx
-
-	/* Check for zero length.  */
-	testl	%edx, %edx
-	jz	L(zero)
-
-	movl	%edi, %ecx
-	andl	$(VEC_SIZE - 1), %ecx
-	jz	L(last_vec_or_less_aligned)
-
-	movl	%ecx, %esi
-	movl	%ecx, %r8d
-	addl	%edx, %esi
-	andq	$-VEC_SIZE, %rdi
+	.p2align 4,, 8
+L(ret_vec_x2):
+	bsrl	%ecx, %ecx
+	leaq	-(VEC_SIZE * 3)(%rcx, %rax), %rax
+	ret
 
-	subl	$VEC_SIZE, %esi
-	ja	L(last_vec_2x_aligned)
+	.p2align 4,, 8
+L(ret_vec_x3):
+	bsrl	%ecx, %ecx
+	leaq	-(VEC_SIZE * 4)(%rcx, %rax), %rax
+	ret
 
-	/* Check the last VEC.  */
-	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
+	.p2align 4,, 8
+L(more_4x_vec):
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x2)
 
-	/* Remove the leading and trailing bytes.  */
-	sarl	%cl, %eax
-	movl	%edx, %ecx
+	vpcmpb	$0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0
+	kmovd	%k0, %ecx
 
-	movl	$1, %edx
-	sall	%cl, %edx
-	subl	$1, %edx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x3)
 
-	andl	%edx, %eax
-	testl	%eax, %eax
-	jz	L(zero)
+	/* Check if near end before re-aligning (otherwise might do an
+	   unnecessary loop iteration).  */
+	addq	$-(VEC_SIZE * 4), %rax
+	cmpq	$(VEC_SIZE * 4), %rdx
+	jbe	L(last_4x_vec)
 
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
-	addq	%r8, %rax
-	ret
+	decq	%rax
+	andq	$-(VEC_SIZE * 4), %rax
+	movq	%rdi, %rdx
+	/* Get endptr for loop in rdx. NB: Can't just do while rax > rdi because
+	   lengths that overflow can be valid and break the comparison.  */
+	andq	$-(VEC_SIZE * 4), %rdx
 
 	.p2align 4
-L(last_vec_2x_aligned):
-	movl	%esi, %ecx
-
-	/* Check the last VEC.  */
-	vpcmpb	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1
+L(loop_4x_vec):
+	/* Store 1 were not-equals and 0 where equals in k1 (used to mask later
+	   on).  */
+	vpcmpb	$4, (VEC_SIZE * 3)(%rax), %VECMATCH, %k1
+
+	/* VEC(2/3) will have zero-byte where we found a CHAR.  */
+	vpxorq	(VEC_SIZE * 2)(%rax), %VECMATCH, %VEC(2)
+	vpxorq	(VEC_SIZE * 1)(%rax), %VECMATCH, %VEC(3)
+	vpcmpb	$0, (VEC_SIZE * 0)(%rax), %VECMATCH, %k4
+
+	/* Combine VEC(2/3) with min and maskz with k1 (k1 has zero bit where
+	   CHAR is found and VEC(2/3) have zero-byte where CHAR is found.  */
+	vpminub	%VEC(2), %VEC(3), %VEC(3){%k1}{z}
+	vptestnmb %VEC(3), %VEC(3), %k2
+
+	/* Any 1s and we found CHAR.  */
+	kortestd %k2, %k4
+	jnz	L(loop_end)
+
+	addq	$-(VEC_SIZE * 4), %rax
+	cmpq	%rdx, %rax
+	jne	L(loop_4x_vec)
+
+	/* Need to re-adjust rdx / rax for L(last_4x_vec).  */
+	subq	$-(VEC_SIZE * 4), %rdx
+	movq	%rdx, %rax
+	subl	%edi, %edx
+L(last_4x_vec):
+
+	/* Used no matter what.  */
+	vpcmpb	$0, (VEC_SIZE * -1)(%rax), %VECMATCH, %k0
+	kmovd	%k0, %ecx
 
-	movl	$1, %edx
-	sall	%cl, %edx
-	subl	$1, %edx
+	cmpl	$(VEC_SIZE * 2), %edx
+	jbe	L(last_2x_vec)
 
-	kmovd	%k1, %eax
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0_dec)
 
-	/* Remove the trailing bytes.  */
-	andl	%edx, %eax
 
-	testl	%eax, %eax
-	jnz	L(last_vec_x1)
+	vpcmpb	$0, (VEC_SIZE * -2)(%rax), %VECMATCH, %k0
+	kmovd	%k0, %ecx
 
-	/* Check the second last VEC.  */
-	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x1)
 
-	movl	%r8d, %ecx
+	/* Used no matter what.  */
+	vpcmpb	$0, (VEC_SIZE * -3)(%rax), %VECMATCH, %k0
+	kmovd	%k0, %ecx
 
-	kmovd	%k1, %eax
+	cmpl	$(VEC_SIZE * 3), %edx
+	ja	L(last_vec)
 
-	/* Remove the leading bytes.  Must use unsigned right shift for
-	   bsrl below.  */
-	shrl	%cl, %eax
-	testl	%eax, %eax
-	jz	L(zero)
+	lzcntl	%ecx, %ecx
+	subq	$(VEC_SIZE * 2 + 1), %rax
+	subq	%rcx, %rax
+	cmpq	%rax, %rdi
+	jbe	L(ret_1)
+	xorl	%eax, %eax
+L(ret_1):
+	ret
 
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
-	addq	%r8, %rax
+	.p2align 4,, 6
+L(loop_end):
+	kmovd	%k1, %ecx
+	notl	%ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0_end)
+
+	vptestnmb %VEC(2), %VEC(2), %k0
+	kmovd	%k0, %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x1_end)
+
+	kmovd	%k2, %ecx
+	kmovd	%k4, %esi
+	/* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
+	   then it won't affect the result in esi (VEC4). If ecx is non-zero
+	   then CHAR in VEC3 and bsrq will use that position.  */
+	salq	$32, %rcx
+	orq	%rsi, %rcx
+	bsrq	%rcx, %rcx
+	addq	%rcx, %rax
+	ret
+	.p2align 4,, 4
+L(ret_vec_x0_end):
+	addq	$(VEC_SIZE), %rax
+L(ret_vec_x1_end):
+	bsrl	%ecx, %ecx
+	leaq	(VEC_SIZE * 2)(%rax, %rcx), %rax
 	ret
-END (__memrchr_evex)
+
+END(MEMRCHR)
 #endif


^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2022-10-20  1:45 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-10-20  1:45 [glibc] x86: Optimize memrchr-evex.S Noah Goldstein
  -- strict thread matches above, loose matches on Subject: below --
2022-06-07 20:44 Noah Goldstein

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).