[PATCH v2 6/8] x86: Optimize memrchr-avx2.S

public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed

From: Noah Goldstein <goldstein.w.n@gmail.com>
To: libc-alpha@sourceware.org
Subject: [PATCH v2 6/8] x86: Optimize memrchr-avx2.S
Date: Fri,  3 Jun 2022 15:04:27 -0500	[thread overview]
Message-ID: <20220603200429.379547-6-goldstein.w.n@gmail.com> (raw)
In-Reply-To: <20220603200429.379547-1-goldstein.w.n@gmail.com>

The new code:
    1. prioritizes smaller user-arg lengths more.
    2. optimizes target placement more carefully
    3. reuses logic more
    4. fixes up various inefficiencies in the logic. The biggest
       case here is the `lzcnt` logic for checking returns which
       saves either a branch or multiple instructions.

The total code size saving is: 306 bytes
Geometric Mean of all benchmarks New / Old: 0.760

Regressions:
There are some regressions. Particularly where the length (user arg
length) is large but the position of the match char is near the
begining of the string (in first VEC). This case has roughly a
10-20% regression.

This is because the new logic gives the hot path for immediate matches
to shorter lengths (the more common input). This case has roughly
a 15-45% speedup.

Full xcheck passes on x86_64.
---
 sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S |   1 +
 sysdeps/x86_64/multiarch/memrchr-avx2.S     | 538 ++++++++++----------
 2 files changed, 260 insertions(+), 279 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S
index cea2d2a72d..5e9beeeef2 100644
--- a/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S
+++ b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S
@@ -2,6 +2,7 @@
 # define MEMRCHR __memrchr_avx2_rtm
 #endif
 
+#define COND_VZEROUPPER	COND_VZEROUPPER_XTEST
 #define ZERO_UPPER_VEC_REGISTERS_RETURN \
   ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
 
diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2.S b/sysdeps/x86_64/multiarch/memrchr-avx2.S
index ba2ce7cb03..6915e1c373 100644
--- a/sysdeps/x86_64/multiarch/memrchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/memrchr-avx2.S
@@ -21,340 +21,320 @@
 # include <sysdep.h>
 
 # ifndef MEMRCHR
-#  define MEMRCHR	__memrchr_avx2
+#  define MEMRCHR				__memrchr_avx2
 # endif
 
 # ifndef VZEROUPPER
-#  define VZEROUPPER	vzeroupper
+#  define VZEROUPPER			vzeroupper
 # endif
 
+// abf-off
 # ifndef SECTION
 #  define SECTION(p)	p##.avx
 # endif
+// abf-on
+
+# define VEC_SIZE			32
+# define PAGE_SIZE			4096
+	.section SECTION(.text), "ax", @progbits
+ENTRY(MEMRCHR)
+# ifdef __ILP32__
+	/* Clear upper bits.  */
+	and	%RDX_LP, %RDX_LP
+# else
+	test	%RDX_LP, %RDX_LP
+# endif
+	jz	L(zero_0)
 
-# define VEC_SIZE 32
-
-	.section SECTION(.text),"ax",@progbits
-ENTRY (MEMRCHR)
-	/* Broadcast CHAR to YMM0.  */
 	vmovd	%esi, %xmm0
-	vpbroadcastb %xmm0, %ymm0
-
-	sub	$VEC_SIZE, %RDX_LP
-	jbe	L(last_vec_or_less)
-
-	add	%RDX_LP, %RDI_LP
+	/* Get end pointer. Minus one for two reasons. 1) It is necessary for a
+	   correct page cross check and 2) it correctly sets up end ptr to be
+	   subtract by lzcnt aligned.  */
+	leaq	-1(%rdx, %rdi), %rax
 
-	/* Check the last VEC_SIZE bytes.  */
-	vpcmpeqb (%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x0)
-
-	subq	$(VEC_SIZE * 4), %rdi
-	movl	%edi, %ecx
-	andl	$(VEC_SIZE - 1), %ecx
-	jz	L(aligned_more)
+	vpbroadcastb %xmm0, %ymm0
 
-	/* Align data for aligned loads in the loop.  */
-	addq	$VEC_SIZE, %rdi
-	addq	$VEC_SIZE, %rdx
-	andq	$-VEC_SIZE, %rdi
-	subq	%rcx, %rdx
+	/* Check if we can load 1x VEC without cross a page.  */
+	testl	$(PAGE_SIZE - VEC_SIZE), %eax
+	jz	L(page_cross)
+
+	vpcmpeqb -(VEC_SIZE - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
+	cmpq	$VEC_SIZE, %rdx
+	ja	L(more_1x_vec)
+
+L(ret_vec_x0_test):
+	/* If ecx is zero (no matches) lzcnt will set it 32 (VEC_SIZE) which
+	   will gurantee edx (len) is less than it.  */
+	lzcntl	%ecx, %ecx
+
+	/* Hoist vzeroupper (not great for RTM) to save code size. This allows
+	   all logic for edx (len) <= VEC_SIZE to fit in first cache line.  */
+	COND_VZEROUPPER
+	cmpl	%ecx, %edx
+	jle	L(zero_0)
+	subq	%rcx, %rax
+	ret
 
-	.p2align 4
-L(aligned_more):
-	subq	$(VEC_SIZE * 4), %rdx
-	jbe	L(last_4x_vec_or_less)
-
-	/* Check the last 4 * VEC_SIZE.  Only one VEC_SIZE at a time
-	   since data is only aligned to VEC_SIZE.  */
-	vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x3)
-
-	vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2
-	vpmovmskb %ymm2, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x2)
-
-	vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3
-	vpmovmskb %ymm3, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x1)
-
-	vpcmpeqb (%rdi), %ymm0, %ymm4
-	vpmovmskb %ymm4, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x0)
-
-	/* Align data to 4 * VEC_SIZE for loop with fewer branches.
-	   There are some overlaps with above if data isn't aligned
-	   to 4 * VEC_SIZE.  */
-	movl	%edi, %ecx
-	andl	$(VEC_SIZE * 4 - 1), %ecx
-	jz	L(loop_4x_vec)
-
-	addq	$(VEC_SIZE * 4), %rdi
-	addq	$(VEC_SIZE * 4), %rdx
-	andq	$-(VEC_SIZE * 4), %rdi
-	subq	%rcx, %rdx
+	/* Fits in aligning bytes of first cache line.  */
+L(zero_0):
+	xorl	%eax, %eax
+	ret
 
-	.p2align 4
-L(loop_4x_vec):
-	/* Compare 4 * VEC at a time forward.  */
-	subq	$(VEC_SIZE * 4), %rdi
-	subq	$(VEC_SIZE * 4), %rdx
-	jbe	L(last_4x_vec_or_less)
-
-	vmovdqa	(%rdi), %ymm1
-	vmovdqa	VEC_SIZE(%rdi), %ymm2
-	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm3
-	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm4
-
-	vpcmpeqb %ymm1, %ymm0, %ymm1
-	vpcmpeqb %ymm2, %ymm0, %ymm2
-	vpcmpeqb %ymm3, %ymm0, %ymm3
-	vpcmpeqb %ymm4, %ymm0, %ymm4
-
-	vpor	%ymm1, %ymm2, %ymm5
-	vpor	%ymm3, %ymm4, %ymm6
-	vpor	%ymm5, %ymm6, %ymm5
-
-	vpmovmskb %ymm5, %eax
-	testl	%eax, %eax
-	jz	L(loop_4x_vec)
-
-	/* There is a match.  */
-	vpmovmskb %ymm4, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x3)
-
-	vpmovmskb %ymm3, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x2)
-
-	vpmovmskb %ymm2, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x1)
-
-	vpmovmskb %ymm1, %eax
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
+	.p2align 4,, 9
+L(ret_vec_x0):
+	lzcntl	%ecx, %ecx
+	subq	%rcx, %rax
 L(return_vzeroupper):
 	ZERO_UPPER_VEC_REGISTERS_RETURN
 
-	.p2align 4
-L(last_4x_vec_or_less):
-	addl	$(VEC_SIZE * 4), %edx
-	cmpl	$(VEC_SIZE * 2), %edx
-	jbe	L(last_2x_vec)
-
-	vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x3)
-
-	vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2
-	vpmovmskb %ymm2, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x2)
-
-	vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3
-	vpmovmskb %ymm3, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x1_check)
-	cmpl	$(VEC_SIZE * 3), %edx
-	jbe	L(zero)
-
-	vpcmpeqb (%rdi), %ymm0, %ymm4
-	vpmovmskb %ymm4, %eax
-	testl	%eax, %eax
-	jz	L(zero)
-	bsrl	%eax, %eax
-	subq	$(VEC_SIZE * 4), %rdx
-	addq	%rax, %rdx
-	jl	L(zero)
-	addq	%rdi, %rax
-	VZEROUPPER_RETURN
-
-	.p2align 4
+	.p2align 4,, 10
+L(more_1x_vec):
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0)
+
+	/* Align rax (string pointer).  */
+	andq	$-VEC_SIZE, %rax
+
+	/* Recompute remaining length after aligning.  */
+	movq	%rax, %rdx
+	/* Need this comparison next no matter what.  */
+	vpcmpeqb -(VEC_SIZE)(%rax), %ymm0, %ymm1
+	subq	%rdi, %rdx
+	decq	%rax
+	vpmovmskb %ymm1, %ecx
+	/* Fall through for short (hotter than length).  */
+	cmpq	$(VEC_SIZE * 2), %rdx
+	ja	L(more_2x_vec)
 L(last_2x_vec):
-	vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x3_check)
 	cmpl	$VEC_SIZE, %edx
-	jbe	L(zero)
-
-	vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jz	L(zero)
-	bsrl	%eax, %eax
-	subq	$(VEC_SIZE * 2), %rdx
-	addq	%rax, %rdx
-	jl	L(zero)
-	addl	$(VEC_SIZE * 2), %eax
-	addq	%rdi, %rax
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(last_vec_x0):
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
-	VZEROUPPER_RETURN
+	jbe	L(ret_vec_x0_test)
+
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0)
+
+	vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
+	/* 64-bit lzcnt. This will naturally add 32 to position.  */
+	lzcntq	%rcx, %rcx
+	COND_VZEROUPPER
+	cmpl	%ecx, %edx
+	jle	L(zero_0)
+	subq	%rcx, %rax
+	ret
 
-	.p2align 4
-L(last_vec_x1):
-	bsrl	%eax, %eax
-	addl	$VEC_SIZE, %eax
-	addq	%rdi, %rax
-	VZEROUPPER_RETURN
 
-	.p2align 4
-L(last_vec_x2):
-	bsrl	%eax, %eax
-	addl	$(VEC_SIZE * 2), %eax
-	addq	%rdi, %rax
+	/* Inexpensive place to put this regarding code size / target alignments
+	   / ICache NLP. Necessary for 2-byte encoding of jump to page cross
+	   case which inturn in necessray for hot path (len <= VEC_SIZE) to fit
+	   in first cache line.  */
+L(page_cross):
+	movq	%rax, %rsi
+	andq	$-VEC_SIZE, %rsi
+	vpcmpeqb (%rsi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
+	/* Shift out negative alignment (because we are starting from endptr and
+	   working backwards).  */
+	movl	%eax, %r8d
+	/* notl because eax already has endptr - 1.  (-x = ~(x - 1)).  */
+	notl	%r8d
+	shlxl	%r8d, %ecx, %ecx
+	cmpq	%rdi, %rsi
+	ja	L(more_1x_vec)
+	lzcntl	%ecx, %ecx
+	COND_VZEROUPPER
+	cmpl	%ecx, %edx
+	jle	L(zero_0)
+	subq	%rcx, %rax
+	ret
+	.p2align 4,, 11
+L(ret_vec_x1):
+	/* This will naturally add 32 to position.  */
+	lzcntq	%rcx, %rcx
+	subq	%rcx, %rax
 	VZEROUPPER_RETURN
+	.p2align 4,, 10
+L(more_2x_vec):
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0)
 
-	.p2align 4
-L(last_vec_x3):
-	bsrl	%eax, %eax
-	addl	$(VEC_SIZE * 3), %eax
-	addq	%rdi, %rax
-	ret
+	vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x1)
 
-	.p2align 4
-L(last_vec_x1_check):
-	bsrl	%eax, %eax
-	subq	$(VEC_SIZE * 3), %rdx
-	addq	%rax, %rdx
-	jl	L(zero)
-	addl	$VEC_SIZE, %eax
-	addq	%rdi, %rax
-	VZEROUPPER_RETURN
 
-	.p2align 4
-L(last_vec_x3_check):
-	bsrl	%eax, %eax
-	subq	$VEC_SIZE, %rdx
-	addq	%rax, %rdx
-	jl	L(zero)
-	addl	$(VEC_SIZE * 3), %eax
-	addq	%rdi, %rax
-	VZEROUPPER_RETURN
+	/* Needed no matter what.  */
+	vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
 
-	.p2align 4
-L(zero):
-	xorl	%eax, %eax
-	VZEROUPPER_RETURN
+	subq	$(VEC_SIZE * 4), %rdx
+	ja	L(more_4x_vec)
+
+	cmpl	$(VEC_SIZE * -1), %edx
+	jle	L(ret_vec_x2_test)
+
+L(last_vec):
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x2)
+
+	/* Needed no matter what.  */
+	vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
+	lzcntl	%ecx, %ecx
+	subq	$(VEC_SIZE * 3), %rax
+	COND_VZEROUPPER
+	subq	%rcx, %rax
+	cmpq	%rax, %rdi
+	ja	L(zero_2)
+	ret
 
-	.p2align 4
-L(null):
+	/* First in aligning bytes.  */
+L(zero_2):
 	xorl	%eax, %eax
 	ret
 
-	.p2align 4
-L(last_vec_or_less_aligned):
-	movl	%edx, %ecx
+	.p2align 4,, 4
+L(ret_vec_x2_test):
+	lzcntl	%ecx, %ecx
+	subq	$(VEC_SIZE * 2), %rax
+	COND_VZEROUPPER
+	subq	%rcx, %rax
+	cmpq	%rax, %rdi
+	ja	L(zero_2)
+	ret
 
-	vpcmpeqb (%rdi), %ymm0, %ymm1
 
-	movl	$1, %edx
-	/* Support rdx << 32.  */
-	salq	%cl, %rdx
-	subq	$1, %rdx
+	.p2align 4,, 11
+L(ret_vec_x2):
+	/* ecx must be non-zero.  */
+	bsrl	%ecx, %ecx
+	leaq	(VEC_SIZE * -3 + 1)(%rcx, %rax), %rax
+	VZEROUPPER_RETURN
 
-	vpmovmskb %ymm1, %eax
+	.p2align 4,, 14
+L(ret_vec_x3):
+	/* ecx must be non-zero.  */
+	bsrl	%ecx, %ecx
+	leaq	(VEC_SIZE * -4 + 1)(%rcx, %rax), %rax
+	VZEROUPPER_RETURN
 
-	/* Remove the trailing bytes.  */
-	andl	%edx, %eax
-	testl	%eax, %eax
-	jz	L(zero)
 
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
-	VZEROUPPER_RETURN
 
 	.p2align 4
-L(last_vec_or_less):
-	addl	$VEC_SIZE, %edx
+L(more_4x_vec):
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x2)
 
-	/* Check for zero length.  */
-	testl	%edx, %edx
-	jz	L(null)
+	vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
 
-	movl	%edi, %ecx
-	andl	$(VEC_SIZE - 1), %ecx
-	jz	L(last_vec_or_less_aligned)
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x3)
 
-	movl	%ecx, %esi
-	movl	%ecx, %r8d
-	addl	%edx, %esi
-	andq	$-VEC_SIZE, %rdi
+	/* Check if near end before re-aligning (otherwise might do an
+	   unnecissary loop iteration).  */
+	addq	$-(VEC_SIZE * 4), %rax
+	cmpq	$(VEC_SIZE * 4), %rdx
+	jbe	L(last_4x_vec)
 
-	subl	$VEC_SIZE, %esi
-	ja	L(last_vec_2x_aligned)
+	/* Align rax to (VEC_SIZE - 1).  */
+	orq	$(VEC_SIZE * 4 - 1), %rax
+	movq	%rdi, %rdx
+	/* Get endptr for loop in rdx. NB: Can't just do while rax > rdi because
+	   lengths that overflow can be valid and break the comparison.  */
+	orq	$(VEC_SIZE * 4 - 1), %rdx
 
-	/* Check the last VEC.  */
-	vpcmpeqb (%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-
-	/* Remove the leading and trailing bytes.  */
-	sarl	%cl, %eax
-	movl	%edx, %ecx
+	.p2align 4
+L(loop_4x_vec):
+	/* Need this comparison next no matter what.  */
+	vpcmpeqb -(VEC_SIZE * 1 - 1)(%rax), %ymm0, %ymm1
+	vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm2
+	vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm3
+	vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm4
 
-	movl	$1, %edx
-	sall	%cl, %edx
-	subl	$1, %edx
+	vpor	%ymm1, %ymm2, %ymm2
+	vpor	%ymm3, %ymm4, %ymm4
+	vpor	%ymm2, %ymm4, %ymm4
+	vpmovmskb %ymm4, %esi
 
-	andl	%edx, %eax
-	testl	%eax, %eax
-	jz	L(zero)
+	testl	%esi, %esi
+	jnz	L(loop_end)
 
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
-	addq	%r8, %rax
-	VZEROUPPER_RETURN
+	addq	$(VEC_SIZE * -4), %rax
+	cmpq	%rdx, %rax
+	jne	L(loop_4x_vec)
 
-	.p2align 4
-L(last_vec_2x_aligned):
-	movl	%esi, %ecx
+	subl	%edi, %edx
+	incl	%edx
 
-	/* Check the last VEC.  */
-	vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm1
+L(last_4x_vec):
+	/* Used no matter what.  */
+	vpcmpeqb -(VEC_SIZE * 1 - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
 
-	movl	$1, %edx
-	sall	%cl, %edx
-	subl	$1, %edx
+	cmpl	$(VEC_SIZE * 2), %edx
+	jbe	L(last_2x_vec)
 
-	vpmovmskb %ymm1, %eax
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0_end)
 
-	/* Remove the trailing bytes.  */
-	andl	%edx, %eax
+	vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x1_end)
 
-	testl	%eax, %eax
-	jnz	L(last_vec_x1)
+	/* Used no matter what.  */
+	vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
 
-	/* Check the second last VEC.  */
-	vpcmpeqb (%rdi), %ymm0, %ymm1
+	cmpl	$(VEC_SIZE * 3), %edx
+	ja	L(last_vec)
+
+	lzcntl	%ecx, %ecx
+	subq	$(VEC_SIZE * 2), %rax
+	COND_VZEROUPPER
+	subq	%rcx, %rax
+	cmpq	%rax, %rdi
+	jbe	L(ret0)
+	xorl	%eax, %eax
+L(ret0):
+	ret
 
-	movl	%r8d, %ecx
 
-	vpmovmskb %ymm1, %eax
+	.p2align 4
+L(loop_end):
+	vpmovmskb %ymm1, %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0_end)
+
+	vpmovmskb %ymm2, %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x1_end)
+
+	vpmovmskb %ymm3, %ecx
+	/* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
+	   then it won't affect the result in esi (VEC4). If ecx is non-zero
+	   then CHAR in VEC3 and bsrq will use that position.  */
+	salq	$32, %rcx
+	orq	%rsi, %rcx
+	bsrq	%rcx, %rcx
+	leaq	(VEC_SIZE * -4 + 1)(%rcx, %rax), %rax
+	VZEROUPPER_RETURN
 
-	/* Remove the leading bytes.  Must use unsigned right shift for
-	   bsrl below.  */
-	shrl	%cl, %eax
-	testl	%eax, %eax
-	jz	L(zero)
+	.p2align 4,, 4
+L(ret_vec_x1_end):
+	/* 64-bit version will automatically add 32 (VEC_SIZE).  */
+	lzcntq	%rcx, %rcx
+	subq	%rcx, %rax
+	VZEROUPPER_RETURN
 
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
-	addq	%r8, %rax
+	.p2align 4,, 4
+L(ret_vec_x0_end):
+	lzcntl	%ecx, %ecx
+	subq	%rcx, %rax
 	VZEROUPPER_RETURN
-END (MEMRCHR)
+
+	/* 2 bytes until next cache line.  */
+END(MEMRCHR)
 #endif
-- 
2.34.1

next prev parent reply	other threads:[~2022-06-03 20:04 UTC|newest]

Thread overview: 82+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-06-03  4:42 [PATCH v1 1/8] x86: Create header for VEC classes in x86 strings library Noah Goldstein
2022-06-03  4:42 ` [PATCH v1 2/8] x86: Add COND_VZEROUPPER that can replace vzeroupper if no `ret` Noah Goldstein
2022-06-03 20:04   ` [PATCH v2 1/8] x86: Create header for VEC classes in x86 strings library Noah Goldstein
2022-06-03 20:04     ` [PATCH v2 2/8] x86: Add COND_VZEROUPPER that can replace vzeroupper if no `ret` Noah Goldstein
2022-06-03 23:12       ` H.J. Lu
2022-06-03 23:33         ` Noah Goldstein
2022-06-03 20:04     ` [PATCH v2 3/8] Benchtests: Improve memrchr benchmarks Noah Goldstein
2022-06-03 20:04     ` [PATCH v2 4/8] x86: Optimize memrchr-sse2.S Noah Goldstein
2022-06-03 20:04     ` [PATCH v2 5/8] x86: Optimize memrchr-evex.S Noah Goldstein
2022-06-03 20:04     ` Noah Goldstein [this message]
2022-06-03 20:04     ` [PATCH v2 7/8] x86: Shrink code size of memchr-avx2.S Noah Goldstein
2022-06-03 20:04     ` [PATCH v2 8/8] x86: Shrink code size of memchr-evex.S Noah Goldstein
2022-06-03 23:09     ` [PATCH v2 1/8] x86: Create header for VEC classes in x86 strings library H.J. Lu
2022-06-03 23:49       ` Noah Goldstein
2022-06-03 23:49   ` [PATCH v3 " Noah Goldstein
2022-06-03 23:49     ` [PATCH v3 2/8] x86: Add COND_VZEROUPPER that can replace vzeroupper if no `ret` Noah Goldstein
2022-06-06 21:30       ` H.J. Lu
2022-06-06 22:38         ` Noah Goldstein
2022-06-03 23:49     ` [PATCH v3 3/8] Benchtests: Improve memrchr benchmarks Noah Goldstein
2022-06-03 23:49     ` [PATCH v3 4/8] x86: Optimize memrchr-sse2.S Noah Goldstein
2022-06-03 23:49     ` [PATCH v3 5/8] x86: Optimize memrchr-evex.S Noah Goldstein
2022-06-03 23:49     ` [PATCH v3 6/8] x86: Optimize memrchr-avx2.S Noah Goldstein
2022-06-03 23:50     ` [PATCH v3 7/8] x86: Shrink code size of memchr-avx2.S Noah Goldstein
2022-06-03 23:50     ` [PATCH v3 8/8] x86: Shrink code size of memchr-evex.S Noah Goldstein
2022-06-06 22:37   ` [PATCH v4 1/8] x86: Create header for VEC classes in x86 strings library Noah Goldstein
2022-06-06 22:37     ` [PATCH v4 2/8] x86: Add COND_VZEROUPPER that can replace vzeroupper if no `ret` Noah Goldstein
2022-06-07  2:45       ` H.J. Lu
2022-07-14  2:12         ` Sunil Pandey
2022-06-06 22:37     ` [PATCH v4 3/8] Benchtests: Improve memrchr benchmarks Noah Goldstein
2022-06-07  2:44       ` H.J. Lu
2022-06-07  4:10         ` Noah Goldstein
2022-06-06 22:37     ` [PATCH v4 4/8] x86: Optimize memrchr-sse2.S Noah Goldstein
2022-06-06 22:37     ` [PATCH v4 5/8] x86: Optimize memrchr-evex.S Noah Goldstein
2022-06-07  2:41       ` H.J. Lu
2022-06-07  4:09         ` Noah Goldstein
2022-06-07  4:12           ` Noah Goldstein
2022-06-06 22:37     ` [PATCH v4 6/8] x86: Optimize memrchr-avx2.S Noah Goldstein
2022-06-07  2:35       ` H.J. Lu
2022-06-07  4:06         ` Noah Goldstein
2022-06-06 22:37     ` [PATCH v4 7/8] x86: Shrink code size of memchr-avx2.S Noah Goldstein
2022-06-06 22:37     ` [PATCH v4 8/8] x86: Shrink code size of memchr-evex.S Noah Goldstein
2022-06-07  4:05   ` [PATCH v5 1/8] x86: Create header for VEC classes in x86 strings library Noah Goldstein
2022-06-07  4:05     ` [PATCH v5 2/8] x86: Add COND_VZEROUPPER that can replace vzeroupper if no `ret` Noah Goldstein
2022-06-07  4:05     ` [PATCH v5 3/8] Benchtests: Improve memrchr benchmarks Noah Goldstein
2022-06-07  4:05     ` [PATCH v5 4/8] x86: Optimize memrchr-sse2.S Noah Goldstein
2022-06-07  4:05     ` [PATCH v5 5/8] x86: Optimize memrchr-evex.S Noah Goldstein
2022-06-07  4:05     ` [PATCH v5 6/8] x86: Optimize memrchr-avx2.S Noah Goldstein
2022-06-07  4:05     ` [PATCH v5 7/8] x86: Shrink code size of memchr-avx2.S Noah Goldstein
2022-06-07  4:05     ` [PATCH v5 8/8] x86: Shrink code size of memchr-evex.S Noah Goldstein
2022-06-07  4:11   ` [PATCH v6 1/8] x86: Create header for VEC classes in x86 strings library Noah Goldstein
2022-06-07  4:11     ` [PATCH v6 2/8] x86: Add COND_VZEROUPPER that can replace vzeroupper if no `ret` Noah Goldstein
2022-06-07  4:11     ` [PATCH v6 3/8] Benchtests: Improve memrchr benchmarks Noah Goldstein
2022-06-07 18:03       ` H.J. Lu
2022-06-07  4:11     ` [PATCH v6 4/8] x86: Optimize memrchr-sse2.S Noah Goldstein
2022-06-07 18:04       ` H.J. Lu
2022-07-14  2:19         ` Sunil Pandey
2022-06-07  4:11     ` [PATCH v6 5/8] x86: Optimize memrchr-evex.S Noah Goldstein
2022-06-07 18:21       ` H.J. Lu
2022-07-14  2:21         ` Sunil Pandey
2022-06-07  4:11     ` [PATCH v6 6/8] x86: Optimize memrchr-avx2.S Noah Goldstein
2022-06-07 18:17       ` H.J. Lu
2022-07-14  2:26         ` Sunil Pandey
2022-07-14  2:43           ` Noah Goldstein
2022-06-07  4:11     ` [PATCH v6 7/8] x86: Shrink code size of memchr-avx2.S Noah Goldstein
2022-06-07 18:18       ` H.J. Lu
2022-07-14  2:31         ` Sunil Pandey
2022-07-14  2:41           ` Noah Goldstein
2022-06-07  4:11     ` [PATCH v6 8/8] x86: Shrink code size of memchr-evex.S Noah Goldstein
2022-06-07 18:19       ` H.J. Lu
2022-07-14  2:32         ` Sunil Pandey
2022-06-07 18:04     ` [PATCH v6 1/8] x86: Create header for VEC classes in x86 strings library H.J. Lu
2022-07-14  2:07       ` Sunil Pandey
2022-06-03  4:42 ` [PATCH v1 3/8] Benchtests: Improve memrchr benchmarks Noah Goldstein
2022-06-03  4:42 ` [PATCH v1 4/8] x86: Optimize memrchr-sse2.S Noah Goldstein
2022-06-03  4:47   ` Noah Goldstein
2022-06-03  4:42 ` [PATCH v1 5/8] x86: Optimize memrchr-evex.S Noah Goldstein
2022-06-03  4:49   ` Noah Goldstein
2022-06-03  4:42 ` [PATCH v1 6/8] x86: Optimize memrchr-avx2.S Noah Goldstein
2022-06-03  4:50   ` Noah Goldstein
2022-06-03  4:42 ` [PATCH v1 7/8] x86: Shrink code size of memchr-avx2.S Noah Goldstein
2022-06-03  4:42 ` [PATCH v1 8/8] x86: Shrink code size of memchr-evex.S Noah Goldstein
2022-06-03  4:51 ` [PATCH v1 1/8] x86: Create header for VEC classes in x86 strings library Noah Goldstein

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20220603200429.379547-6-goldstein.w.n@gmail.com \
    --to=goldstein.w.n@gmail.com \
    --cc=libc-alpha@sourceware.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).